npm - @seanyao/roll - Versions diffs - 2026.528.2 → 2026.529.2 - Mend

@seanyao/roll 2026.528.2 → 2026.529.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/CHANGELOG.md +31 -8
package/README.md +2 -0
package/bin/roll +917 -50
package/lib/README.md +42 -0
package/lib/__pycache__/roll-loop-status.cpython-314.pyc +0 -0
package/lib/agent_routes_lint.py +203 -0
package/lib/i18n/README.md +54 -0
package/lib/i18n/doctor.sh +13 -0
package/lib/i18n/loop.sh +12 -12
package/lib/loop_pick_agent.py +245 -0
package/lib/prices/README.md +35 -0
package/lib/roll-help.py +1 -0
package/lib/roll-loop-status.py +109 -0
package/lib/test_quality_gate.py +143 -0
package/package.json +1 -1
package/skills/roll-brief/SKILL.md +7 -0
package/skills/roll-build/SKILL.md +95 -0
package/skills/roll-design/SKILL.md +45 -0
package/skills/roll-fix/SKILL.md +76 -0
package/skills/roll-loop/SKILL.md +13 -0
package/skills/roll-onboard/SKILL.md +6 -0

package/lib/roll-loop-status.py CHANGED Viewed

@@ -706,6 +706,97 @@ def rollup_for_story(cycles: List[Dict[str, Any]], story_id: str) -> Dict[str, A
             r["model"] = cy["model"]
     return r
+# US-SKILL-014: aggregate the last N self-score notes for the dashboard.
+# Reads .roll/notes/*.md (frontmatter format from US-SKILL-010), returns
+#   "self-score: mean 7.8 / min 4 / redo 2 (last 14)"
+# or "" when no notes / "self-score: (n/a) — N sample(s), need 3 (last N)"
+# when sample is too small.
+def _self_score_summary_line(notes_dir = None, window: int = 14) -> str:
+    notes_dir = notes_dir if notes_dir is not None else Path(".roll/notes")
+    if not notes_dir.exists():
+        return ""
+    files = sorted(notes_dir.glob("*.md"))[-window:]
+    if not files:
+        return ""
+    total = 0
+    count = 0
+    minv = 11
+    redo = 0
+    for f in files:
+        score = None
+        verdict = None
+        for line in f.read_text(errors="ignore").splitlines():
+            if line.startswith("score: "):
+                try:
+                    score = int(line.split(": ", 1)[1].strip())
+                except ValueError:
+                    score = None
+            elif line.startswith("verdict: "):
+                verdict = line.split(": ", 1)[1].strip()
+            if score is not None and verdict is not None:
+                break
+        if score is None:
+            continue
+        count += 1
+        total += score
+        if score < minv:
+            minv = score
+        if verdict == "regression":
+            redo += 1
+        elif verdict == "ok" and score < 6:
+            redo += 1
+    if count < 3:
+        return f"self-score: (n/a) — {count} sample(s), need 3 (last {window})"
+    mean = total / count
+    return f"self-score: mean {mean:.1f} / min {minv} / redo {redo} (last {window})"
+# US-AGENT-010: per-agent hit-rate summary for the ROLLUP block.
+# Aggregates the last `window_cycles` runs.jsonl records grouped by `agent`.
+# Returns a single-line string like
+#     "agents: pi 8/22 (36%) · deepseek 5/8 (63%) · claude 2/2 (n/a)"
+# Empty agents / missing agent field are skipped. Sample < min_sample renders
+# as "(n/a)" instead of a percentage to avoid noise from tiny windows.
+def _agent_summary_line(records: List[Dict[str, Any]], window_cycles: int = 50,
+                       min_sample: int = 5) -> str:
+    if not records or window_cycles <= 0:
+        return ""
+    # Take the most recent `window_cycles` records that have an agent field.
+    tail: List[Dict[str, Any]] = []
+    for rec in records[-window_cycles:]:
+        agent = (rec or {}).get("agent") or ""
+        if not agent:
+            continue
+        tail.append(rec)
+    if not tail:
+        return ""
+    counts: Dict[str, List[int]] = {}
+    # preserve first-seen order for stable output
+    order: List[str] = []
+    for rec in tail:
+        agent = rec.get("agent") or ""
+        if not agent:
+            continue
+        if agent not in counts:
+            counts[agent] = [0, 0]
+            order.append(agent)
+        counts[agent][1] += 1
+        if rec.get("status") == "built":
+            counts[agent][0] += 1
+    if not order:
+        return ""
+    parts: List[str] = []
+    for agent in order:
+        built, total = counts[agent]
+        if total < min_sample:
+            parts.append(f"{agent} {built}/{total} (n/a)")
+        else:
+            pct = round(100 * built / total) if total else 0
+            parts.append(f"{agent} {built}/{total} ({pct}%)")
+    return "agents: " + " · ".join(parts)
 def rollup_for_day(day_cycles: List[Dict[str, Any]]) -> Dict[str, Any]:
     # US-VIEW-012: track input + output separately so the daily summary can
     # show two metric rows. cache_read tokens deliberately excluded — they're
@@ -930,6 +1021,24 @@ def render(events, cron, state, backlog, *, days=3, lang="both", now=None,
                       d2["cost_by_cur"].get(_cur, 0.0),
                       partial=is_partial, symbol=_sym)
+    # US-AGENT-010: per-agent hit-rate summary (single line).
+    try:
+        runs_records = list(runs.values()) if isinstance(runs, dict) else list(runs or [])
+        runs_records.sort(key=lambda r: (r or {}).get("ts", ""))
+        _agent_line = _agent_summary_line(runs_records, window_cycles=50)
+    except Exception:
+        _agent_line = ""
+    if _agent_line:
+        print("  " + c("dim", _agent_line))
+    # US-SKILL-014: per-skill self-score trend (single line) under the agent line.
+    try:
+        _skill_line = _self_score_summary_line()
+    except Exception:
+        _skill_line = ""
+    if _skill_line:
+        print("  " + c("dim", _skill_line))
     print()
     print(c("faint", "─" * COLS))
     print()

package/lib/test_quality_gate.py ADDED Viewed

@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+"""Test quality merge gate (US-QA-012).
+Scan bats test files for rubric ❼ (inline external-tool behaviour) and ❽
+(file outside this repo) violations. Loop's auto-merge runs this between
+CI green and merge; non-zero exit holds the PR until either the test is
+fixed or PR description carries `[skip-test-quality]` (US-QA-013).
+Usage:
+  test_quality_gate.py [--skip] <bats-file> [<bats-file> …]
+Exit:
+  0 — clean OR --skip flag set
+  1 — one or more violations
+  2 — usage error
+"""
+from __future__ import annotations
+import re
+import sys
+from pathlib import Path
+from typing import List, Tuple
+# ❼ — inline external-tool patterns. We flag when a single line contains
+# TWO OR MORE of these distinct tool markers, which signals a hand-rolled
+# pipeline duplicating what a project helper should own. A lone `grep -q`
+# or `awk` (no pipe-chain) is fine.
+INLINE_TOOL_PATTERNS = [
+    re.compile(r"\bsed\s+[^|]*[s/]"),         # sed with substitution / address
+    re.compile(r"\bawk\s+'"),                 # awk with script
+    re.compile(r"\bgrep\s+-[a-zA-Z]*o"),       # grep -o / -oE (extraction)
+    re.compile(r"\bfind\s+[^|]*-name"),         # find -name (path scanning)
+    re.compile(r"\bcut\s+-f"),                # cut -f (column extraction)
+    re.compile(r"\btr\s+-d"),                 # tr -d (char deletion)
+]
+# ❽ — paths outside this repo. We flag `~/.<name>` (dotfile dirs) and
+# absolute system paths. `BATS_TMPDIR` is the sandbox marker and is fine.
+OUTSIDE_PATTERNS = [
+    re.compile(r"~/\.[A-Za-z]"),                # ~/.codex, ~/.kimi, ~/.roll, etc.
+    re.compile(r"(?<![A-Za-z0-9])/etc/[A-Za-z]"),
+    re.compile(r"(?<![A-Za-z0-9])/usr/[A-Za-z]"),
+    re.compile(r"(?<![A-Za-z0-9])/var/[A-Za-z]"),
+]
+OUTSIDE_ALLOW = re.compile(r"BATS_TMPDIR")
+def _scan_lines(text: str) -> List[Tuple[int, str, str]]:
+    """Return list of (line_no, kind, snippet). kind is "❼" or "❽"."""
+    findings: List[Tuple[int, str, str]] = []
+    in_heredoc = False
+    heredoc_terminator: str = ""
+    lines = text.splitlines()
+    for idx, raw_line in enumerate(lines, start=1):
+        line = raw_line.rstrip("\n")
+        stripped = line.lstrip()
+        if in_heredoc:
+            if line.strip() == heredoc_terminator:
+                in_heredoc = False
+            continue
+        # Skip comments — comments can legitimately discuss sed/awk in prose.
+        if stripped.startswith("#"):
+            continue
+        # Skip @test header lines — bats decorators carry the test name
+        # which often quotes the patterns the test exercises (false positive).
+        if stripped.startswith("@test "):
+            continue
+        # Explicit allow marker for lines that legitimately exercise the
+        # gate itself (test fixture content), or for project doc-validation
+        # awks that don't test production code.
+        if "test-quality:allow" in line:
+            continue
+        # Heredoc start: << 'EOF' or <<EOF (optional quotes).
+        # After the heredoc terminator on this line, subsequent lines are
+        # data until the terminator appears alone on a line.
+        m = re.search(r"<<\s*['\"]?([A-Z_]+)['\"]?", line)
+        if m:
+            heredoc_terminator = m.group(1)
+            in_heredoc = True
+            # Don't scan this declarator line further — the leading code
+            # before "<<" might still contain tool patterns, but we'd be
+            # double-flagging here vs the line that actually executes.
+            continue
+        # ❼: any inline extraction/parsing tool on this line flags. Each
+        # pattern intentionally describes parsing intent (sed substitution,
+        # awk script, grep -o / -oE, find -name, cut -f, tr -d) — single
+        # grep -q without -o doesn't match and stays untouched.
+        if any(pat.search(line) for pat in INLINE_TOOL_PATTERNS):
+            findings.append((idx, "❼", line.strip()))
+        # ❽: any outside-path hit unless BATS_TMPDIR appears (sandbox marker).
+        if OUTSIDE_ALLOW.search(line):
+            continue
+        for pat in OUTSIDE_PATTERNS:
+            if pat.search(line):
+                findings.append((idx, "❽", line.strip()))
+                break  # one ❽ finding per line is enough
+    return findings
+def scan_file(path: Path) -> List[Tuple[int, str, str]]:
+    try:
+        text = path.read_text(errors="ignore")
+    except FileNotFoundError:
+        return [(0, "?", f"file not found: {path}")]
+    return _scan_lines(text)
+def main() -> int:
+    args = sys.argv[1:]
+    skip = False
+    files: List[str] = []
+    for a in args:
+        if a in ("--skip", "--skip-test-quality"):
+            skip = True
+        else:
+            files.append(a)
+    if not files:
+        print("usage: test_quality_gate.py [--skip] <bats-file> [<bats-file> …]",
+              file=sys.stderr)
+        return 2
+    if skip:
+        return 0
+    total = 0
+    for f in files:
+        findings = scan_file(Path(f))
+        for line_no, kind, snippet in findings:
+            print(f"{f}:{line_no}: {kind} {snippet}")
+            total += 1
+    return 1 if total > 0 else 0
+if __name__ == "__main__":
+    sys.exit(main())

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@seanyao/roll",
-  "version": "2026.528.2",
+  "version": "2026.529.2",
   "description": "Roll — Roll out features with AI agents",
   "scripts": {
     "test": "bash tests/run.sh"

package/skills/roll-brief/SKILL.md CHANGED Viewed

@@ -127,6 +127,13 @@ A simple heuristic — not a gate, just a signal for the human:
 |----|-------------|----------|
 | US-XXX | {标题} | 高 |
+<!-- US-AGENT-010: per-agent hit-rate summary (one line). Read the last
+     window_cycles records of runs.jsonl, group by `agent`, format as
+     `agents: pi 8/22 (36%) · deepseek 5/8 (63%)`. Sample < 5 → `(n/a)`.
+     Omit when no records have an agent field (legacy data). -->
+## Agent 路由命中率
+{agents: <name> built/total (pct%) · …}  ← from `runs.jsonl` last 50 cycles
 <!-- 仅当 roll-.dream 有新发现时输出 -->
 ## 悟见
 {来自 .roll/dream/ 的摘要}

package/skills/roll-build/SKILL.md CHANGED Viewed

@@ -52,6 +52,66 @@ Do not use for:
 Activate when input is a `US-[A-Z]+-[0-9]+` identifier.
+### Step 0: Pre-flight self-check (US-AGENT-007)
+Before reading the Story in depth or splitting actions, **read the Agent profile** from the story's feature md and decide whether this cycle can realistically deliver it. The check is mechanical:
+```
+inputs:
+  story.est_min       (from **Agent profile:** block, US-AGENT-001)
+  story.risk_zone     (low / medium / high)
+  story.chain_depth   (0 unless already a downgrade product)
+  agent.max_est_min   (from .roll/agent-routes.yaml for the current agent)
+  history.prefer_threshold (from .roll/agent-routes.yaml)
+  history.hit_rate    (this agent × this story_type, last window_cycles)
+verdict:
+  too_big when ANY of these is true:
+    1. story.est_min > agent.max_est_min   (hard capacity miss)
+    2. story.risk_zone not in agent.risk    (hard risk miss)
+    3. history.hit_rate < prefer_threshold AND story.chain_depth == 0
+       (soft signal: history says this agent's not on top of this type yet,
+        and we still have downgrade budget — don't burn a cycle)
+  ok otherwise
+```
+Output the verdict as the first line of the cycle response:
+```yaml
+verdict: ok    # or: too_big
+reason: <one short line — which condition triggered, with numbers>
+```
+When `verdict: ok` → continue to Step 1 normally.
+When `verdict: too_big` → go to **US-AGENT-008 self-downgrade path**, **but** first run the **US-AGENT-009 chain_depth cap check**:
+```bash
+# 0a. Cap check: refuse the third consecutive auto-split.
+#     exit 0 → split allowed; exit 1 → cap hit, take cap-hit path instead.
+if ! bash -c 'source "$(command -v roll)"; _loop_chain_depth_cap_check US-XXX-NNN'; then
+  # Cap hit (chain_depth ≥ 2): hold + ALERT, exit cleanly.
+  bash -c 'source "$(command -v roll)"; _loop_split_cap_hit US-XXX-NNN "depth >= 2, human triage required"'
+  exit 0
+fi
+# 1. Invoke roll-design to re-split the story into smaller sub-stories.
+#    Each sub-story carries chain_depth = (parent.chain_depth + 1).
+#    Sub-stories land as 📋 Todo with depends-on:<parent> chained.
+Skill("roll-design", "--from-story US-XXX-NNN")
+# 2. After the sub-stories are written to BACKLOG, flip the parent
+#    to 🚫 Hold and emit the downgrade event. The helper handles ALERT.
+bash -c 'source "$(command -v roll)"; _loop_self_downgrade US-XXX-NNN "too_big: <reason from verdict>" "US-XXX-NNNa,US-XXX-NNNb"'
+# 3. Exit cleanly — no TCR commits this cycle. The next loop cycle picks
+#    up the first sub-story (which is smaller and should pass pre-flight).
+exit 0
+```
+If `roll-design` cannot produce ≥2 sub-stories (story is already irreducible), fall through to **US-AGENT-009 cap-hit path** by invoking `_loop_split_cap_hit` directly. The cap is purely about stopping infinite split chains; even on the first re-split, if the design step gives up, the cap-hit handler raises ALERT for human triage.
+> Pre-flight is honest, not paranoid: a small story (est_min ≤ 5, chain_depth=0, low risk) should almost always go `ok`. The check pays off on the long tail — stories that look small but compose tons of files, or that the current agent has historically failed.
 ### Step 1: Read the Story
 1. Open `.roll/backlog.md`, find the US row, follow the link to `.roll/features/<feature>.md`
@@ -64,6 +124,16 @@ Activate when input is a `US-[A-Z]+-[0-9]+` identifier.
 - Pick the smallest shippable Action first
 - **Granularity constraint**: Each Action completable in 2–5 minutes; split if larger
 - **No placeholders**: Action descriptions must be specific and directly executable
+- **Test-quality self-check (US-QA-011)** — for every Action that adds tests:
+  1. Tests call project functions / public command entry points; do NOT inline
+     external-tool behaviour (`sed`/`awk`/`grep`/`find`/`cut` pipelines that
+     duplicate logic already in `lib/` or `bin/`) — rubric ❼.
+  2. Tests sandbox filesystem state via `BATS_TMPDIR` (or equivalent); do NOT
+     touch or assert on paths outside this repo (`~/.codex`, `~/.kimi`,
+     `~/.roll/`, `/etc/...`) — rubric ❽.
+  3. If you can't satisfy (1) or (2), reshape the Action: extract a project
+     helper, redirect the env var to a tmp dir, or move the test to an
+     integration tier where the boundary is intentional and documented.
 #### 2.5 Parallel Dispatch (auto-determined)
@@ -636,8 +706,33 @@ Before creating any file or directory:
 - [ ] **.roll/backlog.md index status updated** (📋 → ✅, REQUIRED)
 - [ ] **`.roll/features/<feature>.md` US section updated** (Completed date + [x] ACs, REQUIRED)
 - [ ] **CHANGELOG.md staged and bundled** into completion commit via `$roll-.changelog` in Phase 11 (REQUIRED)
+- [ ] **Self-score note written (US-SKILL-010 / 012)** — see "Self-score" subsection below
 - [ ] Summary reported to user
+### Self-score (US-SKILL-012)
+Before reporting completion to the user, write one self-score note. The
+helper lands the note under `.roll/notes/<date>-roll-build-<US-id>-<epoch>.md`
+with YAML frontmatter so trend analysis (US-SKILL-014) can aggregate later:
+```bash
+bash -c 'source "$(command -v roll)"; \
+  _skill_write_self_score roll-build US-XXX-NNN <score 1..10> <good|ok|regression> "<rationale>"'
+```
+Score guidance (integer 1..10):
+- **9..10** — story shipped cleanly: AC fully met, TCR rhythm tight, no
+  re-tries from `verdict: too_big`, peer review concerns addressed inline.
+- **6..8** — shipped with caveats: re-tries on red, edge case left to a
+  follow-up FIX, documentation lagged behind code by one cycle, etc.
+- **1..5** — shipped but at low confidence: AC partially met (note which),
+  TCR rhythm broken (multiple revert iterations), or `regression` verdict.
+Verdict values:
+- `good` — story fully delivered; AC met; no concerning signal.
+- `ok` — shipped but with at least one documented trade-off (use rationale).
+- `regression` — story landed but another behaviour broke (rare; open a FIX).
 ---
 ## TCR Recovery Patterns

package/skills/roll-design/SKILL.md CHANGED Viewed

@@ -681,6 +681,11 @@ Note: `{DOMAIN}` maps to the Bounded Context name identified in DDD analysis.
 - Events raised: [{EventName}] → {consumer context}
 - Cross-context: {if touches another context, otherwise omit}
+**Agent profile:**
+- est_min: {1-30 整数,目标 5-10 min 一个 cycle 闭环}
+- risk_zone: {low / medium / high — 改文档 low,改用户可见行为 medium,改 loop infra 或安全/隔离基建 high}
+- chain_depth: 0  {若是自降级产出的子 story 则 +1,累计 ≥2 时第 3 次拒拆}
 **AC:**
 - [ ] {measurable criteria 1}
 - [ ] {measurable criteria 2}
@@ -700,6 +705,10 @@ Note: `{DOMAIN}` maps to the Bounded Context name identified in DDD analysis.
 - Integration test: `tests/integration/{flow}.test.ts`
 ```
+> **强制规则 — Agent profile 必须填**：Split into Stories 步骤产出的每个 US 都必须带 `**Agent profile:**` 子段，est_min / risk_zone 不可省（chain_depth 默认 0）。loop 路由（US-AGENT-004）和 agent 自评（US-AGENT-007）都靠这两个字段决策，缺了就回退到 cold_start_default 并 WARN。历史 US 不强制回填。
+>
+> **MUST fill** the `**Agent profile:**` block on every newly split US — `est_min` and `risk_zone` are non-optional. They drive loop routing and agent self-eval downstream.
 ### Closing Doc-Refresh Story Template — Phase N.M 收尾文档
 When any preceding US in the batch changes user-visible behavior, append this template story at the end of the batch. Wire it as `depends-on:` against every preceding user-facing US so it runs last.
@@ -762,6 +771,42 @@ Each story must be:
 ---
+## Self-score (US-SKILL-010 / 013)
+After Step 5 (Write to BACKLOG) completes — i.e. once the new US rows are
+landed and the user has either confirmed or chosen `No` (story still
+queued) — write a single self-score note covering the design session:
+```bash
+bash -c 'source "$(command -v roll)"; \
+  _skill_write_self_score roll-design US-XXX-NNN <score 1..10> <good|ok|regression> "<rationale>"'
+```
+Use the **first** US-id of the batch as the story handle (or the
+representative story when splitting). The note lands under
+`.roll/notes/<date>-roll-design-<id>-<epoch>.md` so US-SKILL-014 can
+read trend data.
+Score guidance for design quality (integer 1..10):
+- **9..10** — clean split: every US is INVEST-compliant, profile
+  (est_min / risk_zone / chain_depth) filled, doc-refresh closer wired
+  when user-visible behaviour changed, peer-review unnecessary or
+  reached AGREE quickly.
+- **6..8** — split shipped but with caveats: one US borderline on
+  INVEST (e.g. shared file conflict), or Discuss had ESCALATE before
+  settling, or profile was partial.
+- **1..5** — split shipped but rough: missing doc-refresh closer,
+  profile fields skipped, USer story too coarse for AI cycle; flag a
+  follow-up `roll-design` pass.
+Verdict values:
+- `good` — design + split are clean.
+- `ok` — design is acceptable with one or two trade-offs noted.
+- `regression` — the split visibly broke something earlier (rare; e.g.
+  invalidated a previously-stable depends-on chain).
+---
 ## Integration
 ### With roll-build

package/skills/roll-fix/SKILL.md CHANGED Viewed

@@ -87,6 +87,46 @@ Before creating any file or directory:
 ## TCR Workflow
+### 0. Pre-flight self-check (US-AGENT-007)
+Before locking the issue, read the FIX's Agent profile (est_min / risk_zone / chain_depth) from the linked feature md and decide whether this cycle should attempt the fix:
+```
+inputs:
+  fix.est_min       (from **Agent profile:** block on the FIX row's feature md)
+  fix.risk_zone     (low / medium / high)
+  fix.chain_depth   (0 unless already a downgrade product)
+  agent.max_est_min (from .roll/agent-routes.yaml for the current agent)
+  history.prefer_threshold + history.hit_rate (FIX history for this agent)
+verdict:
+  too_big when ANY:
+    1. fix.est_min > agent.max_est_min
+    2. fix.risk_zone not in agent.risk
+    3. history.hit_rate < prefer_threshold AND fix.chain_depth == 0
+  ok otherwise
+```
+Emit `verdict: ok` or `verdict: too_big` (with `reason:`) as the first cycle output line.
+- `ok` → continue with step 1 below normally
+- `too_big` → self-downgrade per US-AGENT-008, **gated by US-AGENT-009 cap check**:
+```bash
+# Cap check first (chain_depth ≥ 2 → refuse third auto-split).
+if ! bash -c 'source "$(command -v roll)"; _loop_chain_depth_cap_check FIX-XXX-NNN'; then
+  bash -c 'source "$(command -v roll)"; _loop_split_cap_hit FIX-XXX-NNN "depth >= 2"'
+  exit 0
+fi
+Skill("roll-design", "--from-story FIX-XXX-NNN")
+bash -c 'source "$(command -v roll)"; _loop_self_downgrade FIX-XXX-NNN "too_big: <reason>" "FIX-XXX-NNNa,FIX-XXX-NNNb"'
+exit 0
+```
+Original FIX goes to 🚫 Hold with `→ split to ...` annotation; sub-stories carry `chain_depth + 1`. Cap-hit path raises ALERT for human triage. Do NOT TCR a half fix.
+Bug fixes are usually small (est_min ≤ 5), so pre-flight is mostly a sanity barrier for FIXes whose underlying issue turns out structural — e.g. a "simple null check" that requires touching 12 files. Catching that upfront is cheaper than burning a cycle.
 ### 1. Lock the issue
    - state the user-visible issue or requested enhancement
    - define the scope boundary and non-goals
@@ -96,6 +136,15 @@ Before creating any file or directory:
    - define the online verification target
    - for hotfixes: include regression test to prevent recurrence
    - reference `$roll-.qa` for appropriate test type (unit/integration/E2E)
+   - **Test-quality self-check (US-QA-011)** — for any new test the fix adds:
+     1. The test must call project functions / public command entry points,
+        not inline `sed`/`awk`/`grep -o`/`find`/`cut` pipelines that
+        re-implement what `lib/` or `bin/` already does — rubric ❼.
+     2. The test must sandbox filesystem state via `BATS_TMPDIR` or an
+        equivalent helper; never assert on or write to paths outside this
+        repo (`~/.codex`, `~/.kimi`, `~/.roll/`, system paths) — rubric ❽.
+     3. If you can't satisfy (1) or (2), extract a project helper or
+        redirect the env var to a tmp dir before writing the test.
 ### 3. Test Design Review (TCR Core)
@@ -366,6 +415,33 @@ A minor change is only "done" when all are true:
 - [ ] Deployment completed
 - [ ] Online verification performed
 - [ ] **Verification Gate passed** (fresh evidence for tests, build, fix confirmation, no regression)
+- [ ] **Self-score note written (US-SKILL-010 / 011)** — before exit, the agent
+      writes a structured score note via `_skill_write_self_score` so trend
+      analysis (US-SKILL-014) and skill-self-scoring docs (US-SKILL-015) have
+      data to read.
+### Self-score (US-SKILL-011)
+Before exiting the cycle, write one self-score note. The helper validates
+inputs and lands the note under `.roll/notes/<date>-roll-fix-<FIX-id>-<epoch>.md`:
+```bash
+bash -c 'source "$(command -v roll)"; \
+  _skill_write_self_score roll-fix FIX-XXX-NNN <score 1..10> <good|ok|regression> "<rationale>"'
+```
+Score guidance (integer 1..10):
+- **9..10** — clean root-cause fix; regression test added; TCR cycle smooth.
+- **6..8** — fix shipped but with caveats (e.g. workaround, partial coverage,
+  or repeated TCR red iterations); rationale explains the trade-off.
+- **1..5** — fix landed but quality is below the bar (test coverage missing,
+  fix only narrows blast radius, repeated agent re-tries). Verdict should be
+  `ok` or `regression` if a related test broke.
+Verdict values:
+- `good` — fix is the proper root-cause fix; no caveats.
+- `ok` — shipped but with documented trade-offs (use rationale to explain).
+- `regression` — the fix re-broke something else (rare; consider re-opening).
 ## Rubric

package/skills/roll-loop/SKILL.md CHANGED Viewed

@@ -240,6 +240,19 @@ Together these mean: only one loop runs at a time per project (LOCK), and within
 ### Step 3 — Route and Execute
+> **US-AGENT-006 — Per-story agent routing (pre-cycle)**
+>
+> Before this skill even starts, the runner inner script has already:
+> 1. Picked the next eligible Todo via `_loop_pick_next_story` (priority FIX > US > REFACTOR, manual-only / depends-on gates respected)
+> 2. Read its Agent profile (est_min / risk_zone) and routed an agent via `_loop_pick_agent_for_story` (hard rules from `.roll/agent-routes.yaml` + soft preference from `runs.jsonl`)
+> 3. Exported `ROLL_LOOP_ROUTED_STORY` / `ROLL_LOOP_ROUTED_AGENT` / `ROLL_LOOP_ROUTED_RULE` and printed `[loop] story <id> routed to <agent> via <rule_kind>` to cron.log
+>
+> When `ROLL_LOOP_ROUTED_STORY` is set, prefer it as `US_ID` for this cycle. The story has already been chosen by hard+soft routing rules — do not re-pick a different one unless that story can no longer be found in BACKLOG (e.g. status changed concurrently).
+>
+> Old single-agent fallback (`primary_agent` from `~/.roll/config.yaml`) still applies when:
+> - no story is pickable (empty Todo / all manual-only)
+> - the matching agent-routes.yaml has no agent that fits the story profile (then `cold_start_default` is used)
 For each item, **before invoking the executor skill**, mark the story 🔨 In Progress in the **main repo's** .roll/backlog.md so brief and peer agents can see it being worked on. The cycle worktree is gitignored at .roll/, so editing the worktree's own copy + committing carries no change back to main — write directly via the helper instead:
 ```bash

package/skills/roll-onboard/SKILL.md CHANGED Viewed

@@ -115,6 +115,12 @@ privacy:
 sync_targets: [claude, cursor]               # user's Q8
 enable_loop: false                            # user's Q9
+agent_routes_template: default                # user's Q10 — agent routing preset
+                                              # one of: default / minimal / heavy / skip
+                                              # default = pi/deepseek/claude + history (US-AGENT-002)
+                                              # minimal = single agent (pi), no history
+                                              # heavy   = pi/deepseek/claude/kimi + larger window
+                                              # skip    = don't seed .roll/agent-routes.yaml
 ```
 Then tell the user: