@seanyao/roll 2026.528.2 → 2026.529.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -706,6 +706,97 @@ def rollup_for_story(cycles: List[Dict[str, Any]], story_id: str) -> Dict[str, A
706
706
  r["model"] = cy["model"]
707
707
  return r
708
708
 
709
+
710
+ # US-SKILL-014: aggregate the last N self-score notes for the dashboard.
711
+ # Reads .roll/notes/*.md (frontmatter format from US-SKILL-010), returns
712
+ # "self-score: mean 7.8 / min 4 / redo 2 (last 14)"
713
+ # or "" when no notes / "self-score: (n/a) — N sample(s), need 3 (last N)"
714
+ # when sample is too small.
715
+ def _self_score_summary_line(notes_dir = None, window: int = 14) -> str:
716
+ notes_dir = notes_dir if notes_dir is not None else Path(".roll/notes")
717
+ if not notes_dir.exists():
718
+ return ""
719
+ files = sorted(notes_dir.glob("*.md"))[-window:]
720
+ if not files:
721
+ return ""
722
+ total = 0
723
+ count = 0
724
+ minv = 11
725
+ redo = 0
726
+ for f in files:
727
+ score = None
728
+ verdict = None
729
+ for line in f.read_text(errors="ignore").splitlines():
730
+ if line.startswith("score: "):
731
+ try:
732
+ score = int(line.split(": ", 1)[1].strip())
733
+ except ValueError:
734
+ score = None
735
+ elif line.startswith("verdict: "):
736
+ verdict = line.split(": ", 1)[1].strip()
737
+ if score is not None and verdict is not None:
738
+ break
739
+ if score is None:
740
+ continue
741
+ count += 1
742
+ total += score
743
+ if score < minv:
744
+ minv = score
745
+ if verdict == "regression":
746
+ redo += 1
747
+ elif verdict == "ok" and score < 6:
748
+ redo += 1
749
+ if count < 3:
750
+ return f"self-score: (n/a) — {count} sample(s), need 3 (last {window})"
751
+ mean = total / count
752
+ return f"self-score: mean {mean:.1f} / min {minv} / redo {redo} (last {window})"
753
+
754
+
755
+ # US-AGENT-010: per-agent hit-rate summary for the ROLLUP block.
756
+ # Aggregates the last `window_cycles` runs.jsonl records grouped by `agent`.
757
+ # Returns a single-line string like
758
+ # "agents: pi 8/22 (36%) · deepseek 5/8 (63%) · claude 2/2 (n/a)"
759
+ # Empty agents / missing agent field are skipped. Sample < min_sample renders
760
+ # as "(n/a)" instead of a percentage to avoid noise from tiny windows.
761
+ def _agent_summary_line(records: List[Dict[str, Any]], window_cycles: int = 50,
762
+ min_sample: int = 5) -> str:
763
+ if not records or window_cycles <= 0:
764
+ return ""
765
+ # Take the most recent `window_cycles` records that have an agent field.
766
+ tail: List[Dict[str, Any]] = []
767
+ for rec in records[-window_cycles:]:
768
+ agent = (rec or {}).get("agent") or ""
769
+ if not agent:
770
+ continue
771
+ tail.append(rec)
772
+ if not tail:
773
+ return ""
774
+ counts: Dict[str, List[int]] = {}
775
+ # preserve first-seen order for stable output
776
+ order: List[str] = []
777
+ for rec in tail:
778
+ agent = rec.get("agent") or ""
779
+ if not agent:
780
+ continue
781
+ if agent not in counts:
782
+ counts[agent] = [0, 0]
783
+ order.append(agent)
784
+ counts[agent][1] += 1
785
+ if rec.get("status") == "built":
786
+ counts[agent][0] += 1
787
+ if not order:
788
+ return ""
789
+ parts: List[str] = []
790
+ for agent in order:
791
+ built, total = counts[agent]
792
+ if total < min_sample:
793
+ parts.append(f"{agent} {built}/{total} (n/a)")
794
+ else:
795
+ pct = round(100 * built / total) if total else 0
796
+ parts.append(f"{agent} {built}/{total} ({pct}%)")
797
+ return "agents: " + " · ".join(parts)
798
+
799
+
709
800
  def rollup_for_day(day_cycles: List[Dict[str, Any]]) -> Dict[str, Any]:
710
801
  # US-VIEW-012: track input + output separately so the daily summary can
711
802
  # show two metric rows. cache_read tokens deliberately excluded — they're
@@ -930,6 +1021,24 @@ def render(events, cron, state, backlog, *, days=3, lang="both", now=None,
930
1021
  d2["cost_by_cur"].get(_cur, 0.0),
931
1022
  partial=is_partial, symbol=_sym)
932
1023
 
1024
+ # US-AGENT-010: per-agent hit-rate summary (single line).
1025
+ try:
1026
+ runs_records = list(runs.values()) if isinstance(runs, dict) else list(runs or [])
1027
+ runs_records.sort(key=lambda r: (r or {}).get("ts", ""))
1028
+ _agent_line = _agent_summary_line(runs_records, window_cycles=50)
1029
+ except Exception:
1030
+ _agent_line = ""
1031
+ if _agent_line:
1032
+ print(" " + c("dim", _agent_line))
1033
+
1034
+ # US-SKILL-014: per-skill self-score trend (single line) under the agent line.
1035
+ try:
1036
+ _skill_line = _self_score_summary_line()
1037
+ except Exception:
1038
+ _skill_line = ""
1039
+ if _skill_line:
1040
+ print(" " + c("dim", _skill_line))
1041
+
933
1042
  print()
934
1043
  print(c("faint", "─" * COLS))
935
1044
  print()
@@ -0,0 +1,143 @@
1
+ #!/usr/bin/env python3
2
+ """Test quality merge gate (US-QA-012).
3
+
4
+ Scan bats test files for rubric ❼ (inline external-tool behaviour) and ❽
5
+ (file outside this repo) violations. Loop's auto-merge runs this between
6
+ CI green and merge; non-zero exit holds the PR until either the test is
7
+ fixed or PR description carries `[skip-test-quality]` (US-QA-013).
8
+
9
+ Usage:
10
+ test_quality_gate.py [--skip] <bats-file> [<bats-file> …]
11
+
12
+ Exit:
13
+ 0 — clean OR --skip flag set
14
+ 1 — one or more violations
15
+ 2 — usage error
16
+ """
17
+ from __future__ import annotations
18
+
19
+ import re
20
+ import sys
21
+ from pathlib import Path
22
+ from typing import List, Tuple
23
+
24
+
25
+ # ❼ — inline external-tool patterns. We flag when a single line contains
26
+ # TWO OR MORE of these distinct tool markers, which signals a hand-rolled
27
+ # pipeline duplicating what a project helper should own. A lone `grep -q`
28
+ # or `awk` (no pipe-chain) is fine.
29
+ INLINE_TOOL_PATTERNS = [
30
+ re.compile(r"\bsed\s+[^|]*[s/]"), # sed with substitution / address
31
+ re.compile(r"\bawk\s+'"), # awk with script
32
+ re.compile(r"\bgrep\s+-[a-zA-Z]*o"), # grep -o / -oE (extraction)
33
+ re.compile(r"\bfind\s+[^|]*-name"), # find -name (path scanning)
34
+ re.compile(r"\bcut\s+-f"), # cut -f (column extraction)
35
+ re.compile(r"\btr\s+-d"), # tr -d (char deletion)
36
+ ]
37
+
38
+ # ❽ — paths outside this repo. We flag `~/.<name>` (dotfile dirs) and
39
+ # absolute system paths. `BATS_TMPDIR` is the sandbox marker and is fine.
40
+ OUTSIDE_PATTERNS = [
41
+ re.compile(r"~/\.[A-Za-z]"), # ~/.codex, ~/.kimi, ~/.roll, etc.
42
+ re.compile(r"(?<![A-Za-z0-9])/etc/[A-Za-z]"),
43
+ re.compile(r"(?<![A-Za-z0-9])/usr/[A-Za-z]"),
44
+ re.compile(r"(?<![A-Za-z0-9])/var/[A-Za-z]"),
45
+ ]
46
+ OUTSIDE_ALLOW = re.compile(r"BATS_TMPDIR")
47
+
48
+
49
+ def _scan_lines(text: str) -> List[Tuple[int, str, str]]:
50
+ """Return list of (line_no, kind, snippet). kind is "❼" or "❽"."""
51
+ findings: List[Tuple[int, str, str]] = []
52
+ in_heredoc = False
53
+ heredoc_terminator: str = ""
54
+ lines = text.splitlines()
55
+ for idx, raw_line in enumerate(lines, start=1):
56
+ line = raw_line.rstrip("\n")
57
+ stripped = line.lstrip()
58
+
59
+ if in_heredoc:
60
+ if line.strip() == heredoc_terminator:
61
+ in_heredoc = False
62
+ continue
63
+
64
+ # Skip comments — comments can legitimately discuss sed/awk in prose.
65
+ if stripped.startswith("#"):
66
+ continue
67
+
68
+ # Skip @test header lines — bats decorators carry the test name
69
+ # which often quotes the patterns the test exercises (false positive).
70
+ if stripped.startswith("@test "):
71
+ continue
72
+
73
+ # Explicit allow marker for lines that legitimately exercise the
74
+ # gate itself (test fixture content), or for project doc-validation
75
+ # awks that don't test production code.
76
+ if "test-quality:allow" in line:
77
+ continue
78
+
79
+ # Heredoc start: << 'EOF' or <<EOF (optional quotes).
80
+ # After the heredoc terminator on this line, subsequent lines are
81
+ # data until the terminator appears alone on a line.
82
+ m = re.search(r"<<\s*['\"]?([A-Z_]+)['\"]?", line)
83
+ if m:
84
+ heredoc_terminator = m.group(1)
85
+ in_heredoc = True
86
+ # Don't scan this declarator line further — the leading code
87
+ # before "<<" might still contain tool patterns, but we'd be
88
+ # double-flagging here vs the line that actually executes.
89
+ continue
90
+
91
+ # ❼: any inline extraction/parsing tool on this line flags. Each
92
+ # pattern intentionally describes parsing intent (sed substitution,
93
+ # awk script, grep -o / -oE, find -name, cut -f, tr -d) — single
94
+ # grep -q without -o doesn't match and stays untouched.
95
+ if any(pat.search(line) for pat in INLINE_TOOL_PATTERNS):
96
+ findings.append((idx, "❼", line.strip()))
97
+
98
+ # ❽: any outside-path hit unless BATS_TMPDIR appears (sandbox marker).
99
+ if OUTSIDE_ALLOW.search(line):
100
+ continue
101
+ for pat in OUTSIDE_PATTERNS:
102
+ if pat.search(line):
103
+ findings.append((idx, "❽", line.strip()))
104
+ break # one ❽ finding per line is enough
105
+
106
+ return findings
107
+
108
+
109
+ def scan_file(path: Path) -> List[Tuple[int, str, str]]:
110
+ try:
111
+ text = path.read_text(errors="ignore")
112
+ except FileNotFoundError:
113
+ return [(0, "?", f"file not found: {path}")]
114
+ return _scan_lines(text)
115
+
116
+
117
+ def main() -> int:
118
+ args = sys.argv[1:]
119
+ skip = False
120
+ files: List[str] = []
121
+ for a in args:
122
+ if a in ("--skip", "--skip-test-quality"):
123
+ skip = True
124
+ else:
125
+ files.append(a)
126
+ if not files:
127
+ print("usage: test_quality_gate.py [--skip] <bats-file> [<bats-file> …]",
128
+ file=sys.stderr)
129
+ return 2
130
+ if skip:
131
+ return 0
132
+
133
+ total = 0
134
+ for f in files:
135
+ findings = scan_file(Path(f))
136
+ for line_no, kind, snippet in findings:
137
+ print(f"{f}:{line_no}: {kind} {snippet}")
138
+ total += 1
139
+ return 1 if total > 0 else 0
140
+
141
+
142
+ if __name__ == "__main__":
143
+ sys.exit(main())
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@seanyao/roll",
3
- "version": "2026.528.2",
3
+ "version": "2026.529.2",
4
4
  "description": "Roll — Roll out features with AI agents",
5
5
  "scripts": {
6
6
  "test": "bash tests/run.sh"
@@ -127,6 +127,13 @@ A simple heuristic — not a gate, just a signal for the human:
127
127
  |----|-------------|----------|
128
128
  | US-XXX | {标题} | 高 |
129
129
 
130
+ <!-- US-AGENT-010: per-agent hit-rate summary (one line). Read the last
131
+ window_cycles records of runs.jsonl, group by `agent`, format as
132
+ `agents: pi 8/22 (36%) · deepseek 5/8 (63%)`. Sample < 5 → `(n/a)`.
133
+ Omit when no records have an agent field (legacy data). -->
134
+ ## Agent 路由命中率
135
+ {agents: <name> built/total (pct%) · …} ← from `runs.jsonl` last 50 cycles
136
+
130
137
  <!-- 仅当 roll-.dream 有新发现时输出 -->
131
138
  ## 悟见
132
139
  {来自 .roll/dream/ 的摘要}
@@ -52,6 +52,66 @@ Do not use for:
52
52
 
53
53
  Activate when input is a `US-[A-Z]+-[0-9]+` identifier.
54
54
 
55
+ ### Step 0: Pre-flight self-check (US-AGENT-007)
56
+
57
+ Before reading the Story in depth or splitting actions, **read the Agent profile** from the story's feature md and decide whether this cycle can realistically deliver it. The check is mechanical:
58
+
59
+ ```
60
+ inputs:
61
+ story.est_min (from **Agent profile:** block, US-AGENT-001)
62
+ story.risk_zone (low / medium / high)
63
+ story.chain_depth (0 unless already a downgrade product)
64
+ agent.max_est_min (from .roll/agent-routes.yaml for the current agent)
65
+ history.prefer_threshold (from .roll/agent-routes.yaml)
66
+ history.hit_rate (this agent × this story_type, last window_cycles)
67
+
68
+ verdict:
69
+ too_big when ANY of these is true:
70
+ 1. story.est_min > agent.max_est_min (hard capacity miss)
71
+ 2. story.risk_zone not in agent.risk (hard risk miss)
72
+ 3. history.hit_rate < prefer_threshold AND story.chain_depth == 0
73
+ (soft signal: history says this agent's not on top of this type yet,
74
+ and we still have downgrade budget — don't burn a cycle)
75
+ ok otherwise
76
+ ```
77
+
78
+ Output the verdict as the first line of the cycle response:
79
+
80
+ ```yaml
81
+ verdict: ok # or: too_big
82
+ reason: <one short line — which condition triggered, with numbers>
83
+ ```
84
+
85
+ When `verdict: ok` → continue to Step 1 normally.
86
+ When `verdict: too_big` → go to **US-AGENT-008 self-downgrade path**, **but** first run the **US-AGENT-009 chain_depth cap check**:
87
+
88
+ ```bash
89
+ # 0a. Cap check: refuse the third consecutive auto-split.
90
+ # exit 0 → split allowed; exit 1 → cap hit, take cap-hit path instead.
91
+ if ! bash -c 'source "$(command -v roll)"; _loop_chain_depth_cap_check US-XXX-NNN'; then
92
+ # Cap hit (chain_depth ≥ 2): hold + ALERT, exit cleanly.
93
+ bash -c 'source "$(command -v roll)"; _loop_split_cap_hit US-XXX-NNN "depth >= 2, human triage required"'
94
+ exit 0
95
+ fi
96
+
97
+ # 1. Invoke roll-design to re-split the story into smaller sub-stories.
98
+ # Each sub-story carries chain_depth = (parent.chain_depth + 1).
99
+ # Sub-stories land as 📋 Todo with depends-on:<parent> chained.
100
+ Skill("roll-design", "--from-story US-XXX-NNN")
101
+
102
+ # 2. After the sub-stories are written to BACKLOG, flip the parent
103
+ # to 🚫 Hold and emit the downgrade event. The helper handles ALERT.
104
+ bash -c 'source "$(command -v roll)"; _loop_self_downgrade US-XXX-NNN "too_big: <reason from verdict>" "US-XXX-NNNa,US-XXX-NNNb"'
105
+
106
+ # 3. Exit cleanly — no TCR commits this cycle. The next loop cycle picks
107
+ # up the first sub-story (which is smaller and should pass pre-flight).
108
+ exit 0
109
+ ```
110
+
111
+ If `roll-design` cannot produce ≥2 sub-stories (story is already irreducible), fall through to **US-AGENT-009 cap-hit path** by invoking `_loop_split_cap_hit` directly. The cap is purely about stopping infinite split chains; even on the first re-split, if the design step gives up, the cap-hit handler raises ALERT for human triage.
112
+
113
+ > Pre-flight is honest, not paranoid: a small story (est_min ≤ 5, chain_depth=0, low risk) should almost always go `ok`. The check pays off on the long tail — stories that look small but compose tons of files, or that the current agent has historically failed.
114
+
55
115
  ### Step 1: Read the Story
56
116
 
57
117
  1. Open `.roll/backlog.md`, find the US row, follow the link to `.roll/features/<feature>.md`
@@ -64,6 +124,16 @@ Activate when input is a `US-[A-Z]+-[0-9]+` identifier.
64
124
  - Pick the smallest shippable Action first
65
125
  - **Granularity constraint**: Each Action completable in 2–5 minutes; split if larger
66
126
  - **No placeholders**: Action descriptions must be specific and directly executable
127
+ - **Test-quality self-check (US-QA-011)** — for every Action that adds tests:
128
+ 1. Tests call project functions / public command entry points; do NOT inline
129
+ external-tool behaviour (`sed`/`awk`/`grep`/`find`/`cut` pipelines that
130
+ duplicate logic already in `lib/` or `bin/`) — rubric ❼.
131
+ 2. Tests sandbox filesystem state via `BATS_TMPDIR` (or equivalent); do NOT
132
+ touch or assert on paths outside this repo (`~/.codex`, `~/.kimi`,
133
+ `~/.roll/`, `/etc/...`) — rubric ❽.
134
+ 3. If you can't satisfy (1) or (2), reshape the Action: extract a project
135
+ helper, redirect the env var to a tmp dir, or move the test to an
136
+ integration tier where the boundary is intentional and documented.
67
137
 
68
138
  #### 2.5 Parallel Dispatch (auto-determined)
69
139
 
@@ -636,8 +706,33 @@ Before creating any file or directory:
636
706
  - [ ] **.roll/backlog.md index status updated** (📋 → ✅, REQUIRED)
637
707
  - [ ] **`.roll/features/<feature>.md` US section updated** (Completed date + [x] ACs, REQUIRED)
638
708
  - [ ] **CHANGELOG.md staged and bundled** into completion commit via `$roll-.changelog` in Phase 11 (REQUIRED)
709
+ - [ ] **Self-score note written (US-SKILL-010 / 012)** — see "Self-score" subsection below
639
710
  - [ ] Summary reported to user
640
711
 
712
+ ### Self-score (US-SKILL-012)
713
+
714
+ Before reporting completion to the user, write one self-score note. The
715
+ helper lands the note under `.roll/notes/<date>-roll-build-<US-id>-<epoch>.md`
716
+ with YAML frontmatter so trend analysis (US-SKILL-014) can aggregate later:
717
+
718
+ ```bash
719
+ bash -c 'source "$(command -v roll)"; \
720
+ _skill_write_self_score roll-build US-XXX-NNN <score 1..10> <good|ok|regression> "<rationale>"'
721
+ ```
722
+
723
+ Score guidance (integer 1..10):
724
+ - **9..10** — story shipped cleanly: AC fully met, TCR rhythm tight, no
725
+ re-tries from `verdict: too_big`, peer review concerns addressed inline.
726
+ - **6..8** — shipped with caveats: re-tries on red, edge case left to a
727
+ follow-up FIX, documentation lagged behind code by one cycle, etc.
728
+ - **1..5** — shipped but at low confidence: AC partially met (note which),
729
+ TCR rhythm broken (multiple revert iterations), or `regression` verdict.
730
+
731
+ Verdict values:
732
+ - `good` — story fully delivered; AC met; no concerning signal.
733
+ - `ok` — shipped but with at least one documented trade-off (use rationale).
734
+ - `regression` — story landed but another behaviour broke (rare; open a FIX).
735
+
641
736
  ---
642
737
 
643
738
  ## TCR Recovery Patterns
@@ -681,6 +681,11 @@ Note: `{DOMAIN}` maps to the Bounded Context name identified in DDD analysis.
681
681
  - Events raised: [{EventName}] → {consumer context}
682
682
  - Cross-context: {if touches another context, otherwise omit}
683
683
 
684
+ **Agent profile:**
685
+ - est_min: {1-30 整数,目标 5-10 min 一个 cycle 闭环}
686
+ - risk_zone: {low / medium / high — 改文档 low,改用户可见行为 medium,改 loop infra 或安全/隔离基建 high}
687
+ - chain_depth: 0 {若是自降级产出的子 story 则 +1,累计 ≥2 时第 3 次拒拆}
688
+
684
689
  **AC:**
685
690
  - [ ] {measurable criteria 1}
686
691
  - [ ] {measurable criteria 2}
@@ -700,6 +705,10 @@ Note: `{DOMAIN}` maps to the Bounded Context name identified in DDD analysis.
700
705
  - Integration test: `tests/integration/{flow}.test.ts`
701
706
  ```
702
707
 
708
+ > **强制规则 — Agent profile 必须填**:Split into Stories 步骤产出的每个 US 都必须带 `**Agent profile:**` 子段,est_min / risk_zone 不可省(chain_depth 默认 0)。loop 路由(US-AGENT-004)和 agent 自评(US-AGENT-007)都靠这两个字段决策,缺了就回退到 cold_start_default 并 WARN。历史 US 不强制回填。
709
+ >
710
+ > **MUST fill** the `**Agent profile:**` block on every newly split US — `est_min` and `risk_zone` are non-optional. They drive loop routing and agent self-eval downstream.
711
+
703
712
  ### Closing Doc-Refresh Story Template — Phase N.M 收尾文档
704
713
 
705
714
  When any preceding US in the batch changes user-visible behavior, append this template story at the end of the batch. Wire it as `depends-on:` against every preceding user-facing US so it runs last.
@@ -762,6 +771,42 @@ Each story must be:
762
771
 
763
772
  ---
764
773
 
774
+ ## Self-score (US-SKILL-010 / 013)
775
+
776
+ After Step 5 (Write to BACKLOG) completes — i.e. once the new US rows are
777
+ landed and the user has either confirmed or chosen `No` (story still
778
+ queued) — write a single self-score note covering the design session:
779
+
780
+ ```bash
781
+ bash -c 'source "$(command -v roll)"; \
782
+ _skill_write_self_score roll-design US-XXX-NNN <score 1..10> <good|ok|regression> "<rationale>"'
783
+ ```
784
+
785
+ Use the **first** US-id of the batch as the story handle (or the
786
+ representative story when splitting). The note lands under
787
+ `.roll/notes/<date>-roll-design-<id>-<epoch>.md` so US-SKILL-014 can
788
+ read trend data.
789
+
790
+ Score guidance for design quality (integer 1..10):
791
+ - **9..10** — clean split: every US is INVEST-compliant, profile
792
+ (est_min / risk_zone / chain_depth) filled, doc-refresh closer wired
793
+ when user-visible behaviour changed, peer-review unnecessary or
794
+ reached AGREE quickly.
795
+ - **6..8** — split shipped but with caveats: one US borderline on
796
+ INVEST (e.g. shared file conflict), or Discuss had ESCALATE before
797
+ settling, or profile was partial.
798
+ - **1..5** — split shipped but rough: missing doc-refresh closer,
799
+ profile fields skipped, USer story too coarse for AI cycle; flag a
800
+ follow-up `roll-design` pass.
801
+
802
+ Verdict values:
803
+ - `good` — design + split are clean.
804
+ - `ok` — design is acceptable with one or two trade-offs noted.
805
+ - `regression` — the split visibly broke something earlier (rare; e.g.
806
+ invalidated a previously-stable depends-on chain).
807
+
808
+ ---
809
+
765
810
  ## Integration
766
811
 
767
812
  ### With roll-build
@@ -87,6 +87,46 @@ Before creating any file or directory:
87
87
 
88
88
  ## TCR Workflow
89
89
 
90
+ ### 0. Pre-flight self-check (US-AGENT-007)
91
+
92
+ Before locking the issue, read the FIX's Agent profile (est_min / risk_zone / chain_depth) from the linked feature md and decide whether this cycle should attempt the fix:
93
+
94
+ ```
95
+ inputs:
96
+ fix.est_min (from **Agent profile:** block on the FIX row's feature md)
97
+ fix.risk_zone (low / medium / high)
98
+ fix.chain_depth (0 unless already a downgrade product)
99
+ agent.max_est_min (from .roll/agent-routes.yaml for the current agent)
100
+ history.prefer_threshold + history.hit_rate (FIX history for this agent)
101
+
102
+ verdict:
103
+ too_big when ANY:
104
+ 1. fix.est_min > agent.max_est_min
105
+ 2. fix.risk_zone not in agent.risk
106
+ 3. history.hit_rate < prefer_threshold AND fix.chain_depth == 0
107
+ ok otherwise
108
+ ```
109
+
110
+ Emit `verdict: ok` or `verdict: too_big` (with `reason:`) as the first cycle output line.
111
+
112
+ - `ok` → continue with step 1 below normally
113
+ - `too_big` → self-downgrade per US-AGENT-008, **gated by US-AGENT-009 cap check**:
114
+
115
+ ```bash
116
+ # Cap check first (chain_depth ≥ 2 → refuse third auto-split).
117
+ if ! bash -c 'source "$(command -v roll)"; _loop_chain_depth_cap_check FIX-XXX-NNN'; then
118
+ bash -c 'source "$(command -v roll)"; _loop_split_cap_hit FIX-XXX-NNN "depth >= 2"'
119
+ exit 0
120
+ fi
121
+ Skill("roll-design", "--from-story FIX-XXX-NNN")
122
+ bash -c 'source "$(command -v roll)"; _loop_self_downgrade FIX-XXX-NNN "too_big: <reason>" "FIX-XXX-NNNa,FIX-XXX-NNNb"'
123
+ exit 0
124
+ ```
125
+
126
+ Original FIX goes to 🚫 Hold with `→ split to ...` annotation; sub-stories carry `chain_depth + 1`. Cap-hit path raises ALERT for human triage. Do NOT TCR a half fix.
127
+
128
+ Bug fixes are usually small (est_min ≤ 5), so pre-flight is mostly a sanity barrier for FIXes whose underlying issue turns out structural — e.g. a "simple null check" that requires touching 12 files. Catching that upfront is cheaper than burning a cycle.
129
+
90
130
  ### 1. Lock the issue
91
131
  - state the user-visible issue or requested enhancement
92
132
  - define the scope boundary and non-goals
@@ -96,6 +136,15 @@ Before creating any file or directory:
96
136
  - define the online verification target
97
137
  - for hotfixes: include regression test to prevent recurrence
98
138
  - reference `$roll-.qa` for appropriate test type (unit/integration/E2E)
139
+ - **Test-quality self-check (US-QA-011)** — for any new test the fix adds:
140
+ 1. The test must call project functions / public command entry points,
141
+ not inline `sed`/`awk`/`grep -o`/`find`/`cut` pipelines that
142
+ re-implement what `lib/` or `bin/` already does — rubric ❼.
143
+ 2. The test must sandbox filesystem state via `BATS_TMPDIR` or an
144
+ equivalent helper; never assert on or write to paths outside this
145
+ repo (`~/.codex`, `~/.kimi`, `~/.roll/`, system paths) — rubric ❽.
146
+ 3. If you can't satisfy (1) or (2), extract a project helper or
147
+ redirect the env var to a tmp dir before writing the test.
99
148
 
100
149
  ### 3. Test Design Review (TCR Core)
101
150
 
@@ -366,6 +415,33 @@ A minor change is only "done" when all are true:
366
415
  - [ ] Deployment completed
367
416
  - [ ] Online verification performed
368
417
  - [ ] **Verification Gate passed** (fresh evidence for tests, build, fix confirmation, no regression)
418
+ - [ ] **Self-score note written (US-SKILL-010 / 011)** — before exit, the agent
419
+ writes a structured score note via `_skill_write_self_score` so trend
420
+ analysis (US-SKILL-014) and skill-self-scoring docs (US-SKILL-015) have
421
+ data to read.
422
+
423
+ ### Self-score (US-SKILL-011)
424
+
425
+ Before exiting the cycle, write one self-score note. The helper validates
426
+ inputs and lands the note under `.roll/notes/<date>-roll-fix-<FIX-id>-<epoch>.md`:
427
+
428
+ ```bash
429
+ bash -c 'source "$(command -v roll)"; \
430
+ _skill_write_self_score roll-fix FIX-XXX-NNN <score 1..10> <good|ok|regression> "<rationale>"'
431
+ ```
432
+
433
+ Score guidance (integer 1..10):
434
+ - **9..10** — clean root-cause fix; regression test added; TCR cycle smooth.
435
+ - **6..8** — fix shipped but with caveats (e.g. workaround, partial coverage,
436
+ or repeated TCR red iterations); rationale explains the trade-off.
437
+ - **1..5** — fix landed but quality is below the bar (test coverage missing,
438
+ fix only narrows blast radius, repeated agent re-tries). Verdict should be
439
+ `ok` or `regression` if a related test broke.
440
+
441
+ Verdict values:
442
+ - `good` — fix is the proper root-cause fix; no caveats.
443
+ - `ok` — shipped but with documented trade-offs (use rationale to explain).
444
+ - `regression` — the fix re-broke something else (rare; consider re-opening).
369
445
 
370
446
  ## Rubric
371
447
 
@@ -240,6 +240,19 @@ Together these mean: only one loop runs at a time per project (LOCK), and within
240
240
 
241
241
  ### Step 3 — Route and Execute
242
242
 
243
+ > **US-AGENT-006 — Per-story agent routing (pre-cycle)**
244
+ >
245
+ > Before this skill even starts, the runner inner script has already:
246
+ > 1. Picked the next eligible Todo via `_loop_pick_next_story` (priority FIX > US > REFACTOR, manual-only / depends-on gates respected)
247
+ > 2. Read its Agent profile (est_min / risk_zone) and routed an agent via `_loop_pick_agent_for_story` (hard rules from `.roll/agent-routes.yaml` + soft preference from `runs.jsonl`)
248
+ > 3. Exported `ROLL_LOOP_ROUTED_STORY` / `ROLL_LOOP_ROUTED_AGENT` / `ROLL_LOOP_ROUTED_RULE` and printed `[loop] story <id> routed to <agent> via <rule_kind>` to cron.log
249
+ >
250
+ > When `ROLL_LOOP_ROUTED_STORY` is set, prefer it as `US_ID` for this cycle. The story has already been chosen by hard+soft routing rules — do not re-pick a different one unless that story can no longer be found in BACKLOG (e.g. status changed concurrently).
251
+ >
252
+ > Old single-agent fallback (`primary_agent` from `~/.roll/config.yaml`) still applies when:
253
+ > - no story is pickable (empty Todo / all manual-only)
254
+ > - the matching agent-routes.yaml has no agent that fits the story profile (then `cold_start_default` is used)
255
+
243
256
  For each item, **before invoking the executor skill**, mark the story 🔨 In Progress in the **main repo's** .roll/backlog.md so brief and peer agents can see it being worked on. The cycle worktree is gitignored at .roll/, so editing the worktree's own copy + committing carries no change back to main — write directly via the helper instead:
244
257
 
245
258
  ```bash
@@ -115,6 +115,12 @@ privacy:
115
115
 
116
116
  sync_targets: [claude, cursor] # user's Q8
117
117
  enable_loop: false # user's Q9
118
+ agent_routes_template: default # user's Q10 — agent routing preset
119
+ # one of: default / minimal / heavy / skip
120
+ # default = pi/deepseek/claude + history (US-AGENT-002)
121
+ # minimal = single agent (pi), no history
122
+ # heavy = pi/deepseek/claude/kimi + larger window
123
+ # skip = don't seed .roll/agent-routes.yaml
118
124
  ```
119
125
 
120
126
  Then tell the user: