@seanyao/roll 2026.528.2 → 2026.529.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +31 -8
- package/README.md +2 -0
- package/bin/roll +917 -50
- package/lib/README.md +42 -0
- package/lib/__pycache__/roll-loop-status.cpython-314.pyc +0 -0
- package/lib/agent_routes_lint.py +203 -0
- package/lib/i18n/README.md +54 -0
- package/lib/i18n/doctor.sh +13 -0
- package/lib/i18n/loop.sh +12 -12
- package/lib/loop_pick_agent.py +245 -0
- package/lib/prices/README.md +35 -0
- package/lib/roll-help.py +1 -0
- package/lib/roll-loop-status.py +109 -0
- package/lib/test_quality_gate.py +143 -0
- package/package.json +1 -1
- package/skills/roll-brief/SKILL.md +7 -0
- package/skills/roll-build/SKILL.md +95 -0
- package/skills/roll-design/SKILL.md +45 -0
- package/skills/roll-fix/SKILL.md +76 -0
- package/skills/roll-loop/SKILL.md +13 -0
- package/skills/roll-onboard/SKILL.md +6 -0
package/lib/roll-loop-status.py
CHANGED
|
@@ -706,6 +706,97 @@ def rollup_for_story(cycles: List[Dict[str, Any]], story_id: str) -> Dict[str, A
|
|
|
706
706
|
r["model"] = cy["model"]
|
|
707
707
|
return r
|
|
708
708
|
|
|
709
|
+
|
|
710
|
+
# US-SKILL-014: aggregate the last N self-score notes for the dashboard.
|
|
711
|
+
# Reads .roll/notes/*.md (frontmatter format from US-SKILL-010), returns
|
|
712
|
+
# "self-score: mean 7.8 / min 4 / redo 2 (last 14)"
|
|
713
|
+
# or "" when no notes / "self-score: (n/a) — N sample(s), need 3 (last N)"
|
|
714
|
+
# when sample is too small.
|
|
715
|
+
def _self_score_summary_line(notes_dir = None, window: int = 14) -> str:
|
|
716
|
+
notes_dir = notes_dir if notes_dir is not None else Path(".roll/notes")
|
|
717
|
+
if not notes_dir.exists():
|
|
718
|
+
return ""
|
|
719
|
+
files = sorted(notes_dir.glob("*.md"))[-window:]
|
|
720
|
+
if not files:
|
|
721
|
+
return ""
|
|
722
|
+
total = 0
|
|
723
|
+
count = 0
|
|
724
|
+
minv = 11
|
|
725
|
+
redo = 0
|
|
726
|
+
for f in files:
|
|
727
|
+
score = None
|
|
728
|
+
verdict = None
|
|
729
|
+
for line in f.read_text(errors="ignore").splitlines():
|
|
730
|
+
if line.startswith("score: "):
|
|
731
|
+
try:
|
|
732
|
+
score = int(line.split(": ", 1)[1].strip())
|
|
733
|
+
except ValueError:
|
|
734
|
+
score = None
|
|
735
|
+
elif line.startswith("verdict: "):
|
|
736
|
+
verdict = line.split(": ", 1)[1].strip()
|
|
737
|
+
if score is not None and verdict is not None:
|
|
738
|
+
break
|
|
739
|
+
if score is None:
|
|
740
|
+
continue
|
|
741
|
+
count += 1
|
|
742
|
+
total += score
|
|
743
|
+
if score < minv:
|
|
744
|
+
minv = score
|
|
745
|
+
if verdict == "regression":
|
|
746
|
+
redo += 1
|
|
747
|
+
elif verdict == "ok" and score < 6:
|
|
748
|
+
redo += 1
|
|
749
|
+
if count < 3:
|
|
750
|
+
return f"self-score: (n/a) — {count} sample(s), need 3 (last {window})"
|
|
751
|
+
mean = total / count
|
|
752
|
+
return f"self-score: mean {mean:.1f} / min {minv} / redo {redo} (last {window})"
|
|
753
|
+
|
|
754
|
+
|
|
755
|
+
# US-AGENT-010: per-agent hit-rate summary for the ROLLUP block.
|
|
756
|
+
# Aggregates the last `window_cycles` runs.jsonl records grouped by `agent`.
|
|
757
|
+
# Returns a single-line string like
|
|
758
|
+
# "agents: pi 8/22 (36%) · deepseek 5/8 (63%) · claude 2/2 (n/a)"
|
|
759
|
+
# Empty agents / missing agent field are skipped. Sample < min_sample renders
|
|
760
|
+
# as "(n/a)" instead of a percentage to avoid noise from tiny windows.
|
|
761
|
+
def _agent_summary_line(records: List[Dict[str, Any]], window_cycles: int = 50,
|
|
762
|
+
min_sample: int = 5) -> str:
|
|
763
|
+
if not records or window_cycles <= 0:
|
|
764
|
+
return ""
|
|
765
|
+
# Take the most recent `window_cycles` records that have an agent field.
|
|
766
|
+
tail: List[Dict[str, Any]] = []
|
|
767
|
+
for rec in records[-window_cycles:]:
|
|
768
|
+
agent = (rec or {}).get("agent") or ""
|
|
769
|
+
if not agent:
|
|
770
|
+
continue
|
|
771
|
+
tail.append(rec)
|
|
772
|
+
if not tail:
|
|
773
|
+
return ""
|
|
774
|
+
counts: Dict[str, List[int]] = {}
|
|
775
|
+
# preserve first-seen order for stable output
|
|
776
|
+
order: List[str] = []
|
|
777
|
+
for rec in tail:
|
|
778
|
+
agent = rec.get("agent") or ""
|
|
779
|
+
if not agent:
|
|
780
|
+
continue
|
|
781
|
+
if agent not in counts:
|
|
782
|
+
counts[agent] = [0, 0]
|
|
783
|
+
order.append(agent)
|
|
784
|
+
counts[agent][1] += 1
|
|
785
|
+
if rec.get("status") == "built":
|
|
786
|
+
counts[agent][0] += 1
|
|
787
|
+
if not order:
|
|
788
|
+
return ""
|
|
789
|
+
parts: List[str] = []
|
|
790
|
+
for agent in order:
|
|
791
|
+
built, total = counts[agent]
|
|
792
|
+
if total < min_sample:
|
|
793
|
+
parts.append(f"{agent} {built}/{total} (n/a)")
|
|
794
|
+
else:
|
|
795
|
+
pct = round(100 * built / total) if total else 0
|
|
796
|
+
parts.append(f"{agent} {built}/{total} ({pct}%)")
|
|
797
|
+
return "agents: " + " · ".join(parts)
|
|
798
|
+
|
|
799
|
+
|
|
709
800
|
def rollup_for_day(day_cycles: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
710
801
|
# US-VIEW-012: track input + output separately so the daily summary can
|
|
711
802
|
# show two metric rows. cache_read tokens deliberately excluded — they're
|
|
@@ -930,6 +1021,24 @@ def render(events, cron, state, backlog, *, days=3, lang="both", now=None,
|
|
|
930
1021
|
d2["cost_by_cur"].get(_cur, 0.0),
|
|
931
1022
|
partial=is_partial, symbol=_sym)
|
|
932
1023
|
|
|
1024
|
+
# US-AGENT-010: per-agent hit-rate summary (single line).
|
|
1025
|
+
try:
|
|
1026
|
+
runs_records = list(runs.values()) if isinstance(runs, dict) else list(runs or [])
|
|
1027
|
+
runs_records.sort(key=lambda r: (r or {}).get("ts", ""))
|
|
1028
|
+
_agent_line = _agent_summary_line(runs_records, window_cycles=50)
|
|
1029
|
+
except Exception:
|
|
1030
|
+
_agent_line = ""
|
|
1031
|
+
if _agent_line:
|
|
1032
|
+
print(" " + c("dim", _agent_line))
|
|
1033
|
+
|
|
1034
|
+
# US-SKILL-014: per-skill self-score trend (single line) under the agent line.
|
|
1035
|
+
try:
|
|
1036
|
+
_skill_line = _self_score_summary_line()
|
|
1037
|
+
except Exception:
|
|
1038
|
+
_skill_line = ""
|
|
1039
|
+
if _skill_line:
|
|
1040
|
+
print(" " + c("dim", _skill_line))
|
|
1041
|
+
|
|
933
1042
|
print()
|
|
934
1043
|
print(c("faint", "─" * COLS))
|
|
935
1044
|
print()
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Test quality merge gate (US-QA-012).
|
|
3
|
+
|
|
4
|
+
Scan bats test files for rubric ❼ (inline external-tool behaviour) and ❽
|
|
5
|
+
(file outside this repo) violations. Loop's auto-merge runs this between
|
|
6
|
+
CI green and merge; non-zero exit holds the PR until either the test is
|
|
7
|
+
fixed or PR description carries `[skip-test-quality]` (US-QA-013).
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
test_quality_gate.py [--skip] <bats-file> [<bats-file> …]
|
|
11
|
+
|
|
12
|
+
Exit:
|
|
13
|
+
0 — clean OR --skip flag set
|
|
14
|
+
1 — one or more violations
|
|
15
|
+
2 — usage error
|
|
16
|
+
"""
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import re
|
|
20
|
+
import sys
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import List, Tuple
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# ❼ — inline external-tool patterns. We flag when a single line contains
|
|
26
|
+
# TWO OR MORE of these distinct tool markers, which signals a hand-rolled
|
|
27
|
+
# pipeline duplicating what a project helper should own. A lone `grep -q`
|
|
28
|
+
# or `awk` (no pipe-chain) is fine.
|
|
29
|
+
INLINE_TOOL_PATTERNS = [
|
|
30
|
+
re.compile(r"\bsed\s+[^|]*[s/]"), # sed with substitution / address
|
|
31
|
+
re.compile(r"\bawk\s+'"), # awk with script
|
|
32
|
+
re.compile(r"\bgrep\s+-[a-zA-Z]*o"), # grep -o / -oE (extraction)
|
|
33
|
+
re.compile(r"\bfind\s+[^|]*-name"), # find -name (path scanning)
|
|
34
|
+
re.compile(r"\bcut\s+-f"), # cut -f (column extraction)
|
|
35
|
+
re.compile(r"\btr\s+-d"), # tr -d (char deletion)
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
# ❽ — paths outside this repo. We flag `~/.<name>` (dotfile dirs) and
|
|
39
|
+
# absolute system paths. `BATS_TMPDIR` is the sandbox marker and is fine.
|
|
40
|
+
OUTSIDE_PATTERNS = [
|
|
41
|
+
re.compile(r"~/\.[A-Za-z]"), # ~/.codex, ~/.kimi, ~/.roll, etc.
|
|
42
|
+
re.compile(r"(?<![A-Za-z0-9])/etc/[A-Za-z]"),
|
|
43
|
+
re.compile(r"(?<![A-Za-z0-9])/usr/[A-Za-z]"),
|
|
44
|
+
re.compile(r"(?<![A-Za-z0-9])/var/[A-Za-z]"),
|
|
45
|
+
]
|
|
46
|
+
OUTSIDE_ALLOW = re.compile(r"BATS_TMPDIR")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _scan_lines(text: str) -> List[Tuple[int, str, str]]:
|
|
50
|
+
"""Return list of (line_no, kind, snippet). kind is "❼" or "❽"."""
|
|
51
|
+
findings: List[Tuple[int, str, str]] = []
|
|
52
|
+
in_heredoc = False
|
|
53
|
+
heredoc_terminator: str = ""
|
|
54
|
+
lines = text.splitlines()
|
|
55
|
+
for idx, raw_line in enumerate(lines, start=1):
|
|
56
|
+
line = raw_line.rstrip("\n")
|
|
57
|
+
stripped = line.lstrip()
|
|
58
|
+
|
|
59
|
+
if in_heredoc:
|
|
60
|
+
if line.strip() == heredoc_terminator:
|
|
61
|
+
in_heredoc = False
|
|
62
|
+
continue
|
|
63
|
+
|
|
64
|
+
# Skip comments — comments can legitimately discuss sed/awk in prose.
|
|
65
|
+
if stripped.startswith("#"):
|
|
66
|
+
continue
|
|
67
|
+
|
|
68
|
+
# Skip @test header lines — bats decorators carry the test name
|
|
69
|
+
# which often quotes the patterns the test exercises (false positive).
|
|
70
|
+
if stripped.startswith("@test "):
|
|
71
|
+
continue
|
|
72
|
+
|
|
73
|
+
# Explicit allow marker for lines that legitimately exercise the
|
|
74
|
+
# gate itself (test fixture content), or for project doc-validation
|
|
75
|
+
# awks that don't test production code.
|
|
76
|
+
if "test-quality:allow" in line:
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
# Heredoc start: << 'EOF' or <<EOF (optional quotes).
|
|
80
|
+
# After the heredoc terminator on this line, subsequent lines are
|
|
81
|
+
# data until the terminator appears alone on a line.
|
|
82
|
+
m = re.search(r"<<\s*['\"]?([A-Z_]+)['\"]?", line)
|
|
83
|
+
if m:
|
|
84
|
+
heredoc_terminator = m.group(1)
|
|
85
|
+
in_heredoc = True
|
|
86
|
+
# Don't scan this declarator line further — the leading code
|
|
87
|
+
# before "<<" might still contain tool patterns, but we'd be
|
|
88
|
+
# double-flagging here vs the line that actually executes.
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
# ❼: any inline extraction/parsing tool on this line flags. Each
|
|
92
|
+
# pattern intentionally describes parsing intent (sed substitution,
|
|
93
|
+
# awk script, grep -o / -oE, find -name, cut -f, tr -d) — single
|
|
94
|
+
# grep -q without -o doesn't match and stays untouched.
|
|
95
|
+
if any(pat.search(line) for pat in INLINE_TOOL_PATTERNS):
|
|
96
|
+
findings.append((idx, "❼", line.strip()))
|
|
97
|
+
|
|
98
|
+
# ❽: any outside-path hit unless BATS_TMPDIR appears (sandbox marker).
|
|
99
|
+
if OUTSIDE_ALLOW.search(line):
|
|
100
|
+
continue
|
|
101
|
+
for pat in OUTSIDE_PATTERNS:
|
|
102
|
+
if pat.search(line):
|
|
103
|
+
findings.append((idx, "❽", line.strip()))
|
|
104
|
+
break # one ❽ finding per line is enough
|
|
105
|
+
|
|
106
|
+
return findings
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def scan_file(path: Path) -> List[Tuple[int, str, str]]:
|
|
110
|
+
try:
|
|
111
|
+
text = path.read_text(errors="ignore")
|
|
112
|
+
except FileNotFoundError:
|
|
113
|
+
return [(0, "?", f"file not found: {path}")]
|
|
114
|
+
return _scan_lines(text)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def main() -> int:
|
|
118
|
+
args = sys.argv[1:]
|
|
119
|
+
skip = False
|
|
120
|
+
files: List[str] = []
|
|
121
|
+
for a in args:
|
|
122
|
+
if a in ("--skip", "--skip-test-quality"):
|
|
123
|
+
skip = True
|
|
124
|
+
else:
|
|
125
|
+
files.append(a)
|
|
126
|
+
if not files:
|
|
127
|
+
print("usage: test_quality_gate.py [--skip] <bats-file> [<bats-file> …]",
|
|
128
|
+
file=sys.stderr)
|
|
129
|
+
return 2
|
|
130
|
+
if skip:
|
|
131
|
+
return 0
|
|
132
|
+
|
|
133
|
+
total = 0
|
|
134
|
+
for f in files:
|
|
135
|
+
findings = scan_file(Path(f))
|
|
136
|
+
for line_no, kind, snippet in findings:
|
|
137
|
+
print(f"{f}:{line_no}: {kind} {snippet}")
|
|
138
|
+
total += 1
|
|
139
|
+
return 1 if total > 0 else 0
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
if __name__ == "__main__":
|
|
143
|
+
sys.exit(main())
|
package/package.json
CHANGED
|
@@ -127,6 +127,13 @@ A simple heuristic — not a gate, just a signal for the human:
|
|
|
127
127
|
|----|-------------|----------|
|
|
128
128
|
| US-XXX | {标题} | 高 |
|
|
129
129
|
|
|
130
|
+
<!-- US-AGENT-010: per-agent hit-rate summary (one line). Read the last
|
|
131
|
+
window_cycles records of runs.jsonl, group by `agent`, format as
|
|
132
|
+
`agents: pi 8/22 (36%) · deepseek 5/8 (63%)`. Sample < 5 → `(n/a)`.
|
|
133
|
+
Omit when no records have an agent field (legacy data). -->
|
|
134
|
+
## Agent 路由命中率
|
|
135
|
+
{agents: <name> built/total (pct%) · …} ← from `runs.jsonl` last 50 cycles
|
|
136
|
+
|
|
130
137
|
<!-- 仅当 roll-.dream 有新发现时输出 -->
|
|
131
138
|
## 悟见
|
|
132
139
|
{来自 .roll/dream/ 的摘要}
|
|
@@ -52,6 +52,66 @@ Do not use for:
|
|
|
52
52
|
|
|
53
53
|
Activate when input is a `US-[A-Z]+-[0-9]+` identifier.
|
|
54
54
|
|
|
55
|
+
### Step 0: Pre-flight self-check (US-AGENT-007)
|
|
56
|
+
|
|
57
|
+
Before reading the Story in depth or splitting actions, **read the Agent profile** from the story's feature md and decide whether this cycle can realistically deliver it. The check is mechanical:
|
|
58
|
+
|
|
59
|
+
```
|
|
60
|
+
inputs:
|
|
61
|
+
story.est_min (from **Agent profile:** block, US-AGENT-001)
|
|
62
|
+
story.risk_zone (low / medium / high)
|
|
63
|
+
story.chain_depth (0 unless already a downgrade product)
|
|
64
|
+
agent.max_est_min (from .roll/agent-routes.yaml for the current agent)
|
|
65
|
+
history.prefer_threshold (from .roll/agent-routes.yaml)
|
|
66
|
+
history.hit_rate (this agent × this story_type, last window_cycles)
|
|
67
|
+
|
|
68
|
+
verdict:
|
|
69
|
+
too_big when ANY of these is true:
|
|
70
|
+
1. story.est_min > agent.max_est_min (hard capacity miss)
|
|
71
|
+
2. story.risk_zone not in agent.risk (hard risk miss)
|
|
72
|
+
3. history.hit_rate < prefer_threshold AND story.chain_depth == 0
|
|
73
|
+
(soft signal: history says this agent's not on top of this type yet,
|
|
74
|
+
and we still have downgrade budget — don't burn a cycle)
|
|
75
|
+
ok otherwise
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
Output the verdict as the first line of the cycle response:
|
|
79
|
+
|
|
80
|
+
```yaml
|
|
81
|
+
verdict: ok # or: too_big
|
|
82
|
+
reason: <one short line — which condition triggered, with numbers>
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
When `verdict: ok` → continue to Step 1 normally.
|
|
86
|
+
When `verdict: too_big` → go to **US-AGENT-008 self-downgrade path**, **but** first run the **US-AGENT-009 chain_depth cap check**:
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
# 0a. Cap check: refuse the third consecutive auto-split.
|
|
90
|
+
# exit 0 → split allowed; exit 1 → cap hit, take cap-hit path instead.
|
|
91
|
+
if ! bash -c 'source "$(command -v roll)"; _loop_chain_depth_cap_check US-XXX-NNN'; then
|
|
92
|
+
# Cap hit (chain_depth ≥ 2): hold + ALERT, exit cleanly.
|
|
93
|
+
bash -c 'source "$(command -v roll)"; _loop_split_cap_hit US-XXX-NNN "depth >= 2, human triage required"'
|
|
94
|
+
exit 0
|
|
95
|
+
fi
|
|
96
|
+
|
|
97
|
+
# 1. Invoke roll-design to re-split the story into smaller sub-stories.
|
|
98
|
+
# Each sub-story carries chain_depth = (parent.chain_depth + 1).
|
|
99
|
+
# Sub-stories land as 📋 Todo with depends-on:<parent> chained.
|
|
100
|
+
Skill("roll-design", "--from-story US-XXX-NNN")
|
|
101
|
+
|
|
102
|
+
# 2. After the sub-stories are written to BACKLOG, flip the parent
|
|
103
|
+
# to 🚫 Hold and emit the downgrade event. The helper handles ALERT.
|
|
104
|
+
bash -c 'source "$(command -v roll)"; _loop_self_downgrade US-XXX-NNN "too_big: <reason from verdict>" "US-XXX-NNNa,US-XXX-NNNb"'
|
|
105
|
+
|
|
106
|
+
# 3. Exit cleanly — no TCR commits this cycle. The next loop cycle picks
|
|
107
|
+
# up the first sub-story (which is smaller and should pass pre-flight).
|
|
108
|
+
exit 0
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
If `roll-design` cannot produce ≥2 sub-stories (story is already irreducible), fall through to **US-AGENT-009 cap-hit path** by invoking `_loop_split_cap_hit` directly. The cap is purely about stopping infinite split chains; even on the first re-split, if the design step gives up, the cap-hit handler raises ALERT for human triage.
|
|
112
|
+
|
|
113
|
+
> Pre-flight is honest, not paranoid: a small story (est_min ≤ 5, chain_depth=0, low risk) should almost always go `ok`. The check pays off on the long tail — stories that look small but compose tons of files, or that the current agent has historically failed.
|
|
114
|
+
|
|
55
115
|
### Step 1: Read the Story
|
|
56
116
|
|
|
57
117
|
1. Open `.roll/backlog.md`, find the US row, follow the link to `.roll/features/<feature>.md`
|
|
@@ -64,6 +124,16 @@ Activate when input is a `US-[A-Z]+-[0-9]+` identifier.
|
|
|
64
124
|
- Pick the smallest shippable Action first
|
|
65
125
|
- **Granularity constraint**: Each Action completable in 2–5 minutes; split if larger
|
|
66
126
|
- **No placeholders**: Action descriptions must be specific and directly executable
|
|
127
|
+
- **Test-quality self-check (US-QA-011)** — for every Action that adds tests:
|
|
128
|
+
1. Tests call project functions / public command entry points; do NOT inline
|
|
129
|
+
external-tool behaviour (`sed`/`awk`/`grep`/`find`/`cut` pipelines that
|
|
130
|
+
duplicate logic already in `lib/` or `bin/`) — rubric ❼.
|
|
131
|
+
2. Tests sandbox filesystem state via `BATS_TMPDIR` (or equivalent); do NOT
|
|
132
|
+
touch or assert on paths outside this repo (`~/.codex`, `~/.kimi`,
|
|
133
|
+
`~/.roll/`, `/etc/...`) — rubric ❽.
|
|
134
|
+
3. If you can't satisfy (1) or (2), reshape the Action: extract a project
|
|
135
|
+
helper, redirect the env var to a tmp dir, or move the test to an
|
|
136
|
+
integration tier where the boundary is intentional and documented.
|
|
67
137
|
|
|
68
138
|
#### 2.5 Parallel Dispatch (auto-determined)
|
|
69
139
|
|
|
@@ -636,8 +706,33 @@ Before creating any file or directory:
|
|
|
636
706
|
- [ ] **.roll/backlog.md index status updated** (📋 → ✅, REQUIRED)
|
|
637
707
|
- [ ] **`.roll/features/<feature>.md` US section updated** (Completed date + [x] ACs, REQUIRED)
|
|
638
708
|
- [ ] **CHANGELOG.md staged and bundled** into completion commit via `$roll-.changelog` in Phase 11 (REQUIRED)
|
|
709
|
+
- [ ] **Self-score note written (US-SKILL-010 / 012)** — see "Self-score" subsection below
|
|
639
710
|
- [ ] Summary reported to user
|
|
640
711
|
|
|
712
|
+
### Self-score (US-SKILL-012)
|
|
713
|
+
|
|
714
|
+
Before reporting completion to the user, write one self-score note. The
|
|
715
|
+
helper lands the note under `.roll/notes/<date>-roll-build-<US-id>-<epoch>.md`
|
|
716
|
+
with YAML frontmatter so trend analysis (US-SKILL-014) can aggregate later:
|
|
717
|
+
|
|
718
|
+
```bash
|
|
719
|
+
bash -c 'source "$(command -v roll)"; \
|
|
720
|
+
_skill_write_self_score roll-build US-XXX-NNN <score 1..10> <good|ok|regression> "<rationale>"'
|
|
721
|
+
```
|
|
722
|
+
|
|
723
|
+
Score guidance (integer 1..10):
|
|
724
|
+
- **9..10** — story shipped cleanly: AC fully met, TCR rhythm tight, no
|
|
725
|
+
re-tries from `verdict: too_big`, peer review concerns addressed inline.
|
|
726
|
+
- **6..8** — shipped with caveats: re-tries on red, edge case left to a
|
|
727
|
+
follow-up FIX, documentation lagged behind code by one cycle, etc.
|
|
728
|
+
- **1..5** — shipped but at low confidence: AC partially met (note which),
|
|
729
|
+
TCR rhythm broken (multiple revert iterations), or `regression` verdict.
|
|
730
|
+
|
|
731
|
+
Verdict values:
|
|
732
|
+
- `good` — story fully delivered; AC met; no concerning signal.
|
|
733
|
+
- `ok` — shipped but with at least one documented trade-off (use rationale).
|
|
734
|
+
- `regression` — story landed but another behaviour broke (rare; open a FIX).
|
|
735
|
+
|
|
641
736
|
---
|
|
642
737
|
|
|
643
738
|
## TCR Recovery Patterns
|
|
@@ -681,6 +681,11 @@ Note: `{DOMAIN}` maps to the Bounded Context name identified in DDD analysis.
|
|
|
681
681
|
- Events raised: [{EventName}] → {consumer context}
|
|
682
682
|
- Cross-context: {if touches another context, otherwise omit}
|
|
683
683
|
|
|
684
|
+
**Agent profile:**
|
|
685
|
+
- est_min: {1-30 整数,目标 5-10 min 一个 cycle 闭环}
|
|
686
|
+
- risk_zone: {low / medium / high — 改文档 low,改用户可见行为 medium,改 loop infra 或安全/隔离基建 high}
|
|
687
|
+
- chain_depth: 0 {若是自降级产出的子 story 则 +1,累计 ≥2 时第 3 次拒拆}
|
|
688
|
+
|
|
684
689
|
**AC:**
|
|
685
690
|
- [ ] {measurable criteria 1}
|
|
686
691
|
- [ ] {measurable criteria 2}
|
|
@@ -700,6 +705,10 @@ Note: `{DOMAIN}` maps to the Bounded Context name identified in DDD analysis.
|
|
|
700
705
|
- Integration test: `tests/integration/{flow}.test.ts`
|
|
701
706
|
```
|
|
702
707
|
|
|
708
|
+
> **强制规则 — Agent profile 必须填**:Split into Stories 步骤产出的每个 US 都必须带 `**Agent profile:**` 子段,est_min / risk_zone 不可省(chain_depth 默认 0)。loop 路由(US-AGENT-004)和 agent 自评(US-AGENT-007)都靠这两个字段决策,缺了就回退到 cold_start_default 并 WARN。历史 US 不强制回填。
|
|
709
|
+
>
|
|
710
|
+
> **MUST fill** the `**Agent profile:**` block on every newly split US — `est_min` and `risk_zone` are non-optional. They drive loop routing and agent self-eval downstream.
|
|
711
|
+
|
|
703
712
|
### Closing Doc-Refresh Story Template — Phase N.M 收尾文档
|
|
704
713
|
|
|
705
714
|
When any preceding US in the batch changes user-visible behavior, append this template story at the end of the batch. Wire it as `depends-on:` against every preceding user-facing US so it runs last.
|
|
@@ -762,6 +771,42 @@ Each story must be:
|
|
|
762
771
|
|
|
763
772
|
---
|
|
764
773
|
|
|
774
|
+
## Self-score (US-SKILL-010 / 013)
|
|
775
|
+
|
|
776
|
+
After Step 5 (Write to BACKLOG) completes — i.e. once the new US rows are
|
|
777
|
+
landed and the user has either confirmed or chosen `No` (story still
|
|
778
|
+
queued) — write a single self-score note covering the design session:
|
|
779
|
+
|
|
780
|
+
```bash
|
|
781
|
+
bash -c 'source "$(command -v roll)"; \
|
|
782
|
+
_skill_write_self_score roll-design US-XXX-NNN <score 1..10> <good|ok|regression> "<rationale>"'
|
|
783
|
+
```
|
|
784
|
+
|
|
785
|
+
Use the **first** US-id of the batch as the story handle (or the
|
|
786
|
+
representative story when splitting). The note lands under
|
|
787
|
+
`.roll/notes/<date>-roll-design-<id>-<epoch>.md` so US-SKILL-014 can
|
|
788
|
+
read trend data.
|
|
789
|
+
|
|
790
|
+
Score guidance for design quality (integer 1..10):
|
|
791
|
+
- **9..10** — clean split: every US is INVEST-compliant, profile
|
|
792
|
+
(est_min / risk_zone / chain_depth) filled, doc-refresh closer wired
|
|
793
|
+
when user-visible behaviour changed, peer-review unnecessary or
|
|
794
|
+
reached AGREE quickly.
|
|
795
|
+
- **6..8** — split shipped but with caveats: one US borderline on
|
|
796
|
+
INVEST (e.g. shared file conflict), or Discuss had ESCALATE before
|
|
797
|
+
settling, or profile was partial.
|
|
798
|
+
- **1..5** — split shipped but rough: missing doc-refresh closer,
|
|
799
|
+
profile fields skipped, USer story too coarse for AI cycle; flag a
|
|
800
|
+
follow-up `roll-design` pass.
|
|
801
|
+
|
|
802
|
+
Verdict values:
|
|
803
|
+
- `good` — design + split are clean.
|
|
804
|
+
- `ok` — design is acceptable with one or two trade-offs noted.
|
|
805
|
+
- `regression` — the split visibly broke something earlier (rare; e.g.
|
|
806
|
+
invalidated a previously-stable depends-on chain).
|
|
807
|
+
|
|
808
|
+
---
|
|
809
|
+
|
|
765
810
|
## Integration
|
|
766
811
|
|
|
767
812
|
### With roll-build
|
package/skills/roll-fix/SKILL.md
CHANGED
|
@@ -87,6 +87,46 @@ Before creating any file or directory:
|
|
|
87
87
|
|
|
88
88
|
## TCR Workflow
|
|
89
89
|
|
|
90
|
+
### 0. Pre-flight self-check (US-AGENT-007)
|
|
91
|
+
|
|
92
|
+
Before locking the issue, read the FIX's Agent profile (est_min / risk_zone / chain_depth) from the linked feature md and decide whether this cycle should attempt the fix:
|
|
93
|
+
|
|
94
|
+
```
|
|
95
|
+
inputs:
|
|
96
|
+
fix.est_min (from **Agent profile:** block on the FIX row's feature md)
|
|
97
|
+
fix.risk_zone (low / medium / high)
|
|
98
|
+
fix.chain_depth (0 unless already a downgrade product)
|
|
99
|
+
agent.max_est_min (from .roll/agent-routes.yaml for the current agent)
|
|
100
|
+
history.prefer_threshold + history.hit_rate (FIX history for this agent)
|
|
101
|
+
|
|
102
|
+
verdict:
|
|
103
|
+
too_big when ANY:
|
|
104
|
+
1. fix.est_min > agent.max_est_min
|
|
105
|
+
2. fix.risk_zone not in agent.risk
|
|
106
|
+
3. history.hit_rate < prefer_threshold AND fix.chain_depth == 0
|
|
107
|
+
ok otherwise
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
Emit `verdict: ok` or `verdict: too_big` (with `reason:`) as the first cycle output line.
|
|
111
|
+
|
|
112
|
+
- `ok` → continue with step 1 below normally
|
|
113
|
+
- `too_big` → self-downgrade per US-AGENT-008, **gated by US-AGENT-009 cap check**:
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
# Cap check first (chain_depth ≥ 2 → refuse third auto-split).
|
|
117
|
+
if ! bash -c 'source "$(command -v roll)"; _loop_chain_depth_cap_check FIX-XXX-NNN'; then
|
|
118
|
+
bash -c 'source "$(command -v roll)"; _loop_split_cap_hit FIX-XXX-NNN "depth >= 2"'
|
|
119
|
+
exit 0
|
|
120
|
+
fi
|
|
121
|
+
Skill("roll-design", "--from-story FIX-XXX-NNN")
|
|
122
|
+
bash -c 'source "$(command -v roll)"; _loop_self_downgrade FIX-XXX-NNN "too_big: <reason>" "FIX-XXX-NNNa,FIX-XXX-NNNb"'
|
|
123
|
+
exit 0
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
Original FIX goes to 🚫 Hold with `→ split to ...` annotation; sub-stories carry `chain_depth + 1`. Cap-hit path raises ALERT for human triage. Do NOT TCR a half fix.
|
|
127
|
+
|
|
128
|
+
Bug fixes are usually small (est_min ≤ 5), so pre-flight is mostly a sanity barrier for FIXes whose underlying issue turns out structural — e.g. a "simple null check" that requires touching 12 files. Catching that upfront is cheaper than burning a cycle.
|
|
129
|
+
|
|
90
130
|
### 1. Lock the issue
|
|
91
131
|
- state the user-visible issue or requested enhancement
|
|
92
132
|
- define the scope boundary and non-goals
|
|
@@ -96,6 +136,15 @@ Before creating any file or directory:
|
|
|
96
136
|
- define the online verification target
|
|
97
137
|
- for hotfixes: include regression test to prevent recurrence
|
|
98
138
|
- reference `$roll-.qa` for appropriate test type (unit/integration/E2E)
|
|
139
|
+
- **Test-quality self-check (US-QA-011)** — for any new test the fix adds:
|
|
140
|
+
1. The test must call project functions / public command entry points,
|
|
141
|
+
not inline `sed`/`awk`/`grep -o`/`find`/`cut` pipelines that
|
|
142
|
+
re-implement what `lib/` or `bin/` already does — rubric ❼.
|
|
143
|
+
2. The test must sandbox filesystem state via `BATS_TMPDIR` or an
|
|
144
|
+
equivalent helper; never assert on or write to paths outside this
|
|
145
|
+
repo (`~/.codex`, `~/.kimi`, `~/.roll/`, system paths) — rubric ❽.
|
|
146
|
+
3. If you can't satisfy (1) or (2), extract a project helper or
|
|
147
|
+
redirect the env var to a tmp dir before writing the test.
|
|
99
148
|
|
|
100
149
|
### 3. Test Design Review (TCR Core)
|
|
101
150
|
|
|
@@ -366,6 +415,33 @@ A minor change is only "done" when all are true:
|
|
|
366
415
|
- [ ] Deployment completed
|
|
367
416
|
- [ ] Online verification performed
|
|
368
417
|
- [ ] **Verification Gate passed** (fresh evidence for tests, build, fix confirmation, no regression)
|
|
418
|
+
- [ ] **Self-score note written (US-SKILL-010 / 011)** — before exit, the agent
|
|
419
|
+
writes a structured score note via `_skill_write_self_score` so trend
|
|
420
|
+
analysis (US-SKILL-014) and skill-self-scoring docs (US-SKILL-015) have
|
|
421
|
+
data to read.
|
|
422
|
+
|
|
423
|
+
### Self-score (US-SKILL-011)
|
|
424
|
+
|
|
425
|
+
Before exiting the cycle, write one self-score note. The helper validates
|
|
426
|
+
inputs and lands the note under `.roll/notes/<date>-roll-fix-<FIX-id>-<epoch>.md`:
|
|
427
|
+
|
|
428
|
+
```bash
|
|
429
|
+
bash -c 'source "$(command -v roll)"; \
|
|
430
|
+
_skill_write_self_score roll-fix FIX-XXX-NNN <score 1..10> <good|ok|regression> "<rationale>"'
|
|
431
|
+
```
|
|
432
|
+
|
|
433
|
+
Score guidance (integer 1..10):
|
|
434
|
+
- **9..10** — clean root-cause fix; regression test added; TCR cycle smooth.
|
|
435
|
+
- **6..8** — fix shipped but with caveats (e.g. workaround, partial coverage,
|
|
436
|
+
or repeated TCR red iterations); rationale explains the trade-off.
|
|
437
|
+
- **1..5** — fix landed but quality is below the bar (test coverage missing,
|
|
438
|
+
fix only narrows blast radius, repeated agent re-tries). Verdict should be
|
|
439
|
+
`ok` or `regression` if a related test broke.
|
|
440
|
+
|
|
441
|
+
Verdict values:
|
|
442
|
+
- `good` — fix is the proper root-cause fix; no caveats.
|
|
443
|
+
- `ok` — shipped but with documented trade-offs (use rationale to explain).
|
|
444
|
+
- `regression` — the fix re-broke something else (rare; consider re-opening).
|
|
369
445
|
|
|
370
446
|
## Rubric
|
|
371
447
|
|
|
@@ -240,6 +240,19 @@ Together these mean: only one loop runs at a time per project (LOCK), and within
|
|
|
240
240
|
|
|
241
241
|
### Step 3 — Route and Execute
|
|
242
242
|
|
|
243
|
+
> **US-AGENT-006 — Per-story agent routing (pre-cycle)**
|
|
244
|
+
>
|
|
245
|
+
> Before this skill even starts, the runner inner script has already:
|
|
246
|
+
> 1. Picked the next eligible Todo via `_loop_pick_next_story` (priority FIX > US > REFACTOR, manual-only / depends-on gates respected)
|
|
247
|
+
> 2. Read its Agent profile (est_min / risk_zone) and routed an agent via `_loop_pick_agent_for_story` (hard rules from `.roll/agent-routes.yaml` + soft preference from `runs.jsonl`)
|
|
248
|
+
> 3. Exported `ROLL_LOOP_ROUTED_STORY` / `ROLL_LOOP_ROUTED_AGENT` / `ROLL_LOOP_ROUTED_RULE` and printed `[loop] story <id> routed to <agent> via <rule_kind>` to cron.log
|
|
249
|
+
>
|
|
250
|
+
> When `ROLL_LOOP_ROUTED_STORY` is set, prefer it as `US_ID` for this cycle. The story has already been chosen by hard+soft routing rules — do not re-pick a different one unless that story can no longer be found in BACKLOG (e.g. status changed concurrently).
|
|
251
|
+
>
|
|
252
|
+
> Old single-agent fallback (`primary_agent` from `~/.roll/config.yaml`) still applies when:
|
|
253
|
+
> - no story is pickable (empty Todo / all manual-only)
|
|
254
|
+
> - the matching agent-routes.yaml has no agent that fits the story profile (then `cold_start_default` is used)
|
|
255
|
+
|
|
243
256
|
For each item, **before invoking the executor skill**, mark the story 🔨 In Progress in the **main repo's** .roll/backlog.md so brief and peer agents can see it being worked on. The cycle worktree is gitignored at .roll/, so editing the worktree's own copy + committing carries no change back to main — write directly via the helper instead:
|
|
244
257
|
|
|
245
258
|
```bash
|
|
@@ -115,6 +115,12 @@ privacy:
|
|
|
115
115
|
|
|
116
116
|
sync_targets: [claude, cursor] # user's Q8
|
|
117
117
|
enable_loop: false # user's Q9
|
|
118
|
+
agent_routes_template: default # user's Q10 — agent routing preset
|
|
119
|
+
# one of: default / minimal / heavy / skip
|
|
120
|
+
# default = pi/deepseek/claude + history (US-AGENT-002)
|
|
121
|
+
# minimal = single agent (pi), no history
|
|
122
|
+
# heavy = pi/deepseek/claude/kimi + larger window
|
|
123
|
+
# skip = don't seed .roll/agent-routes.yaml
|
|
118
124
|
```
|
|
119
125
|
|
|
120
126
|
Then tell the user:
|