@seanyao/roll 2026.527.1 → 2026.528.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,27 @@
1
1
  # Changelog
2
2
 
3
+ ## v2026.528.2
4
+
5
+ ### Added
6
+
7
+ - **loop 换机器跑不会再拿过期 backlog** — 每轮自动拉最新项目元数据 `[loop]`
8
+ - **CI 红了 loop 不再干等** — 先试着自己修,修不好再找人 `[loop]`
9
+
10
+ ### Fixed
11
+
12
+ - **`roll loop log` 现在真的能看了** — 每轮 cycle 的留档修好了 `[loop]`
13
+ - **loop 跑完的终端窗口不再瞬间清空** — 关闭前能看到本轮摘要 `[loop]`
14
+
15
+ ## v2026.528.1
16
+
17
+ ### Added
18
+
19
+ - **`roll test` — 测试隔离运行,不再误伤本机 loop 服务** `[loop]`
20
+
21
+ ### Fixed
22
+
23
+ - **Kimi CLI 升级改名为 kimi-code 后,roll 现在能正常识别** `[loop]`
24
+
3
25
  ## v2026.527.1
4
26
 
5
27
  ### Added
package/README.md CHANGED
@@ -69,6 +69,7 @@ roll loop on # let AI work through the backlog (optional)
69
69
  | Configuration (env vars) | [guide/en/configuration.md](guide/en/configuration.md) | [guide/zh/configuration.md](guide/zh/configuration.md) |
70
70
  | Skill selection guide | [guide/en/skills.md](guide/en/skills.md) | [guide/zh/skills.md](guide/zh/skills.md) |
71
71
  | Slides (deck generator) | [guide/en/slides.md](guide/en/slides.md) | [guide/zh/slides.md](guide/zh/slides.md) |
72
+ | Test isolation (`roll test` + Tart VM) | [guide/en/test-isolation.md](guide/en/test-isolation.md) | [guide/zh/test-isolation.md](guide/zh/test-isolation.md) |
72
73
  | Cross-machine sync | [guide/en/loop.md#cross-machine-sync](guide/en/loop.md#cross-machine-sync) | [guide/zh/loop.md#跨机器同步](guide/zh/loop.md#%E8%B7%A8%E6%9C%BA%E5%99%A8%E5%90%8C%E6%AD%A5) |
73
74
  | Pricing (cost visibility) | [guide/en/pricing.md](guide/en/pricing.md) | [guide/zh/pricing.md](guide/zh/pricing.md) |
74
75
  | FAQ (troubleshooting) | [guide/en/faq.md](guide/en/faq.md) | [guide/zh/faq.md](guide/zh/faq.md) |
package/bin/roll CHANGED
@@ -4,7 +4,7 @@ set -euo pipefail
4
4
  # Roll — AI Agent Convention Manager
5
5
  # Single source of truth for how all AI coding agents behave.
6
6
 
7
- VERSION="2026.527.1"
7
+ VERSION="2026.528.2"
8
8
  ROLL_HOME="${ROLL_HOME:-${HOME}/.roll}"
9
9
  ROLL_CONFIG="${ROLL_HOME}/config.yaml"
10
10
  ROLL_GLOBAL="${ROLL_HOME}/conventions/global"
@@ -70,6 +70,11 @@ ai_tool_name() {
70
70
  # Antigravity (agy) reuses ~/.gemini/ from the deprecated Gemini CLI for
71
71
  # its config dir, so a literal `gemini` basename now identifies agy.
72
72
  [[ "$bn" == "gemini" ]] && bn="agy"
73
+ # FIX-126: Kimi upstream renamed its CLI to kimi-code and its config dir
74
+ # to ~/.kimi-code/; map both old and new basenames to the canonical
75
+ # "kimi" agent identifier so downstream argv / config / sync paths stay
76
+ # uniform across the upgrade.
77
+ [[ "$bn" == "kimi-code" ]] && bn="kimi"
73
78
  echo "$bn"
74
79
  }
75
80
 
@@ -266,6 +271,7 @@ _ensure_config_entries() {
266
271
  "ai_claude:~/.claude|CLAUDE.md|CLAUDE.md"
267
272
  "ai_agy:~/.gemini|GEMINI.md|GEMINI.md"
268
273
  "ai_kimi:~/.kimi|AGENTS.md|AGENTS.md"
274
+ "ai_kimi_code:~/.kimi-code|AGENTS.md|AGENTS.md"
269
275
  "ai_codex:~/.codex|AGENTS.md|AGENTS.md"
270
276
  "ai_cursor:~/.cursor|.cursor-rules|.cursor-rules"
271
277
  "ai_trae:~/.trae|user_rules.md|project_rules.md"
@@ -490,6 +496,7 @@ _install_local() {
490
496
  ai_claude: ~/.claude|CLAUDE.md|CLAUDE.md
491
497
  ai_gemini: ~/.gemini|GEMINI.md|GEMINI.md
492
498
  ai_kimi: ~/.kimi|AGENTS.md|AGENTS.md
499
+ ai_kimi_code: ~/.kimi-code|AGENTS.md|AGENTS.md
493
500
  ai_codex: ~/.codex|AGENTS.md|AGENTS.md
494
501
  ai_cursor: ~/.cursor|.cursor-rules|.cursor-rules
495
502
  ai_trae: ~/.trae|user_rules.md|project_rules.md
@@ -740,6 +747,22 @@ _setup_snapshot() {
740
747
  # _ROLL_SETUP_STATE. Caller passes the watch dir(s) plus the command + args.
741
748
  # stdout/stderr of the inner command are suppressed (same as the previous
742
749
  # pattern in cmd_setup) to keep the v2 UI render the only user-visible output.
750
+ # US-INFRA-008: ensure core.hooksPath is set to 'hooks' so TCR pre-commit gate
751
+ # is never silently bypassed in new clones, worktrees, or automated environments.
752
+ # Idempotent: already set to a non-default value → leave it (user knows better).
753
+ # Not a git repo → silently skip.
754
+ _ensure_hooks_path() {
755
+ local repo_path="${1:-$PWD}"
756
+ # Must be a git repo
757
+ git -C "$repo_path" rev-parse --git-dir >/dev/null 2>&1 || return 0
758
+ local current; current=$(git -C "$repo_path" config core.hooksPath 2>/dev/null || echo "")
759
+ # Only set when unset or pointing at the git default (.git/hooks)
760
+ if [[ -z "$current" || "$current" == ".git/hooks" ]]; then
761
+ git -C "$repo_path" config core.hooksPath hooks 2>/dev/null || true
762
+ fi
763
+ return 0
764
+ }
765
+
743
766
  _run_setup_step() {
744
767
  local watch="$1"; shift
745
768
  local before after
@@ -782,7 +805,7 @@ cmd_setup() {
782
805
  esac
783
806
  }
784
807
 
785
- local _ai_dirs="$HOME/.claude:$HOME/.gemini:$HOME/.kimi:$HOME/.codex:$HOME/.cursor:$HOME/.trae:$HOME/.config/opencode:$HOME/.openclaw:$HOME/.pi:$HOME/.deepseek:$HOME/.qwen"
808
+ local _ai_dirs="$HOME/.claude:$HOME/.gemini:$HOME/.kimi:$HOME/.kimi-code:$HOME/.codex:$HOME/.cursor:$HOME/.trae:$HOME/.config/opencode:$HOME/.openclaw:$HOME/.pi:$HOME/.deepseek:$HOME/.qwen"
786
809
 
787
810
  _run_setup_step "$ROLL_HOME" _install_local "$force"
788
811
  _record "$(_state_to_marker "$_ROLL_SETUP_STATE")" "Install templates & conventions to ~/.roll"
@@ -796,6 +819,10 @@ cmd_setup() {
796
819
  _run_setup_step "$ROLL_HOME/.peer-state" _peer_ensure_state_dir
797
820
  _record "$(_state_to_marker "$_ROLL_SETUP_STATE")" "Initialize peer-review state directory"
798
821
 
822
+ # US-INFRA-008: configure git hooks path so TCR pre-commit gate works in this repo
823
+ _run_setup_step "$PWD" _ensure_hooks_path
824
+ _record "$(_state_to_marker "$_ROLL_SETUP_STATE")" "Configure git hooks path"
825
+
799
826
  if command -v tmux >/dev/null 2>&1; then
800
827
  _record skip "Ensure tmux is installed (already present)"
801
828
  else
@@ -1928,6 +1955,18 @@ PY
1928
1955
  return 0
1929
1956
  fi
1930
1957
 
1958
+ # FIX-125: cycle-context tripwire. Apply phase below runs launchctl unload
1959
+ # and rm against ${HOME}/Library/LaunchAgents/<plist> (bin/roll:1957-1958).
1960
+ # From inside a loop cycle this would mutate the host's launchd domain
1961
+ # using another project's identity. Doc-only offboards (no plists) stay
1962
+ # allowed so cycles can still call offboard for non-launchd cleanup.
1963
+ if [ "${#plists[@]}" -gt 0 ] && _loop_in_cycle; then
1964
+ err "Refusing to unload launchd plists from inside a loop cycle (FIX-125)."
1965
+ echo " Run 'roll offboard --confirm' from a terminal outside the cycle," >&2
1966
+ echo " or pause the loop first: 'roll loop pause'." >&2
1967
+ return 1
1968
+ fi
1969
+
1931
1970
  # Apply. Guard every loop with a count check — `set -u` upstream makes
1932
1971
  # naked `"${arr[@]}"` over an empty array a hard error on bash 5.0.
1933
1972
  echo "$(msg offboard.applying_offboard)"
@@ -2700,7 +2739,7 @@ _peer_dispatch_in_tmux() {
2700
2739
  {
2701
2740
  printf '#!/bin/bash -l\n'
2702
2741
  # FIX-050: portable PATH assembly (was hardcoded /opt/homebrew/bin)
2703
- printf 'for _d in /opt/homebrew/bin /usr/local/bin /opt/local/bin "$HOME/.local/bin"; do\n'
2742
+ printf 'for _d in /opt/homebrew/bin /usr/local/bin /opt/local/bin "$HOME/.local/bin" "$HOME/.kimi-code/bin"; do\n'
2704
2743
  printf ' case ":$PATH:" in *":$_d:"*) ;; *) [ -d "$_d" ] && PATH="$_d:$PATH" ;; esac\n'
2705
2744
  printf 'done; export PATH\n'
2706
2745
  printf '%s > %q 2> %q || true\n' "$cmd_str" "$out_file" "$err_file"
@@ -3133,9 +3172,20 @@ _agent_argv() {
3133
3172
  *) _AGENT_ARGV=(claude -p "$prompt") ;;
3134
3173
  esac ;;
3135
3174
  kimi)
3175
+ # FIX-126: Kimi upstream renamed binary from kimi-cli → kimi-code.
3176
+ # Prefer the new name when present; fall back through legacy names
3177
+ # so users mid-upgrade keep working until they reinstall.
3178
+ local _kimi_bin
3179
+ if command -v kimi-code >/dev/null 2>&1; then
3180
+ _kimi_bin=kimi-code
3181
+ elif command -v kimi-cli >/dev/null 2>&1; then
3182
+ _kimi_bin=kimi-cli
3183
+ else
3184
+ _kimi_bin=kimi
3185
+ fi
3136
3186
  case "$mode" in
3137
- interactive) _AGENT_ARGV=(kimi "$prompt") ;;
3138
- *) _AGENT_ARGV=(kimi --quiet -p "$prompt") ;;
3187
+ interactive) _AGENT_ARGV=("$_kimi_bin" "$prompt") ;;
3188
+ *) _AGENT_ARGV=("$_kimi_bin" --quiet -p "$prompt") ;;
3139
3189
  esac ;;
3140
3190
  deepseek)
3141
3191
  # deepseek has the same argv shape in both modes (positional prompt).
@@ -4417,6 +4467,397 @@ cmd_agent() {
4417
4467
  esac
4418
4468
  }
4419
4469
 
4470
+ # ═══════════════════════════════════════════════════════════════════════════════
4471
+ # ISOLATION — pluggable adapter for running tests in an isolated environment
4472
+ # (US-ISO-001). Phase 1 supports two providers: `none` (default — direct host
4473
+ # execution) and `tart` (US-ISO-002 — macOS VM). The dispatcher reads
4474
+ # .roll/local.yaml's `test_isolation.type` and routes to
4475
+ # `_isolation_<type>_<method>`. See .roll/features/engineering-infrastructure/
4476
+ # dev-vm-isolation-plan.md for the full interface contract.
4477
+ # ═══════════════════════════════════════════════════════════════════════════════
4478
+
4479
+ _ISOLATION_SUPPORTED_TYPES="none tart"
4480
+
4481
+ # Read test_isolation.type from .roll/local.yaml. Falls back to "none" when
4482
+ # the file or key is missing. Uses python3+yaml for nested-key parsing,
4483
+ # matching the parser used by cmd_offboard.
4484
+ _isolation_get_type() {
4485
+ local val=""
4486
+ if [[ -f .roll/local.yaml ]] && command -v python3 >/dev/null 2>&1; then
4487
+ val=$(python3 - <<'PY' 2>/dev/null
4488
+ import sys
4489
+ try:
4490
+ import yaml
4491
+ except ImportError:
4492
+ sys.exit(0)
4493
+ try:
4494
+ data = yaml.safe_load(open(".roll/local.yaml")) or {}
4495
+ except Exception:
4496
+ sys.exit(0)
4497
+ section = data.get("test_isolation")
4498
+ if isinstance(section, dict):
4499
+ t = section.get("type")
4500
+ if isinstance(t, str) and t:
4501
+ print(t)
4502
+ PY
4503
+ )
4504
+ fi
4505
+ if [[ -z "$val" ]]; then
4506
+ val="none"
4507
+ fi
4508
+ printf '%s\n' "$val"
4509
+ }
4510
+
4511
+ # Dispatch an isolation-adapter method to the configured provider.
4512
+ # Usage: _isolation_dispatch <method> [args...]
4513
+ # Methods: init / provision / exec / status / reset / destroy
4514
+ _isolation_dispatch() {
4515
+ local method="$1"; shift || true
4516
+ if [[ -z "$method" ]]; then
4517
+ err "isolation: missing method"
4518
+ echo " usage: _isolation_dispatch <init|provision|exec|status|reset|destroy> [args...]" >&2
4519
+ return 1
4520
+ fi
4521
+
4522
+ # Resolve provider; emit a fallback-INFO line only when the config file is
4523
+ # missing (so an explicit `type: none` stays quiet). Goes to stderr so the
4524
+ # actual dispatch output (e.g. exec stdout) stays clean.
4525
+ local type; type=$(_isolation_get_type)
4526
+ if [[ "$type" = "none" ]] && [[ ! -f .roll/local.yaml ]]; then
4527
+ info "isolation: no test_isolation config, falling back to type=none (host)" >&2
4528
+ fi
4529
+
4530
+ # Reject unknown types up front so the error names the provider, not the
4531
+ # missing function — this is the difference between "you typed it wrong"
4532
+ # and "the adapter is broken".
4533
+ local supported_ok=0 t
4534
+ for t in $_ISOLATION_SUPPORTED_TYPES; do
4535
+ [[ "$type" = "$t" ]] && supported_ok=1
4536
+ done
4537
+ if (( ! supported_ok )); then
4538
+ err "isolation: unknown type '$type' in .roll/local.yaml"
4539
+ echo " supported types: ${_ISOLATION_SUPPORTED_TYPES// /, }" >&2
4540
+ return 1
4541
+ fi
4542
+
4543
+ local fn="_isolation_${type}_${method}"
4544
+ if ! declare -F "$fn" >/dev/null 2>&1; then
4545
+ err "isolation: provider '$type' has no '${method}' implementation"
4546
+ return 1
4547
+ fi
4548
+ "$fn" "$@"
4549
+ }
4550
+
4551
+ # ── `none` adapter (default — direct host execution) ──────────────────────
4552
+ # init / provision / destroy are no-ops; exec runs the command in the host
4553
+ # shell unchanged; status is always 'ready'; reset is a benign no-op
4554
+ # (US-ISO-004 will print an explanatory message when invoked via roll test).
4555
+ _isolation_none_init() { return 0; }
4556
+ _isolation_none_provision() { return 0; }
4557
+ _isolation_none_exec() { "$@"; }
4558
+ _isolation_none_status() { echo "ready"; return 0; }
4559
+ _isolation_none_reset() {
4560
+ # US-ISO-004 AC: type=none has nothing to reset; print explanation but
4561
+ # exit 0 (not a failure — host execution is already as clean as it gets).
4562
+ info "isolation type 'none' has nothing to reset (host execution is stateless)" >&2
4563
+ return 0
4564
+ }
4565
+ _isolation_none_destroy() { return 0; }
4566
+
4567
+ # ─── reset lock (US-ISO-004) ──────────────────────────────────────────────
4568
+ # A single lockfile under .roll/ prevents two `roll test --reset` runs from
4569
+ # racing, and forces concurrent `roll test` test-execution paths to bail
4570
+ # fast rather than blocking on a half-rebuilt VM. --where is read-only and
4571
+ # deliberately bypasses the lock.
4572
+ _isolation_reset_lock_path() {
4573
+ echo ".roll/.iso-reset.lock"
4574
+ }
4575
+
4576
+ _isolation_reset_lock_held() {
4577
+ [[ -f "$(_isolation_reset_lock_path)" ]]
4578
+ }
4579
+
4580
+ # Returns 0 if the caller now holds the lock; 1 if someone else does.
4581
+ _isolation_reset_acquire_lock() {
4582
+ local lock; lock=$(_isolation_reset_lock_path)
4583
+ if [[ -f "$lock" ]]; then
4584
+ return 1
4585
+ fi
4586
+ mkdir -p "$(dirname "$lock")"
4587
+ echo "$$" > "$lock"
4588
+ return 0
4589
+ }
4590
+
4591
+ _isolation_reset_release_lock() {
4592
+ rm -f "$(_isolation_reset_lock_path)"
4593
+ }
4594
+
4595
+ # ── `tart` adapter (US-ISO-002 — macOS Apple Silicon VM via Tart) ─────────
4596
+ # Test override hooks (used by unit tests; default values keep prod stable):
4597
+ # _TART_VM_NAME — VM identifier (default: roll-dev-test)
4598
+ # _TART_BASE_IMAGE — OCI base image (default: cirruslabs macos-tahoe-base)
4599
+ # _TART_SSH_USER — SSH user inside the VM (default: admin)
4600
+
4601
+ _isolation_tart_vm_name() { printf '%s\n' "${_TART_VM_NAME:-roll-dev-test}"; }
4602
+ _isolation_tart_base_image() { printf '%s\n' "${_TART_BASE_IMAGE:-ghcr.io/cirruslabs/macos-tahoe-base:latest}"; }
4603
+ _isolation_tart_ssh_user() { printf '%s\n' "${_TART_SSH_USER:-admin}"; }
4604
+
4605
+ _isolation_tart_check_platform() {
4606
+ if [[ "$(uname)" != "Darwin" ]] || [[ "$(uname -m)" != "arm64" ]]; then
4607
+ err "Tart 仅支持 Apple Silicon macOS"
4608
+ err "Tart only supports Apple Silicon macOS"
4609
+ return 1
4610
+ fi
4611
+ return 0
4612
+ }
4613
+
4614
+ _isolation_tart_check_binary() {
4615
+ if ! command -v tart >/dev/null 2>&1; then
4616
+ err "tart binary not found"
4617
+ err " install via: brew install cirruslabs/cli/tart"
4618
+ return 1
4619
+ fi
4620
+ return 0
4621
+ }
4622
+
4623
+ # Returns 0 with the VM name on stdout when the VM is in `tart list`,
4624
+ # returns 1 silently otherwise. Caller decides what to do.
4625
+ _isolation_tart_vm_present() {
4626
+ local name; name=$(_isolation_tart_vm_name)
4627
+ tart list 2>/dev/null | awk -v n="$name" '$1 == n { found=1 } END { exit !found }'
4628
+ }
4629
+
4630
+ # Returns the VM's IP on stdout when reachable; exit non-zero when the VM
4631
+ # is stopped or `tart ip` fails for any other reason.
4632
+ _isolation_tart_ip() {
4633
+ local name; name=$(_isolation_tart_vm_name)
4634
+ local ip; ip=$(tart ip "$name" 2>/dev/null) || return 1
4635
+ [[ "$ip" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]] || return 1
4636
+ printf '%s\n' "$ip"
4637
+ }
4638
+
4639
+ # Status state machine — see dev-vm-isolation-plan.md §4.
4640
+ # Returns one of: not-installed | stopped | running | ready
4641
+ _isolation_tart_status() {
4642
+ _isolation_tart_check_platform >/dev/null 2>&1 || { echo "not-installed"; return 0; }
4643
+ command -v tart >/dev/null 2>&1 || { echo "not-installed"; return 0; }
4644
+ _isolation_tart_vm_present || { echo "not-installed"; return 0; }
4645
+ local ip
4646
+ if ! ip=$(_isolation_tart_ip); then
4647
+ echo "stopped"
4648
+ return 0
4649
+ fi
4650
+ # VM up. Is it provisioned? A trivial SSH probe is the cheapest check.
4651
+ local user; user=$(_isolation_tart_ssh_user)
4652
+ if ssh -o BatchMode=yes -o ConnectTimeout=3 -o StrictHostKeyChecking=no \
4653
+ "${user}@${ip}" "true" >/dev/null 2>&1; then
4654
+ echo "ready"
4655
+ else
4656
+ echo "running"
4657
+ fi
4658
+ return 0
4659
+ }
4660
+
4661
+ # init: ensure the base image is cloned into our VM slot. Idempotent —
4662
+ # `tart clone` is skipped when the VM already exists.
4663
+ _isolation_tart_init() {
4664
+ _isolation_tart_check_platform || return 1
4665
+ _isolation_tart_check_binary || return 1
4666
+ local name; name=$(_isolation_tart_vm_name)
4667
+ if _isolation_tart_vm_present; then
4668
+ return 0
4669
+ fi
4670
+ local img; img=$(_isolation_tart_base_image)
4671
+ tart clone "$img" "$name"
4672
+ }
4673
+
4674
+ # provision: ensure runtime deps are installed inside the VM. Idempotent —
4675
+ # brew install no-ops for already-installed packages. Requires the VM to
4676
+ # be running with SSH responsive (caller's responsibility, usually exec).
4677
+ _isolation_tart_provision() {
4678
+ _isolation_tart_check_platform || return 1
4679
+ _isolation_tart_check_binary || return 1
4680
+ local ip; ip=$(_isolation_tart_ip) || { err "tart provision: VM not running"; return 1; }
4681
+ local user; user=$(_isolation_tart_ssh_user)
4682
+ ssh -o BatchMode=yes -o StrictHostKeyChecking=no \
4683
+ "${user}@${ip}" "brew list bats >/dev/null 2>&1 || brew install bats-core; \
4684
+ brew list node >/dev/null 2>&1 || brew install node; \
4685
+ brew list bash >/dev/null 2>&1 || brew install bash"
4686
+ }
4687
+
4688
+ # exec: run the command inside the VM. Auto-starts the VM if it's stopped.
4689
+ # Mounts the host worktree at /Volumes/My Shared Files/roll (Tart virtiofs).
4690
+ _isolation_tart_exec() {
4691
+ _isolation_tart_check_platform || return 1
4692
+ _isolation_tart_check_binary || return 1
4693
+ local name; name=$(_isolation_tart_vm_name)
4694
+ local ip
4695
+ if ! ip=$(_isolation_tart_ip); then
4696
+ # VM stopped — start it in the background with the repo mounted.
4697
+ local repo_root; repo_root="$(pwd -P)"
4698
+ tart run --dir="roll:${repo_root}" "$name" >/dev/null 2>&1 &
4699
+ # Wait up to ~30s for IP to come up.
4700
+ local i=0
4701
+ while (( i < 30 )); do
4702
+ if ip=$(_isolation_tart_ip); then break; fi
4703
+ sleep 1
4704
+ i=$((i + 1))
4705
+ done
4706
+ [[ -n "${ip:-}" ]] || { err "tart exec: VM failed to start in 30s"; return 1; }
4707
+ fi
4708
+ local user; user=$(_isolation_tart_ssh_user)
4709
+ ssh -o BatchMode=yes -o StrictHostKeyChecking=no "${user}@${ip}" "$@"
4710
+ }
4711
+
4712
+ # reset: stop, delete, re-clone from base image, then re-provision.
4713
+ # Target: ≤90s (caller's perception); actual depends on tart clone speed.
4714
+ # Clone is called directly (not via init) so the sequence is unconditional —
4715
+ # tart's own "VM exists" check still no-ops re-clone if delete didn't take.
4716
+ _isolation_tart_reset() {
4717
+ _isolation_tart_check_platform || return 1
4718
+ _isolation_tart_check_binary || return 1
4719
+ local name; name=$(_isolation_tart_vm_name)
4720
+ local img; img=$(_isolation_tart_base_image)
4721
+ tart stop "$name" 2>/dev/null || true
4722
+ tart delete "$name" 2>/dev/null || true
4723
+ tart clone "$img" "$name" || return 1
4724
+ _isolation_tart_provision || true # provision may fail mid-reset; surface
4725
+ # via subsequent status check.
4726
+ }
4727
+
4728
+ # destroy: stop + delete. Doesn't rebuild.
4729
+ _isolation_tart_destroy() {
4730
+ _isolation_tart_check_platform || return 1
4731
+ _isolation_tart_check_binary || return 1
4732
+ local name; name=$(_isolation_tart_vm_name)
4733
+ tart stop "$name" 2>/dev/null || true
4734
+ tart delete "$name" 2>/dev/null || true
4735
+ return 0
4736
+ }
4737
+
4738
+ # ─── cmd_test ────────────────────────────────────────────────────────────
4739
+ # US-ISO-003: `roll test` — runs the project's test suite through the
4740
+ # isolation dispatcher. The configured `test_isolation.type` determines
4741
+ # where the tests execute (host shell vs Tart VM). When type=tart and
4742
+ # the VM fails to start, the failure surfaces non-zero — no silent
4743
+ # fallback to host, since that would lie about where the tests ran.
4744
+
4745
+ # Print where the test suite will execute. Format is machine-readable
4746
+ # (one token, optionally with a colon-separated detail) so scripts can
4747
+ # parse it: `host`, `tart:<ip>`, `tart:stopped`, `tart:not-installed`, …
4748
+ _cmd_test_where() {
4749
+ local type; type=$(_isolation_get_type)
4750
+ case "$type" in
4751
+ none)
4752
+ echo "host"
4753
+ ;;
4754
+ tart)
4755
+ local st; st=$(_isolation_tart_status)
4756
+ case "$st" in
4757
+ ready|running)
4758
+ local ip
4759
+ if ip=$(_isolation_tart_ip 2>/dev/null); then
4760
+ echo "tart:${ip}"
4761
+ else
4762
+ echo "tart:${st}"
4763
+ fi
4764
+ ;;
4765
+ *)
4766
+ echo "tart:${st}"
4767
+ ;;
4768
+ esac
4769
+ ;;
4770
+ *)
4771
+ echo "unknown:${type}"
4772
+ ;;
4773
+ esac
4774
+ }
4775
+
4776
+ cmd_test() {
4777
+ # US-ISO-005: `--help` / `-h` anywhere in pre-`--` args shows help and
4778
+ # exits 0, so `roll test --reset --help` is a help lookup, not a reset.
4779
+ # Args appearing after `--` are forwarded verbatim and not intercepted.
4780
+ local _a
4781
+ for _a in "$@"; do
4782
+ case "$_a" in
4783
+ --) break ;;
4784
+ --help|-h) set -- --help; break ;;
4785
+ esac
4786
+ done
4787
+ case "${1:-}" in
4788
+ --help|-h)
4789
+ cat <<'EOF'
4790
+ Usage: roll test [--where | --reset] [--] [<extra-args>...]
4791
+
4792
+ Runs the project's test suite through the isolation adapter chosen in
4793
+ .roll/local.yaml:
4794
+
4795
+ test_isolation:
4796
+ type: none (default) Direct host execution — same shell as `npm test`.
4797
+ type: tart Inside the Apple-Silicon `roll-dev-test` Tart VM,
4798
+ so tests can't reach the host's launchd / shared
4799
+ roll state. Tart isn't auto-installed; run
4800
+ `brew install cirruslabs/cli/tart` first.
4801
+
4802
+ Flags:
4803
+ --where Print where tests will run, then exit (e.g. `host`,
4804
+ `tart:192.168.64.5`, `tart:stopped`).
4805
+ --reset Rebuild the isolation environment to a clean baseline.
4806
+ type=tart: stop → delete → clone → provision (~90s).
4807
+ type=none: prints a note and exits 0 (host is stateless).
4808
+ Holds a lockfile under .roll/.iso-reset.lock; concurrent
4809
+ `roll test` invocations fast-fail with a clear error.
4810
+ --help, -h Show this help.
4811
+
4812
+ Examples:
4813
+ roll test Run the suite in whatever the config says.
4814
+ roll test -- --tier=fast Forward arguments to npm test.
4815
+ roll test --where Don't run; just report routing.
4816
+ roll test --reset Rebuild the VM (or host no-op).
4817
+
4818
+ When type=tart and the VM can't be reached, the command exits non-zero
4819
+ rather than silently falling back to host execution.
4820
+ EOF
4821
+ return 0
4822
+ ;;
4823
+ --where)
4824
+ _cmd_test_where
4825
+ return 0
4826
+ ;;
4827
+ --reset)
4828
+ # Refuse if another reset is in progress — fast-fail beats blocking
4829
+ # on a half-rebuilt VM (US-ISO-004 AC).
4830
+ if _isolation_reset_lock_held; then
4831
+ err "roll test --reset: another reset is already in progress"
4832
+ echo " lock: $(_isolation_reset_lock_path) (delete manually if stale)" >&2
4833
+ return 1
4834
+ fi
4835
+ _isolation_reset_acquire_lock || {
4836
+ err "roll test --reset: failed to acquire reset lock"
4837
+ return 1
4838
+ }
4839
+ # Make sure the lock comes off no matter how dispatch exits.
4840
+ trap '_isolation_reset_release_lock' RETURN
4841
+ _isolation_dispatch reset
4842
+ return $?
4843
+ ;;
4844
+ --)
4845
+ shift
4846
+ ;;
4847
+ esac
4848
+
4849
+ # Test-execution path. If a reset is in progress, bail rather than racing
4850
+ # into a half-rebuilt VM — user can `roll test --where` to inspect state.
4851
+ if _isolation_reset_lock_held; then
4852
+ err "roll test: a reset is in progress (lock: $(_isolation_reset_lock_path))"
4853
+ echo " re-run once the reset completes, or delete the lockfile if stale" >&2
4854
+ return 1
4855
+ fi
4856
+
4857
+ # Pass remaining args through to npm test inside the configured adapter.
4858
+ _isolation_dispatch exec npm test "$@"
4859
+ }
4860
+
4420
4861
  # ═══════════════════════════════════════════════════════════════════════════════
4421
4862
  # LOOP — autonomous BACKLOG executor management
4422
4863
  # ═══════════════════════════════════════════════════════════════════════════════
@@ -5142,6 +5583,8 @@ _detect_path_prepend() {
5142
5583
  [[ -d /usr/local/bin ]] && dirs+=("/usr/local/bin")
5143
5584
  [[ -d /opt/local/bin ]] && dirs+=("/opt/local/bin")
5144
5585
  [[ -d "$HOME/.local/bin" ]] && dirs+=("$HOME/.local/bin")
5586
+ # FIX-129: kimi-code installs to ~/.kimi-code/bin (not brew/local), launchd misses it
5587
+ [[ -d "$HOME/.kimi-code/bin" ]] && dirs+=("$HOME/.kimi-code/bin")
5145
5588
  dirs+=("/usr/bin" "/bin" "/usr/sbin" "/sbin")
5146
5589
  for d in "${dirs[@]}"; do
5147
5590
  case ":$seen:" in *":$d:"*) continue ;; esac
@@ -5307,7 +5750,7 @@ set -o pipefail
5307
5750
  # FIX-050: portable PATH assembly — launchd/cron deliver a bare PATH that
5308
5751
  # misses brew-installed tools (tmux, claude, node, …). Iterate candidate
5309
5752
  # dirs; only prepend when present and not already in PATH. Idempotent.
5310
- for _d in /opt/homebrew/bin /usr/local/bin /opt/local/bin "\$HOME/.local/bin"; do
5753
+ for _d in /opt/homebrew/bin /usr/local/bin /opt/local/bin "\$HOME/.local/bin" "\$HOME/.kimi-code/bin"; do
5311
5754
  case ":\$PATH:" in *":\$_d:"*) ;; *) [ -d "\$_d" ] && PATH="\$_d:\$PATH" ;; esac
5312
5755
  done
5313
5756
  export PATH
@@ -5640,6 +6083,10 @@ _phase_begin startup
5640
6083
  _phase_end startup ok
5641
6084
  _phase_begin preflight
5642
6085
  cd "${project_path}" 2>/dev/null || true
6086
+ # US-INFRA-008: ensure git hooks are wired so TCR pre-commit gate can't be bypassed
6087
+ _ensure_hooks_path "${project_path}" 2>/dev/null || true
6088
+ # US-LOOP-056: sync .roll/ meta from roll-meta remote before backlog scan
6089
+ _loop_sync_meta "${project_path}" || true
5643
6090
  # FIX-104: GC stale merged temp branches at cycle entry — before worktree setup
5644
6091
  # and before any early-exit gate (pre-run abort, CI red precheck). The post-claude
5645
6092
  # call site doesn't cover those paths, so merged branches accumulated on origin.
@@ -5951,7 +6398,7 @@ INNER
5951
6398
  # FIX-050: portable PATH assembly before any brew-tool lookup (tmux, caffeinate
5952
6399
  # on some systems, claude). Mirrors the inner script's bootstrap so even when
5953
6400
  # launchd's plist EnvironmentVariables is stale, the runner self-repairs.
5954
- for _d in /opt/homebrew/bin /usr/local/bin /opt/local/bin "\$HOME/.local/bin"; do
6401
+ for _d in /opt/homebrew/bin /usr/local/bin /opt/local/bin "\$HOME/.local/bin" "\$HOME/.kimi-code/bin"; do
5955
6402
  case ":\$PATH:" in *":\$_d:"*) ;; *) [ -d "\$_d" ] && PATH="\$_d:\$PATH" ;; esac
5956
6403
  done
5957
6404
  export PATH
@@ -6038,10 +6485,24 @@ if command -v tmux >/dev/null 2>&1; then
6038
6485
  tmux list-sessions -F "#{session_name}" 2>/dev/null | grep "^roll-loop-${slug}\$" | while read _s; do
6039
6486
  tmux kill-session -t "\$_s" 2>/dev/null || true
6040
6487
  done
6041
- tmux new-session -d -s "\$SESSION" -x 200 -y 50 "bash \"\$INNER_SCRIPT\""
6042
- CYCLE_LOG_RAW="${project_path}/.roll/cycle-logs/.pipe-\$\$.raw"
6488
+ # FIX-132: syntax-check the inner script before spawning the tmux session.
6489
+ # A heredoc quoting regression or mid-cycle regeneration can silently produce
6490
+ # a syntactically broken script; catching it here prevents the session from
6491
+ # starting in a corrupted state and logging a misleading "exited 0, retrying".
6492
+ if ! bash -n "\$INNER_SCRIPT" 2>>"\$LOG"; then
6493
+ echo "[\$(date '+%Y-%m-%dT%H:%M:%S%z')] ABORT: inner script failed syntax check — cycle skipped (see log: \$LOG)" >> "\$LOG"
6494
+ exit 1
6495
+ fi
6496
+ # FIX-130: export ROLL_CYCLE_LOG_RAW BEFORE spawning the tmux session so
6497
+ # the inner script inherits it (env vars are inherited at spawn time, not
6498
+ # retroactively — exporting after new-session means inner never sees it and
6499
+ # _inner_cleanup skips log archiving, leaving only orphan .pipe-*.raw files).
6043
6500
  mkdir -p "${project_path}/.roll/cycle-logs"
6501
+ # Clean orphan .pipe-*.raw files from previous crashed cycles
6502
+ find "${project_path}/.roll/cycle-logs" -name '.pipe-*.raw' -delete 2>/dev/null || true
6503
+ CYCLE_LOG_RAW="${project_path}/.roll/cycle-logs/.pipe-\$\$.raw"
6044
6504
  export ROLL_CYCLE_LOG_RAW="\$CYCLE_LOG_RAW"
6505
+ tmux new-session -d -s "\$SESSION" -x 200 -y 50 "bash \"\$INNER_SCRIPT\""
6045
6506
  tmux pipe-pane -t "\$SESSION" "tee -a \"\$LOG\" >> \"\$ROLL_CYCLE_LOG_RAW\""
6046
6507
  # Auto-attach popup: when not muted, spawn a Terminal.app window attached
6047
6508
  # to the tmux session so the user can watch the loop work in real time.
@@ -6060,7 +6521,9 @@ if command -v tmux >/dev/null 2>&1; then
6060
6521
  # window closes the instant the tmux session ends (cycle_end kills
6061
6522
  # the session) and the entire scrollback disappears with it; the
6062
6523
  # cron-<slug>.log file still has the full transcript as a fallback.
6063
- printf '#!/bin/bash\\ntmux attach -t %s\\necho\\necho "================================================================"\\necho " cycle ended. log: ~/.shared/roll/loop/cron-%s.log"\\necho " press enter to close this window."\\necho "================================================================"\\nread _\\n' \\
6524
+ # FIX-131: after tmux session ends, open the cron log with less so the
6525
+ # user can scroll through the full cycle output instead of seeing nothing.
6526
+ printf '#!/bin/bash\\ntmux attach -t %s 2>/dev/null\\nLOGFILE=~/.shared/roll/loop/cron-%s.log\\necho\\nif [ -f "\$LOGFILE" ]; then\\n echo "================================================================"\\n echo " Cycle ended — showing log (arrows to scroll, q to close)"\\n echo "================================================================"\\n less -R +G "\$LOGFILE"\\nelse\\n echo "================================================================"\\n echo " Cycle ended. Log not found: \$LOGFILE"\\n echo " press enter to close."\\n echo "================================================================"\\n read _\\nfi\\n' \\
6064
6527
  "\$SESSION" "${slug}" > "\$_attach_cmd" 2>/dev/null || true
6065
6528
  chmod +x "\$_attach_cmd" 2>/dev/null || true
6066
6529
  open -g -a Terminal "\$_attach_cmd" >/dev/null 2>&1 || true
@@ -6289,10 +6752,11 @@ cmd_loop() {
6289
6752
  resume) _loop_resume ;;
6290
6753
  reset) _loop_reset ;;
6291
6754
  gc) shift; _loop_gc "$@" ;;
6292
- notify) _notify "${1:-roll}" "${2:-}" ;;
6293
- enforce-tcr) _loop_enforce_tcr "${1:-}" "${2:-}" ;;
6294
- precheck-ci) _loop_precheck_ci ;;
6295
- branches) _loop_branches "$(pwd -P)" ;;
6755
+ notify) _notify "${1:-roll}" "${2:-}" ;;
6756
+ enforce-tcr) _loop_enforce_tcr "${1:-}" "${2:-}" ;;
6757
+ precheck-ci) _loop_precheck_ci ;;
6758
+ hotfix-head-context) _loop_hotfix_head_context "${1:-}" ;;
6759
+ branches) _loop_branches "$(pwd -P)" ;;
6296
6760
  *) cat <<'HELP'
6297
6761
  Usage: roll loop <on|off|now|test|status|monitor|runs|log|story|events|attach|mute|unmute|pause|resume|reset|gc|branches>
6298
6762
 
@@ -6768,11 +7232,33 @@ _loop_attach() {
6768
7232
  exec tmux attach -t "$session"
6769
7233
  }
6770
7234
 
7235
+ # FIX-125: detect whether we are running inside a loop cycle. Cycle context
7236
+ # is signalled by env vars exported by the cycle runner (ROLL_LOOP_AGENT,
7237
+ # bin/roll:5736) or by the outer cycle script (ROLL_CYCLE_LOG_RAW,
7238
+ # bin/roll:6044). Used by callers that touch canonical ${HOME}/Library/LaunchAgents
7239
+ # directly (_loop_gc, cmd_offboard) to refuse host-loop mutations from inside
7240
+ # a cycle. Read-only ops are unaffected.
7241
+ _loop_in_cycle() {
7242
+ [[ -n "${ROLL_LOOP_AGENT:-}" || -n "${ROLL_CYCLE_LOG_RAW:-}" ]]
7243
+ }
7244
+
6771
7245
  # US-LOOP-021: garbage-collect orphan slugs, tmp debris, and expired backups.
6772
7246
  # Usage: _loop_gc [--dry-run] [--keep-days N]
6773
7247
  # Keeps backups/migrated files within N days (default 30).
6774
7248
  # Retention order: ROLL_LOOP_GC_RETENTION_DAYS env > .roll/local.yaml loop_gc.retention_days > 30.
6775
7249
  _loop_gc() {
7250
+ # FIX-125: refuse from inside a loop cycle. Phase 1 below scans/mutates
7251
+ # ${HOME}/Library/LaunchAgents directly (bin/roll:6814,6847) — running it
7252
+ # from a cycle would let one project's tick remove another project's plist
7253
+ # under the host's launchd domain. Read-only ops (status, runs) are
7254
+ # unaffected; only the GC mutator is gated.
7255
+ if _loop_in_cycle; then
7256
+ echo "roll loop gc: refusing — cycle-context tripwire (FIX-125)" >&2
7257
+ echo " This command scans ~/Library/LaunchAgents directly. Running it" >&2
7258
+ echo " from inside a loop cycle is a known host-state corruption path." >&2
7259
+ return 1
7260
+ fi
7261
+
6776
7262
  local dry_run=false
6777
7263
  local keep_days=30
6778
7264
 
@@ -7298,6 +7784,66 @@ _ci_wait() {
7298
7784
  }
7299
7785
 
7300
7786
  # Pre-run CI health check — call before picking up new stories.
7787
+ # US-LOOP-056: sync .roll/ (roll-meta private submodule) before each cycle so
7788
+ # the cycle always runs against the latest backlog. Fail-soft: any error emits
7789
+ # a meta_sync event and returns 0 so the cycle continues with stale/existing meta.
7790
+ #
7791
+ # Statuses emitted via _loop_event meta_sync:
7792
+ # ok – fetch + reset --hard succeeded
7793
+ # stale – fetch failed; existing .roll/ used as fallback
7794
+ # skipped – no git remote configured (not a roll-meta managed project)
7795
+ #
7796
+ # Env override: ROLL_LOOP_META_SYNC_TIMEOUT (default 15) controls fetch timeout.
7797
+ _loop_sync_meta() {
7798
+ local project_path="$1"
7799
+ local roll_meta="${project_path}/.roll"
7800
+ local timeout_sec="${ROLL_LOOP_META_SYNC_TIMEOUT:-15}"
7801
+ local cid="${CYCLE_ID:-unknown}"
7802
+ local slug="${_LOOP_PROJ_SLUG:-$(_project_slug 2>/dev/null || echo unknown)}"
7803
+ local shared_dir="${_SHARED_ROOT:-$HOME/.shared/roll}/loop"
7804
+ local fail_counter="${shared_dir}/meta-sync-fail-${slug}"
7805
+
7806
+ # Detect remote via the canonical probe point. If .roll/ has no .git or no
7807
+ # remote configured, treat as "not managed" and skip silently.
7808
+ local remote_url
7809
+ remote_url=$(git -C "$roll_meta" remote get-url origin 2>/dev/null || echo "")
7810
+ if [ -z "$remote_url" ]; then
7811
+ return 0
7812
+ fi
7813
+
7814
+ # Attempt fetch with timeout
7815
+ local _fetch_ok=0
7816
+ if command -v timeout >/dev/null 2>&1; then
7817
+ timeout "$timeout_sec" git -C "$roll_meta" fetch --quiet 2>/dev/null && _fetch_ok=1
7818
+ else
7819
+ git -C "$roll_meta" fetch --quiet 2>/dev/null && _fetch_ok=1
7820
+ fi
7821
+
7822
+ if [ "$_fetch_ok" -eq 1 ]; then
7823
+ if git -C "$roll_meta" reset --hard origin/main --quiet 2>/dev/null; then
7824
+ _loop_event meta_sync "$cid" "ok" "" 2>/dev/null || true
7825
+ # US-LOOP-057: reset consecutive failure counter on success
7826
+ rm -f "$fail_counter" 2>/dev/null || true
7827
+ return 0
7828
+ fi
7829
+ fi
7830
+
7831
+ # Fetch or reset failed — stale .roll/ used; cycle continues
7832
+ _loop_event meta_sync "$cid" "stale" "fetch/reset failed" 2>/dev/null || true
7833
+
7834
+ # US-LOOP-057: increment failure counter; write ALERT after 3 consecutive failures
7835
+ mkdir -p "$shared_dir" 2>/dev/null || true
7836
+ local count=0
7837
+ [ -f "$fail_counter" ] && count=$(cat "$fail_counter" 2>/dev/null || echo 0)
7838
+ count=$(( count + 1 ))
7839
+ printf '%s\n' "$count" > "$fail_counter"
7840
+ if [ "$count" -ge 3 ]; then
7841
+ printf '[%s] roll-meta sync consecutive failures: %d times. Check SSH key / network.\n Last error: fetch/reset failed for %s\n' \
7842
+ "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$count" "$remote_url" >> "${shared_dir}/ALERT-${slug}.md" 2>/dev/null || true
7843
+ fi
7844
+ return 0
7845
+ }
7846
+
7301
7847
  # Refuses to build on a red base (HEAD CI failed). Lenient on unknown states
7302
7848
  # (gh missing, repo unparseable, no runs yet) — the post-build _loop_enforce_ci
7303
7849
  # is the strict gate.
@@ -7330,6 +7876,38 @@ _loop_precheck_ci() {
7330
7876
  run_states=$(echo "$runs" \
7331
7877
  | jq -r '[.[] | "\(.status // "?")/\(.conclusion // "null")"] | unique | join(", ")' \
7332
7878
  2>/dev/null || echo "?")
7879
+
7880
+ # US-LOOP-046/048: check whether hot-fix path is allowed before aborting.
7881
+ # ROLL_LOOP_NO_HEAL=1 or ROLL_LOOP_HEAL_MAX=0 → fall through to original abort.
7882
+ local _heal_max="${ROLL_LOOP_HEAL_MAX:-2}"
7883
+ if [[ "${ROLL_LOOP_NO_HEAL:-}" != "1" ]] && [[ "$_heal_max" -gt 0 ]]; then
7884
+ local _state_file="${_SHARED_ROOT:-$HOME/.shared/roll}/loop/state-${_LOOP_PROJ_SLUG:-$(basename "$PWD")}.yaml"
7885
+ local _heal_key="heal_count_head_${commit:0:8}"
7886
+ local _count=0
7887
+ [[ -f "$_state_file" ]] && _count=$(grep "^${_heal_key}:" "$_state_file" 2>/dev/null | awk '{print $2}' || echo 0)
7888
+ _count=$(( ${_count:-0} + 0 )) # coerce to int
7889
+ if [[ "$_count" -lt "$_heal_max" ]]; then
7890
+ # Increment counter and signal hot-fix path to the agent
7891
+ _count=$(( _count + 1 ))
7892
+ mkdir -p "$(dirname "$_state_file")" 2>/dev/null || true
7893
+ if [[ -f "$_state_file" ]]; then
7894
+ # Update existing key or append
7895
+ if grep -q "^${_heal_key}:" "$_state_file" 2>/dev/null; then
7896
+ local _tmp; _tmp=$(mktemp)
7897
+ grep -v "^${_heal_key}:" "$_state_file" > "$_tmp" 2>/dev/null || true
7898
+ printf '%s: %d\n' "$_heal_key" "$_count" >> "$_tmp"
7899
+ mv "$_tmp" "$_state_file"
7900
+ else
7901
+ printf '%s: %d\n' "$_heal_key" "$_count" >> "$_state_file"
7902
+ fi
7903
+ else
7904
+ printf '%s: %d\n' "$_heal_key" "$_count" > "$_state_file"
7905
+ fi
7906
+ # Exit 2 signals the agent: CI is red, hot-fix path is available
7907
+ return 2
7908
+ fi
7909
+ fi
7910
+
7333
7911
  err "$(msg loop.pre_run_ci_check_head_ci ${short})"
7334
7912
  mkdir -p "$(dirname "$_LOOP_ALERT")"
7335
7913
  cat > "$_LOOP_ALERT" << EOF
@@ -7352,6 +7930,62 @@ EOF
7352
7930
  return 0
7353
7931
  }
7354
7932
 
7933
+ # US-LOOP-047: hot-fix context factory for HEAD CI failures.
7934
+ # Captures failing run logs + recent commit diff, writes to /tmp/roll-heal-head-<sha>.log
7935
+ # Returns 0 and prints the log path on success; 1 if context could not be gathered.
7936
+ _loop_hotfix_head_context() {
7937
+ local commit="${1:-$(git rev-parse HEAD 2>/dev/null)}"
7938
+ [[ -z "$commit" ]] && return 1
7939
+ local short="${commit:0:8}"
7940
+ local outfile="/tmp/roll-heal-head-${short}.log"
7941
+ local slug; _gh_resolve slug || slug="unknown"
7942
+
7943
+ {
7944
+ printf '=== CI Hot-fix Context: HEAD %s ===\n\n' "$short"
7945
+ printf '--- Recent commits ---\n'
7946
+ git log --oneline -5 2>/dev/null || true
7947
+ printf '\n--- Diff of last commit ---\n'
7948
+ git show --stat HEAD 2>/dev/null | head -40 || true
7949
+ printf '\n--- CI failure logs (head 200 lines) ---\n'
7950
+ local run_id
7951
+ run_id=$(gh -R "$slug" run list --commit "$commit" \
7952
+ --json databaseId,conclusion -L 5 2>/dev/null \
7953
+ | jq -r '.[] | select(.conclusion=="failure") | .databaseId' 2>/dev/null | head -1)
7954
+ if [[ -n "$run_id" ]]; then
7955
+ gh -R "$slug" run view --log-failed "$run_id" 2>/dev/null | head -200 || true
7956
+ else
7957
+ printf '(no failed run found for commit %s)\n' "$short"
7958
+ fi
7959
+ } > "$outfile" 2>/dev/null
7960
+ printf '%s\n' "$outfile"
7961
+ return 0
7962
+ }
7963
+
7964
+ # US-LOOP-050: PR hot-fix entry point.
7965
+ # Checks out the PR branch, captures CI failure logs, and prepares context
7966
+ # for a roll-fix invocation on the PR branch.
7967
+ # Usage: _loop_hot_fix_pr <pr_number>
7968
+ _loop_hot_fix_pr() {
7969
+ local pr_num="$1"
7970
+ [[ -z "$pr_num" ]] && return 1
7971
+ local slug; _gh_resolve slug || return 1
7972
+ local outfile="/tmp/roll-heal-pr-${pr_num}.log"
7973
+ local run_id
7974
+ run_id=$(gh -R "$slug" run list --json databaseId,conclusion,headBranch -L 20 2>/dev/null \
7975
+ | jq -r --argjson pr "\"$pr_num\"" \
7976
+ '.[] | select(.conclusion=="failure") | .databaseId' 2>/dev/null | head -1)
7977
+ {
7978
+ printf '=== PR #%s CI Hot-fix Context ===\n\n' "$pr_num"
7979
+ if [[ -n "$run_id" ]]; then
7980
+ gh -R "$slug" run view --log-failed "$run_id" 2>/dev/null | head -200 || true
7981
+ else
7982
+ printf '(no failed run found for PR #%s)\n' "$pr_num"
7983
+ fi
7984
+ } > "$outfile" 2>/dev/null
7985
+ printf '%s\n' "$outfile"
7986
+ return 0
7987
+ }
7988
+
7355
7989
  # _loop_diagnose_open_prs <slug>
7356
7990
  # Appended to ALERT when CI is red on HEAD.
7357
7991
  # For each open PR targeting main: lists CI failing tests + changed files,
@@ -7586,7 +8220,14 @@ _loop_pr_classify() {
7586
8220
  local mergeable="${4:-}"
7587
8221
 
7588
8222
  case "$head_ref" in
7589
- loop/*) echo "loop_self"; return 0 ;;
8223
+ loop/*)
8224
+ # US-LOOP-049: loop/* PRs with CI failure get their own classification
8225
+ # so _loop_pr_inbox can route them to the PR hot-fix path.
8226
+ if [[ "$ci_state" == "failure" ]]; then
8227
+ echo "loop_self_ci_red"; return 0
8228
+ fi
8229
+ echo "loop_self"; return 0
8230
+ ;;
7590
8231
  esac
7591
8232
 
7592
8233
  case "$human_review" in
@@ -9854,6 +10495,7 @@ main() {
9854
10495
  doctor) cmd_doctor "$@" ;;
9855
10496
  review-pr) cmd_review_pr "$@" ;;
9856
10497
  slides) cmd_slides "$@" ;;
10498
+ test) cmd_test "$@" ;;
9857
10499
  prices) cmd_prices "$@" ;;
9858
10500
  changelog) cmd_changelog "$@" ;;
9859
10501
  version|--version|-v) echo "roll v${VERSION}" ;;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@seanyao/roll",
3
- "version": "2026.527.1",
3
+ "version": "2026.528.2",
4
4
  "description": "Roll — Roll out features with AI agents",
5
5
  "scripts": {
6
6
  "test": "bash tests/run.sh"
@@ -121,7 +121,7 @@ Document structure (two-layer separation):
121
121
  3. **FIX / IDEA detail files use ID-prefixed filenames**: `.roll/features/<epic>/FIX-097.md`, not `.roll/features/<epic>/some-descriptive-slug.md`. Reason: a single FIX is one card, not a long-lived feature; the ID is the most stable handle, descriptive slugs date quickly and break links. US can keep feature-slug naming (US lives inside a multi-Story feature file). Quick lookup: `ls .roll/features/<epic>/FIX-*.md` finds all bugs in that area without grepping content.
122
122
  4. .roll/backlog.md only contains index rows (one row per US), **do not write** AC / Files / Notes
123
123
  5. Domain model files go in `.roll/domain/` — create on first greenfield design, update incrementally
124
- 6. **Do not** write to `~/.kimi/` or any global config directory
124
+ 6. **Do not** write to `~/.kimi/`, `~/.kimi-code/`, or any global config directory
125
125
 
126
126
  **File path resolution order:**
127
127
  1. Determine Feature ownership (based on the requirement domain: compiler / ingest / qa / ...)
@@ -125,16 +125,26 @@ readers. The rule mirrors the gate in Step 2.
125
125
  ### Step 1.5 — Pre-run CI Health Check
126
126
 
127
127
  Call `roll loop precheck-ci` before scanning BACKLOG. This is a **defensive gate**
128
- against building on a broken base if the most recent commit on the branch
129
- has red CI, the loop must not stack new commits on top (which would create the
130
- exact stuck-red state FIX-026 traces to).
131
-
132
- - HEAD CI green / pending / no-run-yet proceed to Step 2.
133
- - HEAD CI red write ALERT, **do not pick up any stories this cycle**,
134
- exit cleanly. The next cycle will retry; the human must fix CI manually
135
- (typically by reverting or pushing a green commit) before the loop resumes.
136
- - `gh` missing or repo unparseable → graceful skip (`roll loop precheck-ci`
137
- returns 0); the post-build `_loop_enforce_ci` remains the strict gate.
128
+ against building on a broken base. Check the **exit code** and route accordingly:
129
+
130
+ | Exit code | Meaning | Action |
131
+ |-----------|---------|--------|
132
+ | `0` | CI green / pending / unknown | Proceed to Step 1.6 (PR Inbox) and Step 2 (BACKLOG scan) |
133
+ | `1` | CI red AND heal exhausted or `ROLL_LOOP_NO_HEAL=1` | ALERT already written; exit cleanly this cycle |
134
+ | `2` | CI red AND heal attempt allowed (US-LOOP-046) | **Hot-fix path** — skip BACKLOG, fix CI instead (see below) |
135
+
136
+ `gh` missing or repo unparseable → `precheck-ci` returns `0`; graceful skip.
137
+
138
+ **Hot-fix path (exit code 2) — US-LOOP-046:**
139
+
140
+ Do NOT pick any BACKLOG stories this cycle. Instead:
141
+
142
+ 1. Capture context: `roll loop hotfix-head-context` → prints path to context log
143
+ 2. Invoke `Skill("roll-fix")` with brief:
144
+ `"CI red on HEAD. Failing run logs at <context-log-path>. Diagnose root cause, fix via TCR, commit, push. Do NOT change BACKLOG status."`
145
+ 3. After `roll-fix` completes, re-run `roll ci --wait` to verify the fix
146
+ 4. If CI is still red: run `roll loop precheck-ci` again; if it returns `1` (heal exhausted),
147
+ exit cleanly — ALERT was already written by the precheck
138
148
 
139
149
  ### Step 1.6 — PR Inbox (US-AUTO-034)
140
150