@seanyao/roll 2.603.1 → 2.604.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +40 -11
- package/bin/roll +84 -757
- package/lib/changelog_audit.py +149 -0
- package/lib/changelog_generate.py +41 -23
- package/lib/consistency_check.py +409 -0
- package/lib/i18n/consistency.sh +8 -0
- package/lib/prices/snapshot-2026-05-22.json +1 -7
- package/lib/prices/snapshot-2026-05-23-deepseek.json +0 -2
- package/lib/prices/snapshot-2026-06-02-kimi.json +0 -1
- package/lib/prices_fetcher.py +1 -20
- package/lib/roll-loop-status.py +1 -1
- package/package.json +1 -1
- package/lib/__pycache__/changelog_generate.cpython-314.pyc +0 -0
- package/lib/__pycache__/github_sync.cpython-314.pyc +0 -0
- package/lib/__pycache__/loop-fmt.cpython-314.pyc +0 -0
- package/lib/__pycache__/loop_result_eval.cpython-314.pyc +0 -0
- package/lib/__pycache__/loop_unstick.cpython-314.pyc +0 -0
- package/lib/__pycache__/model_prices.cpython-314.pyc +0 -0
- package/lib/__pycache__/prices_fetcher.cpython-314.pyc +0 -0
- package/lib/__pycache__/roll-home.cpython-314.pyc +0 -0
- package/lib/__pycache__/roll-loop-status.cpython-314.pyc +0 -0
- package/lib/__pycache__/roll_git.cpython-314.pyc +0 -0
- package/lib/__pycache__/roll_render.cpython-314.pyc +0 -0
- package/lib/__pycache__/slides-render.cpython-314.pyc +0 -0
- package/lib/agent_usage/__pycache__/__init__.cpython-314.pyc +0 -0
- package/lib/agent_usage/__pycache__/gemini.cpython-314.pyc +0 -0
- package/lib/agent_usage/__pycache__/kimi.cpython-314.pyc +0 -0
- package/lib/agent_usage/__pycache__/openai.cpython-314.pyc +0 -0
- package/lib/agent_usage/__pycache__/pi.cpython-314.pyc +0 -0
- package/lib/agent_usage/__pycache__/pi_emit.cpython-314.pyc +0 -0
- package/lib/agent_usage/__pycache__/qwen.cpython-314.pyc +0 -0
package/bin/roll
CHANGED
|
@@ -4,7 +4,7 @@ set -euo pipefail
|
|
|
4
4
|
# Roll — AI Agent Convention Manager
|
|
5
5
|
# Single source of truth for how all AI coding agents behave.
|
|
6
6
|
|
|
7
|
-
VERSION="2.
|
|
7
|
+
VERSION="2.604.1"
|
|
8
8
|
ROLL_HOME="${ROLL_HOME:-${HOME}/.roll}"
|
|
9
9
|
ROLL_CONFIG="${ROLL_HOME}/config.yaml"
|
|
10
10
|
ROLL_GLOBAL="${ROLL_HOME}/conventions/global"
|
|
@@ -4046,6 +4046,11 @@ _peer_call() {
|
|
|
4046
4046
|
_watchdog_pid=$!
|
|
4047
4047
|
wait "$_peer_pid" 2>/dev/null || _peer_exit=$?
|
|
4048
4048
|
# Cancel watchdog if agent finished on time.
|
|
4049
|
+
# FIX-181: kill children (sleep) first so they cannot outlive the
|
|
4050
|
+
# watchdog and later hit a reused PID, then kill the watchdog itself.
|
|
4051
|
+
if command -v pkill >/dev/null 2>&1; then
|
|
4052
|
+
pkill -P "$_watchdog_pid" 2>/dev/null || true
|
|
4053
|
+
fi
|
|
4049
4054
|
kill "$_watchdog_pid" 2>/dev/null || true
|
|
4050
4055
|
wait "$_watchdog_pid" 2>/dev/null || true
|
|
4051
4056
|
output="$(cat "$_out" 2>/dev/null || true)"
|
|
@@ -5658,7 +5663,7 @@ cmd_changelog() {
|
|
|
5658
5663
|
esac
|
|
5659
5664
|
done
|
|
5660
5665
|
local raw
|
|
5661
|
-
raw=$(python3 "${ROLL_PKG_DIR}/lib/changelog_generate.py" "${pyargs[@]}") || return 1
|
|
5666
|
+
raw=$(python3 "${ROLL_PKG_DIR}/lib/changelog_generate.py" ${pyargs[@]+"${pyargs[@]}"}) || return 1
|
|
5662
5667
|
if [ "$is_json" = 1 ]; then printf '%s\n' "$raw"; return 0; fi
|
|
5663
5668
|
local final="$raw"
|
|
5664
5669
|
if [ "$want_ai" = 1 ]; then
|
|
@@ -5697,6 +5702,34 @@ EOF
|
|
|
5697
5702
|
esac
|
|
5698
5703
|
}
|
|
5699
5704
|
|
|
5705
|
+
# ─── roll consistency check — unified consistency orchestrator (US-CONSIST-001) ──
|
|
5706
|
+
cmd_consistency() {
|
|
5707
|
+
local subcmd="${1:-check}"
|
|
5708
|
+
shift || true
|
|
5709
|
+
case "$subcmd" in
|
|
5710
|
+
check)
|
|
5711
|
+
python3 "${ROLL_PKG_DIR}/lib/consistency_check.py" "$@"
|
|
5712
|
+
;;
|
|
5713
|
+
--help|-h|help)
|
|
5714
|
+
cat <<EOF
|
|
5715
|
+
Usage: roll consistency <subcommand>
|
|
5716
|
+
|
|
5717
|
+
check [--json] [--project-dir DIR] 逐维度跑一致性检查
|
|
5718
|
+
Run checks across five dimensions (code, docs, i18n, tests, site)
|
|
5719
|
+
and produce a structured pass/gap report.
|
|
5720
|
+
|
|
5721
|
+
roll consistency check # human-readable report
|
|
5722
|
+
roll consistency check --json # machine-readable JSON
|
|
5723
|
+
EOF
|
|
5724
|
+
;;
|
|
5725
|
+
*)
|
|
5726
|
+
err "$(msg consistency.unknown_sub "$subcmd")"
|
|
5727
|
+
err "Try: roll consistency check"
|
|
5728
|
+
return 1
|
|
5729
|
+
;;
|
|
5730
|
+
esac
|
|
5731
|
+
}
|
|
5732
|
+
|
|
5700
5733
|
# ─── roll config — unified read/list/set for loop schedule keys (US-LOOP-033) ──
|
|
5701
5734
|
#
|
|
5702
5735
|
# One interactive entry point so users don't have to remember whether a key
|
|
@@ -6192,14 +6225,14 @@ cmd_review_pr() {
|
|
|
6192
6225
|
|
|
6193
6226
|
local slug; slug=$(_gh_repo_slug) || { err "Not a GitHub repo — review-pr requires GitHub remote"; return 1; }
|
|
6194
6227
|
|
|
6195
|
-
local pr_json
|
|
6196
|
-
pr_json=$(gh -R "$slug" pr view "$pr_number" --json title,body
|
|
6228
|
+
local pr_json diff
|
|
6229
|
+
pr_json=$(gh -R "$slug" pr view "$pr_number" --json title,body 2>&1) \
|
|
6197
6230
|
|| { err "gh pr view failed: ${pr_json}"; return 1; }
|
|
6231
|
+
diff=$(gh -R "$slug" pr diff "$pr_number" 2>/dev/null) || true
|
|
6198
6232
|
|
|
6199
6233
|
local title body diff
|
|
6200
6234
|
title=$(echo "$pr_json" | jq -r '.title // ""')
|
|
6201
6235
|
body=$(echo "$pr_json" | jq -r '.body // ""')
|
|
6202
|
-
diff=$(echo "$pr_json" | jq -r '.diff // ""')
|
|
6203
6236
|
|
|
6204
6237
|
if echo "$body" | grep -qF '[skip-ai-review]'; then
|
|
6205
6238
|
gh -R "$slug" pr review "$pr_number" --approve -b "Auto-approved: [skip-ai-review] detected" 2>/dev/null || true
|
|
@@ -8301,96 +8334,6 @@ PRRUNNER
|
|
|
8301
8334
|
chmod +x "$script_path"
|
|
8302
8335
|
}
|
|
8303
8336
|
|
|
8304
|
-
# _write_ci_loop_runner_script <script_path> <project_path> <roll_bin> <log_path>
|
|
8305
|
-
# US-AUTO-045 Phase 2: the script the com.roll.ci.<slug> launchd plist runs
|
|
8306
|
-
# every 5 min. Mirrors _write_pr_loop_runner_script — lightweight (no agent,
|
|
8307
|
-
# no tmux): portable PATH, a single-flight re-entry lock (pid+ts, 15-min
|
|
8308
|
-
# staleness so a crashed pass self-heals next tick), then drives the _ci_scan
|
|
8309
|
-
# orchestrator via the `roll _ci_scan` dispatch.
|
|
8310
|
-
_write_ci_loop_runner_script() {
|
|
8311
|
-
local script_path="$1" project_path="$2" roll_bin="$3" log_path="$4"
|
|
8312
|
-
mkdir -p "$(dirname "$script_path")"
|
|
8313
|
-
local lock="${project_path}/.roll/loop/.ci-loop.lock"
|
|
8314
|
-
cat > "$script_path" << CIRUNNER
|
|
8315
|
-
#!/bin/bash -l
|
|
8316
|
-
set -o pipefail
|
|
8317
|
-
# Portable PATH: launchd delivers a bare PATH missing brew/local tools. Idempotent.
|
|
8318
|
-
for _d in /opt/homebrew/bin /usr/local/bin /opt/local/bin "\$HOME/.local/bin" "\$HOME/.kimi-code/bin"; do
|
|
8319
|
-
case ":\$PATH:" in *":\$_d:"*) ;; *) [ -d "\$_d" ] && PATH="\$_d:\$PATH" ;; esac
|
|
8320
|
-
done
|
|
8321
|
-
export PATH
|
|
8322
|
-
# Single-flight re-entry guard: one CI-loop pass at a time. 5-min cadence;
|
|
8323
|
-
# 15-min (900s) staleness so a crashed/hung pass self-heals on the next tick.
|
|
8324
|
-
LOCK="${lock}"
|
|
8325
|
-
mkdir -p "\$(dirname "\$LOCK")"
|
|
8326
|
-
if [ -f "\$LOCK" ]; then
|
|
8327
|
-
_pp=""; _pt=""
|
|
8328
|
-
IFS=: read -r _pp _pt < "\$LOCK" 2>/dev/null || true
|
|
8329
|
-
_now=\$(date -u +%s)
|
|
8330
|
-
if [ -n "\$_pp" ] && [ -n "\$_pt" ] && kill -0 "\$_pp" 2>/dev/null && [ "\$((_now - _pt))" -lt 900 ]; then
|
|
8331
|
-
exit 0
|
|
8332
|
-
fi
|
|
8333
|
-
rm -f "\$LOCK"
|
|
8334
|
-
fi
|
|
8335
|
-
printf '%s:%s\n' "\$\$" "\$(date -u +%s)" > "\$LOCK"
|
|
8336
|
-
trap 'rm -f "\$LOCK"' EXIT
|
|
8337
|
-
cd "${project_path}" || exit 0
|
|
8338
|
-
bash "${roll_bin}" _ci_scan >> "${log_path}" 2>&1 || true
|
|
8339
|
-
CIRUNNER
|
|
8340
|
-
chmod +x "$script_path"
|
|
8341
|
-
}
|
|
8342
|
-
|
|
8343
|
-
# _write_alert_loop_runner_script <script_path> <project_path> <roll_bin> <log_path>
|
|
8344
|
-
# US-AUTO-046 Phase 2: the script the com.roll.alert.<slug> launchd plist runs
|
|
8345
|
-
# every 1 min. Mirrors _write_ci_loop_runner_script — lightweight (no agent,
|
|
8346
|
-
# no tmux): portable PATH, a single-flight re-entry lock (pid+ts), then drives
|
|
8347
|
-
# the Phase-1 _alert_dispatch consumer via the `roll _alert_dispatch` dispatch.
|
|
8348
|
-
# _alert_dispatch reads $_LOOP_ALERT, parses + notifies + records to
|
|
8349
|
-
# alert-log.jsonl, then rotates the file. Staleness is 180s (3 ticks at the
|
|
8350
|
-
# 1-min cadence) so a crashed/hung pass self-heals quickly.
|
|
8351
|
-
_write_alert_loop_runner_script() {
|
|
8352
|
-
local script_path="$1" project_path="$2" roll_bin="$3" log_path="$4"
|
|
8353
|
-
mkdir -p "$(dirname "$script_path")"
|
|
8354
|
-
local lock="${project_path}/.roll/loop/.alert-loop.lock"
|
|
8355
|
-
local slug; slug=$(_project_slug "${project_path}")
|
|
8356
|
-
cat > "$script_path" << ALERTRUNNER
|
|
8357
|
-
#!/bin/bash -l
|
|
8358
|
-
set -o pipefail
|
|
8359
|
-
# Portable PATH: launchd delivers a bare PATH missing brew/local tools. Idempotent.
|
|
8360
|
-
for _d in /opt/homebrew/bin /usr/local/bin /opt/local/bin "\$HOME/.local/bin" "\$HOME/.kimi-code/bin"; do
|
|
8361
|
-
case ":\$PATH:" in *":\$_d:"*) ;; *) [ -d "\$_d" ] && PATH="\$_d:\$PATH" ;; esac
|
|
8362
|
-
done
|
|
8363
|
-
export PATH
|
|
8364
|
-
# Single-flight re-entry guard: one alert-loop pass at a time. 1-min cadence;
|
|
8365
|
-
# 180s staleness so a crashed/hung pass self-heals within a few ticks.
|
|
8366
|
-
LOCK="${lock}"
|
|
8367
|
-
mkdir -p "\$(dirname "\$LOCK")"
|
|
8368
|
-
if [ -f "\$LOCK" ]; then
|
|
8369
|
-
_pp=""; _pt=""
|
|
8370
|
-
IFS=: read -r _pp _pt < "\$LOCK" 2>/dev/null || true
|
|
8371
|
-
_now=\$(date -u +%s)
|
|
8372
|
-
if [ -n "\$_pp" ] && [ -n "\$_pt" ] && kill -0 "\$_pp" 2>/dev/null && [ "\$((_now - _pt))" -lt 180 ]; then
|
|
8373
|
-
exit 0
|
|
8374
|
-
fi
|
|
8375
|
-
rm -f "\$LOCK"
|
|
8376
|
-
fi
|
|
8377
|
-
printf '%s:%s\n' "\$\$" "\$(date -u +%s)" > "\$LOCK"
|
|
8378
|
-
trap 'rm -f "\$LOCK"' EXIT
|
|
8379
|
-
cd "${project_path}" || exit 0
|
|
8380
|
-
# FIX-171: bake the project-local runtime dir directly; do not rely on
|
|
8381
|
-
# _loop_runtime_dir which may fail to resolve in fresh shells. Set
|
|
8382
|
-
# _LOOP_ALERT so the dispatched roll reads the project-local ALERT file,
|
|
8383
|
-
# but do not override an externally-supplied value (test sandboxes).
|
|
8384
|
-
_LOOP_RT_DIR="${project_path}/.roll/loop"
|
|
8385
|
-
if [ -d "\$_LOOP_RT_DIR" ]; then
|
|
8386
|
-
: "\${_LOOP_ALERT:=\${_LOOP_RT_DIR}/ALERT-${slug}.md}"
|
|
8387
|
-
export _LOOP_ALERT
|
|
8388
|
-
fi
|
|
8389
|
-
bash "${roll_bin}" _alert_dispatch >> "${log_path}" 2>&1 || true
|
|
8390
|
-
ALERTRUNNER
|
|
8391
|
-
chmod +x "$script_path"
|
|
8392
|
-
}
|
|
8393
|
-
|
|
8394
8337
|
# Like _write_runner_script but prepends an active window guard.
|
|
8395
8338
|
# Silently exits when current hour is outside [active_start, active_end).
|
|
8396
8339
|
# When tmux is available, wraps the inner command in a detached tmux session
|
|
@@ -9733,15 +9676,11 @@ _install_launchd_plists() {
|
|
|
9733
9676
|
|
|
9734
9677
|
# US-AUTO-044: "pr" is the 4th service — a 5-min PR Loop (period=5, empty hour
|
|
9735
9678
|
# → StartInterval=300). No skill (it drives _loop_pr_inbox, not an agent).
|
|
9736
|
-
|
|
9737
|
-
|
|
9738
|
-
|
|
9739
|
-
|
|
9740
|
-
local
|
|
9741
|
-
local skill_names=("roll-loop" "roll-.dream" "roll-brief" "" "" "")
|
|
9742
|
-
local periods=("$loop_period" "60" "60" "5" "5" "1")
|
|
9743
|
-
local offsets=("$loop_offset" "$dream_minute" "$brief_minute" "0" "0" "0")
|
|
9744
|
-
local hours=("" "$dream_hour" "$brief_hour" "" "" "")
|
|
9679
|
+
local services=("loop" "dream" "pr")
|
|
9680
|
+
local skill_names=("roll-loop" "roll-.dream" "")
|
|
9681
|
+
local periods=("$loop_period" "60" "5")
|
|
9682
|
+
local offsets=("$loop_offset" "$dream_minute" "0")
|
|
9683
|
+
local hours=("" "$dream_hour" "")
|
|
9745
9684
|
|
|
9746
9685
|
local updated=0
|
|
9747
9686
|
local slug; slug=$(_project_slug "$project_path")
|
|
@@ -9774,22 +9713,8 @@ _install_launchd_plists() {
|
|
|
9774
9713
|
local pr_log="${project_path}/.roll/loop/pr.log"
|
|
9775
9714
|
mkdir -p "${project_path}/.roll/loop"
|
|
9776
9715
|
_write_pr_loop_runner_script "$runner" "$project_path" "${ROLL_PKG_DIR}/bin/roll" "$pr_log"
|
|
9777
|
-
elif [[ "$svc" == "ci" ]]; then
|
|
9778
|
-
# US-AUTO-045 Phase 2: lightweight CI Loop runner — drives _ci_scan every
|
|
9779
|
-
# 5 min (no agent, no tmux). Records run timing, auto-reruns transient
|
|
9780
|
-
# failures, and surfaces flaky / degradation stories.
|
|
9781
|
-
local ci_log="${project_path}/.roll/loop/ci.log"
|
|
9782
|
-
mkdir -p "${project_path}/.roll/loop"
|
|
9783
|
-
_write_ci_loop_runner_script "$runner" "$project_path" "${ROLL_PKG_DIR}/bin/roll" "$ci_log"
|
|
9784
|
-
elif [[ "$svc" == "alert" ]]; then
|
|
9785
|
-
# US-AUTO-046 Phase 2: lightweight Alert Loop runner — drives _alert_dispatch
|
|
9786
|
-
# every 1 min (no agent, no tmux). Consumes _LOOP_ALERT: parse → notify →
|
|
9787
|
-
# record to alert-log.jsonl → rotate the file.
|
|
9788
|
-
local alert_log="${project_path}/.roll/loop/alert.log"
|
|
9789
|
-
mkdir -p "${project_path}/.roll/loop"
|
|
9790
|
-
_write_alert_loop_runner_script "$runner" "$project_path" "${ROLL_PKG_DIR}/bin/roll" "$alert_log"
|
|
9791
9716
|
else
|
|
9792
|
-
#
|
|
9717
|
+
# dream cron log is project-local, mirroring loop (FIX-139).
|
|
9793
9718
|
local log="${project_path}/.roll/${svc}/cron.log"
|
|
9794
9719
|
mkdir -p "${project_path}/.roll/${svc}"
|
|
9795
9720
|
_write_runner_script "$runner" "$project_path" "cd \"${project_path}\" && ${cmd}" "$log"
|
|
@@ -9987,7 +9912,7 @@ _loop_on() {
|
|
|
9987
9912
|
# does not disturb the overrides DB.
|
|
9988
9913
|
local uid; uid=$(id -u)
|
|
9989
9914
|
local all_loaded=true
|
|
9990
|
-
for svc in loop dream
|
|
9915
|
+
for svc in loop dream pr; do
|
|
9991
9916
|
local label; label=$(_launchd_label "$svc" "$project_path")
|
|
9992
9917
|
local plist; plist=$(_launchd_plist_path "$svc" "$project_path")
|
|
9993
9918
|
if ! _launchd_is_loaded "$label"; then
|
|
@@ -10054,7 +9979,7 @@ _loop_off() {
|
|
|
10054
9979
|
if [[ "$(uname)" == "Darwin" ]]; then
|
|
10055
9980
|
local any_loaded=false
|
|
10056
9981
|
local _skip_off; _launchd_should_skip_registry && _skip_off=1 || _skip_off=0
|
|
10057
|
-
for svc in loop dream
|
|
9982
|
+
for svc in loop dream pr; do
|
|
10058
9983
|
local label; label=$(_launchd_label "$svc" "$project_path")
|
|
10059
9984
|
if _launchd_is_loaded "$label"; then
|
|
10060
9985
|
any_loaded=true
|
|
@@ -10069,7 +9994,7 @@ _loop_off() {
|
|
|
10069
9994
|
fi
|
|
10070
9995
|
local slug; slug=$(_project_slug "$project_path")
|
|
10071
9996
|
local uid; uid=$(id -u)
|
|
10072
|
-
for svc in loop dream
|
|
9997
|
+
for svc in loop dream pr; do
|
|
10073
9998
|
rm -f "${_SHARED_ROOT}/${svc}/run-${slug}.sh"
|
|
10074
9999
|
# FIX-081: reverse the FIX-059 auto-bootstrap guard. `_install_launchd_plists`
|
|
10075
10000
|
# writes `launchctl disable gui/<UID>/<label>` for every brand-new plist
|
|
@@ -10405,7 +10330,7 @@ _legacy_loop_status() {
|
|
|
10405
10330
|
echo ""
|
|
10406
10331
|
if [[ "$(uname)" == "Darwin" ]]; then
|
|
10407
10332
|
echo -e " Services Agent: ${CYAN}${agent}${NC}"
|
|
10408
|
-
for svc in loop dream
|
|
10333
|
+
for svc in loop dream pr; do
|
|
10409
10334
|
local state; state=$(_launchd_svc_state "$svc" "$project_path")
|
|
10410
10335
|
if [[ "$svc" == "loop" ]] && $_is_paused; then
|
|
10411
10336
|
local _paused_at; _paused_at=$(grep '^paused_at:' "$_LOOP_STATE" 2>/dev/null | awk '{print $2}' | tr -d '"')
|
|
@@ -10419,7 +10344,7 @@ _legacy_loop_status() {
|
|
|
10419
10344
|
echo -e " ${YELLOW}loop ⏸ paused${NC}${_dur} run: roll loop resume"
|
|
10420
10345
|
else
|
|
10421
10346
|
local _tick_age=""
|
|
10422
|
-
case "$svc" in pr
|
|
10347
|
+
case "$svc" in pr)
|
|
10423
10348
|
_tick_age=$(_loop_tick_age "$svc")
|
|
10424
10349
|
[ -n "$_tick_age" ] && _tick_age=" tick ${_tick_age}"
|
|
10425
10350
|
esac
|
|
@@ -11601,7 +11526,7 @@ _loop_pr_heal_self() {
|
|
|
11601
11526
|
|
|
11602
11527
|
local agent; agent="$(_project_agent 2>/dev/null)"; agent="${agent:-claude}"
|
|
11603
11528
|
|
|
11604
|
-
( echo "$BASHPID" > "$lock"
|
|
11529
|
+
( echo "${BASHPID:-$$}" > "$lock"
|
|
11605
11530
|
_loop_pr_do_heal "$num" "$head_ref" "$slug" "$agent" >/dev/null 2>&1
|
|
11606
11531
|
rm -f "$lock"
|
|
11607
11532
|
) &
|
|
@@ -11828,54 +11753,25 @@ _loop_is_roll_meta_story() {
|
|
|
11828
11753
|
|
|
11829
11754
|
# _loop_pr_classify <head_ref> <human_review_state> <ci_state> <mergeable_state>
|
|
11830
11755
|
# Prints one of:
|
|
11831
|
-
#
|
|
11832
|
-
#
|
|
11833
|
-
#
|
|
11834
|
-
#
|
|
11835
|
-
# eligible
|
|
11836
|
-
# Exit 0 always — callers parse the printed token.
|
|
11756
|
+
# ci_red — CI failed → heal
|
|
11757
|
+
# stale — needs rebase / conflicting / behind
|
|
11758
|
+
# ready — CI green + clean → merge
|
|
11759
|
+
# Human review intentionally irrelevant — CI is the only gate.
|
|
11837
11760
|
_loop_pr_classify() {
|
|
11838
11761
|
local head_ref="${1:-}"
|
|
11839
11762
|
local human_review="${2:-}"
|
|
11840
11763
|
local ci_state="${3:-}"
|
|
11841
11764
|
local mergeable="${4:-}"
|
|
11842
11765
|
|
|
11843
|
-
case "$
|
|
11844
|
-
|
|
11845
|
-
# US-LOOP-049: loop/* PRs with CI failure get their own classification
|
|
11846
|
-
# so _loop_pr_inbox can route them to the PR hot-fix path.
|
|
11847
|
-
if [[ "$ci_state" == "failure" ]]; then
|
|
11848
|
-
echo "loop_self_ci_red"; return 0
|
|
11849
|
-
fi
|
|
11850
|
-
echo "loop_self"; return 0
|
|
11851
|
-
;;
|
|
11852
|
-
claude/*)
|
|
11853
|
-
# Claude-agent-authored PRs are loop-owned for autonomous merge/rebase
|
|
11854
|
-
# once green — same treatment as loop/* — so they close within a
|
|
11855
|
-
# PR-loop tick instead of waiting on a human or a GHA bot review.
|
|
11856
|
-
# CI-red claude/* PRs are deliberately NOT routed to background heal
|
|
11857
|
-
# (no agent re-spawn); they fall through to the stale/eligible paths
|
|
11858
|
-
# below so a human decides what to do with a failing run.
|
|
11859
|
-
if [[ "$ci_state" != "failure" ]]; then
|
|
11860
|
-
echo "loop_self"; return 0
|
|
11861
|
-
fi
|
|
11862
|
-
;;
|
|
11863
|
-
esac
|
|
11864
|
-
|
|
11865
|
-
case "$human_review" in
|
|
11866
|
-
CHANGES_REQUESTED) echo "blocked_human_request_changes"; return 0 ;;
|
|
11867
|
-
APPROVED) echo "blocked_human_approved"; return 0 ;;
|
|
11766
|
+
case "$mergeable" in
|
|
11767
|
+
BEHIND|DIRTY|CONFLICTING) echo "stale"; return 0 ;;
|
|
11868
11768
|
esac
|
|
11869
11769
|
|
|
11870
|
-
|
|
11871
|
-
|
|
11872
|
-
# spellings so a conflicting/out-of-date PR is reliably routed to rebase.
|
|
11873
|
-
if [ "$ci_state" = "failure" ] || [ "$mergeable" = "CONFLICTING" ] || [ "$mergeable" = "DIRTY" ] || [ "$mergeable" = "BEHIND" ]; then
|
|
11874
|
-
echo "stale"
|
|
11875
|
-
return 0
|
|
11770
|
+
if [ "$ci_state" = "failure" ]; then
|
|
11771
|
+
echo "ci_red"; return 0
|
|
11876
11772
|
fi
|
|
11877
11773
|
|
|
11878
|
-
echo "
|
|
11774
|
+
echo "ready"
|
|
11879
11775
|
}
|
|
11880
11776
|
|
|
11881
11777
|
# _loop_pr_rebase_circuit <pr_number>
|
|
@@ -12013,6 +11909,9 @@ _loop_pr_rebase_stale() {
|
|
|
12013
11909
|
fi
|
|
12014
11910
|
|
|
12015
11911
|
git fetch origin "$head_ref" 2>/dev/null || return 0
|
|
11912
|
+
# Reset local tracking branch to the freshly-fetched remote state
|
|
11913
|
+
# before rebasing, otherwise force-push destroys commits pushed by others.
|
|
11914
|
+
git checkout -B "$head_ref" "origin/$head_ref" 2>/dev/null || return 0
|
|
12016
11915
|
|
|
12017
11916
|
# FIX-159: save original branch so we can restore it unconditionally
|
|
12018
11917
|
local _orig
|
|
@@ -12135,44 +12034,29 @@ _loop_pr_inbox() {
|
|
|
12135
12034
|
verdict=$(_loop_pr_classify "$head_ref" "$human_review" "$ci_state" "$mergeable")
|
|
12136
12035
|
|
|
12137
12036
|
case "$verdict" in
|
|
12138
|
-
|
|
12139
|
-
# Green self-PR: merge when clean, else rebase onto main first. A
|
|
12140
|
-
# loop/* or claude/* PR that fell BEHIND or now CONFLICTS with main can
|
|
12141
|
-
# never auto-merge until rebased — eager-merge alone would leave it
|
|
12142
|
-
# stuck open forever. Rebase is circuit-gated (≥3 attempts/24h → ALERT)
|
|
12143
|
-
# and merges on a later tick once the rebased head is green + clean.
|
|
12144
|
-
case "$mergeable" in
|
|
12145
|
-
BEHIND|DIRTY|CONFLICTING)
|
|
12146
|
-
if _loop_pr_rebase_circuit "$num"; then
|
|
12147
|
-
_loop_pr_rebase_stale "$num" "$head_ref" || true
|
|
12148
|
-
fi
|
|
12149
|
-
;;
|
|
12150
|
-
*)
|
|
12151
|
-
_loop_pr_merge_self_eager "$num" "$ci_state" "$mergeable" "$slug"
|
|
12152
|
-
;;
|
|
12153
|
-
esac
|
|
12154
|
-
;;
|
|
12155
|
-
loop_self_ci_red)
|
|
12156
|
-
# US-LOOP-062a: a red loop/* PR (classified by US-LOOP-049) is now
|
|
12157
|
-
# background-healed: bounded retries via heal budget + dynamic agent,
|
|
12158
|
-
# falling back to the deduped [TYPE:loop-pr-ci-red] ALERT (FIX-158's
|
|
12159
|
-
# surfacing) when heal is disabled/exhausted. Re-wires US-LOOP-050.
|
|
12037
|
+
ci_red)
|
|
12160
12038
|
_loop_pr_heal_self "$num" "$head_ref" "$slug" || true
|
|
12161
12039
|
;;
|
|
12162
|
-
blocked_human_request_changes)
|
|
12163
|
-
: # skip — last human review requested changes; wait for the author
|
|
12164
|
-
;;
|
|
12165
|
-
blocked_human_approved)
|
|
12166
|
-
# US-LOOP-062b: human approved — merge directly when green + mergeable
|
|
12167
|
-
# (don't wait for repo auto-merge, which may be off).
|
|
12168
|
-
_loop_pr_merge_approved "$num" "$ci_state" "$mergeable" "$slug" || true
|
|
12169
|
-
;;
|
|
12170
12040
|
stale)
|
|
12171
12041
|
_loop_pr_rebase_circuit "$num" || true
|
|
12172
|
-
_loop_pr_rebase_stale "$num" "$head_ref" || true
|
|
12042
|
+
if _loop_pr_rebase_stale "$num" "$head_ref" || true; then
|
|
12043
|
+
# Re-fetch PR state after rebase — if now clean, merge immediately.
|
|
12044
|
+
local _re_view
|
|
12045
|
+
_re_view=$(gh -R "$slug" pr view "$num" --json mergeStateStatus,statusCheckRollup 2>/dev/null) || true
|
|
12046
|
+
if [ -n "$_re_view" ]; then
|
|
12047
|
+
local _re_ci _re_mb
|
|
12048
|
+
_re_ci=$(echo "$_re_view" | jq -r '
|
|
12049
|
+
if (.statusCheckRollup | length) == 0 then ""
|
|
12050
|
+
elif any(.statusCheckRollup[]?; .conclusion == "FAILURE") then "failure"
|
|
12051
|
+
elif all(.statusCheckRollup[]?; .conclusion == "SUCCESS" or .conclusion == "SKIPPED") then "success"
|
|
12052
|
+
else "pending" end' 2>/dev/null)
|
|
12053
|
+
_re_mb=$(echo "$_re_view" | jq -r '.mergeStateStatus // ""' 2>/dev/null)
|
|
12054
|
+
_loop_pr_merge_self_eager "$num" "$_re_ci" "$_re_mb" "$slug"
|
|
12055
|
+
fi
|
|
12056
|
+
fi
|
|
12173
12057
|
;;
|
|
12174
|
-
|
|
12175
|
-
|
|
12058
|
+
ready)
|
|
12059
|
+
_loop_pr_merge_self_eager "$num" "$ci_state" "$mergeable" "$slug"
|
|
12176
12060
|
;;
|
|
12177
12061
|
esac
|
|
12178
12062
|
|
|
@@ -12370,569 +12254,13 @@ _loop_pr_route() {
|
|
|
12370
12254
|
return 0
|
|
12371
12255
|
}
|
|
12372
12256
|
|
|
12373
|
-
#
|
|
12374
|
-
|
|
12375
|
-
# These six helpers collect CI timing data, classify failures, auto-rerun
|
|
12376
|
-
# transient flakes, and surface flaky / degradation signals as backlog
|
|
12377
|
-
# entries. They are NOT yet wired into any runner or launchd plist — that is
|
|
12378
|
-
# Phase 2 (wired by hand). Each is unit-tested in
|
|
12379
|
-
# tests/unit/roll_loop_ci_loop.bats with gh stubbed. Do not delete or inline.
|
|
12380
|
-
#
|
|
12381
|
-
# State lives under project-local .roll/state/:
|
|
12382
|
-
# ci-timing.jsonl append-only NDJSON, one line per recorded CI run
|
|
12383
|
-
# ci-rerun-state.yaml minimal YAML: rerun attempt count per run_id
|
|
12384
|
-
# _LOOP_ALERT is the existing shared alert file (real failures, rerun limits).
|
|
12385
|
-
|
|
12386
|
-
# _ci_state_dir
|
|
12387
|
-
# Echo the project-local CI state directory, creating it if needed.
|
|
12388
|
-
# Resolves relative to the current working dir's .roll/ (tests cd into a
|
|
12389
|
-
# sandbox; the live loop runner cds into the project root).
|
|
12390
|
-
_ci_state_dir() {
|
|
12257
|
+
# _alert_log_file — echo path to alert-log.jsonl (used by `roll alert log` CLI).
|
|
12258
|
+
_alert_log_file() {
|
|
12391
12259
|
local dir=".roll/state"
|
|
12392
12260
|
mkdir -p "$dir" 2>/dev/null || true
|
|
12393
|
-
echo "$dir"
|
|
12394
|
-
}
|
|
12395
|
-
|
|
12396
|
-
# _ci_record_timing <run_json>
|
|
12397
|
-
# Parse one `gh run list --json ...` object and append a flat NDJSON line to
|
|
12398
|
-
# ci-timing.jsonl. Idempotent: a run_id already present in the file is
|
|
12399
|
-
# skipped. Duration is computed from createdAt → updatedAt (gh exposes no
|
|
12400
|
-
# native duration field). Returns 0 always (loop-safe).
|
|
12401
|
-
_ci_record_timing() {
|
|
12402
|
-
local json="$1"
|
|
12403
|
-
[ -n "$json" ] || return 0
|
|
12404
|
-
|
|
12405
|
-
local run_id workflow conclusion status created updated
|
|
12406
|
-
run_id=$(echo "$json" | jq -r '.databaseId // ""' 2>/dev/null)
|
|
12407
|
-
[ -n "$run_id" ] || return 0
|
|
12408
|
-
|
|
12409
|
-
local dir; dir=$(_ci_state_dir)
|
|
12410
|
-
local file="${dir}/ci-timing.jsonl"
|
|
12411
|
-
|
|
12412
|
-
# Idempotency: skip if this run_id is already recorded with a non-empty
|
|
12413
|
-
# conclusion. If the existing record has an empty conclusion and the new
|
|
12414
|
-
# data has a conclusion, update in-place so in-progress runs are completed.
|
|
12415
|
-
if [ -f "$file" ] && grep -q "\"run_id\":${run_id}," "$file" 2>/dev/null; then
|
|
12416
|
-
local existing_conclusion new_conclusion
|
|
12417
|
-
existing_conclusion=$(grep "\"run_id\":${run_id}," "$file" 2>/dev/null | jq -r '.conclusion // ""' 2>/dev/null)
|
|
12418
|
-
new_conclusion=$(echo "$json" | jq -r '.conclusion // ""' 2>/dev/null)
|
|
12419
|
-
if [ -n "$existing_conclusion" ] || [ -z "$new_conclusion" ]; then
|
|
12420
|
-
return 0
|
|
12421
|
-
fi
|
|
12422
|
-
# Remove the stale line so the new record can be appended below.
|
|
12423
|
-
local tmpfile="${file}.tmp.$$"
|
|
12424
|
-
grep -v "\"run_id\":${run_id}," "$file" > "$tmpfile" 2>/dev/null || true
|
|
12425
|
-
mv "$tmpfile" "$file"
|
|
12426
|
-
fi
|
|
12427
|
-
|
|
12428
|
-
workflow=$(echo "$json" | jq -r '.workflowName // .name // ""' 2>/dev/null)
|
|
12429
|
-
conclusion=$(echo "$json" | jq -r '.conclusion // ""' 2>/dev/null)
|
|
12430
|
-
status=$(echo "$json" | jq -r '.status // ""' 2>/dev/null)
|
|
12431
|
-
created=$(echo "$json" | jq -r '.createdAt // ""' 2>/dev/null)
|
|
12432
|
-
updated=$(echo "$json" | jq -r '.updatedAt // ""' 2>/dev/null)
|
|
12433
|
-
|
|
12434
|
-
# Duration in seconds from ISO-8601 timestamps; 0 if either is missing or
|
|
12435
|
-
# unparseable. `date -j` (BSD) and `date -d` (GNU) differ — try both.
|
|
12436
|
-
local dur=0 c_epoch u_epoch
|
|
12437
|
-
if [ -n "$created" ] && [ -n "$updated" ]; then
|
|
12438
|
-
c_epoch=$(_ci_iso_to_epoch "$created")
|
|
12439
|
-
u_epoch=$(_ci_iso_to_epoch "$updated")
|
|
12440
|
-
if [ -n "$c_epoch" ] && [ -n "$u_epoch" ] && [ "$u_epoch" -ge "$c_epoch" ] 2>/dev/null; then
|
|
12441
|
-
dur=$((u_epoch - c_epoch))
|
|
12442
|
-
fi
|
|
12443
|
-
fi
|
|
12444
|
-
|
|
12445
|
-
printf '{"run_id":%s,"workflow":"%s","conclusion":"%s","status":"%s","duration_sec":%s,"recorded_at":"%s"}\n' \
|
|
12446
|
-
"$run_id" "$workflow" "$conclusion" "$status" "$dur" \
|
|
12447
|
-
"$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "$file"
|
|
12448
|
-
return 0
|
|
12449
|
-
}
|
|
12450
|
-
|
|
12451
|
-
# _ci_iso_to_epoch <iso8601>
|
|
12452
|
-
# Convert an ISO-8601 UTC timestamp (2026-05-30T10:00:00Z) to epoch seconds.
|
|
12453
|
-
# Echoes nothing on failure. Handles both BSD (macOS) and GNU date.
|
|
12454
|
-
_ci_iso_to_epoch() {
|
|
12455
|
-
local iso="$1"
|
|
12456
|
-
[ -n "$iso" ] || return 0
|
|
12457
|
-
local e
|
|
12458
|
-
# GNU date
|
|
12459
|
-
e=$(date -u -d "$iso" +%s 2>/dev/null) && { echo "$e"; return 0; }
|
|
12460
|
-
# BSD date (strip trailing Z, parse explicit format)
|
|
12461
|
-
local trimmed="${iso%Z}"
|
|
12462
|
-
e=$(date -u -j -f "%Y-%m-%dT%H:%M:%S" "$trimmed" +%s 2>/dev/null) && { echo "$e"; return 0; }
|
|
12463
|
-
return 0
|
|
12464
|
-
}
|
|
12465
|
-
|
|
12466
|
-
# _ci_classify_failure <run_id>
|
|
12467
|
-
# Inspect `gh run view <id> --log-failed` and classify the failure as
|
|
12468
|
-
# "transient" (infra flake: network, timeout, runner death) or "real"
|
|
12469
|
-
# (genuine test/build failure). Echoes "transient" or "real".
|
|
12470
|
-
# Empty / unavailable logs default to "real" (fail safe — don't auto-rerun
|
|
12471
|
-
# something we can't read).
|
|
12472
|
-
_ci_classify_failure() {
|
|
12473
|
-
local run_id="$1"
|
|
12474
|
-
[ -n "$run_id" ] || { echo "real"; return 0; }
|
|
12475
|
-
local slug; _gh_resolve slug 2>/dev/null || slug=""
|
|
12476
|
-
|
|
12477
|
-
local log
|
|
12478
|
-
if [ -n "$slug" ]; then
|
|
12479
|
-
log=$(gh -R "$slug" run view "$run_id" --log-failed 2>/dev/null)
|
|
12480
|
-
else
|
|
12481
|
-
log=$(gh run view "$run_id" --log-failed 2>/dev/null)
|
|
12482
|
-
fi
|
|
12483
|
-
|
|
12484
|
-
# Transient signatures: network/infra failures that a rerun typically clears.
|
|
12485
|
-
if echo "$log" | grep -qiE 'ETIMEDOUT|ECONNRESET|ENOTFOUND|EAI_AGAIN|shutdown signal|runner.*(error|lost|terminated)|The runner has received a shutdown|503 Service|connection reset|TLS handshake|i/o timeout|could not resolve host'; then
|
|
12486
|
-
echo "transient"
|
|
12487
|
-
return 0
|
|
12488
|
-
fi
|
|
12489
|
-
echo "real"
|
|
12490
|
-
return 0
|
|
12491
|
-
}
|
|
12492
|
-
|
|
12493
|
-
# _ci_rerun_state_file
|
|
12494
|
-
# Echo path to ci-rerun-state.yaml (creating the dir).
|
|
12495
|
-
_ci_rerun_state_file() {
|
|
12496
|
-
local dir; dir=$(_ci_state_dir)
|
|
12497
|
-
echo "${dir}/ci-rerun-state.yaml"
|
|
12498
|
-
}
|
|
12499
|
-
|
|
12500
|
-
# _ci_rerun_attempts <run_id>
|
|
12501
|
-
# Echo the recorded rerun attempt count for <run_id> (0 if none).
|
|
12502
|
-
_ci_rerun_attempts() {
|
|
12503
|
-
local run_id="$1"
|
|
12504
|
-
local file; file=$(_ci_rerun_state_file)
|
|
12505
|
-
[ -f "$file" ] || { echo 0; return 0; }
|
|
12506
|
-
local n
|
|
12507
|
-
n=$(awk -v key="\"${run_id}\":" '$1 == key { print $2 }' "$file" 2>/dev/null | head -1)
|
|
12508
|
-
case "$n" in
|
|
12509
|
-
''|*[!0-9]*) echo 0 ;;
|
|
12510
|
-
*) echo "$n" ;;
|
|
12511
|
-
esac
|
|
12512
|
-
}
|
|
12513
|
-
|
|
12514
|
-
# _ci_rerun_state_write <run_id> <attempts>
|
|
12515
|
-
# Set the attempt count for <run_id> in ci-rerun-state.yaml. Minimal YAML
|
|
12516
|
-
# writer (we own the schema): one `"<run_id>": <n>` line per run.
|
|
12517
|
-
_ci_rerun_state_write() {
|
|
12518
|
-
local run_id="$1" attempts="$2"
|
|
12519
|
-
local file; file=$(_ci_rerun_state_file)
|
|
12520
|
-
[ -f "$file" ] || : > "$file"
|
|
12521
|
-
local tmp; tmp=$(mktemp)
|
|
12522
|
-
awk -v key="\"${run_id}\":" -v val="$attempts" '
|
|
12523
|
-
$1 == key { print key " " val; found=1; next }
|
|
12524
|
-
{ print }
|
|
12525
|
-
END { if (!found) print key " " val }
|
|
12526
|
-
' "$file" > "$tmp" && mv "$tmp" "$file"
|
|
12527
|
-
}
|
|
12528
|
-
|
|
12529
|
-
# _ci_rerun_transient <run_id>
|
|
12530
|
-
# Auto-rerun a transient CI failure, capped at 2 attempts. attempt<2 →
|
|
12531
|
-
# `gh run rerun`; attempt>=2 → write an error ALERT. Echoes the action taken
|
|
12532
|
-
# ("rerun" / "limit"). Loop-safe (returns 0).
|
|
12533
|
-
_ci_rerun_transient() {
|
|
12534
|
-
local run_id="$1"
|
|
12535
|
-
[ -n "$run_id" ] || return 0
|
|
12536
|
-
local slug; _gh_resolve slug 2>/dev/null || slug=""
|
|
12537
|
-
|
|
12538
|
-
local attempts; attempts=$(_ci_rerun_attempts "$run_id")
|
|
12539
|
-
if [ "$attempts" -ge 2 ]; then
|
|
12540
|
-
local alert="$_LOOP_ALERT"
|
|
12541
|
-
mkdir -p "$(dirname "$alert")" 2>/dev/null || true
|
|
12542
|
-
printf '[%s] [error] [TYPE:ci-rerun-limit] CI rerun reached limit: run #%s (%s attempts)\n' \
|
|
12543
|
-
"$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$run_id" "$attempts" >> "$alert"
|
|
12544
|
-
echo "limit"
|
|
12545
|
-
return 0
|
|
12546
|
-
fi
|
|
12547
|
-
|
|
12548
|
-
if [ -n "$slug" ]; then
|
|
12549
|
-
gh -R "$slug" run rerun "$run_id" >/dev/null 2>&1 || true
|
|
12550
|
-
else
|
|
12551
|
-
gh run rerun "$run_id" >/dev/null 2>&1 || true
|
|
12552
|
-
fi
|
|
12553
|
-
_ci_rerun_state_write "$run_id" "$((attempts + 1))"
|
|
12554
|
-
echo "rerun"
|
|
12555
|
-
return 0
|
|
12556
|
-
}
|
|
12557
|
-
|
|
12558
|
-
# _ci_open_story <type> <title>
|
|
12559
|
-
# Append a FIX or US row to .roll/backlog.md's `| ID | Description | Status |`
|
|
12560
|
-
# table. Idempotent: if a 📋 Todo row with the same title already exists, skip
|
|
12561
|
-
# (echo "skip"). New IDs auto-increment from the max existing <TYPE>-NNN.
|
|
12562
|
-
# Echoes the new ID on success, "skip" if already queued.
|
|
12563
|
-
_ci_open_story() {
|
|
12564
|
-
local type="$1" title="$2"
|
|
12565
|
-
[ -n "$type" ] && [ -n "$title" ] || return 0
|
|
12566
|
-
|
|
12567
|
-
# Resolve the backlog file (project-local).
|
|
12568
|
-
local backlog=".roll/backlog.md"
|
|
12569
|
-
[ -f "$backlog" ] || { echo "skip"; return 0; }
|
|
12570
|
-
|
|
12571
|
-
# Idempotency: same title already queued as Todo → skip.
|
|
12572
|
-
if grep -F "$title" "$backlog" 2>/dev/null | grep -q '📋 Todo'; then
|
|
12573
|
-
echo "skip"
|
|
12574
|
-
return 0
|
|
12575
|
-
fi
|
|
12576
|
-
|
|
12577
|
-
# Auto-increment: find the max existing <TYPE>-NNN id.
|
|
12578
|
-
local prefix max next
|
|
12579
|
-
prefix=$(echo "$type" | tr '[:lower:]' '[:upper:]')
|
|
12580
|
-
max=$(grep -oE "${prefix}-[0-9]+" "$backlog" 2>/dev/null \
|
|
12581
|
-
| sed "s/${prefix}-//" \
|
|
12582
|
-
| sort -n | tail -1)
|
|
12583
|
-
case "$max" in ''|*[!0-9]*) max=0 ;; esac
|
|
12584
|
-
# 10# prefix forces base-10: a zero-padded id like 008/009 would otherwise be
|
|
12585
|
-
# parsed as octal and either misnumber (010→8) or error ("value too great").
|
|
12586
|
-
next=$((10#$max + 1))
|
|
12587
|
-
local id
|
|
12588
|
-
id=$(printf '%s-%03d' "$prefix" "$next")
|
|
12589
|
-
|
|
12590
|
-
printf '| %s | %s | 📋 Todo |\n' "$id" "$title" >> "$backlog"
|
|
12591
|
-
echo "$id"
|
|
12592
|
-
return 0
|
|
12593
|
-
}
|
|
12594
|
-
|
|
12595
|
-
# _ci_detect_flaky
|
|
12596
|
-
# Scan the last 20 ci-timing.jsonl lines, group by workflow, and flag any
|
|
12597
|
-
# workflow whose recent runs have a 20%–80% failure rate (2..8 failures of
|
|
12598
|
-
# the last 10) as flaky — opening a FIX story. Returns 0 (loop-safe).
|
|
12599
|
-
_ci_detect_flaky() {
|
|
12600
|
-
local dir; dir=$(_ci_state_dir)
|
|
12601
|
-
local file="${dir}/ci-timing.jsonl"
|
|
12602
|
-
[ -f "$file" ] || return 0
|
|
12603
|
-
|
|
12604
|
-
# Per workflow: count total + failures over the most recent 10 records.
|
|
12605
|
-
# awk reads last 20 lines (tail), keeps last 10 per workflow. Output is
|
|
12606
|
-
# collected into a variable (not piped to `while`) so an empty result or
|
|
12607
|
-
# an intermediate nonzero exit cannot trip a caller's ERR trap.
|
|
12608
|
-
local flaky_wfs
|
|
12609
|
-
flaky_wfs=$(tail -n 20 "$file" 2>/dev/null | awk '
|
|
12610
|
-
{
|
|
12611
|
-
# crude field extraction from flat JSON line
|
|
12612
|
-
wf=""; concl="";
|
|
12613
|
-
if (match($0, /"workflow":"[^"]*"/)) { wf=substr($0,RSTART+12,RLENGTH-13) }
|
|
12614
|
-
if (match($0, /"conclusion":"[^"]*"/)) { concl=substr($0,RSTART+14,RLENGTH-15) }
|
|
12615
|
-
if (wf=="") next
|
|
12616
|
-
order[wf]=order[wf]" "NR
|
|
12617
|
-
val[wf"|"NR]=concl
|
|
12618
|
-
}
|
|
12619
|
-
END {
|
|
12620
|
-
for (wf in order) {
|
|
12621
|
-
n=split(order[wf], idx, " ")
|
|
12622
|
-
# keep most recent 10
|
|
12623
|
-
start=1; if (n-10 > 0) start=n-9
|
|
12624
|
-
total=0; fail=0
|
|
12625
|
-
for (i=start;i<=n;i++) {
|
|
12626
|
-
if (idx[i]=="") continue
|
|
12627
|
-
total++
|
|
12628
|
-
c=val[wf"|"idx[i]]
|
|
12629
|
-
if (c=="failure" || c=="timed_out" || c=="cancelled") fail++
|
|
12630
|
-
}
|
|
12631
|
-
if (total>=4 && fail>=2 && fail<=8 && fail*100 <= total*80 && fail*100 >= total*20) {
|
|
12632
|
-
print wf
|
|
12633
|
-
}
|
|
12634
|
-
}
|
|
12635
|
-
}
|
|
12636
|
-
' || true)
|
|
12637
|
-
|
|
12638
|
-
local wf
|
|
12639
|
-
for wf in $flaky_wfs; do
|
|
12640
|
-
[ -n "$wf" ] && _ci_open_story FIX "flaky: ${wf}" >/dev/null || true
|
|
12641
|
-
done
|
|
12642
|
-
return 0
|
|
12643
|
-
}
|
|
12644
|
-
|
|
12645
|
-
# _ci_detect_degradation
|
|
12646
|
-
# Scan the last 20 ci-timing.jsonl lines, compute mean duration per workflow,
|
|
12647
|
-
# and open a US story when a workflow crosses its threshold:
|
|
12648
|
-
# unit* > 300s (5 min)
|
|
12649
|
-
# integration* > 900s (15 min)
|
|
12650
|
-
# Returns 0 (loop-safe).
|
|
12651
|
-
_ci_detect_degradation() {
|
|
12652
|
-
local dir; dir=$(_ci_state_dir)
|
|
12653
|
-
local file="${dir}/ci-timing.jsonl"
|
|
12654
|
-
[ -f "$file" ] || return 0
|
|
12655
|
-
|
|
12656
|
-
local degraded
|
|
12657
|
-
degraded=$(tail -n 20 "$file" 2>/dev/null | awk '
|
|
12658
|
-
{
|
|
12659
|
-
wf=""; dur=0;
|
|
12660
|
-
if (match($0, /"workflow":"[^"]*"/)) { wf=substr($0,RSTART+12,RLENGTH-13) }
|
|
12661
|
-
if (match($0, /"duration_sec":[0-9]+/)) { dur=substr($0,RSTART+15,RLENGTH-15)+0 }
|
|
12662
|
-
if (wf=="") next
|
|
12663
|
-
sum[wf]+=dur; cnt[wf]++
|
|
12664
|
-
}
|
|
12665
|
-
END {
|
|
12666
|
-
for (wf in sum) {
|
|
12667
|
-
if (cnt[wf]==0) continue
|
|
12668
|
-
avg=sum[wf]/cnt[wf]
|
|
12669
|
-
lc=tolower(wf)
|
|
12670
|
-
if (index(lc,"unit")>0 && avg>300) { print wf "\t" int(avg) }
|
|
12671
|
-
else if (index(lc,"integration")>0 && avg>900) { print wf "\t" int(avg) }
|
|
12672
|
-
}
|
|
12673
|
-
}
|
|
12674
|
-
' || true)
|
|
12675
|
-
|
|
12676
|
-
local line wf avg
|
|
12677
|
-
# IFS=newline so each "wf<TAB>avg" record is one iteration; field-split on TAB.
|
|
12678
|
-
local _oifs="$IFS"
|
|
12679
|
-
IFS='
|
|
12680
|
-
'
|
|
12681
|
-
for line in $degraded; do
|
|
12682
|
-
IFS="$_oifs"
|
|
12683
|
-
wf=$(printf '%s' "$line" | cut -f1)
|
|
12684
|
-
avg=$(printf '%s' "$line" | cut -f2)
|
|
12685
|
-
[ -n "$wf" ] && _ci_open_story US "CI degradation: ${wf} avg ${avg}s exceeds threshold" >/dev/null || true
|
|
12686
|
-
IFS='
|
|
12687
|
-
'
|
|
12688
|
-
done
|
|
12689
|
-
IFS="$_oifs"
|
|
12690
|
-
return 0
|
|
12691
|
-
}
|
|
12692
|
-
|
|
12693
|
-
# _ci_scan
|
|
12694
|
-
# US-AUTO-045 Phase 2 orchestrator: the entry the CI Loop runner drives every
|
|
12695
|
-
# 5 min. Lists recent `main`-branch CI runs, records each run's timing, and on
|
|
12696
|
-
# a `failure` conclusion classifies it — auto-rerunning transient infra
|
|
12697
|
-
# flakes. After the loop it runs the flaky + degradation detectors over the
|
|
12698
|
-
# accumulated history. Lenient on gh unavailability (missing / failed list →
|
|
12699
|
-
# return 0) so the service never errors out a tick.
|
|
12700
|
-
_ci_scan() {
|
|
12701
|
-
local slug; _gh_resolve slug 2>/dev/null || { _loop_write_tick "ci" "idle" "gh_unavailable"; return 0; }
|
|
12702
|
-
|
|
12703
|
-
local runs_json
|
|
12704
|
-
runs_json=$(gh -R "$slug" run list --branch main \
|
|
12705
|
-
--json databaseId,workflowName,name,conclusion,status,createdAt,updatedAt \
|
|
12706
|
-
2>/dev/null) || { _loop_write_tick "ci" "idle" "gh_error"; return 0; }
|
|
12707
|
-
[ -n "$runs_json" ] || { _loop_write_tick "ci" "idle" "empty_response"; return 0; }
|
|
12708
|
-
|
|
12709
|
-
# An empty list ("[]") still falls through to the detectors below: they run
|
|
12710
|
-
# over accumulated history, not just this tick's runs.
|
|
12711
|
-
local count; count=$(echo "$runs_json" | jq 'length' 2>/dev/null || echo 0)
|
|
12712
|
-
case "$count" in ''|*[!0-9]*) count=0 ;; esac
|
|
12713
|
-
|
|
12714
|
-
local i=0
|
|
12715
|
-
while [ "$i" -lt "$count" ]; do
|
|
12716
|
-
local run_json conclusion run_id
|
|
12717
|
-
run_json=$(echo "$runs_json" | jq -c ".[$i]" 2>/dev/null)
|
|
12718
|
-
_ci_record_timing "$run_json"
|
|
12719
|
-
|
|
12720
|
-
conclusion=$(echo "$run_json" | jq -r '.conclusion // ""' 2>/dev/null)
|
|
12721
|
-
if [ "$conclusion" = "failure" ]; then
|
|
12722
|
-
run_id=$(echo "$run_json" | jq -r '.databaseId // ""' 2>/dev/null)
|
|
12723
|
-
if [ -n "$run_id" ]; then
|
|
12724
|
-
local kind; kind=$(_ci_classify_failure "$run_id")
|
|
12725
|
-
[ "$kind" = "transient" ] && _ci_rerun_transient "$run_id" >/dev/null
|
|
12726
|
-
fi
|
|
12727
|
-
fi
|
|
12728
|
-
i=$((i + 1))
|
|
12729
|
-
done
|
|
12730
|
-
|
|
12731
|
-
_ci_detect_flaky
|
|
12732
|
-
_ci_detect_degradation
|
|
12733
|
-
_loop_write_tick "ci" "acted" "scan_done"
|
|
12734
|
-
return 0
|
|
12735
|
-
}
|
|
12736
|
-
|
|
12737
|
-
# ═══════════════════════════════════════════════════════════════════════════════
|
|
12738
|
-
# US-AUTO-046 Phase 1: dedicated Alert Loop helpers (loop-safe, pure bash)
|
|
12739
|
-
# ═══════════════════════════════════════════════════════════════════════════════
|
|
12740
|
-
# These consume the existing $_LOOP_ALERT file — until now a write-only dumb file
|
|
12741
|
-
# that every loop appends to but nobody reads. The Alert Loop turns it into a
|
|
12742
|
-
# real consumer: parse → dedup (1h per category) → notify (error always) →
|
|
12743
|
-
# log → rotate. They are NOT yet wired into any runner or launchd plist — that
|
|
12744
|
-
# is Phase 2 (wired by hand). Each is unit-tested in
|
|
12745
|
-
# tests/unit/roll_loop_alert_loop.bats with _notify stubbed. Do not delete or
|
|
12746
|
-
# inline.
|
|
12747
|
-
#
|
|
12748
|
-
# State lives under project-local .roll/state/ (shared with the CI Loop):
|
|
12749
|
-
# alert-log.jsonl append-only NDJSON, one line per consumed alert
|
|
12750
|
-
# $_LOOP_ALERT.prev is the rotated copy (kept for debugging).
|
|
12751
|
-
#
|
|
12752
|
-
# Line format ($_LOOP_ALERT) — new tagged format, old format read-compatible:
|
|
12753
|
-
# [2026-05-26T10:00:00] [error] [TYPE:ci-real-failure] CI failed: run #123
|
|
12754
|
-
# [2026-05-26T10:00:00] some legacy message → level=warn category=legacy
|
|
12755
|
-
|
|
12756
|
-
# _alert_parse_file [file]
|
|
12757
|
-
# Parse each non-empty line of $_LOOP_ALERT (or <file>) into a TAB-separated
|
|
12758
|
-
# record `ts<TAB>level<TAB>category<TAB>message`, one per output line. The
|
|
12759
|
-
# leading `[ts]` is extracted when present; optional `[level]` and
|
|
12760
|
-
# `[TYPE:category]` tags follow. Untagged (legacy) lines default to
|
|
12761
|
-
# level=warn, category=legacy, with the whole remainder as the message.
|
|
12762
|
-
# Markdown headers / ack footers (lines starting with `#` or `**`) are skipped.
|
|
12763
|
-
# Echoes nothing for a missing/empty file. Loop-safe (returns 0).
|
|
12764
|
-
_alert_parse_file() {
|
|
12765
|
-
local file="${1:-$_LOOP_ALERT}"
|
|
12766
|
-
[ -n "$file" ] && [ -f "$file" ] || return 0
|
|
12767
|
-
|
|
12768
|
-
awk '
|
|
12769
|
-
{
|
|
12770
|
-
line=$0
|
|
12771
|
-
# skip blank lines and markdown chrome (headers, ack footers)
|
|
12772
|
-
if (line ~ /^[ \t]*$/) next
|
|
12773
|
-
if (line ~ /^[ \t]*#/) next
|
|
12774
|
-
if (line ~ /^[ \t]*\*\*/) next
|
|
12775
|
-
|
|
12776
|
-
ts=""; level=""; category=""
|
|
12777
|
-
|
|
12778
|
-
# leading [timestamp]
|
|
12779
|
-
if (match(line, /^\[[^]]*\]/)) {
|
|
12780
|
-
ts=substr(line, RSTART+1, RLENGTH-2)
|
|
12781
|
-
line=substr(line, RSTART+RLENGTH)
|
|
12782
|
-
sub(/^[ \t]+/, "", line)
|
|
12783
|
-
}
|
|
12784
|
-
# optional [level] (error|warn|info)
|
|
12785
|
-
if (match(line, /^\[(error|warn|info)\]/)) {
|
|
12786
|
-
level=substr(line, RSTART+1, RLENGTH-2)
|
|
12787
|
-
line=substr(line, RSTART+RLENGTH)
|
|
12788
|
-
sub(/^[ \t]+/, "", line)
|
|
12789
|
-
}
|
|
12790
|
-
# optional [TYPE:category]
|
|
12791
|
-
if (match(line, /^\[TYPE:[^]]*\]/)) {
|
|
12792
|
-
category=substr(line, RSTART+6, RLENGTH-7)
|
|
12793
|
-
line=substr(line, RSTART+RLENGTH)
|
|
12794
|
-
sub(/^[ \t]+/, "", line)
|
|
12795
|
-
}
|
|
12796
|
-
|
|
12797
|
-
# legacy "ALERT:" prefix on the remaining message — strip the keyword
|
|
12798
|
-
sub(/^ALERT:[ \t]*/, "", line)
|
|
12799
|
-
|
|
12800
|
-
if (level=="") level="warn"
|
|
12801
|
-
if (category=="") category="legacy"
|
|
12802
|
-
|
|
12803
|
-
printf "%s\t%s\t%s\t%s\n", ts, level, category, line
|
|
12804
|
-
}
|
|
12805
|
-
' "$file"
|
|
12806
|
-
return 0
|
|
12807
|
-
}
|
|
12808
|
-
|
|
12809
|
-
# _alert_log_file
|
|
12810
|
-
# Echo path to .roll/state/alert-log.jsonl (creating the dir). Reuses the
|
|
12811
|
-
# CI Loop's _ci_state_dir so both loops share one project-local state dir.
|
|
12812
|
-
_alert_log_file() {
|
|
12813
|
-
local dir; dir=$(_ci_state_dir)
|
|
12814
12261
|
echo "${dir}/alert-log.jsonl"
|
|
12815
12262
|
}
|
|
12816
12263
|
|
|
12817
|
-
# _alert_should_notify <category> <level>
|
|
12818
|
-
# Decide whether an alert should fire a notification.
|
|
12819
|
-
# error → always true (immediate, never throttled)
|
|
12820
|
-
# warn | info → true unless a same-category alert was already notified
|
|
12821
|
-
# within the last hour (rate-limit / dedup)
|
|
12822
|
-
# The 1h window is read from alert-log.jsonl (notified=1 entries only).
|
|
12823
|
-
# Echoes "true" / "false".
|
|
12824
|
-
_alert_should_notify() {
|
|
12825
|
-
local category="$1" level="$2"
|
|
12826
|
-
[ "$level" = "error" ] && { echo "true"; return 0; }
|
|
12827
|
-
|
|
12828
|
-
local file; file=$(_alert_log_file)
|
|
12829
|
-
[ -f "$file" ] || { echo "true"; return 0; }
|
|
12830
|
-
|
|
12831
|
-
local now; now=$(date -u +%s)
|
|
12832
|
-
# Most recent notified=1 entry for this category → its recorded_at epoch.
|
|
12833
|
-
local last
|
|
12834
|
-
last=$(grep -F "\"category\":\"${category}\"" "$file" 2>/dev/null \
|
|
12835
|
-
| grep -F '"notified":1' \
|
|
12836
|
-
| tail -1 \
|
|
12837
|
-
| sed -n 's/.*"recorded_at":"\([^"]*\)".*/\1/p')
|
|
12838
|
-
[ -n "$last" ] || { echo "true"; return 0; }
|
|
12839
|
-
|
|
12840
|
-
local last_epoch; last_epoch=$(_ci_iso_to_epoch "$last")
|
|
12841
|
-
[ -n "$last_epoch" ] || { echo "true"; return 0; }
|
|
12842
|
-
|
|
12843
|
-
# Within 1h (3600s) → throttle (false); otherwise allow.
|
|
12844
|
-
if [ "$((now - last_epoch))" -lt 3600 ] 2>/dev/null; then
|
|
12845
|
-
echo "false"
|
|
12846
|
-
else
|
|
12847
|
-
echo "true"
|
|
12848
|
-
fi
|
|
12849
|
-
return 0
|
|
12850
|
-
}
|
|
12851
|
-
|
|
12852
|
-
# _alert_write_log <ts> <level> <category> <message> <notified>
|
|
12853
|
-
# Append one NDJSON record to alert-log.jsonl. <notified> is the literal
|
|
12854
|
-
# string "true"/"false" (or 1/0) and is normalized to 1/0. recorded_at is the
|
|
12855
|
-
# consumption time (UTC), distinct from the alert's own <ts>. Quotes in the
|
|
12856
|
-
# message are escaped so the line stays valid JSON. Loop-safe (returns 0).
|
|
12857
|
-
_alert_write_log() {
|
|
12858
|
-
local ts="$1" level="$2" category="$3" message="$4" notified="$5"
|
|
12859
|
-
local file; file=$(_alert_log_file)
|
|
12860
|
-
|
|
12861
|
-
local n=0
|
|
12862
|
-
case "$notified" in true|1) n=1 ;; esac
|
|
12863
|
-
|
|
12864
|
-
# Escape backslashes then double-quotes for JSON string safety.
|
|
12865
|
-
local esc
|
|
12866
|
-
esc=$(printf '%s' "$message" | sed 's/\\/\\\\/g; s/"/\\"/g')
|
|
12867
|
-
|
|
12868
|
-
printf '{"ts":"%s","level":"%s","category":"%s","message":"%s","notified":%s,"recorded_at":"%s"}\n' \
|
|
12869
|
-
"$ts" "$level" "$category" "$esc" "$n" \
|
|
12870
|
-
"$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "$file"
|
|
12871
|
-
return 0
|
|
12872
|
-
}
|
|
12873
|
-
|
|
12874
|
-
# _alert_rotate [file]
|
|
12875
|
-
# Snapshot $_LOOP_ALERT (or <file>) to <file>.prev and truncate it in place.
|
|
12876
|
-
# Idempotent: a missing source is a no-op (the .prev from a prior run is
|
|
12877
|
-
# left untouched). Loop-safe (returns 0).
|
|
12878
|
-
#
|
|
12879
|
-
# US-AUTO-046 (kimi peer-review Q2): copy+truncate instead of mv. `mv` swaps
|
|
12880
|
-
# the inode at the path, so a producer loop (main/pr/ci) that opened its `>>`
|
|
12881
|
-
# fd *before* the rotation but writes *after* it would land in `.prev` and be
|
|
12882
|
-
# silently lost. Copying keeps the original inode at the path; the subsequent
|
|
12883
|
-
# `:>` truncates that same inode, so any concurrent appender's fd still points
|
|
12884
|
-
# at the live alert file and its write is read on the next 1-min tick.
|
|
12885
|
-
_alert_rotate() {
|
|
12886
|
-
local file="${1:-$_LOOP_ALERT}"
|
|
12887
|
-
[ -n "$file" ] || return 0
|
|
12888
|
-
if [ -f "$file" ]; then
|
|
12889
|
-
cat "$file" > "${file}.prev" 2>/dev/null || true
|
|
12890
|
-
: > "$file"
|
|
12891
|
-
fi
|
|
12892
|
-
return 0
|
|
12893
|
-
}
|
|
12894
|
-
|
|
12895
|
-
# _alert_dispatch [file]
|
|
12896
|
-
# Main consumer entry point. Parse $_LOOP_ALERT → for each alert decide
|
|
12897
|
-
# notify → fire _notify + record to alert-log.jsonl → rotate the file.
|
|
12898
|
-
# A missing/empty alert file is a no-op (no rotate, no log). Loop-safe.
|
|
12899
|
-
_alert_dispatch() {
|
|
12900
|
-
local file="${1:-$_LOOP_ALERT}"
|
|
12901
|
-
[ -n "$file" ] && [ -f "$file" ] || { _loop_write_tick "alert" "idle" "no_file"; return 0; }
|
|
12902
|
-
# Empty file → nothing to consume, leave it in place.
|
|
12903
|
-
[ -s "$file" ] || { _loop_write_tick "alert" "idle" "empty_file"; return 0; }
|
|
12904
|
-
|
|
12905
|
-
local parsed; parsed=$(_alert_parse_file "$file")
|
|
12906
|
-
[ -n "$parsed" ] || { _alert_rotate "$file"; _loop_write_tick "alert" "idle" "no_parsed"; return 0; }
|
|
12907
|
-
|
|
12908
|
-
local line ts level category message notify
|
|
12909
|
-
local _oifs="$IFS"
|
|
12910
|
-
IFS='
|
|
12911
|
-
'
|
|
12912
|
-
for line in $parsed; do
|
|
12913
|
-
IFS="$_oifs"
|
|
12914
|
-
ts=$(printf '%s' "$line" | cut -f1)
|
|
12915
|
-
level=$(printf '%s' "$line" | cut -f2)
|
|
12916
|
-
category=$(printf '%s' "$line" | cut -f3)
|
|
12917
|
-
message=$(printf '%s' "$line" | cut -f4-)
|
|
12918
|
-
|
|
12919
|
-
notify=$(_alert_should_notify "$category" "$level")
|
|
12920
|
-
if [ "$notify" = "true" ]; then
|
|
12921
|
-
_notify "roll alert: ${level}" "${message}" || true
|
|
12922
|
-
_alert_write_log "$ts" "$level" "$category" "$message" "true"
|
|
12923
|
-
else
|
|
12924
|
-
_alert_write_log "$ts" "$level" "$category" "$message" "false"
|
|
12925
|
-
fi
|
|
12926
|
-
IFS='
|
|
12927
|
-
'
|
|
12928
|
-
done
|
|
12929
|
-
IFS="$_oifs"
|
|
12930
|
-
|
|
12931
|
-
_alert_rotate "$file"
|
|
12932
|
-
_loop_write_tick "alert" "acted" "dispatch_done"
|
|
12933
|
-
return 0
|
|
12934
|
-
}
|
|
12935
|
-
|
|
12936
12264
|
# FIX-070: flip a story row in the main repo's .roll/backlog.md between
|
|
12937
12265
|
# 📋 Todo and 🔨 In Progress. The cycle worktree is gitignored at .roll/,
|
|
12938
12266
|
# so editing the worktree copy + committing leaves no trace in git — and
|
|
@@ -14480,7 +13808,7 @@ _loop_monitor() {
|
|
|
14480
13808
|
dream_sched=$(printf "%02d:%02d" "$dream_hour" "$dream_minute")
|
|
14481
13809
|
brief_sched=$(printf "%02d:%02d" "$brief_hour" "$brief_minute")
|
|
14482
13810
|
|
|
14483
|
-
local svcs=("loop" "dream" "
|
|
13811
|
+
local svcs=("loop" "dream" "pr")
|
|
14484
13812
|
local scheds=("$loop_sched" "$dream_sched" "$brief_sched")
|
|
14485
13813
|
for i in "${!svcs[@]}"; do
|
|
14486
13814
|
local svc="${svcs[$i]}" schedule="${scheds[$i]}"
|
|
@@ -15909,11 +15237,10 @@ main() {
|
|
|
15909
15237
|
test) cmd_test "$@" ;;
|
|
15910
15238
|
prices) cmd_prices "$@" ;;
|
|
15911
15239
|
changelog) cmd_changelog "$@" ;;
|
|
15240
|
+
consistency) cmd_consistency "$@" ;;
|
|
15912
15241
|
config) cmd_config "$@" ;;
|
|
15913
15242
|
_loop_render_exit_summary) _loop_render_exit_summary "$@" ;;
|
|
15914
15243
|
_loop_pr_inbox) _loop_pr_inbox "$@" ;;
|
|
15915
|
-
_ci_scan) _ci_scan "$@" ;;
|
|
15916
|
-
_alert_dispatch) _alert_dispatch "$@" ;;
|
|
15917
15244
|
version|--version|-v) echo "roll v${VERSION}" ;;
|
|
15918
15245
|
help|--help|-h) _help "$@" ;;
|
|
15919
15246
|
"") [[ -f ".roll/backlog.md" ]] && _home || { _help; _show_changelog; } ;;
|