agent-control-plane 0.1.8 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/bin/pr-risk.sh +54 -10
  2. package/hooks/heartbeat-hooks.sh +166 -13
  3. package/package.json +8 -2
  4. package/references/commands.md +1 -0
  5. package/tools/bin/agent-project-cleanup-session +143 -2
  6. package/tools/bin/agent-project-heartbeat-loop +29 -2
  7. package/tools/bin/agent-project-publish-issue-pr +178 -62
  8. package/tools/bin/agent-project-reconcile-issue-session +230 -5
  9. package/tools/bin/agent-project-reconcile-pr-session +104 -13
  10. package/tools/bin/agent-project-run-claude-session +19 -1
  11. package/tools/bin/agent-project-run-codex-resilient +121 -16
  12. package/tools/bin/agent-project-run-codex-session +61 -11
  13. package/tools/bin/agent-project-run-openclaw-session +274 -7
  14. package/tools/bin/agent-project-sync-anchor-repo +13 -2
  15. package/tools/bin/agent-project-worker-status +19 -14
  16. package/tools/bin/cleanup-worktree.sh +4 -1
  17. package/tools/bin/dashboard-launchd-bootstrap.sh +16 -4
  18. package/tools/bin/ensure-runtime-sync.sh +182 -0
  19. package/tools/bin/flow-config-lib.sh +76 -30
  20. package/tools/bin/flow-resident-worker-lib.sh +28 -2
  21. package/tools/bin/flow-shell-lib.sh +28 -8
  22. package/tools/bin/heartbeat-safe-auto.sh +32 -0
  23. package/tools/bin/issue-publish-localization-guard.sh +142 -0
  24. package/tools/bin/prepare-worktree.sh +3 -1
  25. package/tools/bin/project-launchd-bootstrap.sh +17 -4
  26. package/tools/bin/project-runtime-supervisor.sh +7 -1
  27. package/tools/bin/project-runtimectl.sh +78 -15
  28. package/tools/bin/provider-cooldown-state.sh +1 -1
  29. package/tools/bin/render-flow-config.sh +16 -1
  30. package/tools/bin/reuse-issue-worktree.sh +46 -0
  31. package/tools/bin/run-codex-task.sh +2 -2
  32. package/tools/bin/scaffold-profile.sh +2 -2
  33. package/tools/bin/start-issue-worker.sh +118 -16
  34. package/tools/bin/start-resident-issue-loop.sh +1 -0
  35. package/tools/bin/sync-shared-agent-home.sh +26 -0
  36. package/tools/bin/test-smoke.sh +6 -1
  37. package/tools/dashboard/app.js +91 -3
  38. package/tools/dashboard/dashboard_snapshot.py +119 -0
  39. package/tools/dashboard/styles.css +43 -0
  40. package/tools/templates/issue-prompt-template.md +18 -66
  41. package/tools/templates/legacy/issue-prompt-template-pre-slim.md +109 -0
  42. package/bin/audit-issue-routing.sh +0 -74
  43. package/tools/bin/audit-agent-worktrees.sh +0 -310
  44. package/tools/bin/audit-issue-routing.sh +0 -11
  45. package/tools/bin/audit-retained-layout.sh +0 -58
  46. package/tools/bin/audit-retained-overlap.sh +0 -135
  47. package/tools/bin/audit-retained-worktrees.sh +0 -228
  48. package/tools/bin/check-skill-contracts.sh +0 -324
@@ -153,6 +153,54 @@ reap_stale_run_dir() {
153
153
  mv "$RUN_DIR" "${HISTORY_ROOT}/${SESSION}-stale-$(date +%Y%m%d-%H%M%S)"
154
154
  }
155
155
 
156
+ find_archived_issue_session_dir() {
157
+ local root="${1:-}"
158
+ local target_session="${2:-}"
159
+ [[ -n "$root" && -d "$root" && -n "$target_session" ]] || return 1
160
+
161
+ find "$root" -mindepth 1 -maxdepth 1 -type d -name "${target_session}-*" ! -name "${target_session}-stale-*" 2>/dev/null \
162
+ | sort -r \
163
+ | head -n 1
164
+ }
165
+
166
+ issue_retry_state_value() {
167
+ local key="${1:?retry-state key required}"
168
+ awk -F= -v target_key="$key" '$1 == target_key { print substr($0, index($0, "=") + 1); exit }' <<<"${ISSUE_RETRY_STATE:-}"
169
+ }
170
+
171
+ issue_host_publish_replay_dir() {
172
+ local last_reason=""
173
+ local archived_dir=""
174
+ local runner_state=""
175
+ local result_outcome=""
176
+ local result_action=""
177
+
178
+ last_reason="$(issue_retry_state_value LAST_REASON)"
179
+ case "${last_reason}" in
180
+ host-publish-failed|issue-worker-blocked) ;;
181
+ *) return 1 ;;
182
+ esac
183
+
184
+ archived_dir="$(find_archived_issue_session_dir "$HISTORY_ROOT" "$SESSION" || true)"
185
+ [[ -n "${archived_dir}" && -f "${archived_dir}/run.env" && -f "${archived_dir}/runner.env" && -f "${archived_dir}/result.env" ]] || return 1
186
+
187
+ runner_state="$(awk -F= '/^RUNNER_STATE=/{print $2; exit}' "${archived_dir}/runner.env")"
188
+ result_outcome="$(awk -F= '/^OUTCOME=/{print $2; exit}' "${archived_dir}/result.env")"
189
+ result_action="$(awk -F= '/^ACTION=/{print $2; exit}' "${archived_dir}/result.env")"
190
+
191
+ [[ "${runner_state}" == "succeeded" ]] || return 1
192
+ [[ "${result_outcome}" == "implemented" ]] || return 1
193
+ [[ "${result_action}" == "host-publish-issue-pr" ]] || return 1
194
+
195
+ printf '%s\n' "${archived_dir}"
196
+ }
197
+
198
+ replay_issue_host_publish_retry() {
199
+ local archived_dir="${1:?archived dir required}"
200
+ printf 'ISSUE_HOST_PUBLISH_REPLAY=session=%s archived_run_dir=%s\n' "${SESSION}" "${archived_dir}" >&2
201
+ bash "${WORKSPACE_DIR}/bin/reconcile-issue-worker.sh" "${SESSION}"
202
+ }
203
+
156
204
  if tmux has-session -t "$SESSION" 2>/dev/null; then
157
205
  echo "worker session already exists: $SESSION" >&2
158
206
  exit 1
@@ -180,6 +228,9 @@ EOF
180
228
  ISSUE_REQUIRES_LOCAL_WORKSPACE_INSTALL="$(
181
229
  ISSUE_BODY="$ISSUE_BODY" bash "$LOCAL_INSTALL_POLICY_BIN"
182
230
  )"
231
+ ISSUE_RETRY_STATE="$(
232
+ bash "${WORKSPACE_DIR}/bin/retry-state.sh" issue "$ISSUE_ID" get 2>/dev/null || true
233
+ )"
183
234
  if [[ "${ISSUE_SCHEDULE_INTERVAL_SECONDS}" =~ ^[1-9][0-9]*$ ]]; then
184
235
  TEMPLATE_FILE="${SCHEDULED_TEMPLATE_FILE}"
185
236
  fi
@@ -227,6 +278,16 @@ if [[ -d "$RUN_DIR" ]]; then
227
278
  reap_stale_run_dir
228
279
  fi
229
280
 
281
+ ISSUE_HOST_PUBLISH_REPLAY_DIR="$(issue_host_publish_replay_dir || true)"
282
+ if [[ -n "${ISSUE_HOST_PUBLISH_REPLAY_DIR}" ]]; then
283
+ if ! replay_issue_host_publish_retry "${ISSUE_HOST_PUBLISH_REPLAY_DIR}"; then
284
+ echo "host publish replay failed for session ${SESSION}" >&2
285
+ exit 1
286
+ fi
287
+ launch_success="yes"
288
+ exit 0
289
+ fi
290
+
230
291
  block_if_recurring_checklist_complete
231
292
 
232
293
  mkdir -p "$RUN_DIR"
@@ -348,9 +409,6 @@ if (completedPrs.length > 0) {
348
409
  process.stdout.write(`${lines.join('\n')}\n`);
349
410
  EOF
350
411
  ISSUE_RECURRING_CONTEXT="$(cat "$ISSUE_RECURRING_CONTEXT_FILE")"
351
- ISSUE_RETRY_STATE="$(
352
- bash "${WORKSPACE_DIR}/bin/retry-state.sh" issue "$ISSUE_ID" get 2>/dev/null || true
353
- )"
354
412
  ISSUE_BLOCKER_CONTEXT="$(
355
413
  ISSUE_JSON="$ISSUE_JSON" ISSUE_RETRY_STATE="$ISSUE_RETRY_STATE" node <<'EOF'
356
414
  const issue = JSON.parse(process.env.ISSUE_JSON || '{}');
@@ -386,6 +444,41 @@ const blockerComment = [...(issue.comments || [])]
386
444
  ),
387
445
  );
388
446
 
447
+ const inferCommentReason = (bodyText) => {
448
+ const body = String(bodyText || '');
449
+ const marker = 'Failure reason:';
450
+ const markerIndex = body.search(/Failure reason:/i);
451
+ if (markerIndex !== -1) {
452
+ const backtick = String.fromCharCode(96);
453
+ const tail = body.slice(markerIndex + marker.length);
454
+ const firstQuoted = tail.split(backtick)[1];
455
+ if (firstQuoted) {
456
+ return firstQuoted.trim();
457
+ }
458
+ }
459
+ if (/^# Blocker: Verification requirements were not satisfied$/im.test(body)) {
460
+ return 'verification-guard-blocked';
461
+ }
462
+ if (/^# Blocker: Localization requirements were not satisfied$/im.test(body)) {
463
+ return 'localization-guard-blocked';
464
+ }
465
+ if (/^# Blocker: (All checklist items already completed|Worker produced no publishable delta)$/im.test(body)) {
466
+ return 'no-publishable-commits';
467
+ }
468
+ if (/scope guard/i.test(body)) {
469
+ return 'scope-guard-blocked';
470
+ }
471
+ if (/^# Blocker: Provider quota is currently exhausted$/im.test(body)) {
472
+ return 'provider-quota-limit';
473
+ }
474
+ return '';
475
+ };
476
+
477
+ const effectiveLastReason =
478
+ lastReason && lastReason !== 'issue-worker-blocked'
479
+ ? lastReason
480
+ : inferCommentReason(blockerComment?.body || '') || lastReason;
481
+
389
482
  if (!blockerComment || !blockerComment.body) {
390
483
  const fallbackLines = [
391
484
  '',
@@ -393,13 +486,13 @@ if (!blockerComment || !blockerComment.body) {
393
486
  'This issue is being retried after an `agent-blocked` stop.',
394
487
  '- First resolve the prior blocker instead of repeating the same broad implementation path.',
395
488
  ];
396
- if (lastReason) {
397
- fallbackLines.push(`- Last recorded blocker: \`${lastReason}\`.`);
489
+ if (effectiveLastReason) {
490
+ fallbackLines.push('- Last recorded blocker: `' + effectiveLastReason + '`.');
398
491
  }
399
492
  if (attempts > 0) {
400
- fallbackLines.push(`- Blocked retries so far: ${attempts}.`);
493
+ fallbackLines.push('- Blocked retries so far: ' + attempts + '.');
401
494
  }
402
- if (lastReason === 'scope-guard-blocked' && attempts >= 2) {
495
+ if (effectiveLastReason === 'scope-guard-blocked' && attempts >= 2) {
403
496
  fallbackLines.push(
404
497
  '- This issue has already hit the scope guard multiple times. Do not attempt another broad multi-surface patch.',
405
498
  `- Either ship one focused slice that stays under the scope guard, or create focused follow-up issues with \`bash "$FLOW_TOOLS_DIR/create-follow-up-issue.sh" --parent ${issue.number} --title "..." --body-file /tmp/follow-up.md\` and supersede the umbrella.`,
@@ -420,22 +513,24 @@ const lines = [
420
513
  '- Address the blocker below before attempting a new implementation/publish cycle.',
421
514
  ];
422
515
 
423
- if (lastReason) {
424
- lines.push(`- Last recorded blocker: \`${lastReason}\`.`);
516
+ if (effectiveLastReason) {
517
+ lines.push('- Last recorded blocker: `' + effectiveLastReason + '`.');
425
518
  }
426
519
  if (attempts > 0) {
427
- lines.push(`- Blocked retries so far: ${attempts}.`);
520
+ lines.push('- Blocked retries so far: ' + attempts + '.');
428
521
  }
429
522
  if (nextAttemptAt) {
430
- lines.push(`- Last scheduled retry target was ${nextAttemptAt}.`);
523
+ lines.push('- Last scheduled retry target was ' + nextAttemptAt + '.');
431
524
  }
432
- if (lastReason === 'scope-guard-blocked') {
525
+ if (effectiveLastReason === 'scope-guard-blocked') {
433
526
  lines.push('- Treat this as a scope problem first: narrow to one safe slice or decompose into focused follow-up issues.');
434
527
  if (attempts >= 2) {
435
528
  lines.push(`- Because the scope guard has already fired multiple times, do not retry the same umbrella patch. Use \`bash "$FLOW_TOOLS_DIR/create-follow-up-issue.sh" --parent ${issue.number} --title "..." --body-file /tmp/follow-up.md\` for the remaining slices, then supersede the umbrella if you covered the full decomposition.`);
436
529
  }
437
- } else if (lastReason === 'verification-guard-blocked') {
530
+ } else if (effectiveLastReason === 'verification-guard-blocked') {
438
531
  lines.push('- Add the missing verification or shrink the touched surface before attempting another publish cycle.');
532
+ } else if (effectiveLastReason === 'localization-guard-blocked') {
533
+ lines.push('- Finish moving the remaining user-facing literals behind translation keys before attempting another publish cycle.');
439
534
  }
440
535
 
441
536
  lines.push('', clippedBody);
@@ -547,10 +642,11 @@ open_or_reuse_issue_worktree() {
547
642
  RESIDENT_OPENCLAW_CONFIG_PATH="${current_resident_openclaw_config_path}"
548
643
  RESIDENT_TASK_COUNT="$(( ${TASK_COUNT:-0} + 1 ))"
549
644
  RESIDENT_WORKTREE_REUSED="yes"
550
- if [[ "${CODING_WORKER}" == "openclaw" && -n "${previous_issue_id}" && "${previous_issue_id}" != "${current_issue_id}" ]]; then
645
+ if [[ "${CODING_WORKER}" == "openclaw" ]]; then
551
646
  # Keep the resident lane's warm workspace/agent files, but rotate the
552
- # OpenClaw conversation thread when switching issues to reduce context drift.
553
- RESIDENT_OPENCLAW_SESSION_ID="$(flow_resident_issue_openclaw_session_id "${CONFIG_YAML}" "${current_issue_id}")"
647
+ # OpenClaw conversation thread every cycle so a new task does not inherit
648
+ # stale conversational context from the previous one.
649
+ RESIDENT_OPENCLAW_SESSION_ID="$(flow_resident_issue_openclaw_session_id "${CONFIG_YAML}" "${current_issue_id}" "${RESIDENT_TASK_COUNT}")"
554
650
  fi
555
651
  if reuse_output="$("${WORKSPACE_DIR}/bin/reuse-issue-worktree.sh" "${WORKTREE}" "${ISSUE_ID}" "${ISSUE_SLUG}" 2>&1)"; then
556
652
  WORKTREE_OUT="${reuse_output}"
@@ -558,6 +654,9 @@ open_or_reuse_issue_worktree() {
558
654
  printf 'RESIDENT_REUSE_FALLBACK=issue-%s reason=%s\n' "${ISSUE_ID}" "$(printf '%s' "${reuse_output}" | tr '\n' ' ' | sed 's/ */ /g')" >&2
559
655
  RESIDENT_TASK_COUNT="1"
560
656
  RESIDENT_WORKTREE_REUSED="no"
657
+ if [[ "${CODING_WORKER}" == "openclaw" ]]; then
658
+ RESIDENT_OPENCLAW_SESSION_ID="$(flow_resident_issue_openclaw_session_id "${CONFIG_YAML}" "${current_issue_id}" "${RESIDENT_TASK_COUNT}")"
659
+ fi
561
660
  if [[ "$ISSUE_REQUIRES_LOCAL_WORKSPACE_INSTALL" == "yes" ]]; then
562
661
  WORKTREE_OUT="$(ACP_WORKTREE_LOCAL_INSTALL=true F_LOSNING_WORKTREE_LOCAL_INSTALL=true "${WORKSPACE_DIR}/bin/new-worktree.sh" "$ISSUE_ID" "$ISSUE_SLUG")"
563
662
  else
@@ -567,6 +666,9 @@ open_or_reuse_issue_worktree() {
567
666
  else
568
667
  RESIDENT_TASK_COUNT="1"
569
668
  RESIDENT_WORKTREE_REUSED="no"
669
+ if [[ "${CODING_WORKER}" == "openclaw" ]]; then
670
+ RESIDENT_OPENCLAW_SESSION_ID="$(flow_resident_issue_openclaw_session_id "${CONFIG_YAML}" "${current_issue_id}" "${RESIDENT_TASK_COUNT}")"
671
+ fi
570
672
  if [[ "$ISSUE_REQUIRES_LOCAL_WORKSPACE_INSTALL" == "yes" ]]; then
571
673
  WORKTREE_OUT="$(ACP_WORKTREE_LOCAL_INSTALL=true F_LOSNING_WORKTREE_LOCAL_INSTALL=true "${WORKSPACE_DIR}/bin/new-worktree.sh" "$ISSUE_ID" "$ISSUE_SLUG")"
572
674
  else
@@ -785,6 +785,7 @@ while true; do
785
785
  controller_refresh_execution_context
786
786
  controller_refresh_issue_lane_context "${is_scheduled}" "${schedule_interval_seconds}"
787
787
  controller_track_provider_selection "provider-selection"
788
+ controller_write_state "starting" ""
788
789
 
789
790
  if controller_yield_to_live_lane_peer; then
790
791
  break
@@ -69,6 +69,31 @@ sync_skill_copies() {
69
69
  fi
70
70
  }
71
71
 
72
+ refresh_legacy_profile_templates() {
73
+ local profiles_root=""
74
+ local current_issue_template=""
75
+ local legacy_issue_template=""
76
+ local profile_dir=""
77
+ local profile_issue_template=""
78
+
79
+ profiles_root="$(resolve_flow_profile_registry_root)"
80
+ current_issue_template="${FLOW_SKILL_SOURCE}/tools/templates/issue-prompt-template.md"
81
+ legacy_issue_template="${FLOW_SKILL_SOURCE}/tools/templates/legacy/issue-prompt-template-pre-slim.md"
82
+
83
+ [[ -d "${profiles_root}" ]] || return 0
84
+ [[ -f "${current_issue_template}" ]] || return 0
85
+ [[ -f "${legacy_issue_template}" ]] || return 0
86
+
87
+ while IFS= read -r profile_dir; do
88
+ [[ -n "${profile_dir}" ]] || continue
89
+ profile_issue_template="${profile_dir}/templates/issue-prompt-template.md"
90
+ [[ -f "${profile_issue_template}" ]] || continue
91
+ if cmp -s "${profile_issue_template}" "${legacy_issue_template}"; then
92
+ cp "${current_issue_template}" "${profile_issue_template}"
93
+ fi
94
+ done < <(find "${profiles_root}" -mindepth 2 -maxdepth 2 -type f -name 'control-plane.yaml' -exec dirname {} \; 2>/dev/null | sort)
95
+ }
96
+
72
97
  remove_repo_local_profile_dirs() {
73
98
  local candidate=""
74
99
 
@@ -210,5 +235,6 @@ fi
210
235
  sync_skill_copies
211
236
  remove_repo_local_profile_dirs
212
237
  normalize_script_permissions
238
+ refresh_legacy_profile_templates
213
239
 
214
240
  printf 'SHARED_AGENT_HOME=%s\n' "${TARGET_HOME}"
@@ -58,7 +58,12 @@ run_step() {
58
58
  return "${status}"
59
59
  }
60
60
 
61
- run_step "check-skill-contracts" bash "${check_contracts_script}"
61
+ if [[ -f "${check_contracts_script}" ]]; then
62
+ run_step "check-skill-contracts" bash "${check_contracts_script}"
63
+ else
64
+ printf 'SMOKE_STEP=%s\n' "check-skill-contracts"
65
+ printf 'SMOKE_STEP_STATUS=%s\n' "skipped"
66
+ fi
62
67
 
63
68
  run_profile_smoke_fixture() (
64
69
  set -euo pipefail
@@ -2,6 +2,8 @@ const refreshButton = document.querySelector("#refresh-button");
2
2
  const generatedAtNode = document.querySelector("#generated-at");
3
3
  const overviewNode = document.querySelector("#overview");
4
4
  const profilesNode = document.querySelector("#profiles");
5
+ const seenAlertIds = new Set();
6
+ let notificationPermissionRequested = false;
5
7
 
6
8
  function relativeTime(input) {
7
9
  if (!input) return "n/a";
@@ -45,7 +47,7 @@ function renderResult(row) {
45
47
 
46
48
  function renderControllerState(row) {
47
49
  const state = row.state || "n/a";
48
- const stale = state !== "stopped" && row.controller_live === false;
50
+ const stale = row.controller_stale === true || (state !== "stopped" && row.controller_live === false);
49
51
  const label = stale ? `${state} (stale)` : state;
50
52
  return `<span class="status-pill ${statusClass(stale ? "stale" : state)}">${label}</span>`;
51
53
  }
@@ -61,9 +63,10 @@ function renderOverview(snapshot) {
61
63
  acc.controllers += profile.counts.live_resident_controllers;
62
64
  acc.cooldowns += profile.counts.provider_cooldowns;
63
65
  acc.queue += profile.counts.queued_issues;
66
+ acc.alerts += profile.counts.alerts || 0;
64
67
  return acc;
65
68
  },
66
- { activeRuns: 0, runningRuns: 0, implementedRuns: 0, reportedRuns: 0, blockedRuns: 0, controllers: 0, cooldowns: 0, queue: 0 },
69
+ { activeRuns: 0, runningRuns: 0, implementedRuns: 0, reportedRuns: 0, blockedRuns: 0, controllers: 0, cooldowns: 0, queue: 0, alerts: 0 },
67
70
  );
68
71
 
69
72
  overviewNode.innerHTML = [
@@ -75,6 +78,7 @@ function renderOverview(snapshot) {
75
78
  ["Blocked", totals.blockedRuns],
76
79
  ["Live Controllers", totals.controllers],
77
80
  ["Provider Cooldowns", totals.cooldowns],
81
+ ["Alerts", totals.alerts],
78
82
  ["Queued Issues", totals.queue],
79
83
  ]
80
84
  .map(
@@ -104,6 +108,36 @@ function renderTable(columns, rows, emptyMessage = "No data right now.") {
104
108
  return `<div class="table-wrap"><table><thead><tr>${headers}</tr></thead><tbody>${body}</tbody></table></div>`;
105
109
  }
106
110
 
111
+ function renderAlerts(alerts) {
112
+ if (!alerts.length) {
113
+ return `<div class="empty-state">No active alerts for this profile.</div>`;
114
+ }
115
+ return `
116
+ <div class="alert-list">
117
+ ${alerts
118
+ .map(
119
+ (alert) => `
120
+ <article class="alert-card ${statusClass(alert.severity || "warn")}">
121
+ <div class="alert-header">
122
+ <div>
123
+ <h4>${alert.title}</h4>
124
+ <div class="muted mono">${alert.session || "n/a"} · ${alert.task_kind || "task"} ${alert.task_id || ""}</div>
125
+ </div>
126
+ <span class="badge warn">${alert.kind}</span>
127
+ </div>
128
+ <p>${alert.message}</p>
129
+ <div class="alert-meta">
130
+ <span>${alert.reset_at ? `Reset: ${alert.reset_at}` : "Reset: n/a"}</span>
131
+ <span>${alert.updated_at ? `${relativeTime(alert.updated_at)} · ${alert.updated_at}` : "updated n/a"}</span>
132
+ </div>
133
+ </article>
134
+ `,
135
+ )
136
+ .join("")}
137
+ </div>
138
+ `;
139
+ }
140
+
107
141
  function renderProfile(profile) {
108
142
  const providerBadges = [
109
143
  profile.coding_worker ? `<span class="badge good">${profile.coding_worker}</span>` : "",
@@ -123,7 +157,10 @@ function renderProfile(profile) {
123
157
  ["Reported", profile.counts.reported_runs],
124
158
  ["Blocked", profile.counts.blocked_runs],
125
159
  ["Live controllers", profile.counts.live_resident_controllers],
160
+ ["Stale controllers", profile.counts.stale_resident_controllers],
126
161
  ["Provider cooldowns", profile.counts.provider_cooldowns],
162
+ ["Alerts", profile.counts.alerts || 0],
163
+ ["Issue retries", profile.counts.active_retries],
127
164
  ["Queued issues", profile.counts.queued_issues],
128
165
  ["Scheduled", profile.counts.scheduled_issues],
129
166
  ]
@@ -165,6 +202,18 @@ function renderProfile(profile) {
165
202
  "No resident controllers recorded for this profile.",
166
203
  );
167
204
 
205
+ const retryTable = renderTable(
206
+ [
207
+ { label: "Issue", key: "issue_id" },
208
+ { label: "Status", render: (row) => `<span class="status-pill ${row.ready ? "" : "waiting-provider"}">${row.ready ? "ready" : "retrying"}</span>` },
209
+ { label: "Reason", render: (row) => row.last_reason || "n/a" },
210
+ { label: "Attempts", key: "attempts" },
211
+ { label: "Next attempt", render: (row) => row.next_attempt_at ? `${relativeTime(row.next_attempt_at)}<div class="muted">${row.next_attempt_at}</div>` : "n/a" },
212
+ ],
213
+ profile.issue_retries || [],
214
+ "No issue retries recorded.",
215
+ );
216
+
168
217
  const workerTable = renderTable(
169
218
  [
170
219
  { label: "Key", render: (row) => `<div class="mono">${row.key}</div>` },
@@ -226,6 +275,11 @@ function renderProfile(profile) {
226
275
  </header>
227
276
  <section class="overview">${summaryCards}</section>
228
277
  <section class="profile-grid">
278
+ <section class="panel">
279
+ <h3>Host Alerts</h3>
280
+ <p class="panel-subtitle">High-signal operational blockers surfaced from active run logs and comment artifacts.</p>
281
+ ${renderAlerts(profile.alerts || [])}
282
+ </section>
229
283
  <section class="panel">
230
284
  <h3>Active Runs</h3>
231
285
  <p class="panel-subtitle">Lifecycle shows technical session completion. Result shows what the run achieved: implemented, reported, or blocked.</p>
@@ -233,9 +287,13 @@ function renderProfile(profile) {
233
287
  </section>
234
288
  <section class="panel">
235
289
  <h3>Resident Controllers</h3>
236
- <p class="panel-subtitle">Includes provider wait and failover telemetry.</p>
290
+ <p class="panel-subtitle">Includes provider wait and failover telemetry. Stale controllers show a warning.</p>
237
291
  ${controllerTable}
238
292
  </section>
293
+ <section class="panel half">
294
+ <h3>Issue Retries</h3>
295
+ ${retryTable}
296
+ </section>
239
297
  <section class="panel">
240
298
  <h3>Resident Worker Metadata</h3>
241
299
  ${workerTable}
@@ -257,6 +315,35 @@ function renderProfile(profile) {
257
315
  `;
258
316
  }
259
317
 
318
+ async function maybeNotifyAlerts(snapshot) {
319
+ const alerts = (snapshot.alerts || []).filter((alert) => alert && alert.id);
320
+ if (!alerts.length || typeof window.Notification === "undefined") return;
321
+
322
+ if (window.Notification.permission === "default" && !notificationPermissionRequested) {
323
+ notificationPermissionRequested = true;
324
+ try {
325
+ await window.Notification.requestPermission();
326
+ } catch (_error) {
327
+ return;
328
+ }
329
+ }
330
+
331
+ if (window.Notification.permission !== "granted") return;
332
+
333
+ for (const alert of alerts) {
334
+ if (seenAlertIds.has(alert.id)) continue;
335
+ seenAlertIds.add(alert.id);
336
+ const bodyParts = [];
337
+ if (alert.session) bodyParts.push(alert.session);
338
+ if (alert.reset_at) bodyParts.push(`reset ${alert.reset_at}`);
339
+ if (alert.message) bodyParts.push(alert.message);
340
+ new window.Notification(alert.title || "ACP alert", {
341
+ body: bodyParts.join(" · ").slice(0, 240),
342
+ tag: alert.id,
343
+ });
344
+ }
345
+ }
346
+
260
347
  async function loadSnapshot() {
261
348
  refreshButton.disabled = true;
262
349
  try {
@@ -268,6 +355,7 @@ async function loadSnapshot() {
268
355
  generatedAtNode.textContent = `Snapshot: ${snapshot.generated_at}`;
269
356
  renderOverview(snapshot);
270
357
  profilesNode.innerHTML = snapshot.profiles.map(renderProfile).join("");
358
+ await maybeNotifyAlerts(snapshot);
271
359
  } catch (error) {
272
360
  generatedAtNode.textContent = `Snapshot load failed: ${error.message}`;
273
361
  profilesNode.innerHTML = `<article class="profile"><div class="empty-state">${error.message}</div></article>`;
@@ -143,6 +143,19 @@ def file_mtime_iso(path: Path) -> str:
143
143
  return datetime.fromtimestamp(path.stat().st_mtime, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
144
144
 
145
145
 
146
+ def read_tail_text(path: Path, max_bytes: int = 65536) -> str:
147
+ if not path.is_file():
148
+ return ""
149
+ try:
150
+ with path.open("rb") as handle:
151
+ size = path.stat().st_size
152
+ if size > max_bytes:
153
+ handle.seek(size - max_bytes)
154
+ return handle.read().decode("utf-8", errors="replace")
155
+ except OSError:
156
+ return ""
157
+
158
+
146
159
  def classify_run_result(status: str, outcome: str, failure_reason: str) -> tuple[str, str]:
147
160
  normalized_status = (status or "").strip().upper()
148
161
  normalized_outcome = (outcome or "").strip()
@@ -167,6 +180,59 @@ def classify_run_result(status: str, outcome: str, failure_reason: str) -> tuple
167
180
  return ("unknown", normalized_status or "Unknown")
168
181
 
169
182
 
183
+ GITHUB_RATE_LIMIT_PATTERNS = [
184
+ re.compile(
185
+ r"GitHub core API[^\n]*?rate limit[^\n]*(?:reset(?:s| into)?(?: at)?\s+(?P<reset>[^.\n]+))?",
186
+ re.IGNORECASE,
187
+ ),
188
+ re.compile(
189
+ r"gh:\s*API rate limit exceeded[^\n]*(?:reset(?:s| into)?(?: at)?\s+(?P<reset>[^.\n]+))?",
190
+ re.IGNORECASE,
191
+ ),
192
+ ]
193
+
194
+
195
+ def summarize_whitespace(text: str) -> str:
196
+ return re.sub(r"\s+", " ", text).strip()
197
+
198
+
199
+ def extract_github_rate_limit_alert(run_dir: Path, run: dict[str, Any]) -> dict[str, Any] | None:
200
+ candidate_files = [
201
+ run_dir / "issue-comment.md",
202
+ run_dir / "pr-comment.md",
203
+ run_dir / f"{run['session']}.log",
204
+ ]
205
+ for path in candidate_files:
206
+ text = read_tail_text(path)
207
+ if not text:
208
+ continue
209
+ for pattern in GITHUB_RATE_LIMIT_PATTERNS:
210
+ match = pattern.search(text)
211
+ if not match:
212
+ continue
213
+ summary = summarize_whitespace(match.group(0))
214
+ reset_match = re.search(r"reset(?:s| into)?(?: at)?\s+([^.\n]+)", summary, re.IGNORECASE)
215
+ reset_at = summarize_whitespace((reset_match.group(1) if reset_match else "") or match.groupdict().get("reset") or "")
216
+ if not summary:
217
+ summary = "GitHub core API rate limit is blocking host-side actions."
218
+ if reset_at and reset_at not in summary:
219
+ summary = f"{summary} Reset: {reset_at}."
220
+ return {
221
+ "id": f"github-core-rate-limit:{run['session']}:{reset_at or path.name}",
222
+ "kind": "github-core-rate-limit",
223
+ "severity": "warn",
224
+ "title": "GitHub core API rate limit blocks host actions",
225
+ "message": summary,
226
+ "session": run.get("session", ""),
227
+ "task_kind": run.get("task_kind", ""),
228
+ "task_id": run.get("task_id", ""),
229
+ "reset_at": reset_at,
230
+ "updated_at": run.get("updated_at", "") or file_mtime_iso(path),
231
+ "source_file": str(path),
232
+ }
233
+ return None
234
+
235
+
170
236
  def collect_runs(runs_root: Path) -> list[dict[str, Any]]:
171
237
  if not runs_root.is_dir():
172
238
  return []
@@ -221,10 +287,26 @@ def collect_runs(runs_root: Path) -> list[dict[str, Any]]:
221
287
  "provider_pool_name": run_env.get("ACTIVE_PROVIDER_POOL_NAME", ""),
222
288
  "run_dir": str(run_dir),
223
289
  }
290
+ alert = extract_github_rate_limit_alert(run_dir, item)
291
+ item["alerts"] = [alert] if alert else []
224
292
  runs.append(item)
225
293
  return runs
226
294
 
227
295
 
296
+ def controller_is_stale(env: dict[str, str], controller_path: Path) -> bool:
297
+ """A controller is stale if it claims to be running but its PID is dead or its
298
+ UPDATED_AT file mtime is older than 10 minutes."""
299
+ if env.get("CONTROLLER_STATE", "") in {"stopped", ""}:
300
+ return False
301
+ if not pid_alive(env.get("CONTROLLER_PID", "")):
302
+ return True
303
+ try:
304
+ file_age = datetime.now(timezone.utc).timestamp() - controller_path.stat().st_mtime
305
+ return file_age > 600
306
+ except Exception:
307
+ return False
308
+
309
+
228
310
  def collect_resident_controllers(state_root: Path) -> list[dict[str, Any]]:
229
311
  controllers_root = state_root / "resident-workers" / "issues"
230
312
  if not controllers_root.is_dir():
@@ -240,6 +322,7 @@ def collect_resident_controllers(state_root: Path) -> list[dict[str, Any]]:
240
322
  "session": env.get("SESSION", ""),
241
323
  "controller_pid": controller_pid,
242
324
  "controller_live": pid_alive(controller_pid),
325
+ "controller_stale": controller_is_stale(env, path),
243
326
  "mode": env.get("CONTROLLER_MODE", ""),
244
327
  "loop_count": safe_int(env.get("CONTROLLER_LOOP_COUNT")),
245
328
  "state": env.get("CONTROLLER_STATE", ""),
@@ -346,6 +429,32 @@ def collect_scheduled_issues(state_root: Path) -> list[dict[str, Any]]:
346
429
  return items
347
430
 
348
431
 
432
+ def collect_issue_retries(state_root: Path) -> list[dict[str, Any]]:
433
+ """Collect retry/backoff state for issues tracked by agent-project-retry-state."""
434
+ retries_root = state_root / "retries" / "issues"
435
+ if not retries_root.is_dir():
436
+ return []
437
+
438
+ now_epoch = int(datetime.now(timezone.utc).timestamp())
439
+ items: list[dict[str, Any]] = []
440
+ for path in sorted(retries_root.glob("*.env"), key=lambda item: item.stat().st_mtime, reverse=True):
441
+ env = read_env_file(path)
442
+ next_attempt_epoch = safe_int(env.get("NEXT_ATTEMPT_EPOCH"))
443
+ items.append(
444
+ {
445
+ "issue_id": path.stem,
446
+ "attempts": safe_int(env.get("ATTEMPTS")) or 0,
447
+ "next_attempt_epoch": next_attempt_epoch,
448
+ "next_attempt_at": env.get("NEXT_ATTEMPT_AT", ""),
449
+ "last_reason": env.get("LAST_REASON", ""),
450
+ "updated_at": env.get("UPDATED_AT", "") or file_mtime_iso(path),
451
+ "ready": not bool(next_attempt_epoch and next_attempt_epoch > now_epoch),
452
+ "state_file": str(path),
453
+ }
454
+ )
455
+ return items
456
+
457
+
349
458
  def collect_issue_queue(state_root: Path) -> dict[str, list[dict[str, Any]]]:
350
459
  queue_root = state_root / "resident-workers" / "issue-queue"
351
460
  pending_root = queue_root / "pending"
@@ -387,7 +496,9 @@ def build_profile_snapshot(profile_id: str, registry_root: Path) -> dict[str, An
387
496
  resident_workers = collect_resident_workers(state_root)
388
497
  cooldowns = collect_provider_cooldowns(state_root)
389
498
  scheduled = collect_scheduled_issues(state_root)
499
+ retries = collect_issue_retries(state_root)
390
500
  queue = collect_issue_queue(state_root)
501
+ alerts = [alert for run in runs for alert in run.get("alerts", [])]
391
502
 
392
503
  return {
393
504
  "id": profile_id,
@@ -422,17 +533,22 @@ def build_profile_snapshot(profile_id: str, registry_root: Path) -> dict[str, An
422
533
  ),
423
534
  "resident_controllers": len(controllers),
424
535
  "live_resident_controllers": sum(1 for item in controllers if item["state"] != "stopped" and item["controller_live"]),
536
+ "stale_resident_controllers": sum(1 for item in controllers if item.get("controller_stale", False)),
425
537
  "resident_workers": len(resident_workers),
426
538
  "queued_issues": len(queue["pending"]),
427
539
  "claimed_issues": len(queue["claims"]),
428
540
  "provider_cooldowns": sum(1 for item in cooldowns if item["active"]),
541
+ "active_retries": sum(1 for item in retries if not item.get("ready", True)),
429
542
  "scheduled_issues": len(scheduled),
543
+ "alerts": len(alerts),
430
544
  },
431
545
  "runs": runs,
546
+ "alerts": alerts,
432
547
  "resident_controllers": controllers,
433
548
  "resident_workers": resident_workers,
434
549
  "provider_cooldowns": cooldowns,
435
550
  "scheduled_issues": scheduled,
551
+ "issue_retries": retries,
436
552
  "issue_queue": queue,
437
553
  }
438
554
 
@@ -440,11 +556,14 @@ def build_profile_snapshot(profile_id: str, registry_root: Path) -> dict[str, An
440
556
  def build_snapshot() -> dict[str, Any]:
441
557
  registry_root = profile_registry_root()
442
558
  profiles = [build_profile_snapshot(profile_id, registry_root) for profile_id in list_profile_ids(registry_root)]
559
+ alerts = [alert for profile in profiles for alert in profile.get("alerts", [])]
443
560
  return {
444
561
  "generated_at": utc_now_iso(),
445
562
  "flow_skill_dir": str(ROOT_DIR),
446
563
  "profile_registry_root": str(registry_root),
447
564
  "profile_count": len(profiles),
565
+ "alert_count": len(alerts),
566
+ "alerts": alerts,
448
567
  "profiles": profiles,
449
568
  }
450
569