agent-control-plane 0.1.8 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/pr-risk.sh +54 -10
- package/hooks/heartbeat-hooks.sh +166 -13
- package/package.json +8 -2
- package/references/commands.md +1 -0
- package/tools/bin/agent-project-cleanup-session +143 -2
- package/tools/bin/agent-project-heartbeat-loop +29 -2
- package/tools/bin/agent-project-publish-issue-pr +178 -62
- package/tools/bin/agent-project-reconcile-issue-session +230 -5
- package/tools/bin/agent-project-reconcile-pr-session +104 -13
- package/tools/bin/agent-project-run-claude-session +19 -1
- package/tools/bin/agent-project-run-codex-resilient +121 -16
- package/tools/bin/agent-project-run-codex-session +61 -11
- package/tools/bin/agent-project-run-openclaw-session +274 -7
- package/tools/bin/agent-project-sync-anchor-repo +13 -2
- package/tools/bin/agent-project-worker-status +19 -14
- package/tools/bin/cleanup-worktree.sh +4 -1
- package/tools/bin/dashboard-launchd-bootstrap.sh +16 -4
- package/tools/bin/ensure-runtime-sync.sh +182 -0
- package/tools/bin/flow-config-lib.sh +76 -30
- package/tools/bin/flow-resident-worker-lib.sh +28 -2
- package/tools/bin/flow-shell-lib.sh +28 -8
- package/tools/bin/heartbeat-safe-auto.sh +32 -0
- package/tools/bin/issue-publish-localization-guard.sh +142 -0
- package/tools/bin/prepare-worktree.sh +3 -1
- package/tools/bin/project-launchd-bootstrap.sh +17 -4
- package/tools/bin/project-runtime-supervisor.sh +7 -1
- package/tools/bin/project-runtimectl.sh +78 -15
- package/tools/bin/provider-cooldown-state.sh +1 -1
- package/tools/bin/render-flow-config.sh +16 -1
- package/tools/bin/reuse-issue-worktree.sh +46 -0
- package/tools/bin/run-codex-task.sh +2 -2
- package/tools/bin/scaffold-profile.sh +2 -2
- package/tools/bin/start-issue-worker.sh +118 -16
- package/tools/bin/start-resident-issue-loop.sh +1 -0
- package/tools/bin/sync-shared-agent-home.sh +26 -0
- package/tools/bin/test-smoke.sh +6 -1
- package/tools/dashboard/app.js +91 -3
- package/tools/dashboard/dashboard_snapshot.py +119 -0
- package/tools/dashboard/styles.css +43 -0
- package/tools/templates/issue-prompt-template.md +18 -66
- package/tools/templates/legacy/issue-prompt-template-pre-slim.md +109 -0
- package/bin/audit-issue-routing.sh +0 -74
- package/tools/bin/audit-agent-worktrees.sh +0 -310
- package/tools/bin/audit-issue-routing.sh +0 -11
- package/tools/bin/audit-retained-layout.sh +0 -58
- package/tools/bin/audit-retained-overlap.sh +0 -135
- package/tools/bin/audit-retained-worktrees.sh +0 -228
- package/tools/bin/check-skill-contracts.sh +0 -324
|
@@ -153,6 +153,54 @@ reap_stale_run_dir() {
|
|
|
153
153
|
mv "$RUN_DIR" "${HISTORY_ROOT}/${SESSION}-stale-$(date +%Y%m%d-%H%M%S)"
|
|
154
154
|
}
|
|
155
155
|
|
|
156
|
+
find_archived_issue_session_dir() {
|
|
157
|
+
local root="${1:-}"
|
|
158
|
+
local target_session="${2:-}"
|
|
159
|
+
[[ -n "$root" && -d "$root" && -n "$target_session" ]] || return 1
|
|
160
|
+
|
|
161
|
+
find "$root" -mindepth 1 -maxdepth 1 -type d -name "${target_session}-*" ! -name "${target_session}-stale-*" 2>/dev/null \
|
|
162
|
+
| sort -r \
|
|
163
|
+
| head -n 1
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
issue_retry_state_value() {
|
|
167
|
+
local key="${1:?retry-state key required}"
|
|
168
|
+
awk -F= -v target_key="$key" '$1 == target_key { print substr($0, index($0, "=") + 1); exit }' <<<"${ISSUE_RETRY_STATE:-}"
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
issue_host_publish_replay_dir() {
|
|
172
|
+
local last_reason=""
|
|
173
|
+
local archived_dir=""
|
|
174
|
+
local runner_state=""
|
|
175
|
+
local result_outcome=""
|
|
176
|
+
local result_action=""
|
|
177
|
+
|
|
178
|
+
last_reason="$(issue_retry_state_value LAST_REASON)"
|
|
179
|
+
case "${last_reason}" in
|
|
180
|
+
host-publish-failed|issue-worker-blocked) ;;
|
|
181
|
+
*) return 1 ;;
|
|
182
|
+
esac
|
|
183
|
+
|
|
184
|
+
archived_dir="$(find_archived_issue_session_dir "$HISTORY_ROOT" "$SESSION" || true)"
|
|
185
|
+
[[ -n "${archived_dir}" && -f "${archived_dir}/run.env" && -f "${archived_dir}/runner.env" && -f "${archived_dir}/result.env" ]] || return 1
|
|
186
|
+
|
|
187
|
+
runner_state="$(awk -F= '/^RUNNER_STATE=/{print $2; exit}' "${archived_dir}/runner.env")"
|
|
188
|
+
result_outcome="$(awk -F= '/^OUTCOME=/{print $2; exit}' "${archived_dir}/result.env")"
|
|
189
|
+
result_action="$(awk -F= '/^ACTION=/{print $2; exit}' "${archived_dir}/result.env")"
|
|
190
|
+
|
|
191
|
+
[[ "${runner_state}" == "succeeded" ]] || return 1
|
|
192
|
+
[[ "${result_outcome}" == "implemented" ]] || return 1
|
|
193
|
+
[[ "${result_action}" == "host-publish-issue-pr" ]] || return 1
|
|
194
|
+
|
|
195
|
+
printf '%s\n' "${archived_dir}"
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
replay_issue_host_publish_retry() {
|
|
199
|
+
local archived_dir="${1:?archived dir required}"
|
|
200
|
+
printf 'ISSUE_HOST_PUBLISH_REPLAY=session=%s archived_run_dir=%s\n' "${SESSION}" "${archived_dir}" >&2
|
|
201
|
+
bash "${WORKSPACE_DIR}/bin/reconcile-issue-worker.sh" "${SESSION}"
|
|
202
|
+
}
|
|
203
|
+
|
|
156
204
|
if tmux has-session -t "$SESSION" 2>/dev/null; then
|
|
157
205
|
echo "worker session already exists: $SESSION" >&2
|
|
158
206
|
exit 1
|
|
@@ -180,6 +228,9 @@ EOF
|
|
|
180
228
|
ISSUE_REQUIRES_LOCAL_WORKSPACE_INSTALL="$(
|
|
181
229
|
ISSUE_BODY="$ISSUE_BODY" bash "$LOCAL_INSTALL_POLICY_BIN"
|
|
182
230
|
)"
|
|
231
|
+
ISSUE_RETRY_STATE="$(
|
|
232
|
+
bash "${WORKSPACE_DIR}/bin/retry-state.sh" issue "$ISSUE_ID" get 2>/dev/null || true
|
|
233
|
+
)"
|
|
183
234
|
if [[ "${ISSUE_SCHEDULE_INTERVAL_SECONDS}" =~ ^[1-9][0-9]*$ ]]; then
|
|
184
235
|
TEMPLATE_FILE="${SCHEDULED_TEMPLATE_FILE}"
|
|
185
236
|
fi
|
|
@@ -227,6 +278,16 @@ if [[ -d "$RUN_DIR" ]]; then
|
|
|
227
278
|
reap_stale_run_dir
|
|
228
279
|
fi
|
|
229
280
|
|
|
281
|
+
ISSUE_HOST_PUBLISH_REPLAY_DIR="$(issue_host_publish_replay_dir || true)"
|
|
282
|
+
if [[ -n "${ISSUE_HOST_PUBLISH_REPLAY_DIR}" ]]; then
|
|
283
|
+
if ! replay_issue_host_publish_retry "${ISSUE_HOST_PUBLISH_REPLAY_DIR}"; then
|
|
284
|
+
echo "host publish replay failed for session ${SESSION}" >&2
|
|
285
|
+
exit 1
|
|
286
|
+
fi
|
|
287
|
+
launch_success="yes"
|
|
288
|
+
exit 0
|
|
289
|
+
fi
|
|
290
|
+
|
|
230
291
|
block_if_recurring_checklist_complete
|
|
231
292
|
|
|
232
293
|
mkdir -p "$RUN_DIR"
|
|
@@ -348,9 +409,6 @@ if (completedPrs.length > 0) {
|
|
|
348
409
|
process.stdout.write(`${lines.join('\n')}\n`);
|
|
349
410
|
EOF
|
|
350
411
|
ISSUE_RECURRING_CONTEXT="$(cat "$ISSUE_RECURRING_CONTEXT_FILE")"
|
|
351
|
-
ISSUE_RETRY_STATE="$(
|
|
352
|
-
bash "${WORKSPACE_DIR}/bin/retry-state.sh" issue "$ISSUE_ID" get 2>/dev/null || true
|
|
353
|
-
)"
|
|
354
412
|
ISSUE_BLOCKER_CONTEXT="$(
|
|
355
413
|
ISSUE_JSON="$ISSUE_JSON" ISSUE_RETRY_STATE="$ISSUE_RETRY_STATE" node <<'EOF'
|
|
356
414
|
const issue = JSON.parse(process.env.ISSUE_JSON || '{}');
|
|
@@ -386,6 +444,41 @@ const blockerComment = [...(issue.comments || [])]
|
|
|
386
444
|
),
|
|
387
445
|
);
|
|
388
446
|
|
|
447
|
+
const inferCommentReason = (bodyText) => {
|
|
448
|
+
const body = String(bodyText || '');
|
|
449
|
+
const marker = 'Failure reason:';
|
|
450
|
+
const markerIndex = body.search(/Failure reason:/i);
|
|
451
|
+
if (markerIndex !== -1) {
|
|
452
|
+
const backtick = String.fromCharCode(96);
|
|
453
|
+
const tail = body.slice(markerIndex + marker.length);
|
|
454
|
+
const firstQuoted = tail.split(backtick)[1];
|
|
455
|
+
if (firstQuoted) {
|
|
456
|
+
return firstQuoted.trim();
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
if (/^# Blocker: Verification requirements were not satisfied$/im.test(body)) {
|
|
460
|
+
return 'verification-guard-blocked';
|
|
461
|
+
}
|
|
462
|
+
if (/^# Blocker: Localization requirements were not satisfied$/im.test(body)) {
|
|
463
|
+
return 'localization-guard-blocked';
|
|
464
|
+
}
|
|
465
|
+
if (/^# Blocker: (All checklist items already completed|Worker produced no publishable delta)$/im.test(body)) {
|
|
466
|
+
return 'no-publishable-commits';
|
|
467
|
+
}
|
|
468
|
+
if (/scope guard/i.test(body)) {
|
|
469
|
+
return 'scope-guard-blocked';
|
|
470
|
+
}
|
|
471
|
+
if (/^# Blocker: Provider quota is currently exhausted$/im.test(body)) {
|
|
472
|
+
return 'provider-quota-limit';
|
|
473
|
+
}
|
|
474
|
+
return '';
|
|
475
|
+
};
|
|
476
|
+
|
|
477
|
+
const effectiveLastReason =
|
|
478
|
+
lastReason && lastReason !== 'issue-worker-blocked'
|
|
479
|
+
? lastReason
|
|
480
|
+
: inferCommentReason(blockerComment?.body || '') || lastReason;
|
|
481
|
+
|
|
389
482
|
if (!blockerComment || !blockerComment.body) {
|
|
390
483
|
const fallbackLines = [
|
|
391
484
|
'',
|
|
@@ -393,13 +486,13 @@ if (!blockerComment || !blockerComment.body) {
|
|
|
393
486
|
'This issue is being retried after an `agent-blocked` stop.',
|
|
394
487
|
'- First resolve the prior blocker instead of repeating the same broad implementation path.',
|
|
395
488
|
];
|
|
396
|
-
if (
|
|
397
|
-
fallbackLines.push(
|
|
489
|
+
if (effectiveLastReason) {
|
|
490
|
+
fallbackLines.push('- Last recorded blocker: `' + effectiveLastReason + '`.');
|
|
398
491
|
}
|
|
399
492
|
if (attempts > 0) {
|
|
400
|
-
fallbackLines.push(
|
|
493
|
+
fallbackLines.push('- Blocked retries so far: ' + attempts + '.');
|
|
401
494
|
}
|
|
402
|
-
if (
|
|
495
|
+
if (effectiveLastReason === 'scope-guard-blocked' && attempts >= 2) {
|
|
403
496
|
fallbackLines.push(
|
|
404
497
|
'- This issue has already hit the scope guard multiple times. Do not attempt another broad multi-surface patch.',
|
|
405
498
|
`- Either ship one focused slice that stays under the scope guard, or create focused follow-up issues with \`bash "$FLOW_TOOLS_DIR/create-follow-up-issue.sh" --parent ${issue.number} --title "..." --body-file /tmp/follow-up.md\` and supersede the umbrella.`,
|
|
@@ -420,22 +513,24 @@ const lines = [
|
|
|
420
513
|
'- Address the blocker below before attempting a new implementation/publish cycle.',
|
|
421
514
|
];
|
|
422
515
|
|
|
423
|
-
if (
|
|
424
|
-
lines.push(
|
|
516
|
+
if (effectiveLastReason) {
|
|
517
|
+
lines.push('- Last recorded blocker: `' + effectiveLastReason + '`.');
|
|
425
518
|
}
|
|
426
519
|
if (attempts > 0) {
|
|
427
|
-
lines.push(
|
|
520
|
+
lines.push('- Blocked retries so far: ' + attempts + '.');
|
|
428
521
|
}
|
|
429
522
|
if (nextAttemptAt) {
|
|
430
|
-
lines.push(
|
|
523
|
+
lines.push('- Last scheduled retry target was ' + nextAttemptAt + '.');
|
|
431
524
|
}
|
|
432
|
-
if (
|
|
525
|
+
if (effectiveLastReason === 'scope-guard-blocked') {
|
|
433
526
|
lines.push('- Treat this as a scope problem first: narrow to one safe slice or decompose into focused follow-up issues.');
|
|
434
527
|
if (attempts >= 2) {
|
|
435
528
|
lines.push(`- Because the scope guard has already fired multiple times, do not retry the same umbrella patch. Use \`bash "$FLOW_TOOLS_DIR/create-follow-up-issue.sh" --parent ${issue.number} --title "..." --body-file /tmp/follow-up.md\` for the remaining slices, then supersede the umbrella if you covered the full decomposition.`);
|
|
436
529
|
}
|
|
437
|
-
} else if (
|
|
530
|
+
} else if (effectiveLastReason === 'verification-guard-blocked') {
|
|
438
531
|
lines.push('- Add the missing verification or shrink the touched surface before attempting another publish cycle.');
|
|
532
|
+
} else if (effectiveLastReason === 'localization-guard-blocked') {
|
|
533
|
+
lines.push('- Finish moving the remaining user-facing literals behind translation keys before attempting another publish cycle.');
|
|
439
534
|
}
|
|
440
535
|
|
|
441
536
|
lines.push('', clippedBody);
|
|
@@ -547,10 +642,11 @@ open_or_reuse_issue_worktree() {
|
|
|
547
642
|
RESIDENT_OPENCLAW_CONFIG_PATH="${current_resident_openclaw_config_path}"
|
|
548
643
|
RESIDENT_TASK_COUNT="$(( ${TASK_COUNT:-0} + 1 ))"
|
|
549
644
|
RESIDENT_WORKTREE_REUSED="yes"
|
|
550
|
-
if [[ "${CODING_WORKER}" == "openclaw"
|
|
645
|
+
if [[ "${CODING_WORKER}" == "openclaw" ]]; then
|
|
551
646
|
# Keep the resident lane's warm workspace/agent files, but rotate the
|
|
552
|
-
# OpenClaw conversation thread
|
|
553
|
-
|
|
647
|
+
# OpenClaw conversation thread every cycle so a new task does not inherit
|
|
648
|
+
# stale conversational context from the previous one.
|
|
649
|
+
RESIDENT_OPENCLAW_SESSION_ID="$(flow_resident_issue_openclaw_session_id "${CONFIG_YAML}" "${current_issue_id}" "${RESIDENT_TASK_COUNT}")"
|
|
554
650
|
fi
|
|
555
651
|
if reuse_output="$("${WORKSPACE_DIR}/bin/reuse-issue-worktree.sh" "${WORKTREE}" "${ISSUE_ID}" "${ISSUE_SLUG}" 2>&1)"; then
|
|
556
652
|
WORKTREE_OUT="${reuse_output}"
|
|
@@ -558,6 +654,9 @@ open_or_reuse_issue_worktree() {
|
|
|
558
654
|
printf 'RESIDENT_REUSE_FALLBACK=issue-%s reason=%s\n' "${ISSUE_ID}" "$(printf '%s' "${reuse_output}" | tr '\n' ' ' | sed 's/ */ /g')" >&2
|
|
559
655
|
RESIDENT_TASK_COUNT="1"
|
|
560
656
|
RESIDENT_WORKTREE_REUSED="no"
|
|
657
|
+
if [[ "${CODING_WORKER}" == "openclaw" ]]; then
|
|
658
|
+
RESIDENT_OPENCLAW_SESSION_ID="$(flow_resident_issue_openclaw_session_id "${CONFIG_YAML}" "${current_issue_id}" "${RESIDENT_TASK_COUNT}")"
|
|
659
|
+
fi
|
|
561
660
|
if [[ "$ISSUE_REQUIRES_LOCAL_WORKSPACE_INSTALL" == "yes" ]]; then
|
|
562
661
|
WORKTREE_OUT="$(ACP_WORKTREE_LOCAL_INSTALL=true F_LOSNING_WORKTREE_LOCAL_INSTALL=true "${WORKSPACE_DIR}/bin/new-worktree.sh" "$ISSUE_ID" "$ISSUE_SLUG")"
|
|
563
662
|
else
|
|
@@ -567,6 +666,9 @@ open_or_reuse_issue_worktree() {
|
|
|
567
666
|
else
|
|
568
667
|
RESIDENT_TASK_COUNT="1"
|
|
569
668
|
RESIDENT_WORKTREE_REUSED="no"
|
|
669
|
+
if [[ "${CODING_WORKER}" == "openclaw" ]]; then
|
|
670
|
+
RESIDENT_OPENCLAW_SESSION_ID="$(flow_resident_issue_openclaw_session_id "${CONFIG_YAML}" "${current_issue_id}" "${RESIDENT_TASK_COUNT}")"
|
|
671
|
+
fi
|
|
570
672
|
if [[ "$ISSUE_REQUIRES_LOCAL_WORKSPACE_INSTALL" == "yes" ]]; then
|
|
571
673
|
WORKTREE_OUT="$(ACP_WORKTREE_LOCAL_INSTALL=true F_LOSNING_WORKTREE_LOCAL_INSTALL=true "${WORKSPACE_DIR}/bin/new-worktree.sh" "$ISSUE_ID" "$ISSUE_SLUG")"
|
|
572
674
|
else
|
|
@@ -785,6 +785,7 @@ while true; do
|
|
|
785
785
|
controller_refresh_execution_context
|
|
786
786
|
controller_refresh_issue_lane_context "${is_scheduled}" "${schedule_interval_seconds}"
|
|
787
787
|
controller_track_provider_selection "provider-selection"
|
|
788
|
+
controller_write_state "starting" ""
|
|
788
789
|
|
|
789
790
|
if controller_yield_to_live_lane_peer; then
|
|
790
791
|
break
|
|
@@ -69,6 +69,31 @@ sync_skill_copies() {
|
|
|
69
69
|
fi
|
|
70
70
|
}
|
|
71
71
|
|
|
72
|
+
refresh_legacy_profile_templates() {
|
|
73
|
+
local profiles_root=""
|
|
74
|
+
local current_issue_template=""
|
|
75
|
+
local legacy_issue_template=""
|
|
76
|
+
local profile_dir=""
|
|
77
|
+
local profile_issue_template=""
|
|
78
|
+
|
|
79
|
+
profiles_root="$(resolve_flow_profile_registry_root)"
|
|
80
|
+
current_issue_template="${FLOW_SKILL_SOURCE}/tools/templates/issue-prompt-template.md"
|
|
81
|
+
legacy_issue_template="${FLOW_SKILL_SOURCE}/tools/templates/legacy/issue-prompt-template-pre-slim.md"
|
|
82
|
+
|
|
83
|
+
[[ -d "${profiles_root}" ]] || return 0
|
|
84
|
+
[[ -f "${current_issue_template}" ]] || return 0
|
|
85
|
+
[[ -f "${legacy_issue_template}" ]] || return 0
|
|
86
|
+
|
|
87
|
+
while IFS= read -r profile_dir; do
|
|
88
|
+
[[ -n "${profile_dir}" ]] || continue
|
|
89
|
+
profile_issue_template="${profile_dir}/templates/issue-prompt-template.md"
|
|
90
|
+
[[ -f "${profile_issue_template}" ]] || continue
|
|
91
|
+
if cmp -s "${profile_issue_template}" "${legacy_issue_template}"; then
|
|
92
|
+
cp "${current_issue_template}" "${profile_issue_template}"
|
|
93
|
+
fi
|
|
94
|
+
done < <(find "${profiles_root}" -mindepth 2 -maxdepth 2 -type f -name 'control-plane.yaml' -exec dirname {} \; 2>/dev/null | sort)
|
|
95
|
+
}
|
|
96
|
+
|
|
72
97
|
remove_repo_local_profile_dirs() {
|
|
73
98
|
local candidate=""
|
|
74
99
|
|
|
@@ -210,5 +235,6 @@ fi
|
|
|
210
235
|
sync_skill_copies
|
|
211
236
|
remove_repo_local_profile_dirs
|
|
212
237
|
normalize_script_permissions
|
|
238
|
+
refresh_legacy_profile_templates
|
|
213
239
|
|
|
214
240
|
printf 'SHARED_AGENT_HOME=%s\n' "${TARGET_HOME}"
|
package/tools/bin/test-smoke.sh
CHANGED
|
@@ -58,7 +58,12 @@ run_step() {
|
|
|
58
58
|
return "${status}"
|
|
59
59
|
}
|
|
60
60
|
|
|
61
|
-
|
|
61
|
+
if [[ -f "${check_contracts_script}" ]]; then
|
|
62
|
+
run_step "check-skill-contracts" bash "${check_contracts_script}"
|
|
63
|
+
else
|
|
64
|
+
printf 'SMOKE_STEP=%s\n' "check-skill-contracts"
|
|
65
|
+
printf 'SMOKE_STEP_STATUS=%s\n' "skipped"
|
|
66
|
+
fi
|
|
62
67
|
|
|
63
68
|
run_profile_smoke_fixture() (
|
|
64
69
|
set -euo pipefail
|
package/tools/dashboard/app.js
CHANGED
|
@@ -2,6 +2,8 @@ const refreshButton = document.querySelector("#refresh-button");
|
|
|
2
2
|
const generatedAtNode = document.querySelector("#generated-at");
|
|
3
3
|
const overviewNode = document.querySelector("#overview");
|
|
4
4
|
const profilesNode = document.querySelector("#profiles");
|
|
5
|
+
const seenAlertIds = new Set();
|
|
6
|
+
let notificationPermissionRequested = false;
|
|
5
7
|
|
|
6
8
|
function relativeTime(input) {
|
|
7
9
|
if (!input) return "n/a";
|
|
@@ -45,7 +47,7 @@ function renderResult(row) {
|
|
|
45
47
|
|
|
46
48
|
function renderControllerState(row) {
|
|
47
49
|
const state = row.state || "n/a";
|
|
48
|
-
const stale = state !== "stopped" && row.controller_live === false;
|
|
50
|
+
const stale = row.controller_stale === true || (state !== "stopped" && row.controller_live === false);
|
|
49
51
|
const label = stale ? `${state} (stale)` : state;
|
|
50
52
|
return `<span class="status-pill ${statusClass(stale ? "stale" : state)}">${label}</span>`;
|
|
51
53
|
}
|
|
@@ -61,9 +63,10 @@ function renderOverview(snapshot) {
|
|
|
61
63
|
acc.controllers += profile.counts.live_resident_controllers;
|
|
62
64
|
acc.cooldowns += profile.counts.provider_cooldowns;
|
|
63
65
|
acc.queue += profile.counts.queued_issues;
|
|
66
|
+
acc.alerts += profile.counts.alerts || 0;
|
|
64
67
|
return acc;
|
|
65
68
|
},
|
|
66
|
-
{ activeRuns: 0, runningRuns: 0, implementedRuns: 0, reportedRuns: 0, blockedRuns: 0, controllers: 0, cooldowns: 0, queue: 0 },
|
|
69
|
+
{ activeRuns: 0, runningRuns: 0, implementedRuns: 0, reportedRuns: 0, blockedRuns: 0, controllers: 0, cooldowns: 0, queue: 0, alerts: 0 },
|
|
67
70
|
);
|
|
68
71
|
|
|
69
72
|
overviewNode.innerHTML = [
|
|
@@ -75,6 +78,7 @@ function renderOverview(snapshot) {
|
|
|
75
78
|
["Blocked", totals.blockedRuns],
|
|
76
79
|
["Live Controllers", totals.controllers],
|
|
77
80
|
["Provider Cooldowns", totals.cooldowns],
|
|
81
|
+
["Alerts", totals.alerts],
|
|
78
82
|
["Queued Issues", totals.queue],
|
|
79
83
|
]
|
|
80
84
|
.map(
|
|
@@ -104,6 +108,36 @@ function renderTable(columns, rows, emptyMessage = "No data right now.") {
|
|
|
104
108
|
return `<div class="table-wrap"><table><thead><tr>${headers}</tr></thead><tbody>${body}</tbody></table></div>`;
|
|
105
109
|
}
|
|
106
110
|
|
|
111
|
+
function renderAlerts(alerts) {
|
|
112
|
+
if (!alerts.length) {
|
|
113
|
+
return `<div class="empty-state">No active alerts for this profile.</div>`;
|
|
114
|
+
}
|
|
115
|
+
return `
|
|
116
|
+
<div class="alert-list">
|
|
117
|
+
${alerts
|
|
118
|
+
.map(
|
|
119
|
+
(alert) => `
|
|
120
|
+
<article class="alert-card ${statusClass(alert.severity || "warn")}">
|
|
121
|
+
<div class="alert-header">
|
|
122
|
+
<div>
|
|
123
|
+
<h4>${alert.title}</h4>
|
|
124
|
+
<div class="muted mono">${alert.session || "n/a"} · ${alert.task_kind || "task"} ${alert.task_id || ""}</div>
|
|
125
|
+
</div>
|
|
126
|
+
<span class="badge warn">${alert.kind}</span>
|
|
127
|
+
</div>
|
|
128
|
+
<p>${alert.message}</p>
|
|
129
|
+
<div class="alert-meta">
|
|
130
|
+
<span>${alert.reset_at ? `Reset: ${alert.reset_at}` : "Reset: n/a"}</span>
|
|
131
|
+
<span>${alert.updated_at ? `${relativeTime(alert.updated_at)} · ${alert.updated_at}` : "updated n/a"}</span>
|
|
132
|
+
</div>
|
|
133
|
+
</article>
|
|
134
|
+
`,
|
|
135
|
+
)
|
|
136
|
+
.join("")}
|
|
137
|
+
</div>
|
|
138
|
+
`;
|
|
139
|
+
}
|
|
140
|
+
|
|
107
141
|
function renderProfile(profile) {
|
|
108
142
|
const providerBadges = [
|
|
109
143
|
profile.coding_worker ? `<span class="badge good">${profile.coding_worker}</span>` : "",
|
|
@@ -123,7 +157,10 @@ function renderProfile(profile) {
|
|
|
123
157
|
["Reported", profile.counts.reported_runs],
|
|
124
158
|
["Blocked", profile.counts.blocked_runs],
|
|
125
159
|
["Live controllers", profile.counts.live_resident_controllers],
|
|
160
|
+
["Stale controllers", profile.counts.stale_resident_controllers],
|
|
126
161
|
["Provider cooldowns", profile.counts.provider_cooldowns],
|
|
162
|
+
["Alerts", profile.counts.alerts || 0],
|
|
163
|
+
["Issue retries", profile.counts.active_retries],
|
|
127
164
|
["Queued issues", profile.counts.queued_issues],
|
|
128
165
|
["Scheduled", profile.counts.scheduled_issues],
|
|
129
166
|
]
|
|
@@ -165,6 +202,18 @@ function renderProfile(profile) {
|
|
|
165
202
|
"No resident controllers recorded for this profile.",
|
|
166
203
|
);
|
|
167
204
|
|
|
205
|
+
const retryTable = renderTable(
|
|
206
|
+
[
|
|
207
|
+
{ label: "Issue", key: "issue_id" },
|
|
208
|
+
{ label: "Status", render: (row) => `<span class="status-pill ${row.ready ? "" : "waiting-provider"}">${row.ready ? "ready" : "retrying"}</span>` },
|
|
209
|
+
{ label: "Reason", render: (row) => row.last_reason || "n/a" },
|
|
210
|
+
{ label: "Attempts", key: "attempts" },
|
|
211
|
+
{ label: "Next attempt", render: (row) => row.next_attempt_at ? `${relativeTime(row.next_attempt_at)}<div class="muted">${row.next_attempt_at}</div>` : "n/a" },
|
|
212
|
+
],
|
|
213
|
+
profile.issue_retries || [],
|
|
214
|
+
"No issue retries recorded.",
|
|
215
|
+
);
|
|
216
|
+
|
|
168
217
|
const workerTable = renderTable(
|
|
169
218
|
[
|
|
170
219
|
{ label: "Key", render: (row) => `<div class="mono">${row.key}</div>` },
|
|
@@ -226,6 +275,11 @@ function renderProfile(profile) {
|
|
|
226
275
|
</header>
|
|
227
276
|
<section class="overview">${summaryCards}</section>
|
|
228
277
|
<section class="profile-grid">
|
|
278
|
+
<section class="panel">
|
|
279
|
+
<h3>Host Alerts</h3>
|
|
280
|
+
<p class="panel-subtitle">High-signal operational blockers surfaced from active run logs and comment artifacts.</p>
|
|
281
|
+
${renderAlerts(profile.alerts || [])}
|
|
282
|
+
</section>
|
|
229
283
|
<section class="panel">
|
|
230
284
|
<h3>Active Runs</h3>
|
|
231
285
|
<p class="panel-subtitle">Lifecycle shows technical session completion. Result shows what the run achieved: implemented, reported, or blocked.</p>
|
|
@@ -233,9 +287,13 @@ function renderProfile(profile) {
|
|
|
233
287
|
</section>
|
|
234
288
|
<section class="panel">
|
|
235
289
|
<h3>Resident Controllers</h3>
|
|
236
|
-
<p class="panel-subtitle">Includes provider wait and failover telemetry.</p>
|
|
290
|
+
<p class="panel-subtitle">Includes provider wait and failover telemetry. Stale controllers show a warning.</p>
|
|
237
291
|
${controllerTable}
|
|
238
292
|
</section>
|
|
293
|
+
<section class="panel half">
|
|
294
|
+
<h3>Issue Retries</h3>
|
|
295
|
+
${retryTable}
|
|
296
|
+
</section>
|
|
239
297
|
<section class="panel">
|
|
240
298
|
<h3>Resident Worker Metadata</h3>
|
|
241
299
|
${workerTable}
|
|
@@ -257,6 +315,35 @@ function renderProfile(profile) {
|
|
|
257
315
|
`;
|
|
258
316
|
}
|
|
259
317
|
|
|
318
|
+
async function maybeNotifyAlerts(snapshot) {
|
|
319
|
+
const alerts = (snapshot.alerts || []).filter((alert) => alert && alert.id);
|
|
320
|
+
if (!alerts.length || typeof window.Notification === "undefined") return;
|
|
321
|
+
|
|
322
|
+
if (window.Notification.permission === "default" && !notificationPermissionRequested) {
|
|
323
|
+
notificationPermissionRequested = true;
|
|
324
|
+
try {
|
|
325
|
+
await window.Notification.requestPermission();
|
|
326
|
+
} catch (_error) {
|
|
327
|
+
return;
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
if (window.Notification.permission !== "granted") return;
|
|
332
|
+
|
|
333
|
+
for (const alert of alerts) {
|
|
334
|
+
if (seenAlertIds.has(alert.id)) continue;
|
|
335
|
+
seenAlertIds.add(alert.id);
|
|
336
|
+
const bodyParts = [];
|
|
337
|
+
if (alert.session) bodyParts.push(alert.session);
|
|
338
|
+
if (alert.reset_at) bodyParts.push(`reset ${alert.reset_at}`);
|
|
339
|
+
if (alert.message) bodyParts.push(alert.message);
|
|
340
|
+
new window.Notification(alert.title || "ACP alert", {
|
|
341
|
+
body: bodyParts.join(" · ").slice(0, 240),
|
|
342
|
+
tag: alert.id,
|
|
343
|
+
});
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
|
|
260
347
|
async function loadSnapshot() {
|
|
261
348
|
refreshButton.disabled = true;
|
|
262
349
|
try {
|
|
@@ -268,6 +355,7 @@ async function loadSnapshot() {
|
|
|
268
355
|
generatedAtNode.textContent = `Snapshot: ${snapshot.generated_at}`;
|
|
269
356
|
renderOverview(snapshot);
|
|
270
357
|
profilesNode.innerHTML = snapshot.profiles.map(renderProfile).join("");
|
|
358
|
+
await maybeNotifyAlerts(snapshot);
|
|
271
359
|
} catch (error) {
|
|
272
360
|
generatedAtNode.textContent = `Snapshot load failed: ${error.message}`;
|
|
273
361
|
profilesNode.innerHTML = `<article class="profile"><div class="empty-state">${error.message}</div></article>`;
|
|
@@ -143,6 +143,19 @@ def file_mtime_iso(path: Path) -> str:
|
|
|
143
143
|
return datetime.fromtimestamp(path.stat().st_mtime, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
144
144
|
|
|
145
145
|
|
|
146
|
+
def read_tail_text(path: Path, max_bytes: int = 65536) -> str:
|
|
147
|
+
if not path.is_file():
|
|
148
|
+
return ""
|
|
149
|
+
try:
|
|
150
|
+
with path.open("rb") as handle:
|
|
151
|
+
size = path.stat().st_size
|
|
152
|
+
if size > max_bytes:
|
|
153
|
+
handle.seek(size - max_bytes)
|
|
154
|
+
return handle.read().decode("utf-8", errors="replace")
|
|
155
|
+
except OSError:
|
|
156
|
+
return ""
|
|
157
|
+
|
|
158
|
+
|
|
146
159
|
def classify_run_result(status: str, outcome: str, failure_reason: str) -> tuple[str, str]:
|
|
147
160
|
normalized_status = (status or "").strip().upper()
|
|
148
161
|
normalized_outcome = (outcome or "").strip()
|
|
@@ -167,6 +180,59 @@ def classify_run_result(status: str, outcome: str, failure_reason: str) -> tuple
|
|
|
167
180
|
return ("unknown", normalized_status or "Unknown")
|
|
168
181
|
|
|
169
182
|
|
|
183
|
+
GITHUB_RATE_LIMIT_PATTERNS = [
|
|
184
|
+
re.compile(
|
|
185
|
+
r"GitHub core API[^\n]*?rate limit[^\n]*(?:reset(?:s| into)?(?: at)?\s+(?P<reset>[^.\n]+))?",
|
|
186
|
+
re.IGNORECASE,
|
|
187
|
+
),
|
|
188
|
+
re.compile(
|
|
189
|
+
r"gh:\s*API rate limit exceeded[^\n]*(?:reset(?:s| into)?(?: at)?\s+(?P<reset>[^.\n]+))?",
|
|
190
|
+
re.IGNORECASE,
|
|
191
|
+
),
|
|
192
|
+
]
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def summarize_whitespace(text: str) -> str:
|
|
196
|
+
return re.sub(r"\s+", " ", text).strip()
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def extract_github_rate_limit_alert(run_dir: Path, run: dict[str, Any]) -> dict[str, Any] | None:
|
|
200
|
+
candidate_files = [
|
|
201
|
+
run_dir / "issue-comment.md",
|
|
202
|
+
run_dir / "pr-comment.md",
|
|
203
|
+
run_dir / f"{run['session']}.log",
|
|
204
|
+
]
|
|
205
|
+
for path in candidate_files:
|
|
206
|
+
text = read_tail_text(path)
|
|
207
|
+
if not text:
|
|
208
|
+
continue
|
|
209
|
+
for pattern in GITHUB_RATE_LIMIT_PATTERNS:
|
|
210
|
+
match = pattern.search(text)
|
|
211
|
+
if not match:
|
|
212
|
+
continue
|
|
213
|
+
summary = summarize_whitespace(match.group(0))
|
|
214
|
+
reset_match = re.search(r"reset(?:s| into)?(?: at)?\s+([^.\n]+)", summary, re.IGNORECASE)
|
|
215
|
+
reset_at = summarize_whitespace((reset_match.group(1) if reset_match else "") or match.groupdict().get("reset") or "")
|
|
216
|
+
if not summary:
|
|
217
|
+
summary = "GitHub core API rate limit is blocking host-side actions."
|
|
218
|
+
if reset_at and reset_at not in summary:
|
|
219
|
+
summary = f"{summary} Reset: {reset_at}."
|
|
220
|
+
return {
|
|
221
|
+
"id": f"github-core-rate-limit:{run['session']}:{reset_at or path.name}",
|
|
222
|
+
"kind": "github-core-rate-limit",
|
|
223
|
+
"severity": "warn",
|
|
224
|
+
"title": "GitHub core API rate limit blocks host actions",
|
|
225
|
+
"message": summary,
|
|
226
|
+
"session": run.get("session", ""),
|
|
227
|
+
"task_kind": run.get("task_kind", ""),
|
|
228
|
+
"task_id": run.get("task_id", ""),
|
|
229
|
+
"reset_at": reset_at,
|
|
230
|
+
"updated_at": run.get("updated_at", "") or file_mtime_iso(path),
|
|
231
|
+
"source_file": str(path),
|
|
232
|
+
}
|
|
233
|
+
return None
|
|
234
|
+
|
|
235
|
+
|
|
170
236
|
def collect_runs(runs_root: Path) -> list[dict[str, Any]]:
|
|
171
237
|
if not runs_root.is_dir():
|
|
172
238
|
return []
|
|
@@ -221,10 +287,26 @@ def collect_runs(runs_root: Path) -> list[dict[str, Any]]:
|
|
|
221
287
|
"provider_pool_name": run_env.get("ACTIVE_PROVIDER_POOL_NAME", ""),
|
|
222
288
|
"run_dir": str(run_dir),
|
|
223
289
|
}
|
|
290
|
+
alert = extract_github_rate_limit_alert(run_dir, item)
|
|
291
|
+
item["alerts"] = [alert] if alert else []
|
|
224
292
|
runs.append(item)
|
|
225
293
|
return runs
|
|
226
294
|
|
|
227
295
|
|
|
296
|
+
def controller_is_stale(env: dict[str, str], controller_path: Path) -> bool:
|
|
297
|
+
"""A controller is stale if it claims to be running but its PID is dead or its
|
|
298
|
+
UPDATED_AT file mtime is older than 10 minutes."""
|
|
299
|
+
if env.get("CONTROLLER_STATE", "") in {"stopped", ""}:
|
|
300
|
+
return False
|
|
301
|
+
if not pid_alive(env.get("CONTROLLER_PID", "")):
|
|
302
|
+
return True
|
|
303
|
+
try:
|
|
304
|
+
file_age = datetime.now(timezone.utc).timestamp() - controller_path.stat().st_mtime
|
|
305
|
+
return file_age > 600
|
|
306
|
+
except Exception:
|
|
307
|
+
return False
|
|
308
|
+
|
|
309
|
+
|
|
228
310
|
def collect_resident_controllers(state_root: Path) -> list[dict[str, Any]]:
|
|
229
311
|
controllers_root = state_root / "resident-workers" / "issues"
|
|
230
312
|
if not controllers_root.is_dir():
|
|
@@ -240,6 +322,7 @@ def collect_resident_controllers(state_root: Path) -> list[dict[str, Any]]:
|
|
|
240
322
|
"session": env.get("SESSION", ""),
|
|
241
323
|
"controller_pid": controller_pid,
|
|
242
324
|
"controller_live": pid_alive(controller_pid),
|
|
325
|
+
"controller_stale": controller_is_stale(env, path),
|
|
243
326
|
"mode": env.get("CONTROLLER_MODE", ""),
|
|
244
327
|
"loop_count": safe_int(env.get("CONTROLLER_LOOP_COUNT")),
|
|
245
328
|
"state": env.get("CONTROLLER_STATE", ""),
|
|
@@ -346,6 +429,32 @@ def collect_scheduled_issues(state_root: Path) -> list[dict[str, Any]]:
|
|
|
346
429
|
return items
|
|
347
430
|
|
|
348
431
|
|
|
432
|
+
def collect_issue_retries(state_root: Path) -> list[dict[str, Any]]:
|
|
433
|
+
"""Collect retry/backoff state for issues tracked by agent-project-retry-state."""
|
|
434
|
+
retries_root = state_root / "retries" / "issues"
|
|
435
|
+
if not retries_root.is_dir():
|
|
436
|
+
return []
|
|
437
|
+
|
|
438
|
+
now_epoch = int(datetime.now(timezone.utc).timestamp())
|
|
439
|
+
items: list[dict[str, Any]] = []
|
|
440
|
+
for path in sorted(retries_root.glob("*.env"), key=lambda item: item.stat().st_mtime, reverse=True):
|
|
441
|
+
env = read_env_file(path)
|
|
442
|
+
next_attempt_epoch = safe_int(env.get("NEXT_ATTEMPT_EPOCH"))
|
|
443
|
+
items.append(
|
|
444
|
+
{
|
|
445
|
+
"issue_id": path.stem,
|
|
446
|
+
"attempts": safe_int(env.get("ATTEMPTS")) or 0,
|
|
447
|
+
"next_attempt_epoch": next_attempt_epoch,
|
|
448
|
+
"next_attempt_at": env.get("NEXT_ATTEMPT_AT", ""),
|
|
449
|
+
"last_reason": env.get("LAST_REASON", ""),
|
|
450
|
+
"updated_at": env.get("UPDATED_AT", "") or file_mtime_iso(path),
|
|
451
|
+
"ready": not bool(next_attempt_epoch and next_attempt_epoch > now_epoch),
|
|
452
|
+
"state_file": str(path),
|
|
453
|
+
}
|
|
454
|
+
)
|
|
455
|
+
return items
|
|
456
|
+
|
|
457
|
+
|
|
349
458
|
def collect_issue_queue(state_root: Path) -> dict[str, list[dict[str, Any]]]:
|
|
350
459
|
queue_root = state_root / "resident-workers" / "issue-queue"
|
|
351
460
|
pending_root = queue_root / "pending"
|
|
@@ -387,7 +496,9 @@ def build_profile_snapshot(profile_id: str, registry_root: Path) -> dict[str, An
|
|
|
387
496
|
resident_workers = collect_resident_workers(state_root)
|
|
388
497
|
cooldowns = collect_provider_cooldowns(state_root)
|
|
389
498
|
scheduled = collect_scheduled_issues(state_root)
|
|
499
|
+
retries = collect_issue_retries(state_root)
|
|
390
500
|
queue = collect_issue_queue(state_root)
|
|
501
|
+
alerts = [alert for run in runs for alert in run.get("alerts", [])]
|
|
391
502
|
|
|
392
503
|
return {
|
|
393
504
|
"id": profile_id,
|
|
@@ -422,17 +533,22 @@ def build_profile_snapshot(profile_id: str, registry_root: Path) -> dict[str, An
|
|
|
422
533
|
),
|
|
423
534
|
"resident_controllers": len(controllers),
|
|
424
535
|
"live_resident_controllers": sum(1 for item in controllers if item["state"] != "stopped" and item["controller_live"]),
|
|
536
|
+
"stale_resident_controllers": sum(1 for item in controllers if item.get("controller_stale", False)),
|
|
425
537
|
"resident_workers": len(resident_workers),
|
|
426
538
|
"queued_issues": len(queue["pending"]),
|
|
427
539
|
"claimed_issues": len(queue["claims"]),
|
|
428
540
|
"provider_cooldowns": sum(1 for item in cooldowns if item["active"]),
|
|
541
|
+
"active_retries": sum(1 for item in retries if not item.get("ready", True)),
|
|
429
542
|
"scheduled_issues": len(scheduled),
|
|
543
|
+
"alerts": len(alerts),
|
|
430
544
|
},
|
|
431
545
|
"runs": runs,
|
|
546
|
+
"alerts": alerts,
|
|
432
547
|
"resident_controllers": controllers,
|
|
433
548
|
"resident_workers": resident_workers,
|
|
434
549
|
"provider_cooldowns": cooldowns,
|
|
435
550
|
"scheduled_issues": scheduled,
|
|
551
|
+
"issue_retries": retries,
|
|
436
552
|
"issue_queue": queue,
|
|
437
553
|
}
|
|
438
554
|
|
|
@@ -440,11 +556,14 @@ def build_profile_snapshot(profile_id: str, registry_root: Path) -> dict[str, An
|
|
|
440
556
|
def build_snapshot() -> dict[str, Any]:
|
|
441
557
|
registry_root = profile_registry_root()
|
|
442
558
|
profiles = [build_profile_snapshot(profile_id, registry_root) for profile_id in list_profile_ids(registry_root)]
|
|
559
|
+
alerts = [alert for profile in profiles for alert in profile.get("alerts", [])]
|
|
443
560
|
return {
|
|
444
561
|
"generated_at": utc_now_iso(),
|
|
445
562
|
"flow_skill_dir": str(ROOT_DIR),
|
|
446
563
|
"profile_registry_root": str(registry_root),
|
|
447
564
|
"profile_count": len(profiles),
|
|
565
|
+
"alert_count": len(alerts),
|
|
566
|
+
"alerts": alerts,
|
|
448
567
|
"profiles": profiles,
|
|
449
568
|
}
|
|
450
569
|
|