@yemi33/minions 0.1.2121 → 0.1.2123
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dashboard/js/settings.js +4 -0
- package/dashboard.js +3 -0
- package/docs/harness-mode.md +92 -0
- package/engine/ado.js +142 -21
- package/engine/github.js +4 -1
- package/engine/harness.js +592 -0
- package/engine/lifecycle.js +91 -0
- package/engine/scheduler.js +40 -3
- package/engine/shared.js +16 -0
- package/engine/timeout.js +286 -21
- package/engine.js +66 -15
- package/package.json +1 -1
package/engine/lifecycle.js
CHANGED
|
@@ -14,6 +14,7 @@ const { trackEngineUsage } = require('./llm');
|
|
|
14
14
|
const { resolveRuntime } = require('./runtimes');
|
|
15
15
|
const adoGitAuth = require('./ado-git-auth');
|
|
16
16
|
const queries = require('./queries');
|
|
17
|
+
const harness = require('./harness');
|
|
17
18
|
const { isBranchActive } = require('./cooldown');
|
|
18
19
|
const { worktreeMatchesBranch, getWorktreeBranch, cleanupMergedPrLocalBranch } = require('./cleanup');
|
|
19
20
|
const { getConfig, getInboxFiles, getNotes, getPrs, getDispatch,
|
|
@@ -4040,6 +4041,82 @@ function handleDecompositionResult(stdout, meta, config, runtimeName) {
|
|
|
4040
4041
|
return 0;
|
|
4041
4042
|
}
|
|
4042
4043
|
|
|
4044
|
+
/**
|
|
4045
|
+
* Tri-agent harness mode (W-mq07a9gf000jbc2b): when an evaluator completes,
|
|
4046
|
+
* parse its verdict against the configured rubric/threshold and — if the
|
|
4047
|
+
* artifact didn't pass and the iteration cap hasn't been hit — append a
|
|
4048
|
+
* fresh Generator+Evaluator pair so the harness can iterate on its own
|
|
4049
|
+
* artifact. Returns the number of work items appended (0 = terminal stop,
|
|
4050
|
+
* either pass or cap reached).
|
|
4051
|
+
*
|
|
4052
|
+
* Called from runPostCompletionHooks after a successful run when the
|
|
4053
|
+
* dispatched item carries _harness.role === 'evaluator'.
|
|
4054
|
+
*/
|
|
4055
|
+
function handleHarnessIterationResult(stdout, structuredCompletion, meta, config) {
|
|
4056
|
+
const evaluatorItem = meta?.item;
|
|
4057
|
+
if (!evaluatorItem?._harness || evaluatorItem._harness.role !== harness.HARNESS_ROLE.EVALUATOR) return 0;
|
|
4058
|
+
|
|
4059
|
+
let verdict;
|
|
4060
|
+
try {
|
|
4061
|
+
verdict = harness.parseEvaluatorVerdict(stdout || '', structuredCompletion || null);
|
|
4062
|
+
} catch (err) {
|
|
4063
|
+
log('warn', `Harness ${evaluatorItem._harness.missionId}: verdict parse failed — ${err.message}; treating as terminal stop`);
|
|
4064
|
+
return 0;
|
|
4065
|
+
}
|
|
4066
|
+
|
|
4067
|
+
if (!harness.shouldIterateAgain(evaluatorItem._harness, verdict)) {
|
|
4068
|
+
const reason = verdict.pass === true ? 'passed' :
|
|
4069
|
+
(evaluatorItem._harness.iteration >= evaluatorItem._harness.maxIterations ? 'max iterations reached' :
|
|
4070
|
+
'inconclusive verdict');
|
|
4071
|
+
log('info', `Harness mission ${evaluatorItem._harness.missionId} terminal stop (iteration ${evaluatorItem._harness.iteration}, ${reason}, score=${verdict.score ?? 'n/a'})`);
|
|
4072
|
+
return 0;
|
|
4073
|
+
}
|
|
4074
|
+
|
|
4075
|
+
let nextItems;
|
|
4076
|
+
try {
|
|
4077
|
+
nextItems = harness.createIterationWorkItems(evaluatorItem, verdict, {});
|
|
4078
|
+
} catch (err) {
|
|
4079
|
+
log('warn', `Harness ${evaluatorItem._harness.missionId}: iteration build failed — ${err.message}`);
|
|
4080
|
+
return 0;
|
|
4081
|
+
}
|
|
4082
|
+
if (!Array.isArray(nextItems) || nextItems.length === 0) return 0;
|
|
4083
|
+
|
|
4084
|
+
// Mirror handleDecompositionResult: scan central + per-project work-items.json
|
|
4085
|
+
// and append into the file that owns the evaluator (the trio always lands in
|
|
4086
|
+
// the central file in practice — scheduler.discoverScheduledWork writes
|
|
4087
|
+
// directly to engine/work-items.json via engine.js — but iterate defensively).
|
|
4088
|
+
const projects = shared.getProjects(config);
|
|
4089
|
+
const allPaths = [path.join(MINIONS_DIR, 'work-items.json')];
|
|
4090
|
+
for (const p of projects) allPaths.push(shared.projectWorkItemsPath(p));
|
|
4091
|
+
|
|
4092
|
+
let appendedTo = null;
|
|
4093
|
+
for (const wiPath of allPaths) {
|
|
4094
|
+
let found = false;
|
|
4095
|
+
mutateJsonFileLocked(wiPath, data => {
|
|
4096
|
+
if (!Array.isArray(data)) return data;
|
|
4097
|
+
const evaluator = data.find(i => i.id === evaluatorItem.id);
|
|
4098
|
+
if (!evaluator) return data;
|
|
4099
|
+
found = true;
|
|
4100
|
+
// De-dupe by id in case a previous tick already appended the next pair.
|
|
4101
|
+
const existingIds = new Set(data.map(i => i.id));
|
|
4102
|
+
for (const it of nextItems) {
|
|
4103
|
+
if (existingIds.has(it.id)) continue;
|
|
4104
|
+
data.push(it);
|
|
4105
|
+
}
|
|
4106
|
+
return data;
|
|
4107
|
+
}, { defaultValue: [] });
|
|
4108
|
+
if (found) { appendedTo = wiPath; break; }
|
|
4109
|
+
}
|
|
4110
|
+
|
|
4111
|
+
if (!appendedTo) {
|
|
4112
|
+
log('warn', `Harness ${evaluatorItem._harness.missionId}: evaluator ${evaluatorItem.id} not found in any work-items.json — iteration skipped`);
|
|
4113
|
+
return 0;
|
|
4114
|
+
}
|
|
4115
|
+
|
|
4116
|
+
log('info', `Harness mission ${evaluatorItem._harness.missionId} iterating: appended ${nextItems.length} work items (next iteration: ${nextItems[0]._harness.iteration}, score=${verdict.score ?? 'n/a'})`);
|
|
4117
|
+
return nextItems.length;
|
|
4118
|
+
}
|
|
4119
|
+
|
|
4043
4120
|
/**
|
|
4044
4121
|
* W-mpg58wv3 — auto-dispatch a re-review WI when a fix-WI born from a minion
|
|
4045
4122
|
* REQUEST_CHANGES marks done. Closure-loop for the shared Yemi reviewer slot:
|
|
@@ -4386,6 +4463,19 @@ async function runPostCompletionHooks(dispatchItem, agentId, code, stdout, confi
|
|
|
4386
4463
|
}
|
|
4387
4464
|
}
|
|
4388
4465
|
|
|
4466
|
+
// Tri-agent harness iteration (W-mq07a9gf000jbc2b): if the evaluator just
|
|
4467
|
+
// completed successfully and verdict says retry, append the next Gen+Eval
|
|
4468
|
+
// pair into the same work-items.json. Engine will dispatch them on the
|
|
4469
|
+
// next tick. No interaction with skipDoneStatus — the evaluator itself
|
|
4470
|
+
// still marks DONE; iteration is a sibling write, not a parent decomp.
|
|
4471
|
+
if (effectiveSuccess && meta?.item?._harness?.role === harness.HARNESS_ROLE.EVALUATOR) {
|
|
4472
|
+
try {
|
|
4473
|
+
handleHarnessIterationResult(stdout, structuredCompletion, meta, config);
|
|
4474
|
+
} catch (err) {
|
|
4475
|
+
log('warn', `Harness iteration hook failed for ${meta.item.id}: ${err.message}`);
|
|
4476
|
+
}
|
|
4477
|
+
}
|
|
4478
|
+
|
|
4389
4479
|
// Verify review work items include a verdict — must run BEFORE updateWorkItemStatus(DONE),
|
|
4390
4480
|
// same pattern as plan-to-prd (#893): updateWorkItemStatus deletes _retryCount, so the check
|
|
4391
4481
|
// must read/increment it before that happens. Also sets skipDoneStatus so completedAt isn't
|
|
@@ -5204,6 +5294,7 @@ module.exports = {
|
|
|
5204
5294
|
isPrAttachmentRequired,
|
|
5205
5295
|
extractDecompositionJson,
|
|
5206
5296
|
handleDecompositionResult,
|
|
5297
|
+
handleHarnessIterationResult,
|
|
5207
5298
|
processCompletionFollowups,
|
|
5208
5299
|
// W-mpg58wv3 — closure-loop dispatch helpers (exported for testing).
|
|
5209
5300
|
dispatchReReviewForFix,
|
package/engine/scheduler.js
CHANGED
|
@@ -25,7 +25,8 @@ const fs = require('fs');
|
|
|
25
25
|
const path = require('path');
|
|
26
26
|
const shared = require('./shared');
|
|
27
27
|
const routing = require('./routing');
|
|
28
|
-
const
|
|
28
|
+
const harness = require('./harness');
|
|
29
|
+
const { safeJson, safeWrite, mutateJsonFileLocked, mutateScheduleRuns, ts, dateStamp, log, WI_STATUS, WORK_TYPE } = shared;
|
|
29
30
|
|
|
30
31
|
const SCHEDULE_RUNS_PATH = path.join(shared.MINIONS_DIR, 'engine', 'schedule-runs.json');
|
|
31
32
|
|
|
@@ -186,9 +187,9 @@ function createScheduledWorkItem(sched) {
|
|
|
186
187
|
};
|
|
187
188
|
}
|
|
188
189
|
|
|
189
|
-
function writeScheduleRunEntry(runs, scheduleId, workItemId) {
|
|
190
|
+
function writeScheduleRunEntry(runs, scheduleId, workItemId, extra) {
|
|
190
191
|
const existing = typeof runs[scheduleId] === 'object' && runs[scheduleId] ? runs[scheduleId] : {};
|
|
191
|
-
runs[scheduleId] = { ...existing, lastRun: ts(), lastWorkItemId: workItemId };
|
|
192
|
+
runs[scheduleId] = { ...existing, lastRun: ts(), lastWorkItemId: workItemId, ...(extra || {}) };
|
|
192
193
|
return runs[scheduleId];
|
|
193
194
|
}
|
|
194
195
|
|
|
@@ -222,6 +223,42 @@ function discoverScheduledWork(config) {
|
|
|
222
223
|
const lastRun = typeof runEntry === 'string' ? runEntry : (runEntry?.lastRun || null);
|
|
223
224
|
if (!shouldRunNow(sched, lastRun)) continue;
|
|
224
225
|
|
|
226
|
+
// Tri-agent harness mode (W-mq07a9gf000jbc2b): a single schedule firing
|
|
227
|
+
// produces a coordinated Planner → Generator → Evaluator trio rather than
|
|
228
|
+
// a single work item. Validate config first — on bad config, skip this
|
|
229
|
+
// tick WITHOUT recording a schedule run so the operator can fix the
|
|
230
|
+
// config and the next tick will pick it up.
|
|
231
|
+
if (sched.harness_mode === harness.HARNESS_MODE.TRI_AGENT) {
|
|
232
|
+
const validation = harness.validateHarnessConfig(sched);
|
|
233
|
+
if (!validation.valid) {
|
|
234
|
+
log('warn', `Scheduler: harness config invalid for ${sched.id} — skipping (errors: ${validation.errors.join('; ')})`);
|
|
235
|
+
continue;
|
|
236
|
+
}
|
|
237
|
+
try {
|
|
238
|
+
// Resolve schedule-time template variables on the title/description
|
|
239
|
+
// BEFORE handing the schedule to the harness builder so subtask
|
|
240
|
+
// prompts inherit the same substitutions as regular schedules.
|
|
241
|
+
const resolvedSched = {
|
|
242
|
+
...sched,
|
|
243
|
+
title: resolveScheduleTemplateVars(sched.title),
|
|
244
|
+
description: resolveScheduleTemplateVars(sched.description || sched.title),
|
|
245
|
+
harness_rubric: resolveScheduleTemplateVars(sched.harness_rubric),
|
|
246
|
+
};
|
|
247
|
+
const mission = harness.createTriAgentMission(resolvedSched);
|
|
248
|
+
for (const it of mission.items) work.push(it);
|
|
249
|
+
// Record the mission's planner id as lastWorkItemId for compatibility
|
|
250
|
+
// with the existing schedule-runs shape, plus lastMissionId so the
|
|
251
|
+
// dashboard and consolidation tooling can join across the trio.
|
|
252
|
+
writeScheduleRunEntry(runs, sched.id, mission.items[0].id, {
|
|
253
|
+
lastMissionId: mission.missionId,
|
|
254
|
+
harnessMode: harness.HARNESS_MODE.TRI_AGENT,
|
|
255
|
+
});
|
|
256
|
+
} catch (err) {
|
|
257
|
+
log('warn', `Scheduler: tri-agent mission build failed for ${sched.id}: ${err.message}`);
|
|
258
|
+
}
|
|
259
|
+
continue;
|
|
260
|
+
}
|
|
261
|
+
|
|
225
262
|
// Substitute schedule-time template vars (e.g. {{date}}) before the work
|
|
226
263
|
// item is written — single-pass playbook rendering can't reach placeholders
|
|
227
264
|
// embedded inside task_description, so they must be resolved up front.
|
package/engine/shared.js
CHANGED
|
@@ -2255,6 +2255,22 @@ const ENGINE_DEFAULTS = {
|
|
|
2255
2255
|
allowedDashboardOrigins: [],
|
|
2256
2256
|
meetingRoundTimeout: 900000, // 15min per meeting round — soft signal; logs a "still waiting" warning each tick
|
|
2257
2257
|
meetingRoundHardTimeout: 3600000, // 60min hard backstop — non-terminal participants are marked failed and the round advances. Prevents permanent stalls if an agent's dispatch never spawns or its completion gets dropped.
|
|
2258
|
+
// W-mq066js7000fff1f-c (steering Gap B): max wall-clock a steering message may
|
|
2259
|
+
// sit deferred (runtime hasn't emitted a resumable checkpoint yet — Copilot
|
|
2260
|
+
// pre-first-checkpoint, etc.). Past this window the message is flagged
|
|
2261
|
+
// stranded: `[steering-warn]` line on live-output, `_steeringStranded=true` on
|
|
2262
|
+
// the active dispatch row, and the steering store (when present) marked
|
|
2263
|
+
// `status='stranded'`. Default 15min; clamp 60_000..14_400_000 (1 min..4 h).
|
|
2264
|
+
steeringDeferredMaxMs: 900000,
|
|
2265
|
+
// W-mq066js7000fff1f-c (steering Gap C): cap on graceful+escalation kill
|
|
2266
|
+
// attempts after a steering kill is issued. Ladder (between attempts): 30s →
|
|
2267
|
+
// 60s → 120s, last interval reused. attempt 1 = killGracefully; attempts 2..cap
|
|
2268
|
+
// = platform-specific hard kill (taskkill /F /T on Windows, descendant tree +
|
|
2269
|
+
// pkill on Unix); after cap is reached and the process is still alive, the
|
|
2270
|
+
// engine gives up with a `[steering-stuck]` log + non-actionable inbox notice
|
|
2271
|
+
// so the agent surfaces in the dashboard for operator intervention. Default 3;
|
|
2272
|
+
// clamp 1..5.
|
|
2273
|
+
steeringMaxKillRetries: 3,
|
|
2258
2274
|
evalLoop: true, // enable review→fix loop after implementation completes
|
|
2259
2275
|
evalMaxCost: null, // USD ceiling per work item across all eval iterations; null = no limit (gather baseline data first)
|
|
2260
2276
|
maxRetries: 3, // max dispatch retries before marking work item as failed
|
package/engine/timeout.js
CHANGED
|
@@ -54,8 +54,87 @@ function checkIdleThreshold(config) {
|
|
|
54
54
|
|
|
55
55
|
// ─── Steering Checker ────────────────────────────────────────────────────────
|
|
56
56
|
|
|
57
|
-
//
|
|
58
|
-
|
|
57
|
+
// W-mq066js7000fff1f-c (Gap C): kill-retry escalation ladder. Intervals between
|
|
58
|
+
// successive kill attempts after a steering kill issues. Attempt i (1-indexed)
|
|
59
|
+
// uses STEERING_KILL_INTERVALS_MS[min(i-1, len-1)]; last value repeats so cap=5
|
|
60
|
+
// keeps stepping every 120s. cap=1 = one graceful retry only (then give-up).
|
|
61
|
+
const STEERING_KILL_INTERVALS_MS = [30000, 60000, 120000];
|
|
62
|
+
// Set of steering-store statuses still "in flight" — Gap G ignores entries that
|
|
63
|
+
// have already terminated (delivered/dropped/etc).
|
|
64
|
+
const STEERING_ACTIVE_STATUSES = new Set(['queued', 'live_kill', 'deferred', 're_spawning']);
|
|
65
|
+
|
|
66
|
+
// W-mq066js7000fff1f-c (Gap G): lazy require for engine/steering-store.js.
|
|
67
|
+
// Sibling branch -d ships the store; my branch ships independently and no-ops
|
|
68
|
+
// when the module is absent. Only swallows MODULE_NOT_FOUND for the exact
|
|
69
|
+
// resolution — real syntax/runtime errors propagate so a broken store fails
|
|
70
|
+
// visibly instead of silently degrading.
|
|
71
|
+
let _steeringStoreLoaded = false;
|
|
72
|
+
let _steeringStore = null;
|
|
73
|
+
function getSteeringStore() {
|
|
74
|
+
if (_steeringStoreLoaded) return _steeringStore;
|
|
75
|
+
_steeringStoreLoaded = true;
|
|
76
|
+
try {
|
|
77
|
+
_steeringStore = require('./steering-store');
|
|
78
|
+
} catch (err) {
|
|
79
|
+
const isMissing = err && err.code === 'MODULE_NOT_FOUND' && /steering-store/.test(String(err.message || ''));
|
|
80
|
+
if (!isMissing) {
|
|
81
|
+
try { log('warn', `Steering: failed to load engine/steering-store.js: ${err.message}`); } catch {}
|
|
82
|
+
}
|
|
83
|
+
_steeringStore = null;
|
|
84
|
+
}
|
|
85
|
+
return _steeringStore;
|
|
86
|
+
}
|
|
87
|
+
// Test-only reset.
|
|
88
|
+
function _resetSteeringStoreCacheForTest() { _steeringStoreLoaded = false; _steeringStore = null; }
|
|
89
|
+
// Test-only direct injection — bypasses the lazy require entirely. Pass `null`
|
|
90
|
+
// to simulate the module-absent case without touching require.cache.
|
|
91
|
+
function _setSteeringStoreForTest(store) {
|
|
92
|
+
_steeringStoreLoaded = true;
|
|
93
|
+
_steeringStore = store;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
function _steeringIdForEntry(entry) {
|
|
97
|
+
if (!entry) return null;
|
|
98
|
+
if (entry.id) return String(entry.id);
|
|
99
|
+
const filePath = entry.path || entry;
|
|
100
|
+
if (typeof filePath !== 'string') return null;
|
|
101
|
+
const m = path.basename(filePath).match(/^steering-(\d+)/);
|
|
102
|
+
return m ? m[1] : null;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function _safeStoreCall(fn, ...args) {
|
|
106
|
+
const store = getSteeringStore();
|
|
107
|
+
if (!store || typeof store[fn] !== 'function') return null;
|
|
108
|
+
try { return store[fn](...args); }
|
|
109
|
+
catch (err) {
|
|
110
|
+
try { log('warn', `Steering store ${fn} failed: ${err.message}`); } catch {}
|
|
111
|
+
return null;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
function _clampSteeringDeferredMaxMs(config) {
|
|
116
|
+
const raw = Number(config?.engine?.steeringDeferredMaxMs ?? ENGINE_DEFAULTS.steeringDeferredMaxMs);
|
|
117
|
+
if (!Number.isFinite(raw)) return ENGINE_DEFAULTS.steeringDeferredMaxMs;
|
|
118
|
+
return Math.max(60000, Math.min(14400000, raw));
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
function _clampSteeringMaxKillRetries(config) {
|
|
122
|
+
const raw = Number(config?.engine?.steeringMaxKillRetries ?? ENGINE_DEFAULTS.steeringMaxKillRetries);
|
|
123
|
+
if (!Number.isFinite(raw)) return ENGINE_DEFAULTS.steeringMaxKillRetries;
|
|
124
|
+
return Math.max(1, Math.min(5, Math.round(raw)));
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
function _steeringKillIntervalMs(attemptIdx) {
|
|
128
|
+
const i = Math.max(0, Math.min(STEERING_KILL_INTERVALS_MS.length - 1, attemptIdx));
|
|
129
|
+
return STEERING_KILL_INTERVALS_MS[i];
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
function _appendLiveOutputLine(agentId, line) {
|
|
133
|
+
try {
|
|
134
|
+
const liveLogPath = path.join(AGENTS_DIR, agentId, 'live-output.log');
|
|
135
|
+
fs.appendFileSync(liveLogPath, line.endsWith('\n') ? line : `${line}\n`);
|
|
136
|
+
} catch { /* optional */ }
|
|
137
|
+
}
|
|
59
138
|
|
|
60
139
|
function runtimeSupportsMidRunSessionId(info) {
|
|
61
140
|
if (typeof info?.midRunSessionId === 'boolean') return info.midRunSessionId;
|
|
@@ -76,6 +155,16 @@ function rememberDeferredSteering(info, steerEntry) {
|
|
|
76
155
|
const existing = new Set(Array.isArray(info._deferredSteeringFiles) ? info._deferredSteeringFiles : []);
|
|
77
156
|
if (steerEntry?.path) existing.add(steerEntry.path);
|
|
78
157
|
info._deferredSteeringFiles = Array.from(existing);
|
|
158
|
+
// Stamp per-entry deferred timestamp for the Gap B stranded-sweep. Map keyed
|
|
159
|
+
// by file path so multiple deferred messages each track their own clock.
|
|
160
|
+
if (steerEntry?.path) {
|
|
161
|
+
if (!info._deferredSteeringQueuedAt || typeof info._deferredSteeringQueuedAt !== 'object') {
|
|
162
|
+
info._deferredSteeringQueuedAt = {};
|
|
163
|
+
}
|
|
164
|
+
if (!info._deferredSteeringQueuedAt[steerEntry.path]) {
|
|
165
|
+
info._deferredSteeringQueuedAt[steerEntry.path] = Date.now();
|
|
166
|
+
}
|
|
167
|
+
}
|
|
79
168
|
}
|
|
80
169
|
|
|
81
170
|
function deferSteeringUntilCheckpoint(id, info, steerEntry) {
|
|
@@ -97,14 +186,157 @@ function deferSteeringUntilCheckpoint(id, info, steerEntry) {
|
|
|
97
186
|
} catch { /* optional */ }
|
|
98
187
|
}
|
|
99
188
|
|
|
189
|
+
// W-mq066js7000fff1f-c (Gap B): per-tick sweep over `_deferredSteeringQueuedAt`
|
|
190
|
+
// entries that are still in `_deferredSteeringFiles` (source-of-truth) and have
|
|
191
|
+
// no sessionId after `engine.steeringDeferredMaxMs`. Each match: append
|
|
192
|
+
// `[steering-warn]` to live-output, mark `_steeringStranded: true` on the matching
|
|
193
|
+
// dispatch active row, and `store.updateStatus(id, 'stranded')` (no-op if store
|
|
194
|
+
// absent). `info._deferredSteeringStrandedFiles` (Set) guards against re-firing
|
|
195
|
+
// the same warning every tick while the file remains stranded.
|
|
196
|
+
function checkDeferredStranded(activeProcesses, config) {
|
|
197
|
+
const maxMs = _clampSteeringDeferredMaxMs(config);
|
|
198
|
+
const now = Date.now();
|
|
199
|
+
for (const [dispatchId, info] of activeProcesses) {
|
|
200
|
+
if (!info || info.sessionId) continue;
|
|
201
|
+
const queuedAt = info._deferredSteeringQueuedAt;
|
|
202
|
+
if (!queuedAt || typeof queuedAt !== 'object') continue;
|
|
203
|
+
const deferredSet = new Set(Array.isArray(info._deferredSteeringFiles) ? info._deferredSteeringFiles : []);
|
|
204
|
+
if (deferredSet.size === 0) continue;
|
|
205
|
+
if (!info._deferredSteeringStrandedFiles) info._deferredSteeringStrandedFiles = new Set();
|
|
206
|
+
const stranded = info._deferredSteeringStrandedFiles;
|
|
207
|
+
for (const filePath of deferredSet) {
|
|
208
|
+
if (stranded.has(filePath)) continue;
|
|
209
|
+
const ts = Number(queuedAt[filePath]);
|
|
210
|
+
if (!Number.isFinite(ts) || now - ts <= maxMs) continue;
|
|
211
|
+
const entryId = _steeringIdForEntry({ path: filePath });
|
|
212
|
+
const ageMin = Math.round((now - ts) / 60000);
|
|
213
|
+
_appendLiveOutputLine(info.agentId,
|
|
214
|
+
`\n[steering-warn] Steering message ${entryId || path.basename(filePath)} has been queued for ${ageMin}m without a resumable checkpoint (no sessionId emitted). It will be delivered on the next dispatch.`);
|
|
215
|
+
log('warn', `Steering: ${info.agentId} (${dispatchId}) deferred steering ${entryId || filePath} stranded after ${ageMin}m — no sessionId`);
|
|
216
|
+
stranded.add(filePath);
|
|
217
|
+
try {
|
|
218
|
+
dispatch().mutateDispatch((data) => {
|
|
219
|
+
for (const row of data.active || []) {
|
|
220
|
+
if (row && row.id === dispatchId) row._steeringStranded = true;
|
|
221
|
+
}
|
|
222
|
+
return data;
|
|
223
|
+
});
|
|
224
|
+
} catch (err) {
|
|
225
|
+
try { log('warn', `Steering: mutateDispatch _steeringStranded failed for ${dispatchId}: ${err.message}`); } catch {}
|
|
226
|
+
}
|
|
227
|
+
if (entryId) _safeStoreCall('updateStatus', entryId, 'stranded', { last_error: 'no-session-after-max-defer' });
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// W-mq066js7000fff1f-c (Gap C): runs the kill-retry escalation ladder for an
|
|
233
|
+
// already-steered process whose `_steeringAt` is set but which hasn't exited.
|
|
234
|
+
// Caller is `checkSteering` per-tick loop; `info` is the `activeProcesses`
|
|
235
|
+
// record. Returns true if any rung fired (caller should `continue` and not
|
|
236
|
+
// scan for new steering messages for this dispatch).
|
|
237
|
+
function _runSteeringKillLadder(id, info, config) {
|
|
238
|
+
if (!info._steeringAt || info._steeringGaveUp) return false;
|
|
239
|
+
const cap = _clampSteeringMaxKillRetries(config);
|
|
240
|
+
const attempts = Number(info._steeringKillAttempts) || 0;
|
|
241
|
+
// Reference time: anchor on `_steeringAt` for attempt 0 (the bookkeeping
|
|
242
|
+
// sequence), then on `_steeringLastRetryAt` for subsequent rungs.
|
|
243
|
+
const refTime = attempts === 0 ? info._steeringAt : (info._steeringLastRetryAt || info._steeringAt);
|
|
244
|
+
const wait = _steeringKillIntervalMs(attempts);
|
|
245
|
+
if (Date.now() - refTime <= wait) return false;
|
|
246
|
+
|
|
247
|
+
// attempts < cap → run kill attempt (i=0 graceful, else platform escalation).
|
|
248
|
+
if (attempts < cap) {
|
|
249
|
+
if (attempts === 0) {
|
|
250
|
+
log('warn', `Steering: ${info.agentId} (${id}) didn't exit ${Math.round(wait / 1000)}s after kill — retrying gracefully`);
|
|
251
|
+
try { shared.killGracefully(info.proc, 5000); }
|
|
252
|
+
catch (err) { try { log('warn', `Steering kill retry (graceful) failed for ${info.agentId}: ${err.message}`); } catch {} }
|
|
253
|
+
} else {
|
|
254
|
+
log('warn', `Steering: ${info.agentId} (${id}) survived attempt ${attempts} — escalating (attempt ${attempts + 1}/${cap})`);
|
|
255
|
+
_escalatePlatformKill(info);
|
|
256
|
+
}
|
|
257
|
+
info._steeringKillAttempts = attempts + 1;
|
|
258
|
+
info._steeringLastRetryAt = Date.now();
|
|
259
|
+
return true;
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// attempts === cap → give up: log + non-actionable inbox notice.
|
|
263
|
+
const minutes = Math.round((Date.now() - info._steeringAt) / 60000);
|
|
264
|
+
log('error', `Steering: ${info.agentId} (${id}) did not exit after ${cap} kill attempts (${minutes}m since kill) — giving up`);
|
|
265
|
+
_appendLiveOutputLine(info.agentId,
|
|
266
|
+
`\n[steering-stuck] Process did not exit after ${cap} kill attempts over ~${minutes}m. The engine is giving up automatic escalation; operator intervention may be required.`);
|
|
267
|
+
try {
|
|
268
|
+
// System notification body is framed as non-actionable: prefix `[engine-system]`
|
|
269
|
+
// so an agent that re-consumes it from the inbox treats it as metadata. The
|
|
270
|
+
// primary signal is the live-output line above (operator-facing) and the
|
|
271
|
+
// give-up log; this inbox write is a belt-and-suspenders notification.
|
|
272
|
+
steering.writeSteeringMessage(info.agentId,
|
|
273
|
+
`[engine-system] Steering kill escalation gave up after ${cap} attempts (~${minutes}m). This is an automated notification — do not act on it; the engine has surfaced this dispatch for operator attention.`,
|
|
274
|
+
{ source: 'engine' });
|
|
275
|
+
} catch (err) {
|
|
276
|
+
try { log('warn', `Steering: writeSteeringMessage for give-up failed: ${err.message}`); } catch {}
|
|
277
|
+
}
|
|
278
|
+
info._steeringGaveUp = true;
|
|
279
|
+
info._steeringKillAttempts = attempts + 1;
|
|
280
|
+
info._steeringLastRetryAt = Date.now();
|
|
281
|
+
return true;
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
function _escalatePlatformKill(info) {
|
|
285
|
+
const pid = info?.proc?.pid;
|
|
286
|
+
if (!pid) return;
|
|
287
|
+
if (process.platform === 'win32') {
|
|
288
|
+
try { shared.exec(`taskkill /F /T /PID ${pid}`, { timeout: 3000 }); }
|
|
289
|
+
catch { /* may already be dead */ }
|
|
290
|
+
return;
|
|
291
|
+
}
|
|
292
|
+
// Unix: collect descendant PIDs (deepest first), SIGKILL each, then a final
|
|
293
|
+
// `pkill -KILL -P <pid>` sweep for anything pgrep missed.
|
|
294
|
+
const descendants = _collectDescendantPids(pid);
|
|
295
|
+
for (const child of descendants) {
|
|
296
|
+
try { process.kill(child, 'SIGKILL'); } catch { /* gone */ }
|
|
297
|
+
}
|
|
298
|
+
try { shared.exec(`pkill -KILL -P ${pid}`, { timeout: 3000 }); }
|
|
299
|
+
catch { /* children may already be dead */ }
|
|
300
|
+
try { process.kill(pid, 'SIGKILL'); } catch { /* parent may already be dead */ }
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
function _collectDescendantPids(rootPid) {
|
|
304
|
+
// Returns descendant PIDs ordered deepest-first so leaves die before parents
|
|
305
|
+
// (otherwise grandchildren get re-parented to init and disappear from pgrep -P
|
|
306
|
+
// <pid>). BFS the tree, then reverse.
|
|
307
|
+
const visited = new Set();
|
|
308
|
+
const queue = [rootPid];
|
|
309
|
+
const order = [];
|
|
310
|
+
while (queue.length) {
|
|
311
|
+
const current = queue.shift();
|
|
312
|
+
if (visited.has(current)) continue;
|
|
313
|
+
visited.add(current);
|
|
314
|
+
let out = '';
|
|
315
|
+
try { out = String(shared.exec(`pgrep -P ${current}`, { timeout: 2000 }) || ''); }
|
|
316
|
+
catch { /* no children or pgrep missing */ continue; }
|
|
317
|
+
const children = out.split(/\s+/).map(s => Number(s)).filter(n => Number.isFinite(n) && n > 0);
|
|
318
|
+
for (const child of children) {
|
|
319
|
+
if (visited.has(child)) continue;
|
|
320
|
+
order.push(child);
|
|
321
|
+
queue.push(child);
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
return order.reverse();
|
|
325
|
+
}
|
|
326
|
+
|
|
100
327
|
function checkSteering(config) {
|
|
101
328
|
const activeProcesses = engine().activeProcesses;
|
|
329
|
+
// Gap B: stranded-deferred sweep runs BEFORE the per-dispatch scan so a
|
|
330
|
+
// stranded info still records its warning even if it has no new inbox file.
|
|
331
|
+
try { checkDeferredStranded(activeProcesses, config); }
|
|
332
|
+
catch (err) { try { log('warn', `Steering: checkDeferredStranded failed: ${err.message}`); } catch {} }
|
|
333
|
+
|
|
102
334
|
for (const [id, info] of activeProcesses) {
|
|
103
|
-
// Gap A (W-mq066js7000fff1f-b): scan agents/<id>/steering-ack/
|
|
104
|
-
// ack files the agent has dropped since the last tick. Each
|
|
105
|
-
// removes its matching inbox file (lookup via frontmatter
|
|
106
|
-
// unread/pending iteration below naturally skips messages
|
|
107
|
-
// acknowledged via the explicit contract.
|
|
335
|
+
// Gap A (W-mq066js7000fff1f-b, master): scan agents/<id>/steering-ack/
|
|
336
|
+
// for any ack files the agent has dropped since the last tick. Each
|
|
337
|
+
// <id>.ack removes its matching inbox file (lookup via frontmatter
|
|
338
|
+
// steerId), so unread/pending iteration below naturally skips messages
|
|
339
|
+
// already acknowledged via the explicit contract.
|
|
108
340
|
let ackedFromDir = [];
|
|
109
341
|
try {
|
|
110
342
|
ackedFromDir = steering.ackSteeringFromAckDir(info.agentId);
|
|
@@ -140,20 +372,12 @@ function checkSteering(config) {
|
|
|
140
372
|
}
|
|
141
373
|
}
|
|
142
374
|
|
|
143
|
-
//
|
|
144
|
-
//
|
|
145
|
-
//
|
|
146
|
-
if (info._steeringAt &&
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
shared.killImmediate(info.proc);
|
|
150
|
-
// On Unix, also try to kill children that may have been orphaned
|
|
151
|
-
if (process.platform !== 'win32' && info.proc?.pid) {
|
|
152
|
-
try { shared.exec(`pkill -KILL -P ${info.proc.pid}`, { timeout: 3000 }); } catch { /* children may already be dead */ }
|
|
153
|
-
}
|
|
154
|
-
info._steeringRetried = true;
|
|
155
|
-
}
|
|
156
|
-
continue;
|
|
375
|
+
// Gap C (this PR): if a steering kill is in-flight, run the escalation
|
|
376
|
+
// ladder. The ladder owns all retry side effects and the give-up exit,
|
|
377
|
+
// and replaces the old one-shot STEERING_KILL_RETRY_MS retry path.
|
|
378
|
+
if (info._steeringAt && !info._steeringGaveUp) {
|
|
379
|
+
const fired = _runSteeringKillLadder(id, info, config);
|
|
380
|
+
if (fired) continue;
|
|
157
381
|
}
|
|
158
382
|
|
|
159
383
|
// Skip if already being steered (prevents double-kill race)
|
|
@@ -217,6 +441,43 @@ function checkSteering(config) {
|
|
|
217
441
|
}
|
|
218
442
|
}
|
|
219
443
|
|
|
444
|
+
// W-mq066js7000fff1f-c (Gap G): when engine.js#onAgentClose's "No conversation
|
|
445
|
+
// found" branch is about to unlink the session.json, any steering-store entries
|
|
446
|
+
// still in flight against that purged sessionId would silently strand. This
|
|
447
|
+
// helper drops them with `status='dropped'` + a `[steering-failed]` live-output
|
|
448
|
+
// line so the human knows to re-send. Caller MUST invoke this BEFORE the unlink.
|
|
449
|
+
// No-ops cleanly when the steering-store module is absent (-d branch unmerged).
|
|
450
|
+
function dropSteeringForPurgedSession(agentId, sessionId, liveOutputPath) {
|
|
451
|
+
if (!agentId || !sessionId) return { dropped: [], skipped: true };
|
|
452
|
+
const store = getSteeringStore();
|
|
453
|
+
if (!store || typeof store.listForAgent !== 'function') return { dropped: [], skipped: true };
|
|
454
|
+
let entries = [];
|
|
455
|
+
try { entries = store.listForAgent(agentId) || []; }
|
|
456
|
+
catch (err) {
|
|
457
|
+
try { log('warn', `Steering: listForAgent for ${agentId} failed: ${err.message}`); } catch {}
|
|
458
|
+
return { dropped: [], skipped: true };
|
|
459
|
+
}
|
|
460
|
+
const dropped = [];
|
|
461
|
+
for (const entry of entries) {
|
|
462
|
+
if (!entry || !STEERING_ACTIVE_STATUSES.has(String(entry.status || ''))) continue;
|
|
463
|
+
const entrySessionId = entry._steeringSessionId || entry.sessionId || entry.session_id;
|
|
464
|
+
if (entrySessionId !== sessionId) continue;
|
|
465
|
+
const entryId = entry.id || _steeringIdForEntry(entry);
|
|
466
|
+
if (!entryId) continue;
|
|
467
|
+
_safeStoreCall('updateStatus', entryId, 'dropped', { last_error: 'session-purged' });
|
|
468
|
+
const line = `\n[steering-failed] Session ${sessionId} was purged by runtime; message ${entryId} dropped, please re-send.\n`;
|
|
469
|
+
if (liveOutputPath) {
|
|
470
|
+
try { fs.appendFileSync(liveOutputPath, line); }
|
|
471
|
+
catch { /* optional */ }
|
|
472
|
+
} else {
|
|
473
|
+
_appendLiveOutputLine(agentId, line);
|
|
474
|
+
}
|
|
475
|
+
log('warn', `Steering: dropped message ${entryId} for ${agentId} (session ${sessionId} purged)`);
|
|
476
|
+
dropped.push(entryId);
|
|
477
|
+
}
|
|
478
|
+
return { dropped, skipped: false };
|
|
479
|
+
}
|
|
480
|
+
|
|
220
481
|
// ─── Timeout Checker ─────────────────────────────────────────────────────────
|
|
221
482
|
|
|
222
483
|
function trackedProcessPid(procInfo) {
|
|
@@ -676,7 +937,11 @@ module.exports = {
|
|
|
676
937
|
checkTimeouts,
|
|
677
938
|
checkSteering,
|
|
678
939
|
checkIdleThreshold,
|
|
940
|
+
dropSteeringForPurgedSession,
|
|
679
941
|
isOsPidAliveForDispatch,
|
|
680
942
|
parseProcessExitCode, terminalResultIndicatesError, parseTerminalResultFallbackExitCode, // exported for testing
|
|
681
943
|
readFileTail, runtimeSupportsMidRunSessionId, // exported for testing
|
|
944
|
+
// exported for testing
|
|
945
|
+
rememberDeferredSteering, checkDeferredStranded, _runSteeringKillLadder, _collectDescendantPids,
|
|
946
|
+
_resetSteeringStoreCacheForTest, _setSteeringStoreForTest,
|
|
682
947
|
};
|