loreli 0.0.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +710 -97
- package/bin/loreli.js +89 -0
- package/package.json +77 -14
- package/packages/README.md +101 -0
- package/packages/action/README.md +98 -0
- package/packages/action/prompts/action.md +172 -0
- package/packages/action/src/index.js +684 -0
- package/packages/agent/README.md +606 -0
- package/packages/agent/src/backends/claude.js +387 -0
- package/packages/agent/src/backends/codex.js +351 -0
- package/packages/agent/src/backends/cursor.js +371 -0
- package/packages/agent/src/backends/index.js +486 -0
- package/packages/agent/src/base.js +138 -0
- package/packages/agent/src/cli.js +275 -0
- package/packages/agent/src/discover.js +396 -0
- package/packages/agent/src/factory.js +124 -0
- package/packages/agent/src/index.js +12 -0
- package/packages/agent/src/models.js +159 -0
- package/packages/agent/src/output.js +62 -0
- package/packages/agent/src/session.js +162 -0
- package/packages/agent/src/trace.js +186 -0
- package/packages/classify/README.md +136 -0
- package/packages/classify/prompts/blocker.md +12 -0
- package/packages/classify/prompts/feedback.md +14 -0
- package/packages/classify/prompts/pane-state.md +20 -0
- package/packages/classify/src/index.js +81 -0
- package/packages/config/README.md +898 -0
- package/packages/config/src/defaults.js +145 -0
- package/packages/config/src/index.js +223 -0
- package/packages/config/src/schema.js +291 -0
- package/packages/config/src/validate.js +160 -0
- package/packages/context/README.md +165 -0
- package/packages/context/src/index.js +198 -0
- package/packages/hub/README.md +338 -0
- package/packages/hub/src/base.js +154 -0
- package/packages/hub/src/github.js +1597 -0
- package/packages/hub/src/index.js +79 -0
- package/packages/hub/src/labels.js +48 -0
- package/packages/identity/README.md +288 -0
- package/packages/identity/src/index.js +620 -0
- package/packages/identity/src/themes/avatar.js +217 -0
- package/packages/identity/src/themes/digimon.js +217 -0
- package/packages/identity/src/themes/dragonball.js +217 -0
- package/packages/identity/src/themes/lotr.js +217 -0
- package/packages/identity/src/themes/marvel.js +217 -0
- package/packages/identity/src/themes/pokemon.js +217 -0
- package/packages/identity/src/themes/starwars.js +217 -0
- package/packages/identity/src/themes/transformers.js +217 -0
- package/packages/identity/src/themes/zelda.js +217 -0
- package/packages/knowledge/README.md +217 -0
- package/packages/knowledge/src/index.js +243 -0
- package/packages/log/README.md +93 -0
- package/packages/log/src/index.js +252 -0
- package/packages/marker/README.md +200 -0
- package/packages/marker/src/index.js +184 -0
- package/packages/mcp/README.md +323 -0
- package/packages/mcp/instructions.md +126 -0
- package/packages/mcp/scaffolding/.agents/skills/loreli-context/SKILL.md +89 -0
- package/packages/mcp/scaffolding/ISSUE_TEMPLATE/config.yml +2 -0
- package/packages/mcp/scaffolding/ISSUE_TEMPLATE/loreli.yml +83 -0
- package/packages/mcp/scaffolding/loreli.yml +491 -0
- package/packages/mcp/scaffolding/mcp-configs/.codex/config.toml +4 -0
- package/packages/mcp/scaffolding/mcp-configs/.cursor/mcp.json +14 -0
- package/packages/mcp/scaffolding/mcp-configs/.mcp.json +14 -0
- package/packages/mcp/scaffolding/pull-request.md +23 -0
- package/packages/mcp/src/index.js +600 -0
- package/packages/mcp/src/tools/agent-context.js +44 -0
- package/packages/mcp/src/tools/agents.js +450 -0
- package/packages/mcp/src/tools/context.js +200 -0
- package/packages/mcp/src/tools/github.js +1163 -0
- package/packages/mcp/src/tools/hitl.js +162 -0
- package/packages/mcp/src/tools/index.js +18 -0
- package/packages/mcp/src/tools/refactor.js +227 -0
- package/packages/mcp/src/tools/repo.js +44 -0
- package/packages/mcp/src/tools/start.js +904 -0
- package/packages/mcp/src/tools/status.js +149 -0
- package/packages/mcp/src/tools/work.js +134 -0
- package/packages/orchestrator/README.md +192 -0
- package/packages/orchestrator/src/index.js +1492 -0
- package/packages/planner/README.md +251 -0
- package/packages/planner/prompts/plan-reviewer.md +109 -0
- package/packages/planner/prompts/planner.md +191 -0
- package/packages/planner/prompts/tiebreaker-reviewer.md +71 -0
- package/packages/planner/src/index.js +1381 -0
- package/packages/review/README.md +129 -0
- package/packages/review/prompts/reviewer.md +158 -0
- package/packages/review/src/index.js +1403 -0
- package/packages/risk/README.md +178 -0
- package/packages/risk/prompts/risk.md +272 -0
- package/packages/risk/src/index.js +439 -0
- package/packages/session/README.md +165 -0
- package/packages/session/src/index.js +215 -0
- package/packages/test-utils/README.md +96 -0
- package/packages/test-utils/src/index.js +354 -0
- package/packages/tmux/README.md +261 -0
- package/packages/tmux/src/index.js +501 -0
- package/packages/workflow/README.md +317 -0
- package/packages/workflow/prompts/preamble.md +14 -0
- package/packages/workflow/src/index.js +660 -0
- package/packages/workflow/src/proof-of-life.js +74 -0
- package/packages/workspace/README.md +143 -0
- package/packages/workspace/src/index.js +1127 -0
- package/index.js +0 -8
|
@@ -0,0 +1,1492 @@
|
|
|
1
|
+
import { rm, writeFile, mkdir } from 'node:fs/promises';
|
|
2
|
+
import { join } from 'node:path';
|
|
3
|
+
import { createHash } from 'node:crypto';
|
|
4
|
+
import { EventEmitter } from 'node:events';
|
|
5
|
+
import { Factory, Session, output } from 'loreli/agent';
|
|
6
|
+
import { Tmux } from 'loreli/tmux';
|
|
7
|
+
import { prepare } from 'loreli/workspace';
|
|
8
|
+
import { pick, side, capability } from 'loreli/identity';
|
|
9
|
+
import { classify } from 'loreli/classify';
|
|
10
|
+
import { logger } from 'loreli/log';
|
|
11
|
+
|
|
12
|
+
const log = logger('orchestrator');
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Fatal error patterns that indicate a backend infrastructure failure
|
|
16
|
+
* (not a task failure). When these appear in an agent's pane output
|
|
17
|
+
* shortly after spawn, the backend is broken and should be degraded.
|
|
18
|
+
*
|
|
19
|
+
* @type {RegExp[]}
|
|
20
|
+
*/
|
|
21
|
+
const FATAL_PATTERNS = [
|
|
22
|
+
/budget[_ ]*(has been )?exceeded/i,
|
|
23
|
+
/rate[_ ]?limit[_ ]*exceeded/i,
|
|
24
|
+
/hit your usage[_ ]*limit/i,
|
|
25
|
+
/authentication[_ ]*(error|failed)/i,
|
|
26
|
+
/invalid[_ ]*api[_ ]*key/i,
|
|
27
|
+
/quota[_ ]*exceeded/i,
|
|
28
|
+
/insufficient[_ ]*quota/i,
|
|
29
|
+
/invalid model name/i,
|
|
30
|
+
/unable to connect to api/i,
|
|
31
|
+
/connection\s*refused/i
|
|
32
|
+
];
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Maximum pane characters logged in diagnostic debug output.
|
|
36
|
+
*
|
|
37
|
+
* @type {number}
|
|
38
|
+
*/
|
|
39
|
+
const PANE_DEBUG_LIMIT = 4000;
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Check if pane output contains fatal API error patterns.
|
|
43
|
+
*
|
|
44
|
+
* @param {string} output - Pane content from agent.capture().
|
|
45
|
+
* @returns {boolean} True if a fatal error pattern is found.
|
|
46
|
+
*/
|
|
47
|
+
function hasFatalError(output) {
|
|
48
|
+
if (!output) return false;
|
|
49
|
+
return FATAL_PATTERNS.some(function match(p) { return p.test(output); });
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Format captured pane output for debug logging.
|
|
54
|
+
*
|
|
55
|
+
* Keeps logs readable while still preserving enough context to validate
|
|
56
|
+
* classifier and fallback decisions during stall/rapid-death diagnosis.
|
|
57
|
+
*
|
|
58
|
+
* @param {string} output - Raw pane output.
|
|
59
|
+
* @returns {string} Pane text, truncated when necessary.
|
|
60
|
+
*/
|
|
61
|
+
function paneDebug(output) {
|
|
62
|
+
if (!output) return '[empty pane output]';
|
|
63
|
+
if (output.length <= PANE_DEBUG_LIMIT) return output;
|
|
64
|
+
const rest = output.length - PANE_DEBUG_LIMIT;
|
|
65
|
+
return `${output.slice(0, PANE_DEBUG_LIMIT)}\n… [truncated ${rest} chars]`;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Normalize remedy instructions into tmux key names.
|
|
70
|
+
*
|
|
71
|
+
* Classifier prompts return remedies as space-delimited strings
|
|
72
|
+
* (`"Down Enter"`), while backend fallback diagnose methods return
|
|
73
|
+
* string arrays (`['Down', 'Enter']`). The orchestrator accepts both.
|
|
74
|
+
*
|
|
75
|
+
* @param {string|string[]|null|undefined} remedy - Remedy from diagnosis.
|
|
76
|
+
* @returns {string[]} Tmux key sequence.
|
|
77
|
+
*/
|
|
78
|
+
function remedy(remedy) {
|
|
79
|
+
if (Array.isArray(remedy)) {
|
|
80
|
+
const keys = remedy.filter(Boolean);
|
|
81
|
+
if (keys.length > 0) return keys;
|
|
82
|
+
return ['Enter'];
|
|
83
|
+
}
|
|
84
|
+
if (typeof remedy === 'string') {
|
|
85
|
+
const keys = remedy.split(/\s+/).filter(Boolean);
|
|
86
|
+
if (keys.length > 0) return keys;
|
|
87
|
+
return ['Enter'];
|
|
88
|
+
}
|
|
89
|
+
return ['Enter'];
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Generic agent lifecycle coordinator via EventEmitter.
|
|
94
|
+
*
|
|
95
|
+
* Manages spawn/shutdown/kill, reactor polling, stall detection,
|
|
96
|
+
* and activity tracking. Contains zero role-specific logic — all
|
|
97
|
+
* planner/action/review behavior lives in the role packages that
|
|
98
|
+
* subscribe to lifecycle events.
|
|
99
|
+
*
|
|
100
|
+
* @extends EventEmitter
|
|
101
|
+
* @fires Orchestrator#spawned
|
|
102
|
+
* @fires Orchestrator#removed
|
|
103
|
+
* @fires Orchestrator#stall
|
|
104
|
+
*/
|
|
105
|
+
export class Orchestrator extends EventEmitter {
|
|
106
|
+
/**
|
|
107
|
+
* @param {object} opts
|
|
108
|
+
* @param {object} opts.hub - Hub instance for git hosting operations.
|
|
109
|
+
* @param {object} opts.identityRegistry - Registry for agent identities.
|
|
110
|
+
* @param {object} opts.backendRegistry - Registry for agent backends.
|
|
111
|
+
* @param {object} opts.storage - Persistent storage instance.
|
|
112
|
+
* @param {object} [opts.config] - Config instance from loreli/config.
|
|
113
|
+
*/
|
|
114
|
+
constructor({ hub, identityRegistry, backendRegistry, storage, config }) {
|
|
115
|
+
super();
|
|
116
|
+
|
|
117
|
+
/** @type {object} Hub for git hosting. */
|
|
118
|
+
this.hub = hub;
|
|
119
|
+
|
|
120
|
+
/** @type {object} Identity registry. */
|
|
121
|
+
this.identityRegistry = identityRegistry;
|
|
122
|
+
|
|
123
|
+
/** @type {object} Backend registry. */
|
|
124
|
+
this.backendRegistry = backendRegistry;
|
|
125
|
+
|
|
126
|
+
/** @type {Factory} Agent factory — centralizes the create+spawn pipeline. */
|
|
127
|
+
this.factory = new Factory({ backends: backendRegistry, identities: identityRegistry, config });
|
|
128
|
+
|
|
129
|
+
/** @type {object} Storage for session persistence. */
|
|
130
|
+
this.storage = storage;
|
|
131
|
+
|
|
132
|
+
/** @type {object|null} Config instance from loreli/config. */
|
|
133
|
+
this.cfg = config ?? null;
|
|
134
|
+
|
|
135
|
+
/** @type {Map<string, object>} Active agents by name. */
|
|
136
|
+
this.agents = new Map();
|
|
137
|
+
|
|
138
|
+
/** @type {string|null} Current session ID. */
|
|
139
|
+
this.sessionId = null;
|
|
140
|
+
|
|
141
|
+
/** @type {object|null} MCP client identity. */
|
|
142
|
+
this.clientIdentity = null;
|
|
143
|
+
|
|
144
|
+
/** @type {string|null} Target repository in "owner/name" format. */
|
|
145
|
+
this.repo = null;
|
|
146
|
+
|
|
147
|
+
/** @type {number} Stall timeout in ms. */
|
|
148
|
+
this.stallTimeout = this.cfg?.get?.('timeouts.stall') ?? 600000;
|
|
149
|
+
|
|
150
|
+
/** @type {number} Delay before checking if a freshly spawned agent died. */
|
|
151
|
+
this.rapidDeathDelay = this.cfg?.get?.('timeouts.rapidDeath') ?? 15000;
|
|
152
|
+
|
|
153
|
+
/** @type {NodeJS.Timeout|null} Stall detection interval handle. */
|
|
154
|
+
this._monitorHandle = null;
|
|
155
|
+
|
|
156
|
+
/** @type {Map<string, string>} Last known activity timestamp per agent. */
|
|
157
|
+
this._lastActivity = new Map();
|
|
158
|
+
|
|
159
|
+
/** @type {Map<string, string>} MD5 hash of last captured pane output per agent for tmux-based activity detection. */
|
|
160
|
+
this._lastPaneHash = new Map();
|
|
161
|
+
|
|
162
|
+
/** @type {Map<string, number>} Consecutive classify failures per agent — safety net kill after threshold. */
|
|
163
|
+
this._classifyFails = new Map();
|
|
164
|
+
|
|
165
|
+
/** @type {NodeJS.Timeout|null} Reactor polling interval handle. */
|
|
166
|
+
this._watchHandle = null;
|
|
167
|
+
|
|
168
|
+
/** @type {Map<string, function>} Registered reactor handlers by name. */
|
|
169
|
+
this._handlers = new Map();
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Names already claimed by other participants, discovered from
|
|
173
|
+
* GitHub claim comments and PR branches during reactor ticks.
|
|
174
|
+
* Populated as a zero-cost side effect of data the reactor
|
|
175
|
+
* already fetches — no additional API calls needed.
|
|
176
|
+
*
|
|
177
|
+
* @type {Set<string>}
|
|
178
|
+
*/
|
|
179
|
+
this.takenNames = new Set();
|
|
180
|
+
|
|
181
|
+
/**
|
|
182
|
+
* Agents we've removed locally (killed or shut down). Used to
|
|
183
|
+
* distinguish "we killed this agent" from "a foreign orchestrator
|
|
184
|
+
* owns this agent" during proof-of-life decisions.
|
|
185
|
+
*
|
|
186
|
+
* @type {Set<string>}
|
|
187
|
+
*/
|
|
188
|
+
this._removed = new Set();
|
|
189
|
+
|
|
190
|
+
/**
|
|
191
|
+
* Registered workflows by role. Populated during start so
|
|
192
|
+
* scale() can collect demand signals from each workflow.
|
|
193
|
+
*
|
|
194
|
+
* @type {Map<string, object>}
|
|
195
|
+
*/
|
|
196
|
+
this.workflows = new Map();
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Last spawn timestamp per role for cooldown enforcement.
|
|
200
|
+
* Prevents thrashing when demand fluctuates between ticks.
|
|
201
|
+
*
|
|
202
|
+
* @type {Map<string, number>}
|
|
203
|
+
*/
|
|
204
|
+
this._lastSpawn = new Map();
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// ── Seed (Identity Discovery) ────────────────────────
|
|
208
|
+
|
|
209
|
+
/**
|
|
210
|
+
* One-time seed of `takenNames` from open PR branch names.
|
|
211
|
+
*
|
|
212
|
+
* Called before the first `acquire()` — before any reactor tick has
|
|
213
|
+
* populated the set. After the first tick, the reactor keeps the
|
|
214
|
+
* set current as a side effect of its normal data flow.
|
|
215
|
+
*
|
|
216
|
+
* Idempotent: only runs once per orchestrator lifetime.
|
|
217
|
+
*
|
|
218
|
+
* @returns {Promise<void>}
|
|
219
|
+
*/
|
|
220
|
+
async seed() {
|
|
221
|
+
if (this._seeded) return;
|
|
222
|
+
this._seeded = true;
|
|
223
|
+
|
|
224
|
+
if (!this.hub || !this.repo) return;
|
|
225
|
+
|
|
226
|
+
try {
|
|
227
|
+
const prs = await this.hub.pulls(this.repo, { state: 'open' });
|
|
228
|
+
for (const pr of prs) {
|
|
229
|
+
const slash = pr.head?.indexOf('/');
|
|
230
|
+
if (slash > 0) this.takenNames.add(pr.head.slice(0, slash));
|
|
231
|
+
}
|
|
232
|
+
log.debug(`seed: discovered ${this.takenNames.size} taken names from open PRs`);
|
|
233
|
+
} catch (err) {
|
|
234
|
+
log.debug(`seed: skipped — ${err.message}`);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
// ── Lifecycle ─────────────────────────────────────────
|
|
239
|
+
|
|
240
|
+
/**
|
|
241
|
+
* Spawn and register an agent with comprehensive rollback.
|
|
242
|
+
*
|
|
243
|
+
* Tracks each step of the spawn process and undoes them in reverse
|
|
244
|
+
* order on failure. Each completed step is tracked and unwound
|
|
245
|
+
* individually on error.
|
|
246
|
+
*
|
|
247
|
+
* @param {object} agent - Agent instance.
|
|
248
|
+
* @returns {Promise<void>}
|
|
249
|
+
* @fires Orchestrator#spawned
|
|
250
|
+
*/
|
|
251
|
+
async spawn(agent) {
|
|
252
|
+
log.info(`spawning agent: ${agent.identity.name} (${agent.role})`);
|
|
253
|
+
|
|
254
|
+
/** @type {Array<{step: string, undo: function}>} Completed steps for rollback. */
|
|
255
|
+
const completed = [];
|
|
256
|
+
|
|
257
|
+
try {
|
|
258
|
+
// Step 1: Spawn the agent process (tmux pane, API session, etc.)
|
|
259
|
+
await agent.spawn();
|
|
260
|
+
completed.push({
|
|
261
|
+
step: 'spawn',
|
|
262
|
+
undo: async function undoSpawn() {
|
|
263
|
+
try { await agent.stop(); } catch (e) { log.debug(`rollback stop failed: ${e.message}`); }
|
|
264
|
+
}
|
|
265
|
+
});
|
|
266
|
+
|
|
267
|
+
// Step 2: Register in the agents map
|
|
268
|
+
this.agents.set(agent.identity.name, agent);
|
|
269
|
+
completed.push({
|
|
270
|
+
step: 'register',
|
|
271
|
+
undo: () => { this.agents.delete(agent.identity.name); }
|
|
272
|
+
});
|
|
273
|
+
|
|
274
|
+
// Step 3: Record activity timestamp
|
|
275
|
+
this._lastActivity.set(agent.identity.name, new Date().toISOString());
|
|
276
|
+
completed.push({
|
|
277
|
+
step: 'activity',
|
|
278
|
+
undo: () => { this._lastActivity.delete(agent.identity.name); }
|
|
279
|
+
});
|
|
280
|
+
|
|
281
|
+
log.info(`agent spawned: ${agent.identity.name}`);
|
|
282
|
+
|
|
283
|
+
// Schedule rapid-death detection: if the agent dies or shows
|
|
284
|
+
// fatal API errors within rapidDeathDelay of spawning, the
|
|
285
|
+
// backend is likely broken (budget exhaustion, API outage).
|
|
286
|
+
// Mark it as degraded so scale() falls back to cursor-agent.
|
|
287
|
+
//
|
|
288
|
+
// Uses the pane-state classifier when pane output is available
|
|
289
|
+
// (remain-on-exit keeps dead panes capturable). Falls back to
|
|
290
|
+
// raw alive() when capture fails.
|
|
291
|
+
if (agent.backend && agent.alive) {
|
|
292
|
+
const backend = agent.backend;
|
|
293
|
+
const name = agent.identity.name;
|
|
294
|
+
const registry = this.backendRegistry;
|
|
295
|
+
const self = this;
|
|
296
|
+
const timer = setTimeout(async function rapidDeathCheck() {
|
|
297
|
+
if (agent.state === 'dormant') return;
|
|
298
|
+
|
|
299
|
+
try {
|
|
300
|
+
const alive = await agent.alive();
|
|
301
|
+
|
|
302
|
+
// Agent is alive and healthy — no rapid death
|
|
303
|
+
if (alive && !agent.capture) return;
|
|
304
|
+
|
|
305
|
+
let output;
|
|
306
|
+
try {
|
|
307
|
+
output = agent.capture
|
|
308
|
+
? await agent.capture(self.cfg?.get?.('classify.maxLines') ?? 100)
|
|
309
|
+
: null;
|
|
310
|
+
} catch { output = null; }
|
|
311
|
+
if (output !== null) {
|
|
312
|
+
log.debug(`rapid-death pane ${name} (${backend}, alive=${alive}):\n${paneDebug(output)}`);
|
|
313
|
+
} else {
|
|
314
|
+
log.debug(`rapid-death pane ${name} (${backend}, alive=${alive}): [capture unavailable]`);
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
// Classify the pane output to determine why the agent
|
|
318
|
+
// died or what error it hit while still alive.
|
|
319
|
+
let diagnosis;
|
|
320
|
+
if (output) {
|
|
321
|
+
try {
|
|
322
|
+
diagnosis = await classify('pane-state', output, {
|
|
323
|
+
backends: self.backendRegistry,
|
|
324
|
+
config: self.cfg,
|
|
325
|
+
vars: { model: agent.model, backend, role: agent.role }
|
|
326
|
+
});
|
|
327
|
+
log.info(`rapid-death classify ${name}: ${diagnosis.category} — ${diagnosis.reasoning}`);
|
|
328
|
+
} catch (err) {
|
|
329
|
+
log.warn(`rapid-death classify failed for ${name}: ${err.message}`);
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
// When LLM classify fails, fall back to backend-specific
|
|
334
|
+
// regex detection. Each backend knows its CLI's dialog patterns.
|
|
335
|
+
let category = diagnosis?.category;
|
|
336
|
+
if (alive && output) {
|
|
337
|
+
const fallback = registry?.diagnose?.(backend, output);
|
|
338
|
+
const actionable = new Set(['option_dialog', 'waiting_for_input', 'fatal', 'dead']);
|
|
339
|
+
const fallbackActionable = actionable.has(fallback?.category);
|
|
340
|
+
const llmActionable = actionable.has(category);
|
|
341
|
+
const llmCategory = category;
|
|
342
|
+
|
|
343
|
+
if (!category && fallback) {
|
|
344
|
+
category = fallback.category;
|
|
345
|
+
diagnosis = fallback;
|
|
346
|
+
log.info(`rapid-death fallback diagnose ${name}: ${category} — ${fallback.reasoning}`);
|
|
347
|
+
} else if (fallbackActionable && !llmActionable) {
|
|
348
|
+
category = fallback.category;
|
|
349
|
+
diagnosis = fallback;
|
|
350
|
+
log.info(`rapid-death fallback override ${name}: ${fallback.category} over ${llmCategory ?? 'unknown'} — ${fallback.reasoning}`);
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
if (!alive) {
|
|
355
|
+
log.warn(`rapid death: ${name} died within ${self.rapidDeathDelay}ms of spawn (${category ?? 'unknown'}) — marking ${backend} degraded`);
|
|
356
|
+
registry?.recordFailure(backend);
|
|
357
|
+
try { await self.kill(name); } catch { /* already dead */ }
|
|
358
|
+
self.emit('rapid-death', { name, backend, diagnosis });
|
|
359
|
+
return;
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
// Alive with recoverable dialog — send the appropriate
|
|
363
|
+
// input to dismiss it. Record a soft warning instead of a
|
|
364
|
+
// hard failure so the backend isn't blacklisted for a
|
|
365
|
+
// transient issue. Repeated warnings promote to failure.
|
|
366
|
+
if (category === 'option_dialog') {
|
|
367
|
+
const keys = remedy(diagnosis?.remedy);
|
|
368
|
+
log.info(`rapid-death remediation: ${name} has option dialog — sending ${keys.join('+')}`);
|
|
369
|
+
try {
|
|
370
|
+
const tmux = new Tmux();
|
|
371
|
+
await tmux.keys(agent.paneId, ...keys);
|
|
372
|
+
} catch (err) { log.debug(`rapid-death: keys failed for ${name}: ${err.message}`); }
|
|
373
|
+
registry?.recordWarning?.(backend);
|
|
374
|
+
self.emit('rapid-death', { name, backend, reason: 'remediated', diagnosis });
|
|
375
|
+
return;
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
if (category === 'waiting_for_input') {
|
|
379
|
+
log.info(`rapid-death remediation: ${name} waiting for input — sending continuation`);
|
|
380
|
+
try {
|
|
381
|
+
await agent.send('Please continue working or report your status.');
|
|
382
|
+
} catch (err) { log.debug(`rapid-death: send failed for ${name}: ${err.message}`); }
|
|
383
|
+
registry?.recordWarning?.(backend);
|
|
384
|
+
self.emit('rapid-death', { name, backend, reason: 'remediated', diagnosis });
|
|
385
|
+
return;
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
// Alive but classifier detected fatal state
|
|
389
|
+
if (category === 'fatal' || category === 'dead') {
|
|
390
|
+
log.warn(`stuck-alive: ${name} classified as ${category} — marking ${backend} degraded`);
|
|
391
|
+
registry?.recordFailure(backend);
|
|
392
|
+
try { await agent.stop(); } catch { /* stop can fail */ }
|
|
393
|
+
self.emit('rapid-death', { name, backend, reason: 'stuck-alive', diagnosis });
|
|
394
|
+
return;
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
// Alive but regex fallback for when classifier didn't detect fatal
|
|
398
|
+
if (alive && output && hasFatalError(output)) {
|
|
399
|
+
log.warn(`stuck-alive: ${name} shows fatal API error (regex) — marking ${backend} degraded`);
|
|
400
|
+
registry?.recordFailure(backend);
|
|
401
|
+
try { await agent.stop(); } catch { /* stop can fail */ }
|
|
402
|
+
self.emit('rapid-death', { name, backend, reason: 'stuck-alive' });
|
|
403
|
+
}
|
|
404
|
+
} catch { /* pane check can fail when session is torn down */ }
|
|
405
|
+
}, this.rapidDeathDelay);
|
|
406
|
+
timer.unref();
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
/**
|
|
410
|
+
* @event Orchestrator#spawned
|
|
411
|
+
* @type {object}
|
|
412
|
+
* @property {string} name - Agent identity name.
|
|
413
|
+
* @property {string} role - Agent role.
|
|
414
|
+
* @property {string} provider - AI provider.
|
|
415
|
+
*/
|
|
416
|
+
this.emit('spawned', {
|
|
417
|
+
name: agent.identity.name,
|
|
418
|
+
role: agent.role,
|
|
419
|
+
provider: agent.identity.provider
|
|
420
|
+
});
|
|
421
|
+
} catch (err) {
|
|
422
|
+
log.error(`spawn failed: ${agent.identity.name} — ${err.message}`);
|
|
423
|
+
|
|
424
|
+
// Rollback completed steps in reverse order
|
|
425
|
+
for (let i = completed.length - 1; i >= 0; i--) {
|
|
426
|
+
try {
|
|
427
|
+
await completed[i].undo();
|
|
428
|
+
log.debug(`rollback: undid "${completed[i].step}" for ${agent.identity.name}`);
|
|
429
|
+
} catch (rollbackErr) {
|
|
430
|
+
// Never mask the original error with a rollback failure
|
|
431
|
+
log.warn(`rollback "${completed[i].step}" failed: ${rollbackErr.message}`);
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
// Identity release is the caller's responsibility (add_agent tool
|
|
436
|
+
// acquired it, add_agent tool releases it). Spawn only rolls back
|
|
437
|
+
// the steps it owns: process, registration, and activity tracking.
|
|
438
|
+
throw err;
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
/**
|
|
443
|
+
* Gracefully shut down an agent using a 3-phase protocol:
|
|
444
|
+
*
|
|
445
|
+
* 1. Send a structured shutdown request with a unique `requestId`
|
|
446
|
+
* 2. Poll for acknowledgment — the agent can accept or continue working
|
|
447
|
+
* 3. On acknowledgment or timeout, force stop and clean up
|
|
448
|
+
*
|
|
449
|
+
* @param {string} name - Agent identity name.
|
|
450
|
+
* @param {number} [timeout] - Timeout before force kill.
|
|
451
|
+
* @param {string} [reason] - Reason for shutdown (passed to agent).
|
|
452
|
+
* @returns {Promise<{acknowledged: boolean}>} Whether the agent acknowledged.
|
|
453
|
+
* @fires Orchestrator#removed
|
|
454
|
+
*/
|
|
455
|
+
async shutdown(name, timeout, reason) {
|
|
456
|
+
const agent = this.agents.get(name);
|
|
457
|
+
if (!agent) throw new Error(`Agent "${name}" not found`);
|
|
458
|
+
|
|
459
|
+
const shutdownTimeout = timeout ?? this.cfg?.get?.('timeouts.shutdown') ?? 60000;
|
|
460
|
+
const pollInterval = this.cfg?.get?.('timeouts.poll') ?? 2000;
|
|
461
|
+
const requestId = `shutdown-${Date.now()}@${name}`;
|
|
462
|
+
|
|
463
|
+
log.info(`shutting down agent: ${name} (timeout: ${shutdownTimeout}ms, requestId: ${requestId})`);
|
|
464
|
+
|
|
465
|
+
// Phase 1: Send structured shutdown request
|
|
466
|
+
const shutdownMessage = [
|
|
467
|
+
`**Shutdown Request** (id: ${requestId})`,
|
|
468
|
+
reason ? `Reason: ${reason}` : '',
|
|
469
|
+
'Please finish your current work and shut down gracefully.',
|
|
470
|
+
'Post a comment or signal when ready.'
|
|
471
|
+
].filter(Boolean).join('\n');
|
|
472
|
+
|
|
473
|
+
try {
|
|
474
|
+
await agent.send(shutdownMessage);
|
|
475
|
+
} catch (err) { log.debug(`shutdown message failed for ${name}: ${err.message}`); }
|
|
476
|
+
|
|
477
|
+
// Phase 2: Poll for acknowledgment or timeout
|
|
478
|
+
let acknowledged = false;
|
|
479
|
+
const deadline = Date.now() + shutdownTimeout;
|
|
480
|
+
|
|
481
|
+
while (Date.now() < deadline) {
|
|
482
|
+
if (agent.state === 'dormant') {
|
|
483
|
+
acknowledged = true;
|
|
484
|
+
break;
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
// Check if the agent is still alive — if it exited on its own,
|
|
488
|
+
// treat that as implicit acknowledgment
|
|
489
|
+
try {
|
|
490
|
+
const alive = await agent.alive?.();
|
|
491
|
+
if (alive === false) {
|
|
492
|
+
acknowledged = true;
|
|
493
|
+
break;
|
|
494
|
+
}
|
|
495
|
+
} catch (err) { log.debug(`alive check during shutdown polling failed: ${err.message}`); }
|
|
496
|
+
|
|
497
|
+
await new Promise(function wait(r) { setTimeout(r, pollInterval); });
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
if (acknowledged) {
|
|
501
|
+
log.info(`agent ${name} acknowledged shutdown`);
|
|
502
|
+
} else {
|
|
503
|
+
log.warn(`agent ${name} did not acknowledge shutdown within ${shutdownTimeout}ms, force stopping`);
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
// Phase 3: Force stop and clean up
|
|
507
|
+
const session = agent.session;
|
|
508
|
+
await this.snapshot(name, agent);
|
|
509
|
+
await agent.stop();
|
|
510
|
+
|
|
511
|
+
// Clean up the agent's workspace directory when configured
|
|
512
|
+
if (this.cfg?.get?.('workspace.cleanup') && agent.cwd) {
|
|
513
|
+
try {
|
|
514
|
+
await rm(agent.cwd, { recursive: true, force: true });
|
|
515
|
+
log.info(`workspace cleaned: ${name} (${agent.cwd})`);
|
|
516
|
+
} catch (err) { log.debug(`workspace cleanup failed for ${name}: ${err.message}`); }
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
this.agents.delete(name);
|
|
520
|
+
this._lastActivity.delete(name);
|
|
521
|
+
this._lastPaneHash.delete(name);
|
|
522
|
+
this._classifyFails.delete(name);
|
|
523
|
+
this._removed.add(name);
|
|
524
|
+
this.identityRegistry.release(agent.identity);
|
|
525
|
+
log.info(`agent shut down: ${name}`);
|
|
526
|
+
|
|
527
|
+
if (this.agents.size === 0 && session) await this._pruneSession(session);
|
|
528
|
+
|
|
529
|
+
/**
|
|
530
|
+
* @event Orchestrator#removed
|
|
531
|
+
* @type {object}
|
|
532
|
+
* @property {string} name - Agent identity name.
|
|
533
|
+
* @property {string} reason - 'shutdown' or 'killed'.
|
|
534
|
+
* @property {object} agent - The removed agent instance.
|
|
535
|
+
*/
|
|
536
|
+
this.emit('removed', { name, reason: 'shutdown', agent });
|
|
537
|
+
|
|
538
|
+
return { acknowledged };
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
/**
|
|
542
|
+
* Force kill an agent immediately.
|
|
543
|
+
* Releases any claimed issues so they can be re-claimed.
|
|
544
|
+
*
|
|
545
|
+
* @param {string} name - Agent identity name.
|
|
546
|
+
* @returns {Promise<void>}
|
|
547
|
+
* @fires Orchestrator#removed
|
|
548
|
+
*/
|
|
549
|
+
async kill(name) {
|
|
550
|
+
const agent = this.agents.get(name);
|
|
551
|
+
if (!agent) throw new Error(`Agent "${name}" not found`);
|
|
552
|
+
|
|
553
|
+
log.warn(`force killing agent: ${name}`);
|
|
554
|
+
const session = agent.session;
|
|
555
|
+
await this.snapshot(name, agent);
|
|
556
|
+
await agent.stop();
|
|
557
|
+
|
|
558
|
+
// Clean up the agent's workspace directory when configured
|
|
559
|
+
if (this.cfg?.get?.('workspace.cleanup') && agent.cwd) {
|
|
560
|
+
try {
|
|
561
|
+
await rm(agent.cwd, { recursive: true, force: true });
|
|
562
|
+
log.info(`workspace cleaned: ${name} (${agent.cwd})`);
|
|
563
|
+
} catch (err) { log.debug(`workspace cleanup failed for ${name}: ${err.message}`); }
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
this.agents.delete(name);
|
|
567
|
+
this._lastActivity.delete(name);
|
|
568
|
+
this._lastPaneHash.delete(name);
|
|
569
|
+
this._classifyFails.delete(name);
|
|
570
|
+
this._removed.add(name);
|
|
571
|
+
this.identityRegistry.release(agent.identity);
|
|
572
|
+
|
|
573
|
+
if (this.agents.size === 0 && session) await this._pruneSession(session);
|
|
574
|
+
|
|
575
|
+
this.emit('removed', { name, reason: 'killed', agent });
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
// ── Tmux Cleanup ─────────────────────────────────────
|
|
579
|
+
|
|
580
|
+
/**
|
|
581
|
+
* Resolve the tmux session name from registered agents or config.
|
|
582
|
+
*
|
|
583
|
+
* Agents carry their own session name (defaults to 'loreli' but
|
|
584
|
+
* overridable in tests). This avoids hardcoding the session name
|
|
585
|
+
* and keeps cleanup aligned with wherever agents actually live.
|
|
586
|
+
*
|
|
587
|
+
* @returns {string} The tmux session name.
|
|
588
|
+
*/
|
|
589
|
+
_session() {
|
|
590
|
+
for (const agent of this.agents.values()) {
|
|
591
|
+
if (agent.session) return agent.session;
|
|
592
|
+
}
|
|
593
|
+
return this.cfg?.get?.('tmux.session') ?? 'loreli';
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
/**
|
|
597
|
+
* Destroy the tmux session when no agents remain.
|
|
598
|
+
*
|
|
599
|
+
* Safety net that runs after the last agent is removed via
|
|
600
|
+
* {@link kill} or {@link shutdown}. Individual agent `stop()` calls
|
|
601
|
+
* kill their own panes, but orphaned panes (from `remain-on-exit`,
|
|
602
|
+
* crashed backends, or timing gaps) can keep the session alive.
|
|
603
|
+
* This ensures a clean slate.
|
|
604
|
+
*
|
|
605
|
+
* @param {string} session - The tmux session name to prune.
|
|
606
|
+
* @returns {Promise<void>}
|
|
607
|
+
*/
|
|
608
|
+
async _pruneSession(session) {
|
|
609
|
+
if (!Tmux.available()) return;
|
|
610
|
+
const tmux = new Tmux();
|
|
611
|
+
|
|
612
|
+
try {
|
|
613
|
+
if (await tmux.has(session)) {
|
|
614
|
+
let paneCount = 0;
|
|
615
|
+
try {
|
|
616
|
+
const panes = await tmux.allPanes(session);
|
|
617
|
+
paneCount = panes.length;
|
|
618
|
+
} catch { /* session may be in a bad state */ }
|
|
619
|
+
await tmux.kill(session);
|
|
620
|
+
log.info(`pruned tmux session "${session}" (${paneCount} orphaned panes) — no agents remain`);
|
|
621
|
+
}
|
|
622
|
+
} catch (err) {
|
|
623
|
+
log.debug(`session prune failed: ${err.message}`);
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
/**
|
|
628
|
+
* Garbage-collect orphaned tmux panes not tracked by any agent.
|
|
629
|
+
*
|
|
630
|
+
* Lists all panes in the loreli tmux session and kills any that
|
|
631
|
+
* do not belong to a registered agent. Safe to call at any time —
|
|
632
|
+
* tracked agent panes are preserved.
|
|
633
|
+
*
|
|
634
|
+
* When all panes are orphaned, the session is destroyed entirely.
|
|
635
|
+
*
|
|
636
|
+
* @returns {Promise<{killed: number}>} Count of orphaned panes killed.
|
|
637
|
+
*/
|
|
638
|
+
async gc() {
|
|
639
|
+
if (!Tmux.available()) return { killed: 0 };
|
|
640
|
+
const session = this._session();
|
|
641
|
+
const tmux = new Tmux();
|
|
642
|
+
|
|
643
|
+
if (!await tmux.has(session)) return { killed: 0 };
|
|
644
|
+
|
|
645
|
+
const tracked = new Set();
|
|
646
|
+
for (const agent of this.agents.values()) {
|
|
647
|
+
if (agent.paneId) tracked.add(agent.paneId);
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
let all;
|
|
651
|
+
try {
|
|
652
|
+
all = await tmux.allPanes(session);
|
|
653
|
+
} catch {
|
|
654
|
+
return { killed: 0 };
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
let killed = 0;
|
|
658
|
+
for (const pane of all) {
|
|
659
|
+
if (!tracked.has(pane.id)) {
|
|
660
|
+
try {
|
|
661
|
+
await tmux.killPane(pane.id);
|
|
662
|
+
killed++;
|
|
663
|
+
log.info(`gc: killed orphaned pane ${pane.id}`);
|
|
664
|
+
} catch { /* pane may have died between list and kill */ }
|
|
665
|
+
}
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
// Destroy the session when every pane was orphaned
|
|
669
|
+
if (killed > 0 && killed === all.length) {
|
|
670
|
+
try {
|
|
671
|
+
if (await tmux.has(session)) await tmux.kill(session);
|
|
672
|
+
} catch { /* session may auto-destroy */ }
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
return { killed };
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
// ── Coordination ──────────────────────────────────────
|
|
679
|
+
|
|
680
|
+
/**
|
|
681
|
+
* Auto-spawn an agent for a given provider and role when one is not
|
|
682
|
+
* already available. Cross-provider review is the core value proposition;
|
|
683
|
+
* silently skipping it defeats the purpose, so the orchestrator
|
|
684
|
+
* proactively enlists the opposing side.
|
|
685
|
+
*
|
|
686
|
+
* Delegates to {@link Factory#spawn} for the full creation pipeline.
|
|
687
|
+
*
|
|
688
|
+
* @param {string} provider - AI provider to spawn for.
|
|
689
|
+
* @param {string} role - Agent role ('reviewer', 'action', etc.).
|
|
690
|
+
* @param {object} [opts] - Additional options.
|
|
691
|
+
* @returns {Promise<object>} The spawned agent instance.
|
|
692
|
+
*/
|
|
693
|
+
async enlist(provider, role, opts = {}) {
|
|
694
|
+
log.info(`enlisting ${role} agent for ${provider} — cross-provider pairing requires it`);
|
|
695
|
+
|
|
696
|
+
// Seed taken names before first acquire — one-time cost, zero
|
|
697
|
+
// ongoing overhead because reactor ticks keep the set current.
|
|
698
|
+
await this.seed();
|
|
699
|
+
|
|
700
|
+
// Build context for the factory so prepare() writes session env vars.
|
|
701
|
+
// Include home and token so agent subprocesses use the same storage
|
|
702
|
+
// location and can create a hub for stamped GitHub operations.
|
|
703
|
+
const context = this.sessionId ? {
|
|
704
|
+
session: this.sessionId,
|
|
705
|
+
agent: null, // set after identity is acquired inside factory
|
|
706
|
+
repo: this.repo,
|
|
707
|
+
home: this.storage?.home,
|
|
708
|
+
token: process.env.GITHUB_TOKEN
|
|
709
|
+
} : undefined;
|
|
710
|
+
|
|
711
|
+
// Theme coherence: inherit from an existing agent so antagonist
|
|
712
|
+
// pairs always share the same theme universe. Only pick from
|
|
713
|
+
// config when no agents exist yet (first enlistment).
|
|
714
|
+
const existing = [...this.agents.values()].find(function hasTheme(a) { return a.identity?.theme; });
|
|
715
|
+
const theme = existing?.identity?.theme ?? pick(this.cfg?.get?.('theme'));
|
|
716
|
+
|
|
717
|
+
const agent = await this.factory.create(provider, role, {
|
|
718
|
+
theme,
|
|
719
|
+
model: this.cfg?.get?.(`workflows.${role}.model`) ?? this.cfg?.get?.('model'),
|
|
720
|
+
config: this.cfg,
|
|
721
|
+
context,
|
|
722
|
+
taken: this.takenNames,
|
|
723
|
+
...opts
|
|
724
|
+
});
|
|
725
|
+
|
|
726
|
+
// Persist session data BEFORE spawn so the agent's MCP server
|
|
727
|
+
// subprocess can hydrate from storage on startup. Without this,
|
|
728
|
+
// the agent's _hydrate() call races against the host's save.
|
|
729
|
+
if (this.sessionId && this.storage && agent.identity?.name) {
|
|
730
|
+
const session = new Session({
|
|
731
|
+
identity: agent.identity.toJSON?.() ?? agent.identity,
|
|
732
|
+
role,
|
|
733
|
+
backend: agent.constructor.name,
|
|
734
|
+
paneId: null, // not yet known
|
|
735
|
+
repo: this.repo
|
|
736
|
+
});
|
|
737
|
+
await this.storage.save(this.sessionId, agent.identity.name, session.toJSON());
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
try {
|
|
741
|
+
await this.spawn(agent);
|
|
742
|
+
} catch (err) {
|
|
743
|
+
if (this.sessionId && this.storage && agent.identity?.name) {
|
|
744
|
+
try { await this.storage.remove(this.sessionId, agent.identity.name); } catch { /* best-effort */ }
|
|
745
|
+
}
|
|
746
|
+
throw err;
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
if (this.sessionId && this.storage && agent.identity?.name && agent.paneId) {
|
|
750
|
+
const data = await this.storage.load(this.sessionId, agent.identity.name);
|
|
751
|
+
if (data) {
|
|
752
|
+
data.paneId = agent.paneId;
|
|
753
|
+
await this.storage.save(this.sessionId, agent.identity.name, data);
|
|
754
|
+
}
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
return agent;
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
/**
|
|
761
|
+
* Record a heartbeat timestamp for an agent. Resets the stall timer.
|
|
762
|
+
*
|
|
763
|
+
* @param {string} name - Agent identity name.
|
|
764
|
+
*/
|
|
765
|
+
activity(name) {
|
|
766
|
+
this._lastActivity.set(name, new Date().toISOString());
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
/**
|
|
770
|
+
* Check whether an agent's tmux pane has new output since the last
|
|
771
|
+
* check. When output changes, the agent is provably active — update
|
|
772
|
+
* `_lastActivity` and return `true`. This is the ground truth signal
|
|
773
|
+
* that feeds into `health()` and, transitively, the proof-of-life
|
|
774
|
+
* responder.
|
|
775
|
+
*
|
|
776
|
+
* @param {string} name - Agent identity name.
|
|
777
|
+
* @returns {Promise<boolean>} True when pane output changed (agent is active).
|
|
778
|
+
*/
|
|
779
|
+
async refresh(name) {
|
|
780
|
+
const agent = this.agents.get(name);
|
|
781
|
+
if (!agent?.capture) return false;
|
|
782
|
+
|
|
783
|
+
try {
|
|
784
|
+
const output = await agent.capture(50);
|
|
785
|
+
const digest = createHash('md5').update(output ?? '').digest('hex');
|
|
786
|
+
const prev = this._lastPaneHash.get(name);
|
|
787
|
+
this._lastPaneHash.set(name, digest);
|
|
788
|
+
if (prev && prev !== digest) {
|
|
789
|
+
if (hasFatalError(output)) return false;
|
|
790
|
+
this._lastActivity.set(name, new Date().toISOString());
|
|
791
|
+
return true;
|
|
792
|
+
}
|
|
793
|
+
return false;
|
|
794
|
+
} catch { return false; }
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
/**
|
|
798
|
+
* Update the pane hash after an orchestrator-initiated interaction.
|
|
799
|
+
*
|
|
800
|
+
* Must be called after any action that changes the pane content
|
|
801
|
+
* (sending keys, messages, etc.) to prevent `refresh()` from
|
|
802
|
+
* misinterpreting the orchestrator's own output as agent activity
|
|
803
|
+
* on the next monitor cycle. The orchestrator also resets the
|
|
804
|
+
* stall timer here because a remediation attempt should buy the
|
|
805
|
+
* agent time to react before another nudge is sent.
|
|
806
|
+
*
|
|
807
|
+
* @param {string} name - Agent identity name.
|
|
808
|
+
* @param {object} agent - Agent instance with a `capture()` method.
|
|
809
|
+
* @returns {Promise<void>}
|
|
810
|
+
*/
|
|
811
|
+
async _rehash(name, agent) {
|
|
812
|
+
try {
|
|
813
|
+
const content = await agent.capture(50);
|
|
814
|
+
const digest = createHash('md5').update(content ?? '').digest('hex');
|
|
815
|
+
this._lastPaneHash.set(name, digest);
|
|
816
|
+
this._lastActivity.set(name, new Date().toISOString());
|
|
817
|
+
} catch { /* capture can fail if pane died */ }
|
|
818
|
+
}
|
|
819
|
+
|
|
820
|
+
// ── Death Snapshot ──────────────────────────────────
|
|
821
|
+
|
|
822
|
+
/**
|
|
823
|
+
* Capture a dying agent's pane output and write it to the session
|
|
824
|
+
* logs directory as `<name>.death.log`. Requires `remain-on-exit`
|
|
825
|
+
* on the pane so output survives after the process exits.
|
|
826
|
+
*
|
|
827
|
+
* Non-fatal: silently skips when session or storage is unavailable,
|
|
828
|
+
* and logs a warning when capture or write fails.
|
|
829
|
+
*
|
|
830
|
+
* @param {string} name - Agent identity name.
|
|
831
|
+
* @param {object} agent - Agent instance with a `capture()` method.
|
|
832
|
+
* @returns {Promise<void>}
|
|
833
|
+
*/
|
|
834
|
+
async snapshot(name, agent) {
|
|
835
|
+
if (!this.sessionId || !this.storage?.home) return;
|
|
836
|
+
try {
|
|
837
|
+
const raw = await agent.capture();
|
|
838
|
+
const cleaned = output.clean(raw);
|
|
839
|
+
const dir = join(this.storage.home, 'sessions', this.sessionId, 'logs');
|
|
840
|
+
await mkdir(dir, { recursive: true });
|
|
841
|
+
await writeFile(join(dir, `${name}.death.log`), cleaned, 'utf8');
|
|
842
|
+
log.info(`death snapshot written: ${name}`);
|
|
843
|
+
} catch (err) {
|
|
844
|
+
log.warn(`death snapshot failed for ${name}: ${err.message}`);
|
|
845
|
+
}
|
|
846
|
+
}
|
|
847
|
+
|
|
848
|
+
// ── Reconcile (Liveness Sweep) ───────────────────────
|
|
849
|
+
|
|
850
|
+
/**
|
|
851
|
+
* Synchronize registry state with actual tmux pane liveness.
|
|
852
|
+
*
|
|
853
|
+
* Iterates all registered agents and checks whether their underlying
|
|
854
|
+
* process is still alive. Dead agents are stopped, removed from the
|
|
855
|
+
* registry, and their identities released — closing the gap where
|
|
856
|
+
* team_status reports agents as "working" after their processes have
|
|
857
|
+
* exited.
|
|
858
|
+
*
|
|
859
|
+
* Dormant agents are skipped because they are already in a terminal
|
|
860
|
+
* state and are handled by the stall monitor's dormant cleanup.
|
|
861
|
+
*
|
|
862
|
+
* @returns {Promise<string[]>} Names of agents that were reconciled.
|
|
863
|
+
* @fires Orchestrator#removed
|
|
864
|
+
*/
|
|
865
|
+
async reconcile() {
|
|
866
|
+
const reconciled = [];
|
|
867
|
+
const entries = [...this.agents.entries()];
|
|
868
|
+
|
|
869
|
+
for (const [name, agent] of entries) {
|
|
870
|
+
if (agent.state === 'dormant') continue;
|
|
871
|
+
|
|
872
|
+
let alive;
|
|
873
|
+
try {
|
|
874
|
+
alive = await agent.alive();
|
|
875
|
+
} catch (err) {
|
|
876
|
+
log.warn(`reconcile: alive() threw for ${name}: ${err.message} — treating as dead`);
|
|
877
|
+
alive = false;
|
|
878
|
+
}
|
|
879
|
+
|
|
880
|
+
if (alive) continue;
|
|
881
|
+
|
|
882
|
+
log.warn(`reconcile: ${name} pane is dead (state was "${agent.state}") — removing`);
|
|
883
|
+
|
|
884
|
+
await this.snapshot(name, agent);
|
|
885
|
+
try { await agent.stop(); } catch { /* pane already dead */ }
|
|
886
|
+
|
|
887
|
+
this.agents.delete(name);
|
|
888
|
+
this._lastActivity.delete(name);
|
|
889
|
+
this._lastPaneHash.delete(name);
|
|
890
|
+
this._classifyFails.delete(name);
|
|
891
|
+
this._removed.add(name);
|
|
892
|
+
this.identityRegistry.release(agent.identity);
|
|
893
|
+
|
|
894
|
+
this.emit('removed', { name, reason: 'reconciled', agent });
|
|
895
|
+
reconciled.push(name);
|
|
896
|
+
}
|
|
897
|
+
|
|
898
|
+
return reconciled;
|
|
899
|
+
}
|
|
900
|
+
|
|
901
|
+
// ── Health ───────────────────────────────────────────
|
|
902
|
+
|
|
903
|
+
/**
|
|
904
|
+
* Multi-signal health assessment for a named agent.
|
|
905
|
+
*
|
|
906
|
+
* Evaluates: tmux/process liveness, agent state machine, activity
|
|
907
|
+
* recency (last orchestrator interaction), and captured output length.
|
|
908
|
+
*
|
|
909
|
+
* @param {string} name - Agent identity name.
|
|
910
|
+
* @returns {Promise<{alive: boolean, status: string, details: string, outputLength?: number}>}
|
|
911
|
+
*/
|
|
912
|
+
async health(name) {
|
|
913
|
+
const agent = this.agents.get(name);
|
|
914
|
+
if (!agent) return { alive: false, status: 'not-found', details: `agent ${name} not registered` };
|
|
915
|
+
|
|
916
|
+
if (agent.state === 'dormant')
|
|
917
|
+
return { alive: false, status: 'unhealthy', details: `agent ${name} is dormant` };
|
|
918
|
+
|
|
919
|
+
const paneAlive = await agent.alive();
|
|
920
|
+
if (!paneAlive)
|
|
921
|
+
return { alive: false, status: 'unhealthy', details: `agent ${name} pane is dead` };
|
|
922
|
+
|
|
923
|
+
const output = await agent.capture().catch(function noop() { return ''; });
|
|
924
|
+
const outputLength = output.length;
|
|
925
|
+
|
|
926
|
+
// Local proof-of-life: check tmux pane for real activity before
|
|
927
|
+
// declaring staleness. Agent-side MCP tool calls don't update
|
|
928
|
+
// _lastActivity, but they DO produce terminal output.
|
|
929
|
+
await this.refresh(name);
|
|
930
|
+
|
|
931
|
+
const lastTs = this._lastActivity.get(name);
|
|
932
|
+
if (lastTs) {
|
|
933
|
+
const elapsed = Date.now() - new Date(lastTs).getTime();
|
|
934
|
+
if (elapsed > this.stallTimeout)
|
|
935
|
+
return { alive: true, status: 'unhealthy', details: `agent ${name} activity is stale (${Math.round(elapsed / 1000)}s)`, outputLength };
|
|
936
|
+
}
|
|
937
|
+
|
|
938
|
+
return { alive: true, status: 'healthy', details: `agent ${name} is active`, outputLength };
|
|
939
|
+
}
|
|
940
|
+
|
|
941
|
+
// ── Scaling ──────────────────────────────────────────
|
|
942
|
+
|
|
943
|
+
/**
|
|
944
|
+
* Demand-driven scaling: collect demand signals from all registered
|
|
945
|
+
* workflows, then spawn agents to fill deficits — respecting global
|
|
946
|
+
* caps, per-role caps, rate limits, and cooldowns.
|
|
947
|
+
*
|
|
948
|
+
* Runs after all workflow handlers in the reactor chain so demand
|
|
949
|
+
* signals reflect the latest hydrated state. Spawns are capped by
|
|
950
|
+
* `maxPerTick` to avoid resource spikes.
|
|
951
|
+
*
|
|
952
|
+
* Priority order: reviewer > risk > action > planner. Reviewers
|
|
953
|
+
* unblock merges, risk unblocks reviewers, so they are filled first
|
|
954
|
+
* when at the global cap.
|
|
955
|
+
*
|
|
956
|
+
* @param {string} repo - Repository in "owner/name" format.
|
|
957
|
+
* @returns {Promise<Array<{role: string, agent: string}>>} Spawned agents.
|
|
958
|
+
*/
|
|
959
|
+
async scale(repo) {
|
|
960
|
+
if (!this.workflows.size) return [];
|
|
961
|
+
|
|
962
|
+
const maxAgents = this.cfg?.get?.('scaling.maxAgents') ?? 8;
|
|
963
|
+
const maxPerTick = this.cfg?.get?.('scaling.maxPerTick') ?? 2;
|
|
964
|
+
const cooldown = this.cfg?.get?.('scaling.cooldown') ?? 30000;
|
|
965
|
+
|
|
966
|
+
// Collect demand signals from each workflow.
|
|
967
|
+
// Use the workflow's static `role` for enlist() — the map key may
|
|
968
|
+
// differ (e.g. map key 'review' vs static role 'reviewer'). The
|
|
969
|
+
// static role is what agents, demand(), and pair() filter on.
|
|
970
|
+
const signals = [];
|
|
971
|
+
for (const [key, workflow] of this.workflows) {
|
|
972
|
+
const role = workflow.constructor.role ?? key;
|
|
973
|
+
try {
|
|
974
|
+
const signal = await workflow.demand(repo);
|
|
975
|
+
signals.push({ role, ...signal });
|
|
976
|
+
log.debug(`scale: ${role} { workload: ${signal.workload}, supply: ${signal.supply}, deficit: ${signal.deficit} }`);
|
|
977
|
+
} catch (err) {
|
|
978
|
+
log.warn(`scale: demand() failed for ${role}: ${err.message}`);
|
|
979
|
+
}
|
|
980
|
+
}
|
|
981
|
+
|
|
982
|
+
// Global cap — only count live agents. Dormant agents stay in
|
|
983
|
+
// the map after exit but consume no resources.
|
|
984
|
+
const live = [...this.agents.values()]
|
|
985
|
+
.filter(function alive(a) { return a.state !== 'dormant'; }).length;
|
|
986
|
+
if (live >= maxAgents) {
|
|
987
|
+
log.debug(`scale: at global cap (${live}/${maxAgents}) — skipping`);
|
|
988
|
+
return [];
|
|
989
|
+
}
|
|
990
|
+
|
|
991
|
+
// Sort by priority: reviewer > risk > action > planner
|
|
992
|
+
const priority = { reviewer: 0, risk: 1, action: 2, planner: 3 };
|
|
993
|
+
signals.sort(function byPriority(a, b) {
|
|
994
|
+
return (priority[a.role] ?? 99) - (priority[b.role] ?? 99);
|
|
995
|
+
});
|
|
996
|
+
|
|
997
|
+
const spawned = [];
|
|
998
|
+
let budget = maxPerTick;
|
|
999
|
+
const now = Date.now();
|
|
1000
|
+
|
|
1001
|
+
for (const signal of signals) {
|
|
1002
|
+
if (budget <= 0) break;
|
|
1003
|
+
if (signal.deficit <= 0) continue;
|
|
1004
|
+
|
|
1005
|
+
const { role } = signal;
|
|
1006
|
+
const roleCap = this.cfg?.get?.(`workflows.${role}.maxAgents`) ?? Infinity;
|
|
1007
|
+
const current = [...this.agents.values()]
|
|
1008
|
+
.filter(function liveRole(a) { return a.role === role && a.state !== 'dormant'; }).length;
|
|
1009
|
+
|
|
1010
|
+
// Per-role cap
|
|
1011
|
+
if (current >= roleCap) {
|
|
1012
|
+
log.debug(`scale: ${role} at role cap (${current}/${roleCap}) — skipping`);
|
|
1013
|
+
continue;
|
|
1014
|
+
}
|
|
1015
|
+
|
|
1016
|
+
// Cooldown check
|
|
1017
|
+
const last = this._lastSpawn.get(role);
|
|
1018
|
+
if (last && now - last < cooldown) {
|
|
1019
|
+
log.debug(`scale: ${role} in cooldown (${Math.round((now - last) / 1000)}s < ${Math.round(cooldown / 1000)}s) — skipping`);
|
|
1020
|
+
continue;
|
|
1021
|
+
}
|
|
1022
|
+
|
|
1023
|
+
// How many to spawn: min of deficit, role headroom, global headroom, tick budget
|
|
1024
|
+
const roleHeadroom = roleCap - current;
|
|
1025
|
+
const globalHeadroom = maxAgents - live;
|
|
1026
|
+
const count = Math.min(signal.deficit, roleHeadroom, globalHeadroom, budget);
|
|
1027
|
+
|
|
1028
|
+
if (count <= 0) continue;
|
|
1029
|
+
|
|
1030
|
+
await this.backendRegistry.discover();
|
|
1031
|
+
|
|
1032
|
+
for (let i = 0; i < count; i++) {
|
|
1033
|
+
try {
|
|
1034
|
+
const providers = this.backendRegistry.providers();
|
|
1035
|
+
const info = capability(providers);
|
|
1036
|
+
let provider;
|
|
1037
|
+
|
|
1038
|
+
// Reviewer and risk agents must oppose the action agents they
|
|
1039
|
+
// pair with — pair() finds the cross-provider match, so
|
|
1040
|
+
// spawning on the same side means scan()/assess() can never
|
|
1041
|
+
// dispatch them. Pick the opposite of existing action agents.
|
|
1042
|
+
//
|
|
1043
|
+
// When no live action agent exists (dead/foreign), fall back
|
|
1044
|
+
// to PR label metadata via demand().actionProviders so the
|
|
1045
|
+
// correct opposing side is still selected.
|
|
1046
|
+
let actionProvider = (role === 'reviewer' || role === 'risk')
|
|
1047
|
+
? [...this.agents.values()]
|
|
1048
|
+
.find(function hasProvider(a) { return a.role === 'action' && a.identity?.provider; })
|
|
1049
|
+
?.identity?.provider
|
|
1050
|
+
: null;
|
|
1051
|
+
|
|
1052
|
+
if (!actionProvider && signal.actionProviders?.length) {
|
|
1053
|
+
actionProvider = signal.actionProviders[0];
|
|
1054
|
+
}
|
|
1055
|
+
|
|
1056
|
+
if (actionProvider) {
|
|
1057
|
+
const opposite = side(actionProvider) === 'yin' ? 'yang' : 'yin';
|
|
1058
|
+
provider = providers.find(function isOpp(p) { return side(p) === opposite; });
|
|
1059
|
+
if (!provider) {
|
|
1060
|
+
if (info.mode === 'single') {
|
|
1061
|
+
provider = providers[0] ?? null;
|
|
1062
|
+
if (!provider) {
|
|
1063
|
+
log.warn(`scale: no providers available for ${role} — skipping`);
|
|
1064
|
+
break;
|
|
1065
|
+
}
|
|
1066
|
+
log.info(`scale: no ${opposite}-side provider available for ${role} — using single-side fallback (${provider})`);
|
|
1067
|
+
} else {
|
|
1068
|
+
log.warn(`scale: no ${opposite}-side provider available for ${role} — skipping`);
|
|
1069
|
+
break;
|
|
1070
|
+
}
|
|
1071
|
+
}
|
|
1072
|
+
} else if (role === 'reviewer' || role === 'risk') {
|
|
1073
|
+
provider = providers[0];
|
|
1074
|
+
} else {
|
|
1075
|
+
// Action / planner: balance yin/yang within their own role
|
|
1076
|
+
const yinCount = [...this.agents.values()]
|
|
1077
|
+
.filter(function isRole(a) { return a.role === role && side(a.identity?.provider) === 'yin'; }).length;
|
|
1078
|
+
const yangCount = [...this.agents.values()]
|
|
1079
|
+
.filter(function isRole(a) { return a.role === role && side(a.identity?.provider) === 'yang'; }).length;
|
|
1080
|
+
|
|
1081
|
+
if (yinCount <= yangCount) {
|
|
1082
|
+
provider = providers.find(function isYin(p) { return side(p) === 'yin'; }) ?? providers[0];
|
|
1083
|
+
} else {
|
|
1084
|
+
provider = providers.find(function isYang(p) { return side(p) === 'yang'; }) ?? providers[0];
|
|
1085
|
+
}
|
|
1086
|
+
}
|
|
1087
|
+
// Final defensive default for non-pairing roles and mixed
|
|
1088
|
+
// provider sets: if discovery returned providers but side
|
|
1089
|
+
// selection yielded none, use the first discovered provider.
|
|
1090
|
+
if (!provider && providers.length) provider = providers[0];
|
|
1091
|
+
|
|
1092
|
+
if (!provider) {
|
|
1093
|
+
log.warn(`scale: no provider resolved for ${role} — skipping`);
|
|
1094
|
+
break;
|
|
1095
|
+
}
|
|
1096
|
+
|
|
1097
|
+
// Backend degradation fallback: when the native backend for
|
|
1098
|
+
// the selected provider is degraded (repeated rapid failures
|
|
1099
|
+
// like budget exhaustion), switch to the cursor-* virtual
|
|
1100
|
+
// provider. Same yin/yang side, different API path.
|
|
1101
|
+
if (!provider.startsWith('cursor-') && this.backendRegistry.degraded) {
|
|
1102
|
+
let nativeBackend = null;
|
|
1103
|
+
for (const info of this.backendRegistry.discovered.values()) {
|
|
1104
|
+
if (info.provider === provider) { nativeBackend = info.name; break; }
|
|
1105
|
+
}
|
|
1106
|
+
if (nativeBackend && this.backendRegistry.degraded(nativeBackend)) {
|
|
1107
|
+
const variant = `cursor-${provider}`;
|
|
1108
|
+
if (providers.includes(variant)) {
|
|
1109
|
+
log.info(`scale: ${nativeBackend} degraded, falling back to ${variant}`);
|
|
1110
|
+
provider = variant;
|
|
1111
|
+
}
|
|
1112
|
+
}
|
|
1113
|
+
}
|
|
1114
|
+
|
|
1115
|
+
const agent = await this.enlist(provider, role);
|
|
1116
|
+
spawned.push({ role, agent: agent.identity.name });
|
|
1117
|
+
log.info(`scale: spawned ${agent.identity.name} as ${role} (${provider})`);
|
|
1118
|
+
} catch (err) {
|
|
1119
|
+
log.warn(`scale: failed to spawn ${role} agent: ${err.message}`);
|
|
1120
|
+
break;
|
|
1121
|
+
}
|
|
1122
|
+
}
|
|
1123
|
+
|
|
1124
|
+
if (spawned.length) {
|
|
1125
|
+
this._lastSpawn.set(role, now);
|
|
1126
|
+
budget -= count;
|
|
1127
|
+
}
|
|
1128
|
+
}
|
|
1129
|
+
|
|
1130
|
+
if (spawned.length) {
|
|
1131
|
+
log.info(`scale: spawned ${spawned.length} agents — ${spawned.map(function fmt(s) { return `${s.role}:${s.agent}`; }).join(', ')}`);
|
|
1132
|
+
}
|
|
1133
|
+
|
|
1134
|
+
return spawned;
|
|
1135
|
+
}
|
|
1136
|
+
|
|
1137
|
+
// ── Reap (Global Safety Net) ─────────────────────────
|
|
1138
|
+
|
|
1139
|
+
/**
|
|
1140
|
+
* Global safety net — stop dormant agents when no work remains.
|
|
1141
|
+
*
|
|
1142
|
+
* Role-specific reaping is handled by each workflow's `reap()` method
|
|
1143
|
+
* (planner-reap, review-reap, action-reap). This global reap runs
|
|
1144
|
+
* last in the reactor chain and catches anything the workflow-level
|
|
1145
|
+
* reaps missed.
|
|
1146
|
+
*
|
|
1147
|
+
* Only **dormant** agents are eligible for global reaping. Agents in
|
|
1148
|
+
* any other state (spawned, working, reviewing, etc.) are actively
|
|
1149
|
+
* doing something and must not be interrupted. Stall detection is
|
|
1150
|
+
* the separate mechanism that handles truly stuck agents.
|
|
1151
|
+
*
|
|
1152
|
+
* All of these conditions must be true before reaping:
|
|
1153
|
+
* 1. No open issues with the `loreli` label
|
|
1154
|
+
* 2. No open pull requests (PRs in flight = work not done)
|
|
1155
|
+
* 3. Every remaining agent is dormant
|
|
1156
|
+
*
|
|
1157
|
+
* Registered as the last reactor handler so it runs after all
|
|
1158
|
+
* workflow-specific reaps have completed.
|
|
1159
|
+
*
|
|
1160
|
+
* @param {string} repo - Repository in "owner/name" format.
|
|
1161
|
+
* @returns {Promise<void>}
|
|
1162
|
+
*/
|
|
1163
|
+
async reap(repo) {
|
|
1164
|
+
if (!this.hub || !this.agents.size) return;
|
|
1165
|
+
|
|
1166
|
+
try {
|
|
1167
|
+
// Only dormant agents are candidates — anything else is active work
|
|
1168
|
+
const all = [...this.agents.values()];
|
|
1169
|
+
const dormant = all.filter(function idle(a) { return a.state === 'dormant'; });
|
|
1170
|
+
if (dormant.length === 0) {
|
|
1171
|
+
log.debug(`reap: ${all.length} agents still active — skipping`);
|
|
1172
|
+
return;
|
|
1173
|
+
}
|
|
1174
|
+
|
|
1175
|
+
const open = await this.hub.issues(repo, { state: 'open' });
|
|
1176
|
+
const loreli = open.filter(function tagged(i) {
|
|
1177
|
+
return i.labels?.some?.(function isLoreli(l) {
|
|
1178
|
+
const name = typeof l === 'string' ? l : l.name;
|
|
1179
|
+
return name === 'loreli';
|
|
1180
|
+
});
|
|
1181
|
+
});
|
|
1182
|
+
|
|
1183
|
+
if (loreli.length > 0) return;
|
|
1184
|
+
|
|
1185
|
+
// Check for open PRs — work is still in flight if PRs exist
|
|
1186
|
+
const prs = await this.hub.pulls(repo, { state: 'open' });
|
|
1187
|
+
if (prs.length > 0) return;
|
|
1188
|
+
|
|
1189
|
+
log.info(`reap: no open loreli issues or PRs — stopping ${dormant.length} dormant agents`);
|
|
1190
|
+
|
|
1191
|
+
for (const agent of dormant) {
|
|
1192
|
+
try {
|
|
1193
|
+
await this.kill(agent.identity.name);
|
|
1194
|
+
log.info(`reap: stopped ${agent.identity.name}`);
|
|
1195
|
+
} catch (err) {
|
|
1196
|
+
log.warn(`reap: failed to stop ${agent.identity.name}: ${err.message}`);
|
|
1197
|
+
}
|
|
1198
|
+
}
|
|
1199
|
+
} catch (err) {
|
|
1200
|
+
log.debug(`reap: skipped — ${err.message}`);
|
|
1201
|
+
}
|
|
1202
|
+
}
|
|
1203
|
+
|
|
1204
|
+
// ── Reactor (Polling Loop) ────────────────────────────
|
|
1205
|
+
|
|
1206
|
+
/**
|
|
1207
|
+
* Register a reactor handler called on every tick.
|
|
1208
|
+
* Role packages register their scan/forward/land handlers here.
|
|
1209
|
+
*
|
|
1210
|
+
* @param {string} name - Handler name (for logging/debugging).
|
|
1211
|
+
* @param {function(string): Promise<void>} handler - Async function receiving repo.
|
|
1212
|
+
*/
|
|
1213
|
+
register(name, handler) {
|
|
1214
|
+
this._handlers.set(name, handler);
|
|
1215
|
+
log.info(`reactor handler registered: ${name}`);
|
|
1216
|
+
}
|
|
1217
|
+
|
|
1218
|
+
/**
|
|
1219
|
+
* Execute one polling iteration: call all registered handlers.
|
|
1220
|
+
* Each handler receives the repo string. Errors in one handler
|
|
1221
|
+
* do not prevent subsequent handlers from running.
|
|
1222
|
+
*
|
|
1223
|
+
* @param {string} repo - Repository in "owner/name" format.
|
|
1224
|
+
* @returns {Promise<void>}
|
|
1225
|
+
*/
|
|
1226
|
+
async tick(repo) {
|
|
1227
|
+
log.debug('tick start');
|
|
1228
|
+
|
|
1229
|
+
for (const [name, handler] of this._handlers) {
|
|
1230
|
+
try {
|
|
1231
|
+
await handler(repo);
|
|
1232
|
+
} catch (err) {
|
|
1233
|
+
log.error(`reactor handler "${name}" failed: ${err.message}`);
|
|
1234
|
+
}
|
|
1235
|
+
}
|
|
1236
|
+
|
|
1237
|
+
log.debug('tick end');
|
|
1238
|
+
}
|
|
1239
|
+
|
|
1240
|
+
/**
|
|
1241
|
+
* Start the polling reactor loop using a self-scheduling setTimeout
|
|
1242
|
+
* chain. Each tick runs to completion before the next is scheduled,
|
|
1243
|
+
* eliminating overlap risk without a reentrant guard. This pattern
|
|
1244
|
+
* is more reliable than setInterval for async callbacks — setInterval
|
|
1245
|
+
* does not await its callback, and combined with unref() it can
|
|
1246
|
+
* silently stop firing after heavy async operations (observed in
|
|
1247
|
+
* production after agent spawn via tmux).
|
|
1248
|
+
*
|
|
1249
|
+
* @param {string} repo - Repository in "owner/name" format.
|
|
1250
|
+
*/
|
|
1251
|
+
watch(repo) {
|
|
1252
|
+
if (this._watchHandle) return;
|
|
1253
|
+
this.repo = repo;
|
|
1254
|
+
|
|
1255
|
+
const interval = this.cfg?.get?.('watch.interval') ?? 60000;
|
|
1256
|
+
const self = this;
|
|
1257
|
+
|
|
1258
|
+
log.info(`watcher started for ${repo} (interval: ${interval}ms)`);
|
|
1259
|
+
|
|
1260
|
+
function schedule() {
|
|
1261
|
+
self._watchHandle = setTimeout(async function cycle() {
|
|
1262
|
+
await self.tick(repo);
|
|
1263
|
+
if (self._watchHandle) schedule();
|
|
1264
|
+
}, interval);
|
|
1265
|
+
|
|
1266
|
+
self._watchHandle.unref();
|
|
1267
|
+
}
|
|
1268
|
+
|
|
1269
|
+
schedule();
|
|
1270
|
+
}
|
|
1271
|
+
|
|
1272
|
+
/**
|
|
1273
|
+
* Stop the polling reactor loop.
|
|
1274
|
+
*/
|
|
1275
|
+
unwatch() {
|
|
1276
|
+
if (this._watchHandle) {
|
|
1277
|
+
clearTimeout(this._watchHandle);
|
|
1278
|
+
this._watchHandle = null;
|
|
1279
|
+
log.info('watcher stopped');
|
|
1280
|
+
}
|
|
1281
|
+
}
|
|
1282
|
+
|
|
1283
|
+
// ── Monitor (Stall Detection) ─────────────────────────
|
|
1284
|
+
|
|
1285
|
+
/**
|
|
1286
|
+
* Start the stall detection monitor with LLM-powered classification.
|
|
1287
|
+
*
|
|
1288
|
+
* When an agent's pane output has not changed for longer than the
|
|
1289
|
+
* stall timeout, the monitor captures the pane content, classifies
|
|
1290
|
+
* it via `loreli/classify`, and dispatches the appropriate action:
|
|
1291
|
+
*
|
|
1292
|
+
* - `working` — reset activity timer, leave the agent alone
|
|
1293
|
+
* - `waiting_for_input` — send a continuation prompt
|
|
1294
|
+
* - `option_dialog` — send the appropriate keystroke (Enter)
|
|
1295
|
+
* - `error_loop` — emit 'stall' with diagnostic context
|
|
1296
|
+
* - `idle` — transition the agent to dormant
|
|
1297
|
+
* - `fatal` — kill the agent and mark the backend degraded
|
|
1298
|
+
*
|
|
1299
|
+
* Falls back to regex heuristics when no LLM backend is available.
|
|
1300
|
+
* Consecutive classification failures trigger a safety-net kill
|
|
1301
|
+
* (replaces the old tier 3 fixed-time kill).
|
|
1302
|
+
*
|
|
1303
|
+
* @fires Orchestrator#stall
|
|
1304
|
+
*/
|
|
1305
|
+
monitor() {
|
|
1306
|
+
if (this._monitorHandle) return;
|
|
1307
|
+
log.info('stall detection monitor started');
|
|
1308
|
+
|
|
1309
|
+
const stallTimeout = this.stallTimeout;
|
|
1310
|
+
const maxClassifyFails = this.cfg?.get?.('classify.maxRetries') ?? 5;
|
|
1311
|
+
const self = this;
|
|
1312
|
+
|
|
1313
|
+
/** @type {boolean} Re-entrancy guard for the monitor callback. */
|
|
1314
|
+
this._monitoring = false;
|
|
1315
|
+
|
|
1316
|
+
this._monitorHandle = setInterval(async function checkStalls() {
|
|
1317
|
+
if (self._monitoring) return;
|
|
1318
|
+
self._monitoring = true;
|
|
1319
|
+
|
|
1320
|
+
try {
|
|
1321
|
+
await self.reconcile();
|
|
1322
|
+
|
|
1323
|
+
const now = Date.now();
|
|
1324
|
+
const snapshot = [...self.agents.entries()];
|
|
1325
|
+
|
|
1326
|
+
for (const [name, agent] of snapshot) {
|
|
1327
|
+
if (agent.state === 'dormant') {
|
|
1328
|
+
const last = self._lastActivity.get(name);
|
|
1329
|
+
if (!last) continue;
|
|
1330
|
+
const elapsed = now - new Date(last).getTime();
|
|
1331
|
+
if (elapsed > stallTimeout * 3) {
|
|
1332
|
+
self.agents.delete(name);
|
|
1333
|
+
self._lastActivity.delete(name);
|
|
1334
|
+
self._lastPaneHash.delete(name);
|
|
1335
|
+
self._classifyFails.delete(name);
|
|
1336
|
+
log.info(`stall: cleaned up dormant agent ${name}`);
|
|
1337
|
+
}
|
|
1338
|
+
continue;
|
|
1339
|
+
}
|
|
1340
|
+
|
|
1341
|
+
const last = self._lastActivity.get(name);
|
|
1342
|
+
if (!last) continue;
|
|
1343
|
+
|
|
1344
|
+
if (await self.refresh(name)) {
|
|
1345
|
+
self._classifyFails.set(name, 0);
|
|
1346
|
+
continue;
|
|
1347
|
+
}
|
|
1348
|
+
|
|
1349
|
+
const elapsed = now - new Date(last).getTime();
|
|
1350
|
+
if (elapsed <= stallTimeout) continue;
|
|
1351
|
+
|
|
1352
|
+
// Stall detected — classify the pane content
|
|
1353
|
+
const maxLines = self.cfg?.get?.('classify.maxLines') ?? 100;
|
|
1354
|
+
let result;
|
|
1355
|
+
|
|
1356
|
+
try {
|
|
1357
|
+
const pane = await agent.capture(maxLines);
|
|
1358
|
+
log.debug(`monitor pane ${name} (${agent.backend}, stale=${Math.round(elapsed / 1000)}s):\n${paneDebug(pane)}`);
|
|
1359
|
+
result = await classify('pane-state', pane, {
|
|
1360
|
+
backends: self.backendRegistry,
|
|
1361
|
+
config: self.cfg,
|
|
1362
|
+
vars: { model: agent.model, backend: agent.backend, role: agent.role }
|
|
1363
|
+
});
|
|
1364
|
+
self._classifyFails.set(name, 0);
|
|
1365
|
+
log.info(`classify ${name}: ${result.category} — ${result.reasoning}`);
|
|
1366
|
+
} catch (err) {
|
|
1367
|
+
const fails = (self._classifyFails.get(name) ?? 0) + 1;
|
|
1368
|
+
self._classifyFails.set(name, fails);
|
|
1369
|
+
log.warn(`classify failed for ${name} (${fails}/${maxClassifyFails}): ${err.message}`);
|
|
1370
|
+
|
|
1371
|
+
if (fails >= maxClassifyFails) {
|
|
1372
|
+
log.error(`agent ${name} unclassifiable after ${fails} attempts — killing as safety net`);
|
|
1373
|
+
self.emit('stall', { name, elapsed, severity: 'critical' });
|
|
1374
|
+
try { await self.kill(name); } catch (e) { log.error(`safety kill failed for ${name}: ${e.message}`); }
|
|
1375
|
+
}
|
|
1376
|
+
continue;
|
|
1377
|
+
}
|
|
1378
|
+
|
|
1379
|
+
switch (result.category) {
|
|
1380
|
+
case 'working':
|
|
1381
|
+
self._lastActivity.set(name, new Date().toISOString());
|
|
1382
|
+
break;
|
|
1383
|
+
|
|
1384
|
+
case 'waiting_for_input':
|
|
1385
|
+
try {
|
|
1386
|
+
await agent.send('Please continue working or report your status.');
|
|
1387
|
+
await self._rehash(name, agent);
|
|
1388
|
+
} catch (err) { log.debug(`monitor: send failed for ${name}: ${err.message}`); }
|
|
1389
|
+
self.emit('stall', { name, elapsed, severity: 'nudge', diagnosis: result });
|
|
1390
|
+
break;
|
|
1391
|
+
|
|
1392
|
+
case 'option_dialog': {
|
|
1393
|
+
const keys = remedy(result.remedy);
|
|
1394
|
+
try {
|
|
1395
|
+
const tmux = new Tmux();
|
|
1396
|
+
await tmux.keys(agent.paneId, ...keys);
|
|
1397
|
+
await self._rehash(name, agent);
|
|
1398
|
+
} catch (err) { log.debug(`monitor: keys failed for ${name}: ${err.message}`); }
|
|
1399
|
+
self.emit('stall', { name, elapsed, severity: 'nudge', diagnosis: result });
|
|
1400
|
+
break;
|
|
1401
|
+
}
|
|
1402
|
+
|
|
1403
|
+
case 'error_loop':
|
|
1404
|
+
self.emit('stall', { name, elapsed, severity: 'warning', diagnosis: result });
|
|
1405
|
+
break;
|
|
1406
|
+
|
|
1407
|
+
case 'idle':
|
|
1408
|
+
agent.transition?.('dormant');
|
|
1409
|
+
self.emit('stall', { name, elapsed, severity: 'nudge', diagnosis: result });
|
|
1410
|
+
break;
|
|
1411
|
+
|
|
1412
|
+
case 'fatal':
|
|
1413
|
+
log.error(`agent ${name} hit fatal error — killing`);
|
|
1414
|
+
self.emit('stall', { name, elapsed, severity: 'critical', diagnosis: result });
|
|
1415
|
+
try {
|
|
1416
|
+
await self.kill(name);
|
|
1417
|
+
self.backendRegistry?.recordFailure?.(agent.backend);
|
|
1418
|
+
} catch (err) { log.error(`fatal kill failed for ${name}: ${err.message}`); }
|
|
1419
|
+
break;
|
|
1420
|
+
|
|
1421
|
+
default:
|
|
1422
|
+
log.warn(`classify ${name}: unknown category "${result.category}"`);
|
|
1423
|
+
break;
|
|
1424
|
+
}
|
|
1425
|
+
}
|
|
1426
|
+
} finally {
|
|
1427
|
+
self._monitoring = false;
|
|
1428
|
+
}
|
|
1429
|
+
}, Math.min(stallTimeout / 2, 60000));
|
|
1430
|
+
|
|
1431
|
+
this._monitorHandle.unref();
|
|
1432
|
+
}
|
|
1433
|
+
|
|
1434
|
+
/**
|
|
1435
|
+
* Stop the stall detection monitor.
|
|
1436
|
+
*/
|
|
1437
|
+
stopMonitor() {
|
|
1438
|
+
if (this._monitorHandle) {
|
|
1439
|
+
clearInterval(this._monitorHandle);
|
|
1440
|
+
this._monitorHandle = null;
|
|
1441
|
+
log.info('stall detection monitor stopped');
|
|
1442
|
+
}
|
|
1443
|
+
}
|
|
1444
|
+
|
|
1445
|
+
// ── Halt (Full System Stop) ──────────────────────────
|
|
1446
|
+
|
|
1447
|
+
/**
|
|
1448
|
+
* Stop the entire orchestrator: reactor loop, stall monitor, and
|
|
1449
|
+
* all registered agents. The MCP server process stays alive so the
|
|
1450
|
+
* user can call `start` again to resume.
|
|
1451
|
+
*
|
|
1452
|
+
* Composes {@link unwatch}, {@link stopMonitor}, and {@link kill}
|
|
1453
|
+
* into a single atomic operation. Idempotent — safe to call when
|
|
1454
|
+
* already halted.
|
|
1455
|
+
*
|
|
1456
|
+
* @returns {Promise<{reactor: boolean, monitor: boolean, agents: string[]}>}
|
|
1457
|
+
* Summary of what was stopped.
|
|
1458
|
+
* @fires Orchestrator#halted
|
|
1459
|
+
*/
|
|
1460
|
+
async halt() {
|
|
1461
|
+
const reactor = Boolean(this._watchHandle);
|
|
1462
|
+
const monitor = Boolean(this._monitorHandle);
|
|
1463
|
+
|
|
1464
|
+
this.unwatch();
|
|
1465
|
+
this.stopMonitor();
|
|
1466
|
+
|
|
1467
|
+
const killed = [];
|
|
1468
|
+
const entries = [...this.agents.entries()];
|
|
1469
|
+
|
|
1470
|
+
for (const [name] of entries) {
|
|
1471
|
+
try {
|
|
1472
|
+
await this.kill(name);
|
|
1473
|
+
killed.push(name);
|
|
1474
|
+
} catch (err) {
|
|
1475
|
+
log.warn(`halt: failed to kill ${name}: ${err.message}`);
|
|
1476
|
+
}
|
|
1477
|
+
}
|
|
1478
|
+
|
|
1479
|
+
log.info(`halt: reactor=${reactor} monitor=${monitor} agents=${killed.length}`);
|
|
1480
|
+
|
|
1481
|
+
/**
|
|
1482
|
+
* @event Orchestrator#halted
|
|
1483
|
+
* @type {object}
|
|
1484
|
+
* @property {boolean} reactor - Whether the reactor was running.
|
|
1485
|
+
* @property {boolean} monitor - Whether the monitor was running.
|
|
1486
|
+
* @property {string[]} agents - Names of agents that were killed.
|
|
1487
|
+
*/
|
|
1488
|
+
this.emit('halted', { reactor, monitor, agents: killed });
|
|
1489
|
+
|
|
1490
|
+
return { reactor, monitor, agents: killed };
|
|
1491
|
+
}
|
|
1492
|
+
}
|