loreli 0.0.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +670 -97
- package/bin/loreli.js +89 -0
- package/package.json +74 -14
- package/packages/README.md +101 -0
- package/packages/action/README.md +98 -0
- package/packages/action/src/index.js +656 -0
- package/packages/agent/README.md +517 -0
- package/packages/agent/src/backends/claude.js +287 -0
- package/packages/agent/src/backends/codex.js +278 -0
- package/packages/agent/src/backends/cursor.js +294 -0
- package/packages/agent/src/backends/index.js +329 -0
- package/packages/agent/src/base.js +138 -0
- package/packages/agent/src/cli.js +198 -0
- package/packages/agent/src/factory.js +119 -0
- package/packages/agent/src/index.js +12 -0
- package/packages/agent/src/models.js +141 -0
- package/packages/agent/src/output.js +62 -0
- package/packages/agent/src/session.js +162 -0
- package/packages/agent/src/trace.js +186 -0
- package/packages/config/README.md +833 -0
- package/packages/config/src/defaults.js +134 -0
- package/packages/config/src/index.js +192 -0
- package/packages/config/src/schema.js +273 -0
- package/packages/config/src/validate.js +160 -0
- package/packages/context/README.md +165 -0
- package/packages/context/src/index.js +198 -0
- package/packages/hub/README.md +338 -0
- package/packages/hub/src/base.js +154 -0
- package/packages/hub/src/github.js +1558 -0
- package/packages/hub/src/index.js +79 -0
- package/packages/hub/src/labels.js +48 -0
- package/packages/identity/README.md +288 -0
- package/packages/identity/src/index.js +620 -0
- package/packages/identity/src/themes/avatar.js +217 -0
- package/packages/identity/src/themes/digimon.js +217 -0
- package/packages/identity/src/themes/dragonball.js +217 -0
- package/packages/identity/src/themes/lotr.js +217 -0
- package/packages/identity/src/themes/marvel.js +217 -0
- package/packages/identity/src/themes/pokemon.js +217 -0
- package/packages/identity/src/themes/starwars.js +217 -0
- package/packages/identity/src/themes/transformers.js +217 -0
- package/packages/identity/src/themes/zelda.js +217 -0
- package/packages/knowledge/README.md +237 -0
- package/packages/knowledge/src/index.js +412 -0
- package/packages/log/README.md +93 -0
- package/packages/log/src/index.js +252 -0
- package/packages/marker/README.md +200 -0
- package/packages/marker/src/index.js +184 -0
- package/packages/mcp/README.md +279 -0
- package/packages/mcp/instructions.md +121 -0
- package/packages/mcp/scaffolding/.agents/skills/loreli-context/SKILL.md +89 -0
- package/packages/mcp/scaffolding/ISSUE_TEMPLATE/config.yml +2 -0
- package/packages/mcp/scaffolding/ISSUE_TEMPLATE/loreli.yml +83 -0
- package/packages/mcp/scaffolding/loreli.yml +453 -0
- package/packages/mcp/scaffolding/mcp-configs/.codex/config.toml +3 -0
- package/packages/mcp/scaffolding/mcp-configs/.cursor/mcp.json +11 -0
- package/packages/mcp/scaffolding/mcp-configs/.mcp.json +11 -0
- package/packages/mcp/scaffolding/pull-request.md +23 -0
- package/packages/mcp/src/index.js +571 -0
- package/packages/mcp/src/tools/agents.js +429 -0
- package/packages/mcp/src/tools/context.js +199 -0
- package/packages/mcp/src/tools/github.js +1199 -0
- package/packages/mcp/src/tools/hitl.js +149 -0
- package/packages/mcp/src/tools/index.js +17 -0
- package/packages/mcp/src/tools/start.js +835 -0
- package/packages/mcp/src/tools/status.js +146 -0
- package/packages/mcp/src/tools/work.js +124 -0
- package/packages/orchestrator/README.md +192 -0
- package/packages/orchestrator/src/index.js +1226 -0
- package/packages/planner/README.md +168 -0
- package/packages/planner/src/index.js +1166 -0
- package/packages/review/README.md +129 -0
- package/packages/review/src/index.js +1283 -0
- package/packages/risk/README.md +119 -0
- package/packages/risk/src/index.js +428 -0
- package/packages/session/README.md +165 -0
- package/packages/session/src/index.js +215 -0
- package/packages/test-utils/README.md +96 -0
- package/packages/test-utils/src/index.js +354 -0
- package/packages/tmux/README.md +261 -0
- package/packages/tmux/src/index.js +452 -0
- package/packages/workflow/README.md +313 -0
- package/packages/workflow/src/index.js +481 -0
- package/packages/workflow/src/proof-of-life.js +74 -0
- package/packages/workspace/README.md +143 -0
- package/packages/workspace/src/index.js +1076 -0
- package/index.js +0 -8
|
@@ -0,0 +1,1226 @@
|
|
|
1
|
+
import { rm, writeFile, mkdir } from 'node:fs/promises';
|
|
2
|
+
import { join } from 'node:path';
|
|
3
|
+
import { createHash } from 'node:crypto';
|
|
4
|
+
import { EventEmitter } from 'node:events';
|
|
5
|
+
import { Factory, Session, output } from 'loreli/agent';
|
|
6
|
+
import { Tmux } from 'loreli/tmux';
|
|
7
|
+
import { prepare } from 'loreli/workspace';
|
|
8
|
+
import { pick, side, capability } from 'loreli/identity';
|
|
9
|
+
import { logger } from 'loreli/log';
|
|
10
|
+
|
|
11
|
+
const log = logger('orchestrator');
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Fatal error patterns that indicate a backend infrastructure failure
|
|
15
|
+
* (not a task failure). When these appear in an agent's pane output
|
|
16
|
+
* shortly after spawn, the backend is broken and should be degraded.
|
|
17
|
+
*
|
|
18
|
+
* @type {RegExp[]}
|
|
19
|
+
*/
|
|
20
|
+
const FATAL_PATTERNS = [
|
|
21
|
+
/budget[_ ]*(has been )?exceeded/i,
|
|
22
|
+
/rate[_ ]?limit[_ ]*exceeded/i,
|
|
23
|
+
/hit your usage[_ ]*limit/i,
|
|
24
|
+
/authentication[_ ]*(error|failed)/i,
|
|
25
|
+
/invalid[_ ]*api[_ ]*key/i,
|
|
26
|
+
/quota[_ ]*exceeded/i,
|
|
27
|
+
/insufficient[_ ]*quota/i
|
|
28
|
+
];
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Check if pane output contains fatal API error patterns.
|
|
32
|
+
*
|
|
33
|
+
* @param {string} output - Pane content from agent.capture().
|
|
34
|
+
* @returns {boolean} True if a fatal error pattern is found.
|
|
35
|
+
*/
|
|
36
|
+
function hasFatalError(output) {
|
|
37
|
+
if (!output) return false;
|
|
38
|
+
return FATAL_PATTERNS.some(function match(p) { return p.test(output); });
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Generic agent lifecycle coordinator via EventEmitter.
|
|
43
|
+
*
|
|
44
|
+
* Manages spawn/shutdown/kill, reactor polling, stall detection,
|
|
45
|
+
* and activity tracking. Contains zero role-specific logic — all
|
|
46
|
+
* planner/action/review behavior lives in the role packages that
|
|
47
|
+
* subscribe to lifecycle events.
|
|
48
|
+
*
|
|
49
|
+
* @extends EventEmitter
|
|
50
|
+
* @fires Orchestrator#spawned
|
|
51
|
+
* @fires Orchestrator#removed
|
|
52
|
+
* @fires Orchestrator#stall
|
|
53
|
+
*/
|
|
54
|
+
export class Orchestrator extends EventEmitter {
|
|
55
|
+
/**
|
|
56
|
+
* @param {object} opts
|
|
57
|
+
* @param {object} opts.hub - Hub instance for git hosting operations.
|
|
58
|
+
* @param {object} opts.identityRegistry - Registry for agent identities.
|
|
59
|
+
* @param {object} opts.backendRegistry - Registry for agent backends.
|
|
60
|
+
* @param {object} opts.storage - Persistent storage instance.
|
|
61
|
+
* @param {object} [opts.config] - Config instance from loreli/config.
|
|
62
|
+
*/
|
|
63
|
+
constructor({ hub, identityRegistry, backendRegistry, storage, config }) {
|
|
64
|
+
super();
|
|
65
|
+
|
|
66
|
+
/** @type {object} Hub for git hosting. */
|
|
67
|
+
this.hub = hub;
|
|
68
|
+
|
|
69
|
+
/** @type {object} Identity registry. */
|
|
70
|
+
this.identityRegistry = identityRegistry;
|
|
71
|
+
|
|
72
|
+
/** @type {object} Backend registry. */
|
|
73
|
+
this.backendRegistry = backendRegistry;
|
|
74
|
+
|
|
75
|
+
/** @type {Factory} Agent factory — centralizes the create+spawn pipeline. */
|
|
76
|
+
this.factory = new Factory({ backends: backendRegistry, identities: identityRegistry, config });
|
|
77
|
+
|
|
78
|
+
/** @type {object} Storage for session persistence. */
|
|
79
|
+
this.storage = storage;
|
|
80
|
+
|
|
81
|
+
/** @type {object|null} Config instance from loreli/config. */
|
|
82
|
+
this.cfg = config ?? null;
|
|
83
|
+
|
|
84
|
+
/** @type {Map<string, object>} Active agents by name. */
|
|
85
|
+
this.agents = new Map();
|
|
86
|
+
|
|
87
|
+
/** @type {string|null} Current session ID. */
|
|
88
|
+
this.sessionId = null;
|
|
89
|
+
|
|
90
|
+
/** @type {object|null} MCP client identity. */
|
|
91
|
+
this.clientIdentity = null;
|
|
92
|
+
|
|
93
|
+
/** @type {string|null} Target repository in "owner/name" format. */
|
|
94
|
+
this.repo = null;
|
|
95
|
+
|
|
96
|
+
/** @type {number} Stall timeout in ms. */
|
|
97
|
+
this.stallTimeout = this.cfg?.get?.('timeouts.stall') ?? 600000;
|
|
98
|
+
|
|
99
|
+
/** @type {number} Delay before checking if a freshly spawned agent died. */
|
|
100
|
+
this.rapidDeathDelay = this.cfg?.get?.('timeouts.rapidDeath') ?? 15000;
|
|
101
|
+
|
|
102
|
+
/** @type {NodeJS.Timeout|null} Stall detection interval handle. */
|
|
103
|
+
this._monitorHandle = null;
|
|
104
|
+
|
|
105
|
+
/** @type {Map<string, string>} Last known activity timestamp per agent. */
|
|
106
|
+
this._lastActivity = new Map();
|
|
107
|
+
|
|
108
|
+
/** @type {Map<string, string>} MD5 hash of last captured pane output per agent for tmux-based activity detection. */
|
|
109
|
+
this._lastPaneHash = new Map();
|
|
110
|
+
|
|
111
|
+
/** @type {NodeJS.Timeout|null} Reactor polling interval handle. */
|
|
112
|
+
this._watchHandle = null;
|
|
113
|
+
|
|
114
|
+
/** @type {Map<string, function>} Registered reactor handlers by name. */
|
|
115
|
+
this._handlers = new Map();
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Names already claimed by other participants, discovered from
|
|
119
|
+
* GitHub claim comments and PR branches during reactor ticks.
|
|
120
|
+
* Populated as a zero-cost side effect of data the reactor
|
|
121
|
+
* already fetches — no additional API calls needed.
|
|
122
|
+
*
|
|
123
|
+
* @type {Set<string>}
|
|
124
|
+
*/
|
|
125
|
+
this.takenNames = new Set();
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* Agents we've removed locally (killed or shut down). Used to
|
|
129
|
+
* distinguish "we killed this agent" from "a foreign orchestrator
|
|
130
|
+
* owns this agent" during proof-of-life decisions.
|
|
131
|
+
*
|
|
132
|
+
* @type {Set<string>}
|
|
133
|
+
*/
|
|
134
|
+
this._removed = new Set();
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* Registered workflows by role. Populated during start so
|
|
138
|
+
* scale() can collect demand signals from each workflow.
|
|
139
|
+
*
|
|
140
|
+
* @type {Map<string, object>}
|
|
141
|
+
*/
|
|
142
|
+
this.workflows = new Map();
|
|
143
|
+
|
|
144
|
+
/**
|
|
145
|
+
* Last spawn timestamp per role for cooldown enforcement.
|
|
146
|
+
* Prevents thrashing when demand fluctuates between ticks.
|
|
147
|
+
*
|
|
148
|
+
* @type {Map<string, number>}
|
|
149
|
+
*/
|
|
150
|
+
this._lastSpawn = new Map();
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// ── Seed (Identity Discovery) ────────────────────────
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* One-time seed of `takenNames` from open PR branch names.
|
|
157
|
+
*
|
|
158
|
+
* Called before the first `acquire()` — before any reactor tick has
|
|
159
|
+
* populated the set. After the first tick, the reactor keeps the
|
|
160
|
+
* set current as a side effect of its normal data flow.
|
|
161
|
+
*
|
|
162
|
+
* Idempotent: only runs once per orchestrator lifetime.
|
|
163
|
+
*
|
|
164
|
+
* @returns {Promise<void>}
|
|
165
|
+
*/
|
|
166
|
+
async seed() {
|
|
167
|
+
if (this._seeded) return;
|
|
168
|
+
this._seeded = true;
|
|
169
|
+
|
|
170
|
+
if (!this.hub || !this.repo) return;
|
|
171
|
+
|
|
172
|
+
try {
|
|
173
|
+
const prs = await this.hub.pulls(this.repo, { state: 'open' });
|
|
174
|
+
for (const pr of prs) {
|
|
175
|
+
const slash = pr.head?.indexOf('/');
|
|
176
|
+
if (slash > 0) this.takenNames.add(pr.head.slice(0, slash));
|
|
177
|
+
}
|
|
178
|
+
log.debug(`seed: discovered ${this.takenNames.size} taken names from open PRs`);
|
|
179
|
+
} catch (err) {
|
|
180
|
+
log.debug(`seed: skipped — ${err.message}`);
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// ── Lifecycle ─────────────────────────────────────────
|
|
185
|
+
|
|
186
|
+
/**
|
|
187
|
+
* Spawn and register an agent with comprehensive rollback.
|
|
188
|
+
*
|
|
189
|
+
* Tracks each step of the spawn process and undoes them in reverse
|
|
190
|
+
* order on failure. Each completed step is tracked and unwound
|
|
191
|
+
* individually on error.
|
|
192
|
+
*
|
|
193
|
+
* @param {object} agent - Agent instance.
|
|
194
|
+
* @returns {Promise<void>}
|
|
195
|
+
* @fires Orchestrator#spawned
|
|
196
|
+
*/
|
|
197
|
+
async spawn(agent) {
|
|
198
|
+
log.info(`spawning agent: ${agent.identity.name} (${agent.role})`);
|
|
199
|
+
|
|
200
|
+
/** @type {Array<{step: string, undo: function}>} Completed steps for rollback. */
|
|
201
|
+
const completed = [];
|
|
202
|
+
|
|
203
|
+
try {
|
|
204
|
+
// Step 1: Spawn the agent process (tmux pane, API session, etc.)
|
|
205
|
+
await agent.spawn();
|
|
206
|
+
completed.push({
|
|
207
|
+
step: 'spawn',
|
|
208
|
+
undo: async function undoSpawn() {
|
|
209
|
+
try { await agent.stop(); } catch (e) { log.debug(`rollback stop failed: ${e.message}`); }
|
|
210
|
+
}
|
|
211
|
+
});
|
|
212
|
+
|
|
213
|
+
// Step 2: Register in the agents map
|
|
214
|
+
this.agents.set(agent.identity.name, agent);
|
|
215
|
+
completed.push({
|
|
216
|
+
step: 'register',
|
|
217
|
+
undo: () => { this.agents.delete(agent.identity.name); }
|
|
218
|
+
});
|
|
219
|
+
|
|
220
|
+
// Step 3: Record activity timestamp
|
|
221
|
+
this._lastActivity.set(agent.identity.name, new Date().toISOString());
|
|
222
|
+
completed.push({
|
|
223
|
+
step: 'activity',
|
|
224
|
+
undo: () => { this._lastActivity.delete(agent.identity.name); }
|
|
225
|
+
});
|
|
226
|
+
|
|
227
|
+
log.info(`agent spawned: ${agent.identity.name}`);
|
|
228
|
+
|
|
229
|
+
// Schedule rapid-death detection: if the agent dies or shows
|
|
230
|
+
// fatal API errors within rapidDeathDelay of spawning, the
|
|
231
|
+
// backend is likely broken (budget exhaustion, API outage).
|
|
232
|
+
// Mark it as degraded so scale() falls back to cursor-agent.
|
|
233
|
+
//
|
|
234
|
+
// Two checks:
|
|
235
|
+
// 1. Dead pane → agent exited on error
|
|
236
|
+
// 2. Stuck-alive → agent stays alive but shows budget/rate-limit
|
|
237
|
+
// errors in its pane output
|
|
238
|
+
if (agent.backend && agent.alive) {
|
|
239
|
+
const backend = agent.backend;
|
|
240
|
+
const name = agent.identity.name;
|
|
241
|
+
const registry = this.backendRegistry;
|
|
242
|
+
const self = this;
|
|
243
|
+
const timer = setTimeout(async function rapidDeathCheck() {
|
|
244
|
+
try {
|
|
245
|
+
const alive = await agent.alive();
|
|
246
|
+
if (!alive && agent.state !== 'dormant') {
|
|
247
|
+
log.warn(`rapid death: ${name} died within ${self.rapidDeathDelay}ms of spawn — marking ${backend} degraded`);
|
|
248
|
+
registry?.recordFailure(backend);
|
|
249
|
+
if (agent.canTransition?.('dormant')) agent.transition('dormant');
|
|
250
|
+
self.emit('rapid-death', { name, backend });
|
|
251
|
+
return;
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
if (alive && agent.capture) {
|
|
255
|
+
const output = await agent.capture();
|
|
256
|
+
if (hasFatalError(output)) {
|
|
257
|
+
log.warn(`stuck-alive: ${name} shows fatal API error — marking ${backend} degraded`);
|
|
258
|
+
registry?.recordFailure(backend);
|
|
259
|
+
try { await agent.stop(); } catch { /* stop can fail */ }
|
|
260
|
+
self.emit('rapid-death', { name, backend, reason: 'stuck-alive' });
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
} catch { /* pane check can fail when session is torn down */ }
|
|
264
|
+
}, this.rapidDeathDelay);
|
|
265
|
+
timer.unref();
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
/**
|
|
269
|
+
* @event Orchestrator#spawned
|
|
270
|
+
* @type {object}
|
|
271
|
+
* @property {string} name - Agent identity name.
|
|
272
|
+
* @property {string} role - Agent role.
|
|
273
|
+
* @property {string} provider - AI provider.
|
|
274
|
+
*/
|
|
275
|
+
this.emit('spawned', {
|
|
276
|
+
name: agent.identity.name,
|
|
277
|
+
role: agent.role,
|
|
278
|
+
provider: agent.identity.provider
|
|
279
|
+
});
|
|
280
|
+
} catch (err) {
|
|
281
|
+
log.error(`spawn failed: ${agent.identity.name} — ${err.message}`);
|
|
282
|
+
|
|
283
|
+
// Rollback completed steps in reverse order
|
|
284
|
+
for (let i = completed.length - 1; i >= 0; i--) {
|
|
285
|
+
try {
|
|
286
|
+
await completed[i].undo();
|
|
287
|
+
log.debug(`rollback: undid "${completed[i].step}" for ${agent.identity.name}`);
|
|
288
|
+
} catch (rollbackErr) {
|
|
289
|
+
// Never mask the original error with a rollback failure
|
|
290
|
+
log.warn(`rollback "${completed[i].step}" failed: ${rollbackErr.message}`);
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
// Identity release is the caller's responsibility (add_agent tool
|
|
295
|
+
// acquired it, add_agent tool releases it). Spawn only rolls back
|
|
296
|
+
// the steps it owns: process, registration, and activity tracking.
|
|
297
|
+
throw err;
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
/**
|
|
302
|
+
* Gracefully shut down an agent using a 3-phase protocol:
|
|
303
|
+
*
|
|
304
|
+
* 1. Send a structured shutdown request with a unique `requestId`
|
|
305
|
+
* 2. Poll for acknowledgment — the agent can accept or continue working
|
|
306
|
+
* 3. On acknowledgment or timeout, force stop and clean up
|
|
307
|
+
*
|
|
308
|
+
* @param {string} name - Agent identity name.
|
|
309
|
+
* @param {number} [timeout] - Timeout before force kill.
|
|
310
|
+
* @param {string} [reason] - Reason for shutdown (passed to agent).
|
|
311
|
+
* @returns {Promise<{acknowledged: boolean}>} Whether the agent acknowledged.
|
|
312
|
+
* @fires Orchestrator#removed
|
|
313
|
+
*/
|
|
314
|
+
async shutdown(name, timeout, reason) {
|
|
315
|
+
const agent = this.agents.get(name);
|
|
316
|
+
if (!agent) throw new Error(`Agent "${name}" not found`);
|
|
317
|
+
|
|
318
|
+
const shutdownTimeout = timeout ?? this.cfg?.get?.('timeouts.shutdown') ?? 60000;
|
|
319
|
+
const pollInterval = this.cfg?.get?.('timeouts.poll') ?? 2000;
|
|
320
|
+
const requestId = `shutdown-${Date.now()}@${name}`;
|
|
321
|
+
|
|
322
|
+
log.info(`shutting down agent: ${name} (timeout: ${shutdownTimeout}ms, requestId: ${requestId})`);
|
|
323
|
+
|
|
324
|
+
// Phase 1: Send structured shutdown request
|
|
325
|
+
const shutdownMessage = [
|
|
326
|
+
`**Shutdown Request** (id: ${requestId})`,
|
|
327
|
+
reason ? `Reason: ${reason}` : '',
|
|
328
|
+
'Please finish your current work and shut down gracefully.',
|
|
329
|
+
'Post a comment or signal when ready.'
|
|
330
|
+
].filter(Boolean).join('\n');
|
|
331
|
+
|
|
332
|
+
try {
|
|
333
|
+
await agent.send(shutdownMessage);
|
|
334
|
+
} catch (err) { log.debug(`shutdown message failed for ${name}: ${err.message}`); }
|
|
335
|
+
|
|
336
|
+
// Phase 2: Poll for acknowledgment or timeout
|
|
337
|
+
let acknowledged = false;
|
|
338
|
+
const deadline = Date.now() + shutdownTimeout;
|
|
339
|
+
|
|
340
|
+
while (Date.now() < deadline) {
|
|
341
|
+
if (agent.state === 'dormant') {
|
|
342
|
+
acknowledged = true;
|
|
343
|
+
break;
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
// Check if the agent is still alive — if it exited on its own,
|
|
347
|
+
// treat that as implicit acknowledgment
|
|
348
|
+
try {
|
|
349
|
+
const alive = await agent.alive?.();
|
|
350
|
+
if (alive === false) {
|
|
351
|
+
acknowledged = true;
|
|
352
|
+
break;
|
|
353
|
+
}
|
|
354
|
+
} catch (err) { log.debug(`alive check during shutdown polling failed: ${err.message}`); }
|
|
355
|
+
|
|
356
|
+
await new Promise(function wait(r) { setTimeout(r, pollInterval); });
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
if (acknowledged) {
|
|
360
|
+
log.info(`agent ${name} acknowledged shutdown`);
|
|
361
|
+
} else {
|
|
362
|
+
log.warn(`agent ${name} did not acknowledge shutdown within ${shutdownTimeout}ms, force stopping`);
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
// Phase 3: Force stop and clean up
|
|
366
|
+
const session = agent.session;
|
|
367
|
+
await this.snapshot(name, agent);
|
|
368
|
+
await agent.stop();
|
|
369
|
+
|
|
370
|
+
// Clean up the agent's workspace directory when configured
|
|
371
|
+
if (this.cfg?.get?.('workspace.cleanup') && agent.cwd) {
|
|
372
|
+
try {
|
|
373
|
+
await rm(agent.cwd, { recursive: true, force: true });
|
|
374
|
+
log.info(`workspace cleaned: ${name} (${agent.cwd})`);
|
|
375
|
+
} catch (err) { log.debug(`workspace cleanup failed for ${name}: ${err.message}`); }
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
this.agents.delete(name);
|
|
379
|
+
this._lastActivity.delete(name);
|
|
380
|
+
this._lastPaneHash.delete(name);
|
|
381
|
+
this._removed.add(name);
|
|
382
|
+
this.identityRegistry.release(agent.identity);
|
|
383
|
+
log.info(`agent shut down: ${name}`);
|
|
384
|
+
|
|
385
|
+
if (this.agents.size === 0 && session) await this._pruneSession(session);
|
|
386
|
+
|
|
387
|
+
/**
|
|
388
|
+
* @event Orchestrator#removed
|
|
389
|
+
* @type {object}
|
|
390
|
+
* @property {string} name - Agent identity name.
|
|
391
|
+
* @property {string} reason - 'shutdown' or 'killed'.
|
|
392
|
+
* @property {object} agent - The removed agent instance.
|
|
393
|
+
*/
|
|
394
|
+
this.emit('removed', { name, reason: 'shutdown', agent });
|
|
395
|
+
|
|
396
|
+
return { acknowledged };
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
/**
|
|
400
|
+
* Force kill an agent immediately.
|
|
401
|
+
* Releases any claimed issues so they can be re-claimed.
|
|
402
|
+
*
|
|
403
|
+
* @param {string} name - Agent identity name.
|
|
404
|
+
* @returns {Promise<void>}
|
|
405
|
+
* @fires Orchestrator#removed
|
|
406
|
+
*/
|
|
407
|
+
async kill(name) {
|
|
408
|
+
const agent = this.agents.get(name);
|
|
409
|
+
if (!agent) throw new Error(`Agent "${name}" not found`);
|
|
410
|
+
|
|
411
|
+
log.warn(`force killing agent: ${name}`);
|
|
412
|
+
const session = agent.session;
|
|
413
|
+
await this.snapshot(name, agent);
|
|
414
|
+
await agent.stop();
|
|
415
|
+
|
|
416
|
+
// Clean up the agent's workspace directory when configured
|
|
417
|
+
if (this.cfg?.get?.('workspace.cleanup') && agent.cwd) {
|
|
418
|
+
try {
|
|
419
|
+
await rm(agent.cwd, { recursive: true, force: true });
|
|
420
|
+
log.info(`workspace cleaned: ${name} (${agent.cwd})`);
|
|
421
|
+
} catch (err) { log.debug(`workspace cleanup failed for ${name}: ${err.message}`); }
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
this.agents.delete(name);
|
|
425
|
+
this._lastActivity.delete(name);
|
|
426
|
+
this._lastPaneHash.delete(name);
|
|
427
|
+
this._removed.add(name);
|
|
428
|
+
this.identityRegistry.release(agent.identity);
|
|
429
|
+
|
|
430
|
+
if (this.agents.size === 0 && session) await this._pruneSession(session);
|
|
431
|
+
|
|
432
|
+
this.emit('removed', { name, reason: 'killed', agent });
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
// ── Tmux Cleanup ─────────────────────────────────────
|
|
436
|
+
|
|
437
|
+
/**
|
|
438
|
+
* Resolve the tmux session name from registered agents or config.
|
|
439
|
+
*
|
|
440
|
+
* Agents carry their own session name (defaults to 'loreli' but
|
|
441
|
+
* overridable in tests). This avoids hardcoding the session name
|
|
442
|
+
* and keeps cleanup aligned with wherever agents actually live.
|
|
443
|
+
*
|
|
444
|
+
* @returns {string} The tmux session name.
|
|
445
|
+
*/
|
|
446
|
+
_session() {
|
|
447
|
+
for (const agent of this.agents.values()) {
|
|
448
|
+
if (agent.session) return agent.session;
|
|
449
|
+
}
|
|
450
|
+
return this.cfg?.get?.('tmux.session') ?? 'loreli';
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
/**
|
|
454
|
+
* Destroy the tmux session when no agents remain.
|
|
455
|
+
*
|
|
456
|
+
* Safety net that runs after the last agent is removed via
|
|
457
|
+
* {@link kill} or {@link shutdown}. Individual agent `stop()` calls
|
|
458
|
+
* kill their own panes, but orphaned panes (from `remain-on-exit`,
|
|
459
|
+
* crashed backends, or timing gaps) can keep the session alive.
|
|
460
|
+
* This ensures a clean slate.
|
|
461
|
+
*
|
|
462
|
+
* @param {string} session - The tmux session name to prune.
|
|
463
|
+
* @returns {Promise<void>}
|
|
464
|
+
*/
|
|
465
|
+
async _pruneSession(session) {
|
|
466
|
+
if (!Tmux.available()) return;
|
|
467
|
+
const tmux = new Tmux();
|
|
468
|
+
|
|
469
|
+
try {
|
|
470
|
+
if (await tmux.has(session)) {
|
|
471
|
+
let paneCount = 0;
|
|
472
|
+
try {
|
|
473
|
+
const panes = await tmux.allPanes(session);
|
|
474
|
+
paneCount = panes.length;
|
|
475
|
+
} catch { /* session may be in a bad state */ }
|
|
476
|
+
await tmux.kill(session);
|
|
477
|
+
log.info(`pruned tmux session "${session}" (${paneCount} orphaned panes) — no agents remain`);
|
|
478
|
+
}
|
|
479
|
+
} catch (err) {
|
|
480
|
+
log.debug(`session prune failed: ${err.message}`);
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
/**
|
|
485
|
+
* Garbage-collect orphaned tmux panes not tracked by any agent.
|
|
486
|
+
*
|
|
487
|
+
* Lists all panes in the loreli tmux session and kills any that
|
|
488
|
+
* do not belong to a registered agent. Safe to call at any time —
|
|
489
|
+
* tracked agent panes are preserved.
|
|
490
|
+
*
|
|
491
|
+
* When all panes are orphaned, the session is destroyed entirely.
|
|
492
|
+
*
|
|
493
|
+
* @returns {Promise<{killed: number}>} Count of orphaned panes killed.
|
|
494
|
+
*/
|
|
495
|
+
async gc() {
|
|
496
|
+
if (!Tmux.available()) return { killed: 0 };
|
|
497
|
+
const session = this._session();
|
|
498
|
+
const tmux = new Tmux();
|
|
499
|
+
|
|
500
|
+
if (!await tmux.has(session)) return { killed: 0 };
|
|
501
|
+
|
|
502
|
+
const tracked = new Set();
|
|
503
|
+
for (const agent of this.agents.values()) {
|
|
504
|
+
if (agent.paneId) tracked.add(agent.paneId);
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
let all;
|
|
508
|
+
try {
|
|
509
|
+
all = await tmux.allPanes(session);
|
|
510
|
+
} catch {
|
|
511
|
+
return { killed: 0 };
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
let killed = 0;
|
|
515
|
+
for (const pane of all) {
|
|
516
|
+
if (!tracked.has(pane.id)) {
|
|
517
|
+
try {
|
|
518
|
+
await tmux.killPane(pane.id);
|
|
519
|
+
killed++;
|
|
520
|
+
log.info(`gc: killed orphaned pane ${pane.id}`);
|
|
521
|
+
} catch { /* pane may have died between list and kill */ }
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
// Destroy the session when every pane was orphaned
|
|
526
|
+
if (killed > 0 && killed === all.length) {
|
|
527
|
+
try {
|
|
528
|
+
if (await tmux.has(session)) await tmux.kill(session);
|
|
529
|
+
} catch { /* session may auto-destroy */ }
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
return { killed };
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
// ── Coordination ──────────────────────────────────────
|
|
536
|
+
|
|
537
|
+
/**
|
|
538
|
+
* Auto-spawn an agent for a given provider and role when one is not
|
|
539
|
+
* already available. Cross-provider review is the core value proposition;
|
|
540
|
+
* silently skipping it defeats the purpose, so the orchestrator
|
|
541
|
+
* proactively enlists the opposing side.
|
|
542
|
+
*
|
|
543
|
+
* Delegates to {@link Factory#spawn} for the full creation pipeline.
|
|
544
|
+
*
|
|
545
|
+
* @param {string} provider - AI provider to spawn for.
|
|
546
|
+
* @param {string} role - Agent role ('reviewer', 'action', etc.).
|
|
547
|
+
* @param {object} [opts] - Additional options.
|
|
548
|
+
* @returns {Promise<object>} The spawned agent instance.
|
|
549
|
+
*/
|
|
550
|
+
async enlist(provider, role, opts = {}) {
|
|
551
|
+
log.info(`enlisting ${role} agent for ${provider} — cross-provider pairing requires it`);
|
|
552
|
+
|
|
553
|
+
// Seed taken names before first acquire — one-time cost, zero
|
|
554
|
+
// ongoing overhead because reactor ticks keep the set current.
|
|
555
|
+
await this.seed();
|
|
556
|
+
|
|
557
|
+
// Build context for the factory so prepare() writes session env vars.
|
|
558
|
+
// Include home and token so agent subprocesses use the same storage
|
|
559
|
+
// location and can create a hub for stamped GitHub operations.
|
|
560
|
+
const context = this.sessionId ? {
|
|
561
|
+
session: this.sessionId,
|
|
562
|
+
agent: null, // set after identity is acquired inside factory
|
|
563
|
+
repo: this.repo,
|
|
564
|
+
home: this.storage?.home,
|
|
565
|
+
token: process.env.GITHUB_TOKEN
|
|
566
|
+
} : undefined;
|
|
567
|
+
|
|
568
|
+
// Theme coherence: inherit from an existing agent so antagonist
|
|
569
|
+
// pairs always share the same theme universe. Only pick from
|
|
570
|
+
// config when no agents exist yet (first enlistment).
|
|
571
|
+
const existing = [...this.agents.values()].find(function hasTheme(a) { return a.identity?.theme; });
|
|
572
|
+
const theme = existing?.identity?.theme ?? pick(this.cfg?.get?.('theme'));
|
|
573
|
+
|
|
574
|
+
const agent = await this.factory.create(provider, role, {
|
|
575
|
+
theme,
|
|
576
|
+
model: this.cfg?.get?.('model'),
|
|
577
|
+
config: this.cfg,
|
|
578
|
+
context,
|
|
579
|
+
taken: this.takenNames,
|
|
580
|
+
...opts
|
|
581
|
+
});
|
|
582
|
+
|
|
583
|
+
// Persist session data BEFORE spawn so the agent's MCP server
|
|
584
|
+
// subprocess can hydrate from storage on startup. Without this,
|
|
585
|
+
// the agent's _hydrate() call races against the host's save.
|
|
586
|
+
if (this.sessionId && this.storage && agent.identity?.name) {
|
|
587
|
+
const session = new Session({
|
|
588
|
+
identity: agent.identity.toJSON?.() ?? agent.identity,
|
|
589
|
+
role,
|
|
590
|
+
backend: agent.constructor.name,
|
|
591
|
+
paneId: null, // not yet known
|
|
592
|
+
repo: this.repo
|
|
593
|
+
});
|
|
594
|
+
await this.storage.save(this.sessionId, agent.identity.name, session.toJSON());
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
await this.spawn(agent);
|
|
598
|
+
|
|
599
|
+
if (this.sessionId && this.storage && agent.identity?.name && agent.paneId) {
|
|
600
|
+
const data = await this.storage.load(this.sessionId, agent.identity.name);
|
|
601
|
+
if (data) {
|
|
602
|
+
data.paneId = agent.paneId;
|
|
603
|
+
await this.storage.save(this.sessionId, agent.identity.name, data);
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
return agent;
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
/**
|
|
611
|
+
* Record a heartbeat timestamp for an agent. Resets the stall timer.
|
|
612
|
+
*
|
|
613
|
+
* @param {string} name - Agent identity name.
|
|
614
|
+
*/
|
|
615
|
+
activity(name) {
|
|
616
|
+
this._lastActivity.set(name, new Date().toISOString());
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
/**
|
|
620
|
+
* Check whether an agent's tmux pane has new output since the last
|
|
621
|
+
* check. When output changes, the agent is provably active — update
|
|
622
|
+
* `_lastActivity` and return `true`. This is the ground truth signal
|
|
623
|
+
* that feeds into `health()` and, transitively, the proof-of-life
|
|
624
|
+
* responder.
|
|
625
|
+
*
|
|
626
|
+
* @param {string} name - Agent identity name.
|
|
627
|
+
* @returns {Promise<boolean>} True when pane output changed (agent is active).
|
|
628
|
+
*/
|
|
629
|
+
async refresh(name) {
|
|
630
|
+
const agent = this.agents.get(name);
|
|
631
|
+
if (!agent?.capture) return false;
|
|
632
|
+
|
|
633
|
+
try {
|
|
634
|
+
const output = await agent.capture(50);
|
|
635
|
+
const digest = createHash('md5').update(output ?? '').digest('hex');
|
|
636
|
+
const prev = this._lastPaneHash.get(name);
|
|
637
|
+
this._lastPaneHash.set(name, digest);
|
|
638
|
+
if (prev && prev !== digest) {
|
|
639
|
+
this._lastActivity.set(name, new Date().toISOString());
|
|
640
|
+
return true;
|
|
641
|
+
}
|
|
642
|
+
return false;
|
|
643
|
+
} catch { return false; }
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
// ── Death Snapshot ──────────────────────────────────
|
|
647
|
+
|
|
648
|
+
/**
|
|
649
|
+
* Capture a dying agent's pane output and write it to the session
|
|
650
|
+
* logs directory as `<name>.death.log`. Requires `remain-on-exit`
|
|
651
|
+
* on the pane so output survives after the process exits.
|
|
652
|
+
*
|
|
653
|
+
* Non-fatal: silently skips when session or storage is unavailable,
|
|
654
|
+
* and logs a warning when capture or write fails.
|
|
655
|
+
*
|
|
656
|
+
* @param {string} name - Agent identity name.
|
|
657
|
+
* @param {object} agent - Agent instance with a `capture()` method.
|
|
658
|
+
* @returns {Promise<void>}
|
|
659
|
+
*/
|
|
660
|
+
async snapshot(name, agent) {
|
|
661
|
+
if (!this.sessionId || !this.storage?.home) return;
|
|
662
|
+
try {
|
|
663
|
+
const raw = await agent.capture();
|
|
664
|
+
const cleaned = output.clean(raw);
|
|
665
|
+
const dir = join(this.storage.home, 'sessions', this.sessionId, 'logs');
|
|
666
|
+
await mkdir(dir, { recursive: true });
|
|
667
|
+
await writeFile(join(dir, `${name}.death.log`), cleaned, 'utf8');
|
|
668
|
+
log.info(`death snapshot written: ${name}`);
|
|
669
|
+
} catch (err) {
|
|
670
|
+
log.warn(`death snapshot failed for ${name}: ${err.message}`);
|
|
671
|
+
}
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
// ── Reconcile (Liveness Sweep) ───────────────────────
|
|
675
|
+
|
|
676
|
+
/**
|
|
677
|
+
* Synchronize registry state with actual tmux pane liveness.
|
|
678
|
+
*
|
|
679
|
+
* Iterates all registered agents and checks whether their underlying
|
|
680
|
+
* process is still alive. Dead agents are stopped, removed from the
|
|
681
|
+
* registry, and their identities released — closing the gap where
|
|
682
|
+
* team_status reports agents as "working" after their processes have
|
|
683
|
+
* exited.
|
|
684
|
+
*
|
|
685
|
+
* Dormant agents are skipped because they are already in a terminal
|
|
686
|
+
* state and are handled by the stall monitor's dormant cleanup.
|
|
687
|
+
*
|
|
688
|
+
* @returns {Promise<string[]>} Names of agents that were reconciled.
|
|
689
|
+
* @fires Orchestrator#removed
|
|
690
|
+
*/
|
|
691
|
+
async reconcile() {
|
|
692
|
+
const reconciled = [];
|
|
693
|
+
const entries = [...this.agents.entries()];
|
|
694
|
+
|
|
695
|
+
for (const [name, agent] of entries) {
|
|
696
|
+
if (agent.state === 'dormant') continue;
|
|
697
|
+
|
|
698
|
+
let alive;
|
|
699
|
+
try {
|
|
700
|
+
alive = await agent.alive();
|
|
701
|
+
} catch {
|
|
702
|
+
continue;
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
if (alive) continue;
|
|
706
|
+
|
|
707
|
+
log.warn(`reconcile: ${name} pane is dead (state was "${agent.state}") — removing`);
|
|
708
|
+
|
|
709
|
+
await this.snapshot(name, agent);
|
|
710
|
+
try { await agent.stop(); } catch { /* pane already dead */ }
|
|
711
|
+
|
|
712
|
+
this.agents.delete(name);
|
|
713
|
+
this._lastActivity.delete(name);
|
|
714
|
+
this._lastPaneHash.delete(name);
|
|
715
|
+
this._removed.add(name);
|
|
716
|
+
this.identityRegistry.release(agent.identity);
|
|
717
|
+
|
|
718
|
+
this.emit('removed', { name, reason: 'reconciled', agent });
|
|
719
|
+
reconciled.push(name);
|
|
720
|
+
}
|
|
721
|
+
|
|
722
|
+
return reconciled;
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
// ── Health ───────────────────────────────────────────
|
|
726
|
+
|
|
727
|
+
/**
|
|
728
|
+
* Multi-signal health assessment for a named agent.
|
|
729
|
+
*
|
|
730
|
+
* Evaluates: tmux/process liveness, agent state machine, activity
|
|
731
|
+
* recency (last orchestrator interaction), and captured output length.
|
|
732
|
+
*
|
|
733
|
+
* @param {string} name - Agent identity name.
|
|
734
|
+
* @returns {Promise<{alive: boolean, status: string, details: string, outputLength?: number}>}
|
|
735
|
+
*/
|
|
736
|
+
async health(name) {
|
|
737
|
+
const agent = this.agents.get(name);
|
|
738
|
+
if (!agent) return { alive: false, status: 'not-found', details: `agent ${name} not registered` };
|
|
739
|
+
|
|
740
|
+
if (agent.state === 'dormant')
|
|
741
|
+
return { alive: false, status: 'unhealthy', details: `agent ${name} is dormant` };
|
|
742
|
+
|
|
743
|
+
const paneAlive = await agent.alive();
|
|
744
|
+
if (!paneAlive)
|
|
745
|
+
return { alive: false, status: 'unhealthy', details: `agent ${name} pane is dead` };
|
|
746
|
+
|
|
747
|
+
const output = await agent.capture().catch(function noop() { return ''; });
|
|
748
|
+
const outputLength = output.length;
|
|
749
|
+
|
|
750
|
+
// Local proof-of-life: check tmux pane for real activity before
|
|
751
|
+
// declaring staleness. Agent-side MCP tool calls don't update
|
|
752
|
+
// _lastActivity, but they DO produce terminal output.
|
|
753
|
+
await this.refresh(name);
|
|
754
|
+
|
|
755
|
+
const lastTs = this._lastActivity.get(name);
|
|
756
|
+
if (lastTs) {
|
|
757
|
+
const elapsed = Date.now() - new Date(lastTs).getTime();
|
|
758
|
+
if (elapsed > this.stallTimeout)
|
|
759
|
+
return { alive: true, status: 'unhealthy', details: `agent ${name} activity is stale (${Math.round(elapsed / 1000)}s)`, outputLength };
|
|
760
|
+
}
|
|
761
|
+
|
|
762
|
+
return { alive: true, status: 'healthy', details: `agent ${name} is active`, outputLength };
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
// ── Scaling ──────────────────────────────────────────
|
|
766
|
+
|
|
767
|
+
/**
|
|
768
|
+
* Demand-driven scaling: collect demand signals from all registered
|
|
769
|
+
* workflows, then spawn agents to fill deficits — respecting global
|
|
770
|
+
* caps, per-role caps, rate limits, and cooldowns.
|
|
771
|
+
*
|
|
772
|
+
* Runs after all workflow handlers in the reactor chain so demand
|
|
773
|
+
* signals reflect the latest hydrated state. Spawns are capped by
|
|
774
|
+
* `maxPerTick` to avoid resource spikes.
|
|
775
|
+
*
|
|
776
|
+
* Priority order: reviewer > risk > action > planner. Reviewers
|
|
777
|
+
* unblock merges, risk unblocks reviewers, so they are filled first
|
|
778
|
+
* when at the global cap.
|
|
779
|
+
*
|
|
780
|
+
* @param {string} repo - Repository in "owner/name" format.
|
|
781
|
+
* @returns {Promise<Array<{role: string, agent: string}>>} Spawned agents.
|
|
782
|
+
*/
|
|
783
|
+
async scale(repo) {
|
|
784
|
+
if (!this.workflows.size) return [];
|
|
785
|
+
|
|
786
|
+
const maxAgents = this.cfg?.get?.('scaling.maxAgents') ?? 8;
|
|
787
|
+
const maxPerRole = this.cfg?.get?.('scaling.maxPerRole') ?? {};
|
|
788
|
+
const maxPerTick = this.cfg?.get?.('scaling.maxPerTick') ?? 2;
|
|
789
|
+
const cooldown = this.cfg?.get?.('scaling.cooldown') ?? 30000;
|
|
790
|
+
|
|
791
|
+
// Collect demand signals from each workflow.
|
|
792
|
+
// Use the workflow's static `role` for enlist() — the map key may
|
|
793
|
+
// differ (e.g. map key 'review' vs static role 'reviewer'). The
|
|
794
|
+
// static role is what agents, demand(), and pair() filter on.
|
|
795
|
+
const signals = [];
|
|
796
|
+
for (const [key, workflow] of this.workflows) {
|
|
797
|
+
const role = workflow.constructor.role ?? key;
|
|
798
|
+
try {
|
|
799
|
+
const signal = await workflow.demand(repo);
|
|
800
|
+
signals.push({ role, ...signal });
|
|
801
|
+
log.debug(`scale: ${role} { workload: ${signal.workload}, supply: ${signal.supply}, deficit: ${signal.deficit} }`);
|
|
802
|
+
} catch (err) {
|
|
803
|
+
log.warn(`scale: demand() failed for ${role}: ${err.message}`);
|
|
804
|
+
}
|
|
805
|
+
}
|
|
806
|
+
|
|
807
|
+
// Global cap — only count live agents. Dormant agents stay in
|
|
808
|
+
// the map after exit but consume no resources.
|
|
809
|
+
const live = [...this.agents.values()]
|
|
810
|
+
.filter(function alive(a) { return a.state !== 'dormant'; }).length;
|
|
811
|
+
if (live >= maxAgents) {
|
|
812
|
+
log.debug(`scale: at global cap (${live}/${maxAgents}) — skipping`);
|
|
813
|
+
return [];
|
|
814
|
+
}
|
|
815
|
+
|
|
816
|
+
// Sort by priority: reviewer > risk > action > planner
|
|
817
|
+
const priority = { reviewer: 0, risk: 1, action: 2, planner: 3 };
|
|
818
|
+
signals.sort(function byPriority(a, b) {
|
|
819
|
+
return (priority[a.role] ?? 99) - (priority[b.role] ?? 99);
|
|
820
|
+
});
|
|
821
|
+
|
|
822
|
+
const spawned = [];
|
|
823
|
+
let budget = maxPerTick;
|
|
824
|
+
const now = Date.now();
|
|
825
|
+
|
|
826
|
+
for (const signal of signals) {
|
|
827
|
+
if (budget <= 0) break;
|
|
828
|
+
if (signal.deficit <= 0) continue;
|
|
829
|
+
|
|
830
|
+
const { role } = signal;
|
|
831
|
+
const roleCap = maxPerRole[role] ?? Infinity;
|
|
832
|
+
const current = [...this.agents.values()]
|
|
833
|
+
.filter(function liveRole(a) { return a.role === role && a.state !== 'dormant'; }).length;
|
|
834
|
+
|
|
835
|
+
// Per-role cap
|
|
836
|
+
if (current >= roleCap) {
|
|
837
|
+
log.debug(`scale: ${role} at role cap (${current}/${roleCap}) — skipping`);
|
|
838
|
+
continue;
|
|
839
|
+
}
|
|
840
|
+
|
|
841
|
+
// Cooldown check
|
|
842
|
+
const last = this._lastSpawn.get(role);
|
|
843
|
+
if (last && now - last < cooldown) {
|
|
844
|
+
log.debug(`scale: ${role} in cooldown (${Math.round((now - last) / 1000)}s < ${Math.round(cooldown / 1000)}s) — skipping`);
|
|
845
|
+
continue;
|
|
846
|
+
}
|
|
847
|
+
|
|
848
|
+
// How many to spawn: min of deficit, role headroom, global headroom, tick budget
|
|
849
|
+
const roleHeadroom = roleCap - current;
|
|
850
|
+
const globalHeadroom = maxAgents - live;
|
|
851
|
+
const count = Math.min(signal.deficit, roleHeadroom, globalHeadroom, budget);
|
|
852
|
+
|
|
853
|
+
if (count <= 0) continue;
|
|
854
|
+
|
|
855
|
+
await this.backendRegistry.discover();
|
|
856
|
+
|
|
857
|
+
for (let i = 0; i < count; i++) {
|
|
858
|
+
try {
|
|
859
|
+
const providers = this.backendRegistry.providers();
|
|
860
|
+
const info = capability(providers);
|
|
861
|
+
let provider;
|
|
862
|
+
|
|
863
|
+
// Reviewer and risk agents must oppose the action agents they
|
|
864
|
+
// pair with — pair() finds the cross-provider match, so
|
|
865
|
+
// spawning on the same side means scan()/assess() can never
|
|
866
|
+
// dispatch them. Pick the opposite of existing action agents.
|
|
867
|
+
//
|
|
868
|
+
// When no live action agent exists (dead/foreign), fall back
|
|
869
|
+
// to PR label metadata via demand().actionProviders so the
|
|
870
|
+
// correct opposing side is still selected.
|
|
871
|
+
let actionProvider = (role === 'reviewer' || role === 'risk')
|
|
872
|
+
? [...this.agents.values()]
|
|
873
|
+
.find(function hasProvider(a) { return a.role === 'action' && a.identity?.provider; })
|
|
874
|
+
?.identity?.provider
|
|
875
|
+
: null;
|
|
876
|
+
|
|
877
|
+
if (!actionProvider && signal.actionProviders?.length) {
|
|
878
|
+
actionProvider = signal.actionProviders[0];
|
|
879
|
+
}
|
|
880
|
+
|
|
881
|
+
if (actionProvider) {
|
|
882
|
+
const opposite = side(actionProvider) === 'yin' ? 'yang' : 'yin';
|
|
883
|
+
provider = providers.find(function isOpp(p) { return side(p) === opposite; });
|
|
884
|
+
if (!provider) {
|
|
885
|
+
if (info.mode === 'single') {
|
|
886
|
+
provider = providers[0] ?? null;
|
|
887
|
+
if (!provider) {
|
|
888
|
+
log.warn(`scale: no providers available for ${role} — skipping`);
|
|
889
|
+
break;
|
|
890
|
+
}
|
|
891
|
+
log.info(`scale: no ${opposite}-side provider available for ${role} — using single-side fallback (${provider})`);
|
|
892
|
+
} else {
|
|
893
|
+
log.warn(`scale: no ${opposite}-side provider available for ${role} — skipping`);
|
|
894
|
+
break;
|
|
895
|
+
}
|
|
896
|
+
}
|
|
897
|
+
} else if (role === 'reviewer' || role === 'risk') {
|
|
898
|
+
provider = providers[0];
|
|
899
|
+
} else {
|
|
900
|
+
// Action / planner: balance yin/yang within their own role
|
|
901
|
+
const yinCount = [...this.agents.values()]
|
|
902
|
+
.filter(function isRole(a) { return a.role === role && side(a.identity?.provider) === 'yin'; }).length;
|
|
903
|
+
const yangCount = [...this.agents.values()]
|
|
904
|
+
.filter(function isRole(a) { return a.role === role && side(a.identity?.provider) === 'yang'; }).length;
|
|
905
|
+
|
|
906
|
+
if (yinCount <= yangCount) {
|
|
907
|
+
provider = providers.find(function isYin(p) { return side(p) === 'yin'; }) ?? providers[0];
|
|
908
|
+
} else {
|
|
909
|
+
provider = providers.find(function isYang(p) { return side(p) === 'yang'; }) ?? providers[0];
|
|
910
|
+
}
|
|
911
|
+
}
|
|
912
|
+
// Final defensive default for non-pairing roles and mixed
|
|
913
|
+
// provider sets: if discovery returned providers but side
|
|
914
|
+
// selection yielded none, use the first discovered provider.
|
|
915
|
+
if (!provider && providers.length) provider = providers[0];
|
|
916
|
+
|
|
917
|
+
if (!provider) {
|
|
918
|
+
log.warn(`scale: no provider resolved for ${role} — skipping`);
|
|
919
|
+
break;
|
|
920
|
+
}
|
|
921
|
+
|
|
922
|
+
// Backend degradation fallback: when the native backend for
|
|
923
|
+
// the selected provider is degraded (repeated rapid failures
|
|
924
|
+
// like budget exhaustion), switch to the cursor-* virtual
|
|
925
|
+
// provider. Same yin/yang side, different API path.
|
|
926
|
+
if (!provider.startsWith('cursor-') && this.backendRegistry.degraded) {
|
|
927
|
+
let nativeBackend = null;
|
|
928
|
+
for (const info of this.backendRegistry.discovered.values()) {
|
|
929
|
+
if (info.provider === provider) { nativeBackend = info.name; break; }
|
|
930
|
+
}
|
|
931
|
+
if (nativeBackend && this.backendRegistry.degraded(nativeBackend)) {
|
|
932
|
+
const variant = `cursor-${provider}`;
|
|
933
|
+
if (providers.includes(variant)) {
|
|
934
|
+
log.info(`scale: ${nativeBackend} degraded, falling back to ${variant}`);
|
|
935
|
+
provider = variant;
|
|
936
|
+
}
|
|
937
|
+
}
|
|
938
|
+
}
|
|
939
|
+
|
|
940
|
+
const agent = await this.enlist(provider, role);
|
|
941
|
+
spawned.push({ role, agent: agent.identity.name });
|
|
942
|
+
log.info(`scale: spawned ${agent.identity.name} as ${role} (${provider})`);
|
|
943
|
+
} catch (err) {
|
|
944
|
+
log.warn(`scale: failed to spawn ${role} agent: ${err.message}`);
|
|
945
|
+
break;
|
|
946
|
+
}
|
|
947
|
+
}
|
|
948
|
+
|
|
949
|
+
if (spawned.length) {
|
|
950
|
+
this._lastSpawn.set(role, now);
|
|
951
|
+
budget -= count;
|
|
952
|
+
}
|
|
953
|
+
}
|
|
954
|
+
|
|
955
|
+
if (spawned.length) {
|
|
956
|
+
log.info(`scale: spawned ${spawned.length} agents — ${spawned.map(function fmt(s) { return `${s.role}:${s.agent}`; }).join(', ')}`);
|
|
957
|
+
}
|
|
958
|
+
|
|
959
|
+
return spawned;
|
|
960
|
+
}
|
|
961
|
+
|
|
962
|
+
// ── Reap (Global Safety Net) ─────────────────────────
|
|
963
|
+
|
|
964
|
+
/**
|
|
965
|
+
* Global safety net — stop dormant agents when no work remains.
|
|
966
|
+
*
|
|
967
|
+
* Role-specific reaping is handled by each workflow's `reap()` method
|
|
968
|
+
* (planner-reap, review-reap, action-reap). This global reap runs
|
|
969
|
+
* last in the reactor chain and catches anything the workflow-level
|
|
970
|
+
* reaps missed.
|
|
971
|
+
*
|
|
972
|
+
* Only **dormant** agents are eligible for global reaping. Agents in
|
|
973
|
+
* any other state (spawned, working, reviewing, etc.) are actively
|
|
974
|
+
* doing something and must not be interrupted. Stall detection is
|
|
975
|
+
* the separate mechanism that handles truly stuck agents.
|
|
976
|
+
*
|
|
977
|
+
* All of these conditions must be true before reaping:
|
|
978
|
+
* 1. No open issues with the `loreli` label
|
|
979
|
+
* 2. No open pull requests (PRs in flight = work not done)
|
|
980
|
+
* 3. Every remaining agent is dormant
|
|
981
|
+
*
|
|
982
|
+
* Registered as the last reactor handler so it runs after all
|
|
983
|
+
* workflow-specific reaps have completed.
|
|
984
|
+
*
|
|
985
|
+
* @param {string} repo - Repository in "owner/name" format.
|
|
986
|
+
* @returns {Promise<void>}
|
|
987
|
+
*/
|
|
988
|
+
async reap(repo) {
|
|
989
|
+
if (!this.hub || !this.agents.size) return;
|
|
990
|
+
|
|
991
|
+
try {
|
|
992
|
+
// Only dormant agents are candidates — anything else is active work
|
|
993
|
+
const all = [...this.agents.values()];
|
|
994
|
+
const dormant = all.filter(function idle(a) { return a.state === 'dormant'; });
|
|
995
|
+
if (dormant.length === 0) {
|
|
996
|
+
log.debug(`reap: ${all.length} agents still active — skipping`);
|
|
997
|
+
return;
|
|
998
|
+
}
|
|
999
|
+
|
|
1000
|
+
const open = await this.hub.issues(repo, { state: 'open' });
|
|
1001
|
+
const loreli = open.filter(function tagged(i) {
|
|
1002
|
+
return i.labels?.some?.(function isLoreli(l) {
|
|
1003
|
+
const name = typeof l === 'string' ? l : l.name;
|
|
1004
|
+
return name === 'loreli';
|
|
1005
|
+
});
|
|
1006
|
+
});
|
|
1007
|
+
|
|
1008
|
+
if (loreli.length > 0) return;
|
|
1009
|
+
|
|
1010
|
+
// Check for open PRs — work is still in flight if PRs exist
|
|
1011
|
+
const prs = await this.hub.pulls(repo, { state: 'open' });
|
|
1012
|
+
if (prs.length > 0) return;
|
|
1013
|
+
|
|
1014
|
+
log.info(`reap: no open loreli issues or PRs — stopping ${dormant.length} dormant agents`);
|
|
1015
|
+
|
|
1016
|
+
for (const agent of dormant) {
|
|
1017
|
+
try {
|
|
1018
|
+
await this.kill(agent.identity.name);
|
|
1019
|
+
log.info(`reap: stopped ${agent.identity.name}`);
|
|
1020
|
+
} catch (err) {
|
|
1021
|
+
log.warn(`reap: failed to stop ${agent.identity.name}: ${err.message}`);
|
|
1022
|
+
}
|
|
1023
|
+
}
|
|
1024
|
+
} catch (err) {
|
|
1025
|
+
log.debug(`reap: skipped — ${err.message}`);
|
|
1026
|
+
}
|
|
1027
|
+
}
|
|
1028
|
+
|
|
1029
|
+
// ── Reactor (Polling Loop) ────────────────────────────
|
|
1030
|
+
|
|
1031
|
+
/**
|
|
1032
|
+
* Register a reactor handler called on every tick.
|
|
1033
|
+
* Role packages register their scan/forward/land handlers here.
|
|
1034
|
+
*
|
|
1035
|
+
* @param {string} name - Handler name (for logging/debugging).
|
|
1036
|
+
* @param {function(string): Promise<void>} handler - Async function receiving repo.
|
|
1037
|
+
*/
|
|
1038
|
+
register(name, handler) {
|
|
1039
|
+
this._handlers.set(name, handler);
|
|
1040
|
+
log.info(`reactor handler registered: ${name}`);
|
|
1041
|
+
}
|
|
1042
|
+
|
|
1043
|
+
/**
|
|
1044
|
+
* Execute one polling iteration: call all registered handlers.
|
|
1045
|
+
* Each handler receives the repo string. Errors in one handler
|
|
1046
|
+
* do not prevent subsequent handlers from running.
|
|
1047
|
+
*
|
|
1048
|
+
* @param {string} repo - Repository in "owner/name" format.
|
|
1049
|
+
* @returns {Promise<void>}
|
|
1050
|
+
*/
|
|
1051
|
+
async tick(repo) {
|
|
1052
|
+
log.debug('tick start');
|
|
1053
|
+
|
|
1054
|
+
for (const [name, handler] of this._handlers) {
|
|
1055
|
+
try {
|
|
1056
|
+
await handler(repo);
|
|
1057
|
+
} catch (err) {
|
|
1058
|
+
log.error(`reactor handler "${name}" failed: ${err.message}`);
|
|
1059
|
+
}
|
|
1060
|
+
}
|
|
1061
|
+
|
|
1062
|
+
log.debug('tick end');
|
|
1063
|
+
}
|
|
1064
|
+
|
|
1065
|
+
/**
|
|
1066
|
+
* Start the polling reactor loop using a self-scheduling setTimeout
|
|
1067
|
+
* chain. Each tick runs to completion before the next is scheduled,
|
|
1068
|
+
* eliminating overlap risk without a reentrant guard. This pattern
|
|
1069
|
+
* is more reliable than setInterval for async callbacks — setInterval
|
|
1070
|
+
* does not await its callback, and combined with unref() it can
|
|
1071
|
+
* silently stop firing after heavy async operations (observed in
|
|
1072
|
+
* production after agent spawn via tmux).
|
|
1073
|
+
*
|
|
1074
|
+
* @param {string} repo - Repository in "owner/name" format.
|
|
1075
|
+
*/
|
|
1076
|
+
watch(repo) {
|
|
1077
|
+
if (this._watchHandle) return;
|
|
1078
|
+
this.repo = repo;
|
|
1079
|
+
|
|
1080
|
+
const interval = this.cfg?.get?.('watch.interval') ?? 60000;
|
|
1081
|
+
const self = this;
|
|
1082
|
+
|
|
1083
|
+
log.info(`watcher started for ${repo} (interval: ${interval}ms)`);
|
|
1084
|
+
|
|
1085
|
+
function schedule() {
|
|
1086
|
+
self._watchHandle = setTimeout(async function cycle() {
|
|
1087
|
+
await self.tick(repo);
|
|
1088
|
+
if (self._watchHandle) schedule();
|
|
1089
|
+
}, interval);
|
|
1090
|
+
|
|
1091
|
+
self._watchHandle.unref();
|
|
1092
|
+
}
|
|
1093
|
+
|
|
1094
|
+
schedule();
|
|
1095
|
+
}
|
|
1096
|
+
|
|
1097
|
+
/**
|
|
1098
|
+
* Stop the polling reactor loop.
|
|
1099
|
+
*/
|
|
1100
|
+
unwatch() {
|
|
1101
|
+
if (this._watchHandle) {
|
|
1102
|
+
clearTimeout(this._watchHandle);
|
|
1103
|
+
this._watchHandle = null;
|
|
1104
|
+
log.info('watcher stopped');
|
|
1105
|
+
}
|
|
1106
|
+
}
|
|
1107
|
+
|
|
1108
|
+
// ── Monitor (Stall Detection) ─────────────────────────
|
|
1109
|
+
|
|
1110
|
+
/**
|
|
1111
|
+
* Start the stall detection monitor with 3-tier escalation.
|
|
1112
|
+
*
|
|
1113
|
+
* Tier 1 — Nudge (1x stall timeout):
|
|
1114
|
+
* Send a message to the agent asking for status. Emits 'stall'
|
|
1115
|
+
* with severity 'nudge'.
|
|
1116
|
+
*
|
|
1117
|
+
* Tier 2 — Warning (2x stall timeout):
|
|
1118
|
+
* Emits 'stall' with severity 'warning'. Role packages can
|
|
1119
|
+
* subscribe and post GitHub comments.
|
|
1120
|
+
*
|
|
1121
|
+
* Tier 3 — Critical (3x stall timeout):
|
|
1122
|
+
* Kills the agent and emits 'stall' with severity 'critical'.
|
|
1123
|
+
*
|
|
1124
|
+
* @fires Orchestrator#stall
|
|
1125
|
+
*/
|
|
1126
|
+
monitor() {
|
|
1127
|
+
if (this._monitorHandle) return;
|
|
1128
|
+
log.info('stall detection monitor started');
|
|
1129
|
+
|
|
1130
|
+
const stallTimeout = this.stallTimeout;
|
|
1131
|
+
const nudge = this.cfg?.get?.('timeouts.nudge') ?? true;
|
|
1132
|
+
const self = this;
|
|
1133
|
+
|
|
1134
|
+
this._monitorHandle = setInterval(async function checkStalls() {
|
|
1135
|
+
// Reconcile first: detect dead panes and clean up before
|
|
1136
|
+
// running stall-escalation checks. Without this, dead agents
|
|
1137
|
+
// linger until stallTimeout elapses.
|
|
1138
|
+
await self.reconcile();
|
|
1139
|
+
|
|
1140
|
+
const now = Date.now();
|
|
1141
|
+
|
|
1142
|
+
// Snapshot keys to avoid mutation during iteration — Tier 3
|
|
1143
|
+
// calls kill() which deletes from self.agents mid-loop.
|
|
1144
|
+
const snapshot = [...self.agents.entries()];
|
|
1145
|
+
for (const [name, agent] of snapshot) {
|
|
1146
|
+
// Dormant agents are kept registered so downstream workflows
|
|
1147
|
+
// (e.g. review scan) can still match them by identity. Skip
|
|
1148
|
+
// nudge/warning, but allow Tier 3 kill for eventual cleanup.
|
|
1149
|
+
if (agent.state === 'dormant') {
|
|
1150
|
+
const last = self._lastActivity.get(name);
|
|
1151
|
+
if (!last) continue;
|
|
1152
|
+
const elapsed = now - new Date(last).getTime();
|
|
1153
|
+
if (elapsed > stallTimeout * 3) {
|
|
1154
|
+
self.agents.delete(name);
|
|
1155
|
+
self._lastActivity.delete(name);
|
|
1156
|
+
self._lastPaneHash.delete(name);
|
|
1157
|
+
log.info(`stall: cleaned up dormant agent ${name}`);
|
|
1158
|
+
}
|
|
1159
|
+
continue;
|
|
1160
|
+
}
|
|
1161
|
+
|
|
1162
|
+
const last = self._lastActivity.get(name);
|
|
1163
|
+
if (!last) continue;
|
|
1164
|
+
|
|
1165
|
+
// Local proof-of-life: check tmux pane for real activity
|
|
1166
|
+
// before escalating. If output changed, _lastActivity is
|
|
1167
|
+
// now current and the tier checks naturally skip.
|
|
1168
|
+
if (await self.refresh(name)) continue;
|
|
1169
|
+
|
|
1170
|
+
const elapsed = now - new Date(last).getTime();
|
|
1171
|
+
|
|
1172
|
+
if (elapsed > stallTimeout * 3) {
|
|
1173
|
+
// Tier 3: Critically stalled — kill and emit
|
|
1174
|
+
log.error(`agent ${name} critically stalled (${Math.round(elapsed / 1000)}s) — killing`);
|
|
1175
|
+
|
|
1176
|
+
/**
|
|
1177
|
+
* @event Orchestrator#stall
|
|
1178
|
+
* @type {object}
|
|
1179
|
+
* @property {string} name - Agent identity name.
|
|
1180
|
+
* @property {number} elapsed - Time since last activity in ms.
|
|
1181
|
+
* @property {string} severity - 'nudge', 'warning', or 'critical'.
|
|
1182
|
+
*/
|
|
1183
|
+
self.emit('stall', { name, elapsed, severity: 'critical' });
|
|
1184
|
+
|
|
1185
|
+
try {
|
|
1186
|
+
await self.kill(name);
|
|
1187
|
+
log.info(`stall tier 3: agent ${name} killed`);
|
|
1188
|
+
} catch (err) {
|
|
1189
|
+
log.error(`stall tier 3: kill failed for ${name}: ${err.message}`);
|
|
1190
|
+
}
|
|
1191
|
+
} else if (elapsed > stallTimeout * 2) {
|
|
1192
|
+
// Tier 2: Warning
|
|
1193
|
+
log.warn(`agent ${name} stalled tier 2 (${Math.round(elapsed / 1000)}s)`);
|
|
1194
|
+
self.emit('stall', { name, elapsed, severity: 'warning' });
|
|
1195
|
+
} else if (elapsed > stallTimeout) {
|
|
1196
|
+
// Tier 1: Optional nudge
|
|
1197
|
+
if (nudge) {
|
|
1198
|
+
log.warn(`agent ${name} stalled tier 1 (${Math.round(elapsed / 1000)}s) - nudging`);
|
|
1199
|
+
try {
|
|
1200
|
+
await agent.send('You appear to be stalled. Please report your current status or continue working.');
|
|
1201
|
+
// Activity resets only when the agent responds (via MCP tool
|
|
1202
|
+
// calls or hub activity), NOT when we nudge it. Resetting
|
|
1203
|
+
// here would trap agents at tier 1 forever.
|
|
1204
|
+
} catch (err) { log.debug(`monitor: nudge failed for ${name}: ${err.message}`); }
|
|
1205
|
+
} else {
|
|
1206
|
+
log.warn(`agent ${name} stalled tier 1 (${Math.round(elapsed / 1000)}s) - nudge suppressed by config`);
|
|
1207
|
+
}
|
|
1208
|
+
self.emit('stall', { name, elapsed, severity: 'nudge' });
|
|
1209
|
+
}
|
|
1210
|
+
}
|
|
1211
|
+
}, Math.min(stallTimeout / 2, 60000));
|
|
1212
|
+
|
|
1213
|
+
this._monitorHandle.unref();
|
|
1214
|
+
}
|
|
1215
|
+
|
|
1216
|
+
/**
|
|
1217
|
+
* Stop the stall detection monitor.
|
|
1218
|
+
*/
|
|
1219
|
+
stopMonitor() {
|
|
1220
|
+
if (this._monitorHandle) {
|
|
1221
|
+
clearInterval(this._monitorHandle);
|
|
1222
|
+
this._monitorHandle = null;
|
|
1223
|
+
log.info('stall detection monitor stopped');
|
|
1224
|
+
}
|
|
1225
|
+
}
|
|
1226
|
+
}
|