loreli 0.0.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +670 -97
  3. package/bin/loreli.js +89 -0
  4. package/package.json +74 -14
  5. package/packages/README.md +101 -0
  6. package/packages/action/README.md +98 -0
  7. package/packages/action/src/index.js +656 -0
  8. package/packages/agent/README.md +517 -0
  9. package/packages/agent/src/backends/claude.js +287 -0
  10. package/packages/agent/src/backends/codex.js +278 -0
  11. package/packages/agent/src/backends/cursor.js +294 -0
  12. package/packages/agent/src/backends/index.js +329 -0
  13. package/packages/agent/src/base.js +138 -0
  14. package/packages/agent/src/cli.js +198 -0
  15. package/packages/agent/src/factory.js +119 -0
  16. package/packages/agent/src/index.js +12 -0
  17. package/packages/agent/src/models.js +141 -0
  18. package/packages/agent/src/output.js +62 -0
  19. package/packages/agent/src/session.js +162 -0
  20. package/packages/agent/src/trace.js +186 -0
  21. package/packages/config/README.md +833 -0
  22. package/packages/config/src/defaults.js +134 -0
  23. package/packages/config/src/index.js +192 -0
  24. package/packages/config/src/schema.js +273 -0
  25. package/packages/config/src/validate.js +160 -0
  26. package/packages/context/README.md +165 -0
  27. package/packages/context/src/index.js +198 -0
  28. package/packages/hub/README.md +338 -0
  29. package/packages/hub/src/base.js +154 -0
  30. package/packages/hub/src/github.js +1558 -0
  31. package/packages/hub/src/index.js +79 -0
  32. package/packages/hub/src/labels.js +48 -0
  33. package/packages/identity/README.md +288 -0
  34. package/packages/identity/src/index.js +620 -0
  35. package/packages/identity/src/themes/avatar.js +217 -0
  36. package/packages/identity/src/themes/digimon.js +217 -0
  37. package/packages/identity/src/themes/dragonball.js +217 -0
  38. package/packages/identity/src/themes/lotr.js +217 -0
  39. package/packages/identity/src/themes/marvel.js +217 -0
  40. package/packages/identity/src/themes/pokemon.js +217 -0
  41. package/packages/identity/src/themes/starwars.js +217 -0
  42. package/packages/identity/src/themes/transformers.js +217 -0
  43. package/packages/identity/src/themes/zelda.js +217 -0
  44. package/packages/knowledge/README.md +237 -0
  45. package/packages/knowledge/src/index.js +412 -0
  46. package/packages/log/README.md +93 -0
  47. package/packages/log/src/index.js +252 -0
  48. package/packages/marker/README.md +200 -0
  49. package/packages/marker/src/index.js +184 -0
  50. package/packages/mcp/README.md +279 -0
  51. package/packages/mcp/instructions.md +121 -0
  52. package/packages/mcp/scaffolding/.agents/skills/loreli-context/SKILL.md +89 -0
  53. package/packages/mcp/scaffolding/ISSUE_TEMPLATE/config.yml +2 -0
  54. package/packages/mcp/scaffolding/ISSUE_TEMPLATE/loreli.yml +83 -0
  55. package/packages/mcp/scaffolding/loreli.yml +453 -0
  56. package/packages/mcp/scaffolding/mcp-configs/.codex/config.toml +3 -0
  57. package/packages/mcp/scaffolding/mcp-configs/.cursor/mcp.json +11 -0
  58. package/packages/mcp/scaffolding/mcp-configs/.mcp.json +11 -0
  59. package/packages/mcp/scaffolding/pull-request.md +23 -0
  60. package/packages/mcp/src/index.js +571 -0
  61. package/packages/mcp/src/tools/agents.js +429 -0
  62. package/packages/mcp/src/tools/context.js +199 -0
  63. package/packages/mcp/src/tools/github.js +1199 -0
  64. package/packages/mcp/src/tools/hitl.js +149 -0
  65. package/packages/mcp/src/tools/index.js +17 -0
  66. package/packages/mcp/src/tools/start.js +835 -0
  67. package/packages/mcp/src/tools/status.js +146 -0
  68. package/packages/mcp/src/tools/work.js +124 -0
  69. package/packages/orchestrator/README.md +192 -0
  70. package/packages/orchestrator/src/index.js +1226 -0
  71. package/packages/planner/README.md +168 -0
  72. package/packages/planner/src/index.js +1166 -0
  73. package/packages/review/README.md +129 -0
  74. package/packages/review/src/index.js +1283 -0
  75. package/packages/risk/README.md +119 -0
  76. package/packages/risk/src/index.js +428 -0
  77. package/packages/session/README.md +165 -0
  78. package/packages/session/src/index.js +215 -0
  79. package/packages/test-utils/README.md +96 -0
  80. package/packages/test-utils/src/index.js +354 -0
  81. package/packages/tmux/README.md +261 -0
  82. package/packages/tmux/src/index.js +452 -0
  83. package/packages/workflow/README.md +313 -0
  84. package/packages/workflow/src/index.js +481 -0
  85. package/packages/workflow/src/proof-of-life.js +74 -0
  86. package/packages/workspace/README.md +143 -0
  87. package/packages/workspace/src/index.js +1076 -0
  88. package/index.js +0 -8
@@ -0,0 +1,1226 @@
1
+ import { rm, writeFile, mkdir } from 'node:fs/promises';
2
+ import { join } from 'node:path';
3
+ import { createHash } from 'node:crypto';
4
+ import { EventEmitter } from 'node:events';
5
+ import { Factory, Session, output } from 'loreli/agent';
6
+ import { Tmux } from 'loreli/tmux';
7
+ import { prepare } from 'loreli/workspace';
8
+ import { pick, side, capability } from 'loreli/identity';
9
+ import { logger } from 'loreli/log';
10
+
11
+ const log = logger('orchestrator');
12
+
13
+ /**
14
+ * Fatal error patterns that indicate a backend infrastructure failure
15
+ * (not a task failure). When these appear in an agent's pane output
16
+ * shortly after spawn, the backend is broken and should be degraded.
17
+ *
18
+ * @type {RegExp[]}
19
+ */
20
+ const FATAL_PATTERNS = [
21
+ /budget[_ ]*(has been )?exceeded/i,
22
+ /rate[_ ]?limit[_ ]*exceeded/i,
23
+ /hit your usage[_ ]*limit/i,
24
+ /authentication[_ ]*(error|failed)/i,
25
+ /invalid[_ ]*api[_ ]*key/i,
26
+ /quota[_ ]*exceeded/i,
27
+ /insufficient[_ ]*quota/i
28
+ ];
29
+
30
+ /**
31
+ * Check if pane output contains fatal API error patterns.
32
+ *
33
+ * @param {string} output - Pane content from agent.capture().
34
+ * @returns {boolean} True if a fatal error pattern is found.
35
+ */
36
+ function hasFatalError(output) {
37
+ if (!output) return false;
38
+ return FATAL_PATTERNS.some(function match(p) { return p.test(output); });
39
+ }
40
+
41
+ /**
42
+ * Generic agent lifecycle coordinator via EventEmitter.
43
+ *
44
+ * Manages spawn/shutdown/kill, reactor polling, stall detection,
45
+ * and activity tracking. Contains zero role-specific logic — all
46
+ * planner/action/review behavior lives in the role packages that
47
+ * subscribe to lifecycle events.
48
+ *
49
+ * @extends EventEmitter
50
+ * @fires Orchestrator#spawned
51
+ * @fires Orchestrator#removed
52
+ * @fires Orchestrator#stall
53
+ */
54
+ export class Orchestrator extends EventEmitter {
55
+ /**
56
+ * @param {object} opts
57
+ * @param {object} opts.hub - Hub instance for git hosting operations.
58
+ * @param {object} opts.identityRegistry - Registry for agent identities.
59
+ * @param {object} opts.backendRegistry - Registry for agent backends.
60
+ * @param {object} opts.storage - Persistent storage instance.
61
+ * @param {object} [opts.config] - Config instance from loreli/config.
62
+ */
63
+ constructor({ hub, identityRegistry, backendRegistry, storage, config }) {
64
+ super();
65
+
66
+ /** @type {object} Hub for git hosting. */
67
+ this.hub = hub;
68
+
69
+ /** @type {object} Identity registry. */
70
+ this.identityRegistry = identityRegistry;
71
+
72
+ /** @type {object} Backend registry. */
73
+ this.backendRegistry = backendRegistry;
74
+
75
+ /** @type {Factory} Agent factory — centralizes the create+spawn pipeline. */
76
+ this.factory = new Factory({ backends: backendRegistry, identities: identityRegistry, config });
77
+
78
+ /** @type {object} Storage for session persistence. */
79
+ this.storage = storage;
80
+
81
+ /** @type {object|null} Config instance from loreli/config. */
82
+ this.cfg = config ?? null;
83
+
84
+ /** @type {Map<string, object>} Active agents by name. */
85
+ this.agents = new Map();
86
+
87
+ /** @type {string|null} Current session ID. */
88
+ this.sessionId = null;
89
+
90
+ /** @type {object|null} MCP client identity. */
91
+ this.clientIdentity = null;
92
+
93
+ /** @type {string|null} Target repository in "owner/name" format. */
94
+ this.repo = null;
95
+
96
+ /** @type {number} Stall timeout in ms. */
97
+ this.stallTimeout = this.cfg?.get?.('timeouts.stall') ?? 600000;
98
+
99
+ /** @type {number} Delay before checking if a freshly spawned agent died. */
100
+ this.rapidDeathDelay = this.cfg?.get?.('timeouts.rapidDeath') ?? 15000;
101
+
102
+ /** @type {NodeJS.Timeout|null} Stall detection interval handle. */
103
+ this._monitorHandle = null;
104
+
105
+ /** @type {Map<string, string>} Last known activity timestamp per agent. */
106
+ this._lastActivity = new Map();
107
+
108
+ /** @type {Map<string, string>} MD5 hash of last captured pane output per agent for tmux-based activity detection. */
109
+ this._lastPaneHash = new Map();
110
+
111
+ /** @type {NodeJS.Timeout|null} Reactor polling interval handle. */
112
+ this._watchHandle = null;
113
+
114
+ /** @type {Map<string, function>} Registered reactor handlers by name. */
115
+ this._handlers = new Map();
116
+
117
+ /**
118
+ * Names already claimed by other participants, discovered from
119
+ * GitHub claim comments and PR branches during reactor ticks.
120
+ * Populated as a zero-cost side effect of data the reactor
121
+ * already fetches — no additional API calls needed.
122
+ *
123
+ * @type {Set<string>}
124
+ */
125
+ this.takenNames = new Set();
126
+
127
+ /**
128
+ * Agents we've removed locally (killed or shut down). Used to
129
+ * distinguish "we killed this agent" from "a foreign orchestrator
130
+ * owns this agent" during proof-of-life decisions.
131
+ *
132
+ * @type {Set<string>}
133
+ */
134
+ this._removed = new Set();
135
+
136
+ /**
137
+ * Registered workflows by role. Populated during start so
138
+ * scale() can collect demand signals from each workflow.
139
+ *
140
+ * @type {Map<string, object>}
141
+ */
142
+ this.workflows = new Map();
143
+
144
+ /**
145
+ * Last spawn timestamp per role for cooldown enforcement.
146
+ * Prevents thrashing when demand fluctuates between ticks.
147
+ *
148
+ * @type {Map<string, number>}
149
+ */
150
+ this._lastSpawn = new Map();
151
+ }
152
+
153
+ // ── Seed (Identity Discovery) ────────────────────────
154
+
155
+ /**
156
+ * One-time seed of `takenNames` from open PR branch names.
157
+ *
158
+ * Called before the first `acquire()` — before any reactor tick has
159
+ * populated the set. After the first tick, the reactor keeps the
160
+ * set current as a side effect of its normal data flow.
161
+ *
162
+ * Idempotent: only runs once per orchestrator lifetime.
163
+ *
164
+ * @returns {Promise<void>}
165
+ */
166
+ async seed() {
167
+ if (this._seeded) return;
168
+ this._seeded = true;
169
+
170
+ if (!this.hub || !this.repo) return;
171
+
172
+ try {
173
+ const prs = await this.hub.pulls(this.repo, { state: 'open' });
174
+ for (const pr of prs) {
175
+ const slash = pr.head?.indexOf('/');
176
+ if (slash > 0) this.takenNames.add(pr.head.slice(0, slash));
177
+ }
178
+ log.debug(`seed: discovered ${this.takenNames.size} taken names from open PRs`);
179
+ } catch (err) {
180
+ log.debug(`seed: skipped — ${err.message}`);
181
+ }
182
+ }
183
+
184
+ // ── Lifecycle ─────────────────────────────────────────
185
+
186
+ /**
187
+ * Spawn and register an agent with comprehensive rollback.
188
+ *
189
+ * Tracks each step of the spawn process and undoes them in reverse
190
+ * order on failure. Each completed step is tracked and unwound
191
+ * individually on error.
192
+ *
193
+ * @param {object} agent - Agent instance.
194
+ * @returns {Promise<void>}
195
+ * @fires Orchestrator#spawned
196
+ */
197
+ async spawn(agent) {
198
+ log.info(`spawning agent: ${agent.identity.name} (${agent.role})`);
199
+
200
+ /** @type {Array<{step: string, undo: function}>} Completed steps for rollback. */
201
+ const completed = [];
202
+
203
+ try {
204
+ // Step 1: Spawn the agent process (tmux pane, API session, etc.)
205
+ await agent.spawn();
206
+ completed.push({
207
+ step: 'spawn',
208
+ undo: async function undoSpawn() {
209
+ try { await agent.stop(); } catch (e) { log.debug(`rollback stop failed: ${e.message}`); }
210
+ }
211
+ });
212
+
213
+ // Step 2: Register in the agents map
214
+ this.agents.set(agent.identity.name, agent);
215
+ completed.push({
216
+ step: 'register',
217
+ undo: () => { this.agents.delete(agent.identity.name); }
218
+ });
219
+
220
+ // Step 3: Record activity timestamp
221
+ this._lastActivity.set(agent.identity.name, new Date().toISOString());
222
+ completed.push({
223
+ step: 'activity',
224
+ undo: () => { this._lastActivity.delete(agent.identity.name); }
225
+ });
226
+
227
+ log.info(`agent spawned: ${agent.identity.name}`);
228
+
229
+ // Schedule rapid-death detection: if the agent dies or shows
230
+ // fatal API errors within rapidDeathDelay of spawning, the
231
+ // backend is likely broken (budget exhaustion, API outage).
232
+ // Mark it as degraded so scale() falls back to cursor-agent.
233
+ //
234
+ // Two checks:
235
+ // 1. Dead pane → agent exited on error
236
+ // 2. Stuck-alive → agent stays alive but shows budget/rate-limit
237
+ // errors in its pane output
238
+ if (agent.backend && agent.alive) {
239
+ const backend = agent.backend;
240
+ const name = agent.identity.name;
241
+ const registry = this.backendRegistry;
242
+ const self = this;
243
+ const timer = setTimeout(async function rapidDeathCheck() {
244
+ try {
245
+ const alive = await agent.alive();
246
+ if (!alive && agent.state !== 'dormant') {
247
+ log.warn(`rapid death: ${name} died within ${self.rapidDeathDelay}ms of spawn — marking ${backend} degraded`);
248
+ registry?.recordFailure(backend);
249
+ if (agent.canTransition?.('dormant')) agent.transition('dormant');
250
+ self.emit('rapid-death', { name, backend });
251
+ return;
252
+ }
253
+
254
+ if (alive && agent.capture) {
255
+ const output = await agent.capture();
256
+ if (hasFatalError(output)) {
257
+ log.warn(`stuck-alive: ${name} shows fatal API error — marking ${backend} degraded`);
258
+ registry?.recordFailure(backend);
259
+ try { await agent.stop(); } catch { /* stop can fail */ }
260
+ self.emit('rapid-death', { name, backend, reason: 'stuck-alive' });
261
+ }
262
+ }
263
+ } catch { /* pane check can fail when session is torn down */ }
264
+ }, this.rapidDeathDelay);
265
+ timer.unref();
266
+ }
267
+
268
+ /**
269
+ * @event Orchestrator#spawned
270
+ * @type {object}
271
+ * @property {string} name - Agent identity name.
272
+ * @property {string} role - Agent role.
273
+ * @property {string} provider - AI provider.
274
+ */
275
+ this.emit('spawned', {
276
+ name: agent.identity.name,
277
+ role: agent.role,
278
+ provider: agent.identity.provider
279
+ });
280
+ } catch (err) {
281
+ log.error(`spawn failed: ${agent.identity.name} — ${err.message}`);
282
+
283
+ // Rollback completed steps in reverse order
284
+ for (let i = completed.length - 1; i >= 0; i--) {
285
+ try {
286
+ await completed[i].undo();
287
+ log.debug(`rollback: undid "${completed[i].step}" for ${agent.identity.name}`);
288
+ } catch (rollbackErr) {
289
+ // Never mask the original error with a rollback failure
290
+ log.warn(`rollback "${completed[i].step}" failed: ${rollbackErr.message}`);
291
+ }
292
+ }
293
+
294
+ // Identity release is the caller's responsibility (add_agent tool
295
+ // acquired it, add_agent tool releases it). Spawn only rolls back
296
+ // the steps it owns: process, registration, and activity tracking.
297
+ throw err;
298
+ }
299
+ }
300
+
301
+ /**
302
+ * Gracefully shut down an agent using a 3-phase protocol:
303
+ *
304
+ * 1. Send a structured shutdown request with a unique `requestId`
305
+ * 2. Poll for acknowledgment — the agent can accept or continue working
306
+ * 3. On acknowledgment or timeout, force stop and clean up
307
+ *
308
+ * @param {string} name - Agent identity name.
309
+ * @param {number} [timeout] - Timeout before force kill.
310
+ * @param {string} [reason] - Reason for shutdown (passed to agent).
311
+ * @returns {Promise<{acknowledged: boolean}>} Whether the agent acknowledged.
312
+ * @fires Orchestrator#removed
313
+ */
314
+ async shutdown(name, timeout, reason) {
315
+ const agent = this.agents.get(name);
316
+ if (!agent) throw new Error(`Agent "${name}" not found`);
317
+
318
+ const shutdownTimeout = timeout ?? this.cfg?.get?.('timeouts.shutdown') ?? 60000;
319
+ const pollInterval = this.cfg?.get?.('timeouts.poll') ?? 2000;
320
+ const requestId = `shutdown-${Date.now()}@${name}`;
321
+
322
+ log.info(`shutting down agent: ${name} (timeout: ${shutdownTimeout}ms, requestId: ${requestId})`);
323
+
324
+ // Phase 1: Send structured shutdown request
325
+ const shutdownMessage = [
326
+ `**Shutdown Request** (id: ${requestId})`,
327
+ reason ? `Reason: ${reason}` : '',
328
+ 'Please finish your current work and shut down gracefully.',
329
+ 'Post a comment or signal when ready.'
330
+ ].filter(Boolean).join('\n');
331
+
332
+ try {
333
+ await agent.send(shutdownMessage);
334
+ } catch (err) { log.debug(`shutdown message failed for ${name}: ${err.message}`); }
335
+
336
+ // Phase 2: Poll for acknowledgment or timeout
337
+ let acknowledged = false;
338
+ const deadline = Date.now() + shutdownTimeout;
339
+
340
+ while (Date.now() < deadline) {
341
+ if (agent.state === 'dormant') {
342
+ acknowledged = true;
343
+ break;
344
+ }
345
+
346
+ // Check if the agent is still alive — if it exited on its own,
347
+ // treat that as implicit acknowledgment
348
+ try {
349
+ const alive = await agent.alive?.();
350
+ if (alive === false) {
351
+ acknowledged = true;
352
+ break;
353
+ }
354
+ } catch (err) { log.debug(`alive check during shutdown polling failed: ${err.message}`); }
355
+
356
+ await new Promise(function wait(r) { setTimeout(r, pollInterval); });
357
+ }
358
+
359
+ if (acknowledged) {
360
+ log.info(`agent ${name} acknowledged shutdown`);
361
+ } else {
362
+ log.warn(`agent ${name} did not acknowledge shutdown within ${shutdownTimeout}ms, force stopping`);
363
+ }
364
+
365
+ // Phase 3: Force stop and clean up
366
+ const session = agent.session;
367
+ await this.snapshot(name, agent);
368
+ await agent.stop();
369
+
370
+ // Clean up the agent's workspace directory when configured
371
+ if (this.cfg?.get?.('workspace.cleanup') && agent.cwd) {
372
+ try {
373
+ await rm(agent.cwd, { recursive: true, force: true });
374
+ log.info(`workspace cleaned: ${name} (${agent.cwd})`);
375
+ } catch (err) { log.debug(`workspace cleanup failed for ${name}: ${err.message}`); }
376
+ }
377
+
378
+ this.agents.delete(name);
379
+ this._lastActivity.delete(name);
380
+ this._lastPaneHash.delete(name);
381
+ this._removed.add(name);
382
+ this.identityRegistry.release(agent.identity);
383
+ log.info(`agent shut down: ${name}`);
384
+
385
+ if (this.agents.size === 0 && session) await this._pruneSession(session);
386
+
387
+ /**
388
+ * @event Orchestrator#removed
389
+ * @type {object}
390
+ * @property {string} name - Agent identity name.
391
+ * @property {string} reason - 'shutdown' or 'killed'.
392
+ * @property {object} agent - The removed agent instance.
393
+ */
394
+ this.emit('removed', { name, reason: 'shutdown', agent });
395
+
396
+ return { acknowledged };
397
+ }
398
+
399
+ /**
400
+ * Force kill an agent immediately.
401
+ * Releases any claimed issues so they can be re-claimed.
402
+ *
403
+ * @param {string} name - Agent identity name.
404
+ * @returns {Promise<void>}
405
+ * @fires Orchestrator#removed
406
+ */
407
+ async kill(name) {
408
+ const agent = this.agents.get(name);
409
+ if (!agent) throw new Error(`Agent "${name}" not found`);
410
+
411
+ log.warn(`force killing agent: ${name}`);
412
+ const session = agent.session;
413
+ await this.snapshot(name, agent);
414
+ await agent.stop();
415
+
416
+ // Clean up the agent's workspace directory when configured
417
+ if (this.cfg?.get?.('workspace.cleanup') && agent.cwd) {
418
+ try {
419
+ await rm(agent.cwd, { recursive: true, force: true });
420
+ log.info(`workspace cleaned: ${name} (${agent.cwd})`);
421
+ } catch (err) { log.debug(`workspace cleanup failed for ${name}: ${err.message}`); }
422
+ }
423
+
424
+ this.agents.delete(name);
425
+ this._lastActivity.delete(name);
426
+ this._lastPaneHash.delete(name);
427
+ this._removed.add(name);
428
+ this.identityRegistry.release(agent.identity);
429
+
430
+ if (this.agents.size === 0 && session) await this._pruneSession(session);
431
+
432
+ this.emit('removed', { name, reason: 'killed', agent });
433
+ }
434
+
435
+ // ── Tmux Cleanup ─────────────────────────────────────
436
+
437
+ /**
438
+ * Resolve the tmux session name from registered agents or config.
439
+ *
440
+ * Agents carry their own session name (defaults to 'loreli' but
441
+ * overridable in tests). This avoids hardcoding the session name
442
+ * and keeps cleanup aligned with wherever agents actually live.
443
+ *
444
+ * @returns {string} The tmux session name.
445
+ */
446
+ _session() {
447
+ for (const agent of this.agents.values()) {
448
+ if (agent.session) return agent.session;
449
+ }
450
+ return this.cfg?.get?.('tmux.session') ?? 'loreli';
451
+ }
452
+
453
+ /**
454
+ * Destroy the tmux session when no agents remain.
455
+ *
456
+ * Safety net that runs after the last agent is removed via
457
+ * {@link kill} or {@link shutdown}. Individual agent `stop()` calls
458
+ * kill their own panes, but orphaned panes (from `remain-on-exit`,
459
+ * crashed backends, or timing gaps) can keep the session alive.
460
+ * This ensures a clean slate.
461
+ *
462
+ * @param {string} session - The tmux session name to prune.
463
+ * @returns {Promise<void>}
464
+ */
465
+ async _pruneSession(session) {
466
+ if (!Tmux.available()) return;
467
+ const tmux = new Tmux();
468
+
469
+ try {
470
+ if (await tmux.has(session)) {
471
+ let paneCount = 0;
472
+ try {
473
+ const panes = await tmux.allPanes(session);
474
+ paneCount = panes.length;
475
+ } catch { /* session may be in a bad state */ }
476
+ await tmux.kill(session);
477
+ log.info(`pruned tmux session "${session}" (${paneCount} orphaned panes) — no agents remain`);
478
+ }
479
+ } catch (err) {
480
+ log.debug(`session prune failed: ${err.message}`);
481
+ }
482
+ }
483
+
484
+ /**
485
+ * Garbage-collect orphaned tmux panes not tracked by any agent.
486
+ *
487
+ * Lists all panes in the loreli tmux session and kills any that
488
+ * do not belong to a registered agent. Safe to call at any time —
489
+ * tracked agent panes are preserved.
490
+ *
491
+ * When all panes are orphaned, the session is destroyed entirely.
492
+ *
493
+ * @returns {Promise<{killed: number}>} Count of orphaned panes killed.
494
+ */
495
+ async gc() {
496
+ if (!Tmux.available()) return { killed: 0 };
497
+ const session = this._session();
498
+ const tmux = new Tmux();
499
+
500
+ if (!await tmux.has(session)) return { killed: 0 };
501
+
502
+ const tracked = new Set();
503
+ for (const agent of this.agents.values()) {
504
+ if (agent.paneId) tracked.add(agent.paneId);
505
+ }
506
+
507
+ let all;
508
+ try {
509
+ all = await tmux.allPanes(session);
510
+ } catch {
511
+ return { killed: 0 };
512
+ }
513
+
514
+ let killed = 0;
515
+ for (const pane of all) {
516
+ if (!tracked.has(pane.id)) {
517
+ try {
518
+ await tmux.killPane(pane.id);
519
+ killed++;
520
+ log.info(`gc: killed orphaned pane ${pane.id}`);
521
+ } catch { /* pane may have died between list and kill */ }
522
+ }
523
+ }
524
+
525
+ // Destroy the session when every pane was orphaned
526
+ if (killed > 0 && killed === all.length) {
527
+ try {
528
+ if (await tmux.has(session)) await tmux.kill(session);
529
+ } catch { /* session may auto-destroy */ }
530
+ }
531
+
532
+ return { killed };
533
+ }
534
+
535
+ // ── Coordination ──────────────────────────────────────
536
+
537
+ /**
538
+ * Auto-spawn an agent for a given provider and role when one is not
539
+ * already available. Cross-provider review is the core value proposition;
540
+ * silently skipping it defeats the purpose, so the orchestrator
541
+ * proactively enlists the opposing side.
542
+ *
543
+ * Delegates to {@link Factory#spawn} for the full creation pipeline.
544
+ *
545
+ * @param {string} provider - AI provider to spawn for.
546
+ * @param {string} role - Agent role ('reviewer', 'action', etc.).
547
+ * @param {object} [opts] - Additional options.
548
+ * @returns {Promise<object>} The spawned agent instance.
549
+ */
550
+ async enlist(provider, role, opts = {}) {
551
+ log.info(`enlisting ${role} agent for ${provider} — cross-provider pairing requires it`);
552
+
553
+ // Seed taken names before first acquire — one-time cost, zero
554
+ // ongoing overhead because reactor ticks keep the set current.
555
+ await this.seed();
556
+
557
+ // Build context for the factory so prepare() writes session env vars.
558
+ // Include home and token so agent subprocesses use the same storage
559
+ // location and can create a hub for stamped GitHub operations.
560
+ const context = this.sessionId ? {
561
+ session: this.sessionId,
562
+ agent: null, // set after identity is acquired inside factory
563
+ repo: this.repo,
564
+ home: this.storage?.home,
565
+ token: process.env.GITHUB_TOKEN
566
+ } : undefined;
567
+
568
+ // Theme coherence: inherit from an existing agent so antagonist
569
+ // pairs always share the same theme universe. Only pick from
570
+ // config when no agents exist yet (first enlistment).
571
+ const existing = [...this.agents.values()].find(function hasTheme(a) { return a.identity?.theme; });
572
+ const theme = existing?.identity?.theme ?? pick(this.cfg?.get?.('theme'));
573
+
574
+ const agent = await this.factory.create(provider, role, {
575
+ theme,
576
+ model: this.cfg?.get?.('model'),
577
+ config: this.cfg,
578
+ context,
579
+ taken: this.takenNames,
580
+ ...opts
581
+ });
582
+
583
+ // Persist session data BEFORE spawn so the agent's MCP server
584
+ // subprocess can hydrate from storage on startup. Without this,
585
+ // the agent's _hydrate() call races against the host's save.
586
+ if (this.sessionId && this.storage && agent.identity?.name) {
587
+ const session = new Session({
588
+ identity: agent.identity.toJSON?.() ?? agent.identity,
589
+ role,
590
+ backend: agent.constructor.name,
591
+ paneId: null, // not yet known
592
+ repo: this.repo
593
+ });
594
+ await this.storage.save(this.sessionId, agent.identity.name, session.toJSON());
595
+ }
596
+
597
+ await this.spawn(agent);
598
+
599
+ if (this.sessionId && this.storage && agent.identity?.name && agent.paneId) {
600
+ const data = await this.storage.load(this.sessionId, agent.identity.name);
601
+ if (data) {
602
+ data.paneId = agent.paneId;
603
+ await this.storage.save(this.sessionId, agent.identity.name, data);
604
+ }
605
+ }
606
+
607
+ return agent;
608
+ }
609
+
610
+ /**
611
+ * Record a heartbeat timestamp for an agent. Resets the stall timer.
612
+ *
613
+ * @param {string} name - Agent identity name.
614
+ */
615
+ activity(name) {
616
+ this._lastActivity.set(name, new Date().toISOString());
617
+ }
618
+
619
+ /**
620
+ * Check whether an agent's tmux pane has new output since the last
621
+ * check. When output changes, the agent is provably active — update
622
+ * `_lastActivity` and return `true`. This is the ground truth signal
623
+ * that feeds into `health()` and, transitively, the proof-of-life
624
+ * responder.
625
+ *
626
+ * @param {string} name - Agent identity name.
627
+ * @returns {Promise<boolean>} True when pane output changed (agent is active).
628
+ */
629
+ async refresh(name) {
630
+ const agent = this.agents.get(name);
631
+ if (!agent?.capture) return false;
632
+
633
+ try {
634
+ const output = await agent.capture(50);
635
+ const digest = createHash('md5').update(output ?? '').digest('hex');
636
+ const prev = this._lastPaneHash.get(name);
637
+ this._lastPaneHash.set(name, digest);
638
+ if (prev && prev !== digest) {
639
+ this._lastActivity.set(name, new Date().toISOString());
640
+ return true;
641
+ }
642
+ return false;
643
+ } catch { return false; }
644
+ }
645
+
646
+ // ── Death Snapshot ──────────────────────────────────
647
+
648
+ /**
649
+ * Capture a dying agent's pane output and write it to the session
650
+ * logs directory as `<name>.death.log`. Requires `remain-on-exit`
651
+ * on the pane so output survives after the process exits.
652
+ *
653
+ * Non-fatal: silently skips when session or storage is unavailable,
654
+ * and logs a warning when capture or write fails.
655
+ *
656
+ * @param {string} name - Agent identity name.
657
+ * @param {object} agent - Agent instance with a `capture()` method.
658
+ * @returns {Promise<void>}
659
+ */
660
+ async snapshot(name, agent) {
661
+ if (!this.sessionId || !this.storage?.home) return;
662
+ try {
663
+ const raw = await agent.capture();
664
+ const cleaned = output.clean(raw);
665
+ const dir = join(this.storage.home, 'sessions', this.sessionId, 'logs');
666
+ await mkdir(dir, { recursive: true });
667
+ await writeFile(join(dir, `${name}.death.log`), cleaned, 'utf8');
668
+ log.info(`death snapshot written: ${name}`);
669
+ } catch (err) {
670
+ log.warn(`death snapshot failed for ${name}: ${err.message}`);
671
+ }
672
+ }
673
+
674
+ // ── Reconcile (Liveness Sweep) ───────────────────────
675
+
676
+ /**
677
+ * Synchronize registry state with actual tmux pane liveness.
678
+ *
679
+ * Iterates all registered agents and checks whether their underlying
680
+ * process is still alive. Dead agents are stopped, removed from the
681
+ * registry, and their identities released — closing the gap where
682
+ * team_status reports agents as "working" after their processes have
683
+ * exited.
684
+ *
685
+ * Dormant agents are skipped because they are already in a terminal
686
+ * state and are handled by the stall monitor's dormant cleanup.
687
+ *
688
+ * @returns {Promise<string[]>} Names of agents that were reconciled.
689
+ * @fires Orchestrator#removed
690
+ */
691
+ async reconcile() {
692
+ const reconciled = [];
693
+ const entries = [...this.agents.entries()];
694
+
695
+ for (const [name, agent] of entries) {
696
+ if (agent.state === 'dormant') continue;
697
+
698
+ let alive;
699
+ try {
700
+ alive = await agent.alive();
701
+ } catch {
702
+ continue;
703
+ }
704
+
705
+ if (alive) continue;
706
+
707
+ log.warn(`reconcile: ${name} pane is dead (state was "${agent.state}") — removing`);
708
+
709
+ await this.snapshot(name, agent);
710
+ try { await agent.stop(); } catch { /* pane already dead */ }
711
+
712
+ this.agents.delete(name);
713
+ this._lastActivity.delete(name);
714
+ this._lastPaneHash.delete(name);
715
+ this._removed.add(name);
716
+ this.identityRegistry.release(agent.identity);
717
+
718
+ this.emit('removed', { name, reason: 'reconciled', agent });
719
+ reconciled.push(name);
720
+ }
721
+
722
+ return reconciled;
723
+ }
724
+
725
+ // ── Health ───────────────────────────────────────────
726
+
727
+ /**
728
+ * Multi-signal health assessment for a named agent.
729
+ *
730
+ * Evaluates: tmux/process liveness, agent state machine, activity
731
+ * recency (last orchestrator interaction), and captured output length.
732
+ *
733
+ * @param {string} name - Agent identity name.
734
+ * @returns {Promise<{alive: boolean, status: string, details: string, outputLength?: number}>}
735
+ */
736
+ async health(name) {
737
+ const agent = this.agents.get(name);
738
+ if (!agent) return { alive: false, status: 'not-found', details: `agent ${name} not registered` };
739
+
740
+ if (agent.state === 'dormant')
741
+ return { alive: false, status: 'unhealthy', details: `agent ${name} is dormant` };
742
+
743
+ const paneAlive = await agent.alive();
744
+ if (!paneAlive)
745
+ return { alive: false, status: 'unhealthy', details: `agent ${name} pane is dead` };
746
+
747
+ const output = await agent.capture().catch(function noop() { return ''; });
748
+ const outputLength = output.length;
749
+
750
+ // Local proof-of-life: check tmux pane for real activity before
751
+ // declaring staleness. Agent-side MCP tool calls don't update
752
+ // _lastActivity, but they DO produce terminal output.
753
+ await this.refresh(name);
754
+
755
+ const lastTs = this._lastActivity.get(name);
756
+ if (lastTs) {
757
+ const elapsed = Date.now() - new Date(lastTs).getTime();
758
+ if (elapsed > this.stallTimeout)
759
+ return { alive: true, status: 'unhealthy', details: `agent ${name} activity is stale (${Math.round(elapsed / 1000)}s)`, outputLength };
760
+ }
761
+
762
+ return { alive: true, status: 'healthy', details: `agent ${name} is active`, outputLength };
763
+ }
764
+
765
+ // ── Scaling ──────────────────────────────────────────
766
+
767
+ /**
768
+ * Demand-driven scaling: collect demand signals from all registered
769
+ * workflows, then spawn agents to fill deficits — respecting global
770
+ * caps, per-role caps, rate limits, and cooldowns.
771
+ *
772
+ * Runs after all workflow handlers in the reactor chain so demand
773
+ * signals reflect the latest hydrated state. Spawns are capped by
774
+ * `maxPerTick` to avoid resource spikes.
775
+ *
776
+ * Priority order: reviewer > risk > action > planner. Reviewers
777
+ * unblock merges, risk unblocks reviewers, so they are filled first
778
+ * when at the global cap.
779
+ *
780
+ * @param {string} repo - Repository in "owner/name" format.
781
+ * @returns {Promise<Array<{role: string, agent: string}>>} Spawned agents.
782
+ */
783
+ async scale(repo) {
784
+ if (!this.workflows.size) return [];
785
+
786
+ const maxAgents = this.cfg?.get?.('scaling.maxAgents') ?? 8;
787
+ const maxPerRole = this.cfg?.get?.('scaling.maxPerRole') ?? {};
788
+ const maxPerTick = this.cfg?.get?.('scaling.maxPerTick') ?? 2;
789
+ const cooldown = this.cfg?.get?.('scaling.cooldown') ?? 30000;
790
+
791
+ // Collect demand signals from each workflow.
792
+ // Use the workflow's static `role` for enlist() — the map key may
793
+ // differ (e.g. map key 'review' vs static role 'reviewer'). The
794
+ // static role is what agents, demand(), and pair() filter on.
795
+ const signals = [];
796
+ for (const [key, workflow] of this.workflows) {
797
+ const role = workflow.constructor.role ?? key;
798
+ try {
799
+ const signal = await workflow.demand(repo);
800
+ signals.push({ role, ...signal });
801
+ log.debug(`scale: ${role} { workload: ${signal.workload}, supply: ${signal.supply}, deficit: ${signal.deficit} }`);
802
+ } catch (err) {
803
+ log.warn(`scale: demand() failed for ${role}: ${err.message}`);
804
+ }
805
+ }
806
+
807
+ // Global cap — only count live agents. Dormant agents stay in
808
+ // the map after exit but consume no resources.
809
+ const live = [...this.agents.values()]
810
+ .filter(function alive(a) { return a.state !== 'dormant'; }).length;
811
+ if (live >= maxAgents) {
812
+ log.debug(`scale: at global cap (${live}/${maxAgents}) — skipping`);
813
+ return [];
814
+ }
815
+
816
+ // Sort by priority: reviewer > risk > action > planner
817
+ const priority = { reviewer: 0, risk: 1, action: 2, planner: 3 };
818
+ signals.sort(function byPriority(a, b) {
819
+ return (priority[a.role] ?? 99) - (priority[b.role] ?? 99);
820
+ });
821
+
822
+ const spawned = [];
823
+ let budget = maxPerTick;
824
+ const now = Date.now();
825
+
826
+ for (const signal of signals) {
827
+ if (budget <= 0) break;
828
+ if (signal.deficit <= 0) continue;
829
+
830
+ const { role } = signal;
831
+ const roleCap = maxPerRole[role] ?? Infinity;
832
+ const current = [...this.agents.values()]
833
+ .filter(function liveRole(a) { return a.role === role && a.state !== 'dormant'; }).length;
834
+
835
+ // Per-role cap
836
+ if (current >= roleCap) {
837
+ log.debug(`scale: ${role} at role cap (${current}/${roleCap}) — skipping`);
838
+ continue;
839
+ }
840
+
841
+ // Cooldown check
842
+ const last = this._lastSpawn.get(role);
843
+ if (last && now - last < cooldown) {
844
+ log.debug(`scale: ${role} in cooldown (${Math.round((now - last) / 1000)}s < ${Math.round(cooldown / 1000)}s) — skipping`);
845
+ continue;
846
+ }
847
+
848
+ // How many to spawn: min of deficit, role headroom, global headroom, tick budget
849
+ const roleHeadroom = roleCap - current;
850
+ const globalHeadroom = maxAgents - live;
851
+ const count = Math.min(signal.deficit, roleHeadroom, globalHeadroom, budget);
852
+
853
+ if (count <= 0) continue;
854
+
855
+ await this.backendRegistry.discover();
856
+
857
+ for (let i = 0; i < count; i++) {
858
+ try {
859
+ const providers = this.backendRegistry.providers();
860
+ const info = capability(providers);
861
+ let provider;
862
+
863
+ // Reviewer and risk agents must oppose the action agents they
864
+ // pair with — pair() finds the cross-provider match, so
865
+ // spawning on the same side means scan()/assess() can never
866
+ // dispatch them. Pick the opposite of existing action agents.
867
+ //
868
+ // When no live action agent exists (dead/foreign), fall back
869
+ // to PR label metadata via demand().actionProviders so the
870
+ // correct opposing side is still selected.
871
+ let actionProvider = (role === 'reviewer' || role === 'risk')
872
+ ? [...this.agents.values()]
873
+ .find(function hasProvider(a) { return a.role === 'action' && a.identity?.provider; })
874
+ ?.identity?.provider
875
+ : null;
876
+
877
+ if (!actionProvider && signal.actionProviders?.length) {
878
+ actionProvider = signal.actionProviders[0];
879
+ }
880
+
881
+ if (actionProvider) {
882
+ const opposite = side(actionProvider) === 'yin' ? 'yang' : 'yin';
883
+ provider = providers.find(function isOpp(p) { return side(p) === opposite; });
884
+ if (!provider) {
885
+ if (info.mode === 'single') {
886
+ provider = providers[0] ?? null;
887
+ if (!provider) {
888
+ log.warn(`scale: no providers available for ${role} — skipping`);
889
+ break;
890
+ }
891
+ log.info(`scale: no ${opposite}-side provider available for ${role} — using single-side fallback (${provider})`);
892
+ } else {
893
+ log.warn(`scale: no ${opposite}-side provider available for ${role} — skipping`);
894
+ break;
895
+ }
896
+ }
897
+ } else if (role === 'reviewer' || role === 'risk') {
898
+ provider = providers[0];
899
+ } else {
900
+ // Action / planner: balance yin/yang within their own role
901
+ const yinCount = [...this.agents.values()]
902
+ .filter(function isRole(a) { return a.role === role && side(a.identity?.provider) === 'yin'; }).length;
903
+ const yangCount = [...this.agents.values()]
904
+ .filter(function isRole(a) { return a.role === role && side(a.identity?.provider) === 'yang'; }).length;
905
+
906
+ if (yinCount <= yangCount) {
907
+ provider = providers.find(function isYin(p) { return side(p) === 'yin'; }) ?? providers[0];
908
+ } else {
909
+ provider = providers.find(function isYang(p) { return side(p) === 'yang'; }) ?? providers[0];
910
+ }
911
+ }
912
+ // Final defensive default for non-pairing roles and mixed
913
+ // provider sets: if discovery returned providers but side
914
+ // selection yielded none, use the first discovered provider.
915
+ if (!provider && providers.length) provider = providers[0];
916
+
917
+ if (!provider) {
918
+ log.warn(`scale: no provider resolved for ${role} — skipping`);
919
+ break;
920
+ }
921
+
922
+ // Backend degradation fallback: when the native backend for
923
+ // the selected provider is degraded (repeated rapid failures
924
+ // like budget exhaustion), switch to the cursor-* virtual
925
+ // provider. Same yin/yang side, different API path.
926
+ if (!provider.startsWith('cursor-') && this.backendRegistry.degraded) {
927
+ let nativeBackend = null;
928
+ for (const info of this.backendRegistry.discovered.values()) {
929
+ if (info.provider === provider) { nativeBackend = info.name; break; }
930
+ }
931
+ if (nativeBackend && this.backendRegistry.degraded(nativeBackend)) {
932
+ const variant = `cursor-${provider}`;
933
+ if (providers.includes(variant)) {
934
+ log.info(`scale: ${nativeBackend} degraded, falling back to ${variant}`);
935
+ provider = variant;
936
+ }
937
+ }
938
+ }
939
+
940
+ const agent = await this.enlist(provider, role);
941
+ spawned.push({ role, agent: agent.identity.name });
942
+ log.info(`scale: spawned ${agent.identity.name} as ${role} (${provider})`);
943
+ } catch (err) {
944
+ log.warn(`scale: failed to spawn ${role} agent: ${err.message}`);
945
+ break;
946
+ }
947
+ }
948
+
949
+ if (spawned.length) {
950
+ this._lastSpawn.set(role, now);
951
+ budget -= count;
952
+ }
953
+ }
954
+
955
+ if (spawned.length) {
956
+ log.info(`scale: spawned ${spawned.length} agents — ${spawned.map(function fmt(s) { return `${s.role}:${s.agent}`; }).join(', ')}`);
957
+ }
958
+
959
+ return spawned;
960
+ }
961
+
962
+ // ── Reap (Global Safety Net) ─────────────────────────
963
+
964
+ /**
965
+ * Global safety net — stop dormant agents when no work remains.
966
+ *
967
+ * Role-specific reaping is handled by each workflow's `reap()` method
968
+ * (planner-reap, review-reap, action-reap). This global reap runs
969
+ * last in the reactor chain and catches anything the workflow-level
970
+ * reaps missed.
971
+ *
972
+ * Only **dormant** agents are eligible for global reaping. Agents in
973
+ * any other state (spawned, working, reviewing, etc.) are actively
974
+ * doing something and must not be interrupted. Stall detection is
975
+ * the separate mechanism that handles truly stuck agents.
976
+ *
977
+ * All of these conditions must be true before reaping:
978
+ * 1. No open issues with the `loreli` label
979
+ * 2. No open pull requests (PRs in flight = work not done)
980
+ * 3. Every remaining agent is dormant
981
+ *
982
+ * Registered as the last reactor handler so it runs after all
983
+ * workflow-specific reaps have completed.
984
+ *
985
+ * @param {string} repo - Repository in "owner/name" format.
986
+ * @returns {Promise<void>}
987
+ */
988
+ async reap(repo) {
989
+ if (!this.hub || !this.agents.size) return;
990
+
991
+ try {
992
+ // Only dormant agents are candidates — anything else is active work
993
+ const all = [...this.agents.values()];
994
+ const dormant = all.filter(function idle(a) { return a.state === 'dormant'; });
995
+ if (dormant.length === 0) {
996
+ log.debug(`reap: ${all.length} agents still active — skipping`);
997
+ return;
998
+ }
999
+
1000
+ const open = await this.hub.issues(repo, { state: 'open' });
1001
+ const loreli = open.filter(function tagged(i) {
1002
+ return i.labels?.some?.(function isLoreli(l) {
1003
+ const name = typeof l === 'string' ? l : l.name;
1004
+ return name === 'loreli';
1005
+ });
1006
+ });
1007
+
1008
+ if (loreli.length > 0) return;
1009
+
1010
+ // Check for open PRs — work is still in flight if PRs exist
1011
+ const prs = await this.hub.pulls(repo, { state: 'open' });
1012
+ if (prs.length > 0) return;
1013
+
1014
+ log.info(`reap: no open loreli issues or PRs — stopping ${dormant.length} dormant agents`);
1015
+
1016
+ for (const agent of dormant) {
1017
+ try {
1018
+ await this.kill(agent.identity.name);
1019
+ log.info(`reap: stopped ${agent.identity.name}`);
1020
+ } catch (err) {
1021
+ log.warn(`reap: failed to stop ${agent.identity.name}: ${err.message}`);
1022
+ }
1023
+ }
1024
+ } catch (err) {
1025
+ log.debug(`reap: skipped — ${err.message}`);
1026
+ }
1027
+ }
1028
+
1029
+ // ── Reactor (Polling Loop) ────────────────────────────
1030
+
1031
+ /**
1032
+ * Register a reactor handler called on every tick.
1033
+ * Role packages register their scan/forward/land handlers here.
1034
+ *
1035
+ * @param {string} name - Handler name (for logging/debugging).
1036
+ * @param {function(string): Promise<void>} handler - Async function receiving repo.
1037
+ */
1038
+ register(name, handler) {
1039
+ this._handlers.set(name, handler);
1040
+ log.info(`reactor handler registered: ${name}`);
1041
+ }
1042
+
1043
+ /**
1044
+ * Execute one polling iteration: call all registered handlers.
1045
+ * Each handler receives the repo string. Errors in one handler
1046
+ * do not prevent subsequent handlers from running.
1047
+ *
1048
+ * @param {string} repo - Repository in "owner/name" format.
1049
+ * @returns {Promise<void>}
1050
+ */
1051
+ async tick(repo) {
1052
+ log.debug('tick start');
1053
+
1054
+ for (const [name, handler] of this._handlers) {
1055
+ try {
1056
+ await handler(repo);
1057
+ } catch (err) {
1058
+ log.error(`reactor handler "${name}" failed: ${err.message}`);
1059
+ }
1060
+ }
1061
+
1062
+ log.debug('tick end');
1063
+ }
1064
+
1065
+ /**
1066
+ * Start the polling reactor loop using a self-scheduling setTimeout
1067
+ * chain. Each tick runs to completion before the next is scheduled,
1068
+ * eliminating overlap risk without a reentrant guard. This pattern
1069
+ * is more reliable than setInterval for async callbacks — setInterval
1070
+ * does not await its callback, and combined with unref() it can
1071
+ * silently stop firing after heavy async operations (observed in
1072
+ * production after agent spawn via tmux).
1073
+ *
1074
+ * @param {string} repo - Repository in "owner/name" format.
1075
+ */
1076
+ watch(repo) {
1077
+ if (this._watchHandle) return;
1078
+ this.repo = repo;
1079
+
1080
+ const interval = this.cfg?.get?.('watch.interval') ?? 60000;
1081
+ const self = this;
1082
+
1083
+ log.info(`watcher started for ${repo} (interval: ${interval}ms)`);
1084
+
1085
+ function schedule() {
1086
+ self._watchHandle = setTimeout(async function cycle() {
1087
+ await self.tick(repo);
1088
+ if (self._watchHandle) schedule();
1089
+ }, interval);
1090
+
1091
+ self._watchHandle.unref();
1092
+ }
1093
+
1094
+ schedule();
1095
+ }
1096
+
1097
+ /**
1098
+ * Stop the polling reactor loop.
1099
+ */
1100
+ unwatch() {
1101
+ if (this._watchHandle) {
1102
+ clearTimeout(this._watchHandle);
1103
+ this._watchHandle = null;
1104
+ log.info('watcher stopped');
1105
+ }
1106
+ }
1107
+
1108
+ // ── Monitor (Stall Detection) ─────────────────────────
1109
+
1110
+ /**
1111
+ * Start the stall detection monitor with 3-tier escalation.
1112
+ *
1113
+ * Tier 1 — Nudge (1x stall timeout):
1114
+ * Send a message to the agent asking for status. Emits 'stall'
1115
+ * with severity 'nudge'.
1116
+ *
1117
+ * Tier 2 — Warning (2x stall timeout):
1118
+ * Emits 'stall' with severity 'warning'. Role packages can
1119
+ * subscribe and post GitHub comments.
1120
+ *
1121
+ * Tier 3 — Critical (3x stall timeout):
1122
+ * Kills the agent and emits 'stall' with severity 'critical'.
1123
+ *
1124
+ * @fires Orchestrator#stall
1125
+ */
1126
+ monitor() {
1127
+ if (this._monitorHandle) return;
1128
+ log.info('stall detection monitor started');
1129
+
1130
+ const stallTimeout = this.stallTimeout;
1131
+ const nudge = this.cfg?.get?.('timeouts.nudge') ?? true;
1132
+ const self = this;
1133
+
1134
+ this._monitorHandle = setInterval(async function checkStalls() {
1135
+ // Reconcile first: detect dead panes and clean up before
1136
+ // running stall-escalation checks. Without this, dead agents
1137
+ // linger until stallTimeout elapses.
1138
+ await self.reconcile();
1139
+
1140
+ const now = Date.now();
1141
+
1142
+ // Snapshot keys to avoid mutation during iteration — Tier 3
1143
+ // calls kill() which deletes from self.agents mid-loop.
1144
+ const snapshot = [...self.agents.entries()];
1145
+ for (const [name, agent] of snapshot) {
1146
+ // Dormant agents are kept registered so downstream workflows
1147
+ // (e.g. review scan) can still match them by identity. Skip
1148
+ // nudge/warning, but allow Tier 3 kill for eventual cleanup.
1149
+ if (agent.state === 'dormant') {
1150
+ const last = self._lastActivity.get(name);
1151
+ if (!last) continue;
1152
+ const elapsed = now - new Date(last).getTime();
1153
+ if (elapsed > stallTimeout * 3) {
1154
+ self.agents.delete(name);
1155
+ self._lastActivity.delete(name);
1156
+ self._lastPaneHash.delete(name);
1157
+ log.info(`stall: cleaned up dormant agent ${name}`);
1158
+ }
1159
+ continue;
1160
+ }
1161
+
1162
+ const last = self._lastActivity.get(name);
1163
+ if (!last) continue;
1164
+
1165
+ // Local proof-of-life: check tmux pane for real activity
1166
+ // before escalating. If output changed, _lastActivity is
1167
+ // now current and the tier checks naturally skip.
1168
+ if (await self.refresh(name)) continue;
1169
+
1170
+ const elapsed = now - new Date(last).getTime();
1171
+
1172
+ if (elapsed > stallTimeout * 3) {
1173
+ // Tier 3: Critically stalled — kill and emit
1174
+ log.error(`agent ${name} critically stalled (${Math.round(elapsed / 1000)}s) — killing`);
1175
+
1176
+ /**
1177
+ * @event Orchestrator#stall
1178
+ * @type {object}
1179
+ * @property {string} name - Agent identity name.
1180
+ * @property {number} elapsed - Time since last activity in ms.
1181
+ * @property {string} severity - 'nudge', 'warning', or 'critical'.
1182
+ */
1183
+ self.emit('stall', { name, elapsed, severity: 'critical' });
1184
+
1185
+ try {
1186
+ await self.kill(name);
1187
+ log.info(`stall tier 3: agent ${name} killed`);
1188
+ } catch (err) {
1189
+ log.error(`stall tier 3: kill failed for ${name}: ${err.message}`);
1190
+ }
1191
+ } else if (elapsed > stallTimeout * 2) {
1192
+ // Tier 2: Warning
1193
+ log.warn(`agent ${name} stalled tier 2 (${Math.round(elapsed / 1000)}s)`);
1194
+ self.emit('stall', { name, elapsed, severity: 'warning' });
1195
+ } else if (elapsed > stallTimeout) {
1196
+ // Tier 1: Optional nudge
1197
+ if (nudge) {
1198
+ log.warn(`agent ${name} stalled tier 1 (${Math.round(elapsed / 1000)}s) - nudging`);
1199
+ try {
1200
+ await agent.send('You appear to be stalled. Please report your current status or continue working.');
1201
+ // Activity resets only when the agent responds (via MCP tool
1202
+ // calls or hub activity), NOT when we nudge it. Resetting
1203
+ // here would trap agents at tier 1 forever.
1204
+ } catch (err) { log.debug(`monitor: nudge failed for ${name}: ${err.message}`); }
1205
+ } else {
1206
+ log.warn(`agent ${name} stalled tier 1 (${Math.round(elapsed / 1000)}s) - nudge suppressed by config`);
1207
+ }
1208
+ self.emit('stall', { name, elapsed, severity: 'nudge' });
1209
+ }
1210
+ }
1211
+ }, Math.min(stallTimeout / 2, 60000));
1212
+
1213
+ this._monitorHandle.unref();
1214
+ }
1215
+
1216
+ /**
1217
+ * Stop the stall detection monitor.
1218
+ */
1219
+ stopMonitor() {
1220
+ if (this._monitorHandle) {
1221
+ clearInterval(this._monitorHandle);
1222
+ this._monitorHandle = null;
1223
+ log.info('stall detection monitor stopped');
1224
+ }
1225
+ }
1226
+ }