loreli 0.0.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +710 -97
  3. package/bin/loreli.js +89 -0
  4. package/package.json +77 -14
  5. package/packages/README.md +101 -0
  6. package/packages/action/README.md +98 -0
  7. package/packages/action/prompts/action.md +172 -0
  8. package/packages/action/src/index.js +684 -0
  9. package/packages/agent/README.md +606 -0
  10. package/packages/agent/src/backends/claude.js +387 -0
  11. package/packages/agent/src/backends/codex.js +351 -0
  12. package/packages/agent/src/backends/cursor.js +371 -0
  13. package/packages/agent/src/backends/index.js +486 -0
  14. package/packages/agent/src/base.js +138 -0
  15. package/packages/agent/src/cli.js +275 -0
  16. package/packages/agent/src/discover.js +396 -0
  17. package/packages/agent/src/factory.js +124 -0
  18. package/packages/agent/src/index.js +12 -0
  19. package/packages/agent/src/models.js +159 -0
  20. package/packages/agent/src/output.js +62 -0
  21. package/packages/agent/src/session.js +162 -0
  22. package/packages/agent/src/trace.js +186 -0
  23. package/packages/classify/README.md +136 -0
  24. package/packages/classify/prompts/blocker.md +12 -0
  25. package/packages/classify/prompts/feedback.md +14 -0
  26. package/packages/classify/prompts/pane-state.md +20 -0
  27. package/packages/classify/src/index.js +81 -0
  28. package/packages/config/README.md +898 -0
  29. package/packages/config/src/defaults.js +145 -0
  30. package/packages/config/src/index.js +223 -0
  31. package/packages/config/src/schema.js +291 -0
  32. package/packages/config/src/validate.js +160 -0
  33. package/packages/context/README.md +165 -0
  34. package/packages/context/src/index.js +198 -0
  35. package/packages/hub/README.md +338 -0
  36. package/packages/hub/src/base.js +154 -0
  37. package/packages/hub/src/github.js +1597 -0
  38. package/packages/hub/src/index.js +79 -0
  39. package/packages/hub/src/labels.js +48 -0
  40. package/packages/identity/README.md +288 -0
  41. package/packages/identity/src/index.js +620 -0
  42. package/packages/identity/src/themes/avatar.js +217 -0
  43. package/packages/identity/src/themes/digimon.js +217 -0
  44. package/packages/identity/src/themes/dragonball.js +217 -0
  45. package/packages/identity/src/themes/lotr.js +217 -0
  46. package/packages/identity/src/themes/marvel.js +217 -0
  47. package/packages/identity/src/themes/pokemon.js +217 -0
  48. package/packages/identity/src/themes/starwars.js +217 -0
  49. package/packages/identity/src/themes/transformers.js +217 -0
  50. package/packages/identity/src/themes/zelda.js +217 -0
  51. package/packages/knowledge/README.md +217 -0
  52. package/packages/knowledge/src/index.js +243 -0
  53. package/packages/log/README.md +93 -0
  54. package/packages/log/src/index.js +252 -0
  55. package/packages/marker/README.md +200 -0
  56. package/packages/marker/src/index.js +184 -0
  57. package/packages/mcp/README.md +323 -0
  58. package/packages/mcp/instructions.md +126 -0
  59. package/packages/mcp/scaffolding/.agents/skills/loreli-context/SKILL.md +89 -0
  60. package/packages/mcp/scaffolding/ISSUE_TEMPLATE/config.yml +2 -0
  61. package/packages/mcp/scaffolding/ISSUE_TEMPLATE/loreli.yml +83 -0
  62. package/packages/mcp/scaffolding/loreli.yml +491 -0
  63. package/packages/mcp/scaffolding/mcp-configs/.codex/config.toml +4 -0
  64. package/packages/mcp/scaffolding/mcp-configs/.cursor/mcp.json +14 -0
  65. package/packages/mcp/scaffolding/mcp-configs/.mcp.json +14 -0
  66. package/packages/mcp/scaffolding/pull-request.md +23 -0
  67. package/packages/mcp/src/index.js +600 -0
  68. package/packages/mcp/src/tools/agent-context.js +44 -0
  69. package/packages/mcp/src/tools/agents.js +450 -0
  70. package/packages/mcp/src/tools/context.js +200 -0
  71. package/packages/mcp/src/tools/github.js +1163 -0
  72. package/packages/mcp/src/tools/hitl.js +162 -0
  73. package/packages/mcp/src/tools/index.js +18 -0
  74. package/packages/mcp/src/tools/refactor.js +227 -0
  75. package/packages/mcp/src/tools/repo.js +44 -0
  76. package/packages/mcp/src/tools/start.js +904 -0
  77. package/packages/mcp/src/tools/status.js +149 -0
  78. package/packages/mcp/src/tools/work.js +134 -0
  79. package/packages/orchestrator/README.md +192 -0
  80. package/packages/orchestrator/src/index.js +1492 -0
  81. package/packages/planner/README.md +251 -0
  82. package/packages/planner/prompts/plan-reviewer.md +109 -0
  83. package/packages/planner/prompts/planner.md +191 -0
  84. package/packages/planner/prompts/tiebreaker-reviewer.md +71 -0
  85. package/packages/planner/src/index.js +1381 -0
  86. package/packages/review/README.md +129 -0
  87. package/packages/review/prompts/reviewer.md +158 -0
  88. package/packages/review/src/index.js +1403 -0
  89. package/packages/risk/README.md +178 -0
  90. package/packages/risk/prompts/risk.md +272 -0
  91. package/packages/risk/src/index.js +439 -0
  92. package/packages/session/README.md +165 -0
  93. package/packages/session/src/index.js +215 -0
  94. package/packages/test-utils/README.md +96 -0
  95. package/packages/test-utils/src/index.js +354 -0
  96. package/packages/tmux/README.md +261 -0
  97. package/packages/tmux/src/index.js +501 -0
  98. package/packages/workflow/README.md +317 -0
  99. package/packages/workflow/prompts/preamble.md +14 -0
  100. package/packages/workflow/src/index.js +660 -0
  101. package/packages/workflow/src/proof-of-life.js +74 -0
  102. package/packages/workspace/README.md +143 -0
  103. package/packages/workspace/src/index.js +1127 -0
  104. package/index.js +0 -8
@@ -0,0 +1,1492 @@
1
+ import { rm, writeFile, mkdir } from 'node:fs/promises';
2
+ import { join } from 'node:path';
3
+ import { createHash } from 'node:crypto';
4
+ import { EventEmitter } from 'node:events';
5
+ import { Factory, Session, output } from 'loreli/agent';
6
+ import { Tmux } from 'loreli/tmux';
7
+ import { prepare } from 'loreli/workspace';
8
+ import { pick, side, capability } from 'loreli/identity';
9
+ import { classify } from 'loreli/classify';
10
+ import { logger } from 'loreli/log';
11
+
12
+ const log = logger('orchestrator');
13
+
14
+ /**
15
+ * Fatal error patterns that indicate a backend infrastructure failure
16
+ * (not a task failure). When these appear in an agent's pane output
17
+ * shortly after spawn, the backend is broken and should be degraded.
18
+ *
19
+ * @type {RegExp[]}
20
+ */
21
+ const FATAL_PATTERNS = [
22
+ /budget[_ ]*(has been )?exceeded/i,
23
+ /rate[_ ]?limit[_ ]*exceeded/i,
24
+ /hit your usage[_ ]*limit/i,
25
+ /authentication[_ ]*(error|failed)/i,
26
+ /invalid[_ ]*api[_ ]*key/i,
27
+ /quota[_ ]*exceeded/i,
28
+ /insufficient[_ ]*quota/i,
29
+ /invalid model name/i,
30
+ /unable to connect to api/i,
31
+ /connection\s*refused/i
32
+ ];
33
+
34
+ /**
35
+ * Maximum pane characters logged in diagnostic debug output.
36
+ *
37
+ * @type {number}
38
+ */
39
+ const PANE_DEBUG_LIMIT = 4000;
40
+
41
+ /**
42
+ * Check if pane output contains fatal API error patterns.
43
+ *
44
+ * @param {string} output - Pane content from agent.capture().
45
+ * @returns {boolean} True if a fatal error pattern is found.
46
+ */
47
+ function hasFatalError(output) {
48
+ if (!output) return false;
49
+ return FATAL_PATTERNS.some(function match(p) { return p.test(output); });
50
+ }
51
+
52
+ /**
53
+ * Format captured pane output for debug logging.
54
+ *
55
+ * Keeps logs readable while still preserving enough context to validate
56
+ * classifier and fallback decisions during stall/rapid-death diagnosis.
57
+ *
58
+ * @param {string} output - Raw pane output.
59
+ * @returns {string} Pane text, truncated when necessary.
60
+ */
61
+ function paneDebug(output) {
62
+ if (!output) return '[empty pane output]';
63
+ if (output.length <= PANE_DEBUG_LIMIT) return output;
64
+ const rest = output.length - PANE_DEBUG_LIMIT;
65
+ return `${output.slice(0, PANE_DEBUG_LIMIT)}\n… [truncated ${rest} chars]`;
66
+ }
67
+
68
+ /**
69
+ * Normalize remedy instructions into tmux key names.
70
+ *
71
+ * Classifier prompts return remedies as space-delimited strings
72
+ * (`"Down Enter"`), while backend fallback diagnose methods return
73
+ * string arrays (`['Down', 'Enter']`). The orchestrator accepts both.
74
+ *
75
+ * @param {string|string[]|null|undefined} remedy - Remedy from diagnosis.
76
+ * @returns {string[]} Tmux key sequence.
77
+ */
78
+ function remedy(remedy) {
79
+ if (Array.isArray(remedy)) {
80
+ const keys = remedy.filter(Boolean);
81
+ if (keys.length > 0) return keys;
82
+ return ['Enter'];
83
+ }
84
+ if (typeof remedy === 'string') {
85
+ const keys = remedy.split(/\s+/).filter(Boolean);
86
+ if (keys.length > 0) return keys;
87
+ return ['Enter'];
88
+ }
89
+ return ['Enter'];
90
+ }
91
+
92
+ /**
93
+ * Generic agent lifecycle coordinator via EventEmitter.
94
+ *
95
+ * Manages spawn/shutdown/kill, reactor polling, stall detection,
96
+ * and activity tracking. Contains zero role-specific logic — all
97
+ * planner/action/review behavior lives in the role packages that
98
+ * subscribe to lifecycle events.
99
+ *
100
+ * @extends EventEmitter
101
+ * @fires Orchestrator#spawned
102
+ * @fires Orchestrator#removed
103
+ * @fires Orchestrator#stall
104
+ */
105
+ export class Orchestrator extends EventEmitter {
106
+ /**
107
+ * @param {object} opts
108
+ * @param {object} opts.hub - Hub instance for git hosting operations.
109
+ * @param {object} opts.identityRegistry - Registry for agent identities.
110
+ * @param {object} opts.backendRegistry - Registry for agent backends.
111
+ * @param {object} opts.storage - Persistent storage instance.
112
+ * @param {object} [opts.config] - Config instance from loreli/config.
113
+ */
114
+ constructor({ hub, identityRegistry, backendRegistry, storage, config }) {
115
+ super();
116
+
117
+ /** @type {object} Hub for git hosting. */
118
+ this.hub = hub;
119
+
120
+ /** @type {object} Identity registry. */
121
+ this.identityRegistry = identityRegistry;
122
+
123
+ /** @type {object} Backend registry. */
124
+ this.backendRegistry = backendRegistry;
125
+
126
+ /** @type {Factory} Agent factory — centralizes the create+spawn pipeline. */
127
+ this.factory = new Factory({ backends: backendRegistry, identities: identityRegistry, config });
128
+
129
+ /** @type {object} Storage for session persistence. */
130
+ this.storage = storage;
131
+
132
+ /** @type {object|null} Config instance from loreli/config. */
133
+ this.cfg = config ?? null;
134
+
135
+ /** @type {Map<string, object>} Active agents by name. */
136
+ this.agents = new Map();
137
+
138
+ /** @type {string|null} Current session ID. */
139
+ this.sessionId = null;
140
+
141
+ /** @type {object|null} MCP client identity. */
142
+ this.clientIdentity = null;
143
+
144
+ /** @type {string|null} Target repository in "owner/name" format. */
145
+ this.repo = null;
146
+
147
+ /** @type {number} Stall timeout in ms. */
148
+ this.stallTimeout = this.cfg?.get?.('timeouts.stall') ?? 600000;
149
+
150
+ /** @type {number} Delay before checking if a freshly spawned agent died. */
151
+ this.rapidDeathDelay = this.cfg?.get?.('timeouts.rapidDeath') ?? 15000;
152
+
153
+ /** @type {NodeJS.Timeout|null} Stall detection interval handle. */
154
+ this._monitorHandle = null;
155
+
156
+ /** @type {Map<string, string>} Last known activity timestamp per agent. */
157
+ this._lastActivity = new Map();
158
+
159
+ /** @type {Map<string, string>} MD5 hash of last captured pane output per agent for tmux-based activity detection. */
160
+ this._lastPaneHash = new Map();
161
+
162
+ /** @type {Map<string, number>} Consecutive classify failures per agent — safety net kill after threshold. */
163
+ this._classifyFails = new Map();
164
+
165
+ /** @type {NodeJS.Timeout|null} Reactor polling interval handle. */
166
+ this._watchHandle = null;
167
+
168
+ /** @type {Map<string, function>} Registered reactor handlers by name. */
169
+ this._handlers = new Map();
170
+
171
+ /**
172
+ * Names already claimed by other participants, discovered from
173
+ * GitHub claim comments and PR branches during reactor ticks.
174
+ * Populated as a zero-cost side effect of data the reactor
175
+ * already fetches — no additional API calls needed.
176
+ *
177
+ * @type {Set<string>}
178
+ */
179
+ this.takenNames = new Set();
180
+
181
+ /**
182
+ * Agents we've removed locally (killed or shut down). Used to
183
+ * distinguish "we killed this agent" from "a foreign orchestrator
184
+ * owns this agent" during proof-of-life decisions.
185
+ *
186
+ * @type {Set<string>}
187
+ */
188
+ this._removed = new Set();
189
+
190
+ /**
191
+ * Registered workflows by role. Populated during start so
192
+ * scale() can collect demand signals from each workflow.
193
+ *
194
+ * @type {Map<string, object>}
195
+ */
196
+ this.workflows = new Map();
197
+
198
+ /**
199
+ * Last spawn timestamp per role for cooldown enforcement.
200
+ * Prevents thrashing when demand fluctuates between ticks.
201
+ *
202
+ * @type {Map<string, number>}
203
+ */
204
+ this._lastSpawn = new Map();
205
+ }
206
+
207
+ // ── Seed (Identity Discovery) ────────────────────────
208
+
209
+ /**
210
+ * One-time seed of `takenNames` from open PR branch names.
211
+ *
212
+ * Called before the first `acquire()` — before any reactor tick has
213
+ * populated the set. After the first tick, the reactor keeps the
214
+ * set current as a side effect of its normal data flow.
215
+ *
216
+ * Idempotent: only runs once per orchestrator lifetime.
217
+ *
218
+ * @returns {Promise<void>}
219
+ */
220
+ async seed() {
221
+ if (this._seeded) return;
222
+ this._seeded = true;
223
+
224
+ if (!this.hub || !this.repo) return;
225
+
226
+ try {
227
+ const prs = await this.hub.pulls(this.repo, { state: 'open' });
228
+ for (const pr of prs) {
229
+ const slash = pr.head?.indexOf('/');
230
+ if (slash > 0) this.takenNames.add(pr.head.slice(0, slash));
231
+ }
232
+ log.debug(`seed: discovered ${this.takenNames.size} taken names from open PRs`);
233
+ } catch (err) {
234
+ log.debug(`seed: skipped — ${err.message}`);
235
+ }
236
+ }
237
+
238
+ // ── Lifecycle ─────────────────────────────────────────
239
+
240
+ /**
241
+ * Spawn and register an agent with comprehensive rollback.
242
+ *
243
+ * Tracks each step of the spawn process and undoes them in reverse
244
+ * order on failure. Each completed step is tracked and unwound
245
+ * individually on error.
246
+ *
247
+ * @param {object} agent - Agent instance.
248
+ * @returns {Promise<void>}
249
+ * @fires Orchestrator#spawned
250
+ */
251
+ async spawn(agent) {
252
+ log.info(`spawning agent: ${agent.identity.name} (${agent.role})`);
253
+
254
+ /** @type {Array<{step: string, undo: function}>} Completed steps for rollback. */
255
+ const completed = [];
256
+
257
+ try {
258
+ // Step 1: Spawn the agent process (tmux pane, API session, etc.)
259
+ await agent.spawn();
260
+ completed.push({
261
+ step: 'spawn',
262
+ undo: async function undoSpawn() {
263
+ try { await agent.stop(); } catch (e) { log.debug(`rollback stop failed: ${e.message}`); }
264
+ }
265
+ });
266
+
267
+ // Step 2: Register in the agents map
268
+ this.agents.set(agent.identity.name, agent);
269
+ completed.push({
270
+ step: 'register',
271
+ undo: () => { this.agents.delete(agent.identity.name); }
272
+ });
273
+
274
+ // Step 3: Record activity timestamp
275
+ this._lastActivity.set(agent.identity.name, new Date().toISOString());
276
+ completed.push({
277
+ step: 'activity',
278
+ undo: () => { this._lastActivity.delete(agent.identity.name); }
279
+ });
280
+
281
+ log.info(`agent spawned: ${agent.identity.name}`);
282
+
283
+ // Schedule rapid-death detection: if the agent dies or shows
284
+ // fatal API errors within rapidDeathDelay of spawning, the
285
+ // backend is likely broken (budget exhaustion, API outage).
286
+ // Mark it as degraded so scale() falls back to cursor-agent.
287
+ //
288
+ // Uses the pane-state classifier when pane output is available
289
+ // (remain-on-exit keeps dead panes capturable). Falls back to
290
+ // raw alive() when capture fails.
291
+ if (agent.backend && agent.alive) {
292
+ const backend = agent.backend;
293
+ const name = agent.identity.name;
294
+ const registry = this.backendRegistry;
295
+ const self = this;
296
+ const timer = setTimeout(async function rapidDeathCheck() {
297
+ if (agent.state === 'dormant') return;
298
+
299
+ try {
300
+ const alive = await agent.alive();
301
+
302
+ // Agent is alive and healthy — no rapid death
303
+ if (alive && !agent.capture) return;
304
+
305
+ let output;
306
+ try {
307
+ output = agent.capture
308
+ ? await agent.capture(self.cfg?.get?.('classify.maxLines') ?? 100)
309
+ : null;
310
+ } catch { output = null; }
311
+ if (output !== null) {
312
+ log.debug(`rapid-death pane ${name} (${backend}, alive=${alive}):\n${paneDebug(output)}`);
313
+ } else {
314
+ log.debug(`rapid-death pane ${name} (${backend}, alive=${alive}): [capture unavailable]`);
315
+ }
316
+
317
+ // Classify the pane output to determine why the agent
318
+ // died or what error it hit while still alive.
319
+ let diagnosis;
320
+ if (output) {
321
+ try {
322
+ diagnosis = await classify('pane-state', output, {
323
+ backends: self.backendRegistry,
324
+ config: self.cfg,
325
+ vars: { model: agent.model, backend, role: agent.role }
326
+ });
327
+ log.info(`rapid-death classify ${name}: ${diagnosis.category} — ${diagnosis.reasoning}`);
328
+ } catch (err) {
329
+ log.warn(`rapid-death classify failed for ${name}: ${err.message}`);
330
+ }
331
+ }
332
+
333
+ // When LLM classify fails, fall back to backend-specific
334
+ // regex detection. Each backend knows its CLI's dialog patterns.
335
+ let category = diagnosis?.category;
336
+ if (alive && output) {
337
+ const fallback = registry?.diagnose?.(backend, output);
338
+ const actionable = new Set(['option_dialog', 'waiting_for_input', 'fatal', 'dead']);
339
+ const fallbackActionable = actionable.has(fallback?.category);
340
+ const llmActionable = actionable.has(category);
341
+ const llmCategory = category;
342
+
343
+ if (!category && fallback) {
344
+ category = fallback.category;
345
+ diagnosis = fallback;
346
+ log.info(`rapid-death fallback diagnose ${name}: ${category} — ${fallback.reasoning}`);
347
+ } else if (fallbackActionable && !llmActionable) {
348
+ category = fallback.category;
349
+ diagnosis = fallback;
350
+ log.info(`rapid-death fallback override ${name}: ${fallback.category} over ${llmCategory ?? 'unknown'} — ${fallback.reasoning}`);
351
+ }
352
+ }
353
+
354
+ if (!alive) {
355
+ log.warn(`rapid death: ${name} died within ${self.rapidDeathDelay}ms of spawn (${category ?? 'unknown'}) — marking ${backend} degraded`);
356
+ registry?.recordFailure(backend);
357
+ try { await self.kill(name); } catch { /* already dead */ }
358
+ self.emit('rapid-death', { name, backend, diagnosis });
359
+ return;
360
+ }
361
+
362
+ // Alive with recoverable dialog — send the appropriate
363
+ // input to dismiss it. Record a soft warning instead of a
364
+ // hard failure so the backend isn't blacklisted for a
365
+ // transient issue. Repeated warnings promote to failure.
366
+ if (category === 'option_dialog') {
367
+ const keys = remedy(diagnosis?.remedy);
368
+ log.info(`rapid-death remediation: ${name} has option dialog — sending ${keys.join('+')}`);
369
+ try {
370
+ const tmux = new Tmux();
371
+ await tmux.keys(agent.paneId, ...keys);
372
+ } catch (err) { log.debug(`rapid-death: keys failed for ${name}: ${err.message}`); }
373
+ registry?.recordWarning?.(backend);
374
+ self.emit('rapid-death', { name, backend, reason: 'remediated', diagnosis });
375
+ return;
376
+ }
377
+
378
+ if (category === 'waiting_for_input') {
379
+ log.info(`rapid-death remediation: ${name} waiting for input — sending continuation`);
380
+ try {
381
+ await agent.send('Please continue working or report your status.');
382
+ } catch (err) { log.debug(`rapid-death: send failed for ${name}: ${err.message}`); }
383
+ registry?.recordWarning?.(backend);
384
+ self.emit('rapid-death', { name, backend, reason: 'remediated', diagnosis });
385
+ return;
386
+ }
387
+
388
+ // Alive but classifier detected fatal state
389
+ if (category === 'fatal' || category === 'dead') {
390
+ log.warn(`stuck-alive: ${name} classified as ${category} — marking ${backend} degraded`);
391
+ registry?.recordFailure(backend);
392
+ try { await agent.stop(); } catch { /* stop can fail */ }
393
+ self.emit('rapid-death', { name, backend, reason: 'stuck-alive', diagnosis });
394
+ return;
395
+ }
396
+
397
+ // Alive but regex fallback for when classifier didn't detect fatal
398
+ if (alive && output && hasFatalError(output)) {
399
+ log.warn(`stuck-alive: ${name} shows fatal API error (regex) — marking ${backend} degraded`);
400
+ registry?.recordFailure(backend);
401
+ try { await agent.stop(); } catch { /* stop can fail */ }
402
+ self.emit('rapid-death', { name, backend, reason: 'stuck-alive' });
403
+ }
404
+ } catch { /* pane check can fail when session is torn down */ }
405
+ }, this.rapidDeathDelay);
406
+ timer.unref();
407
+ }
408
+
409
+ /**
410
+ * @event Orchestrator#spawned
411
+ * @type {object}
412
+ * @property {string} name - Agent identity name.
413
+ * @property {string} role - Agent role.
414
+ * @property {string} provider - AI provider.
415
+ */
416
+ this.emit('spawned', {
417
+ name: agent.identity.name,
418
+ role: agent.role,
419
+ provider: agent.identity.provider
420
+ });
421
+ } catch (err) {
422
+ log.error(`spawn failed: ${agent.identity.name} — ${err.message}`);
423
+
424
+ // Rollback completed steps in reverse order
425
+ for (let i = completed.length - 1; i >= 0; i--) {
426
+ try {
427
+ await completed[i].undo();
428
+ log.debug(`rollback: undid "${completed[i].step}" for ${agent.identity.name}`);
429
+ } catch (rollbackErr) {
430
+ // Never mask the original error with a rollback failure
431
+ log.warn(`rollback "${completed[i].step}" failed: ${rollbackErr.message}`);
432
+ }
433
+ }
434
+
435
+ // Identity release is the caller's responsibility (add_agent tool
436
+ // acquired it, add_agent tool releases it). Spawn only rolls back
437
+ // the steps it owns: process, registration, and activity tracking.
438
+ throw err;
439
+ }
440
+ }
441
+
442
+ /**
443
+ * Gracefully shut down an agent using a 3-phase protocol:
444
+ *
445
+ * 1. Send a structured shutdown request with a unique `requestId`
446
+ * 2. Poll for acknowledgment — the agent can accept or continue working
447
+ * 3. On acknowledgment or timeout, force stop and clean up
448
+ *
449
+ * @param {string} name - Agent identity name.
450
+ * @param {number} [timeout] - Timeout before force kill.
451
+ * @param {string} [reason] - Reason for shutdown (passed to agent).
452
+ * @returns {Promise<{acknowledged: boolean}>} Whether the agent acknowledged.
453
+ * @fires Orchestrator#removed
454
+ */
455
+ async shutdown(name, timeout, reason) {
456
+ const agent = this.agents.get(name);
457
+ if (!agent) throw new Error(`Agent "${name}" not found`);
458
+
459
+ const shutdownTimeout = timeout ?? this.cfg?.get?.('timeouts.shutdown') ?? 60000;
460
+ const pollInterval = this.cfg?.get?.('timeouts.poll') ?? 2000;
461
+ const requestId = `shutdown-${Date.now()}@${name}`;
462
+
463
+ log.info(`shutting down agent: ${name} (timeout: ${shutdownTimeout}ms, requestId: ${requestId})`);
464
+
465
+ // Phase 1: Send structured shutdown request
466
+ const shutdownMessage = [
467
+ `**Shutdown Request** (id: ${requestId})`,
468
+ reason ? `Reason: ${reason}` : '',
469
+ 'Please finish your current work and shut down gracefully.',
470
+ 'Post a comment or signal when ready.'
471
+ ].filter(Boolean).join('\n');
472
+
473
+ try {
474
+ await agent.send(shutdownMessage);
475
+ } catch (err) { log.debug(`shutdown message failed for ${name}: ${err.message}`); }
476
+
477
+ // Phase 2: Poll for acknowledgment or timeout
478
+ let acknowledged = false;
479
+ const deadline = Date.now() + shutdownTimeout;
480
+
481
+ while (Date.now() < deadline) {
482
+ if (agent.state === 'dormant') {
483
+ acknowledged = true;
484
+ break;
485
+ }
486
+
487
+ // Check if the agent is still alive — if it exited on its own,
488
+ // treat that as implicit acknowledgment
489
+ try {
490
+ const alive = await agent.alive?.();
491
+ if (alive === false) {
492
+ acknowledged = true;
493
+ break;
494
+ }
495
+ } catch (err) { log.debug(`alive check during shutdown polling failed: ${err.message}`); }
496
+
497
+ await new Promise(function wait(r) { setTimeout(r, pollInterval); });
498
+ }
499
+
500
+ if (acknowledged) {
501
+ log.info(`agent ${name} acknowledged shutdown`);
502
+ } else {
503
+ log.warn(`agent ${name} did not acknowledge shutdown within ${shutdownTimeout}ms, force stopping`);
504
+ }
505
+
506
+ // Phase 3: Force stop and clean up
507
+ const session = agent.session;
508
+ await this.snapshot(name, agent);
509
+ await agent.stop();
510
+
511
+ // Clean up the agent's workspace directory when configured
512
+ if (this.cfg?.get?.('workspace.cleanup') && agent.cwd) {
513
+ try {
514
+ await rm(agent.cwd, { recursive: true, force: true });
515
+ log.info(`workspace cleaned: ${name} (${agent.cwd})`);
516
+ } catch (err) { log.debug(`workspace cleanup failed for ${name}: ${err.message}`); }
517
+ }
518
+
519
+ this.agents.delete(name);
520
+ this._lastActivity.delete(name);
521
+ this._lastPaneHash.delete(name);
522
+ this._classifyFails.delete(name);
523
+ this._removed.add(name);
524
+ this.identityRegistry.release(agent.identity);
525
+ log.info(`agent shut down: ${name}`);
526
+
527
+ if (this.agents.size === 0 && session) await this._pruneSession(session);
528
+
529
+ /**
530
+ * @event Orchestrator#removed
531
+ * @type {object}
532
+ * @property {string} name - Agent identity name.
533
+ * @property {string} reason - 'shutdown' or 'killed'.
534
+ * @property {object} agent - The removed agent instance.
535
+ */
536
+ this.emit('removed', { name, reason: 'shutdown', agent });
537
+
538
+ return { acknowledged };
539
+ }
540
+
541
+ /**
542
+ * Force kill an agent immediately.
543
+ * Releases any claimed issues so they can be re-claimed.
544
+ *
545
+ * @param {string} name - Agent identity name.
546
+ * @returns {Promise<void>}
547
+ * @fires Orchestrator#removed
548
+ */
549
+ async kill(name) {
550
+ const agent = this.agents.get(name);
551
+ if (!agent) throw new Error(`Agent "${name}" not found`);
552
+
553
+ log.warn(`force killing agent: ${name}`);
554
+ const session = agent.session;
555
+ await this.snapshot(name, agent);
556
+ await agent.stop();
557
+
558
+ // Clean up the agent's workspace directory when configured
559
+ if (this.cfg?.get?.('workspace.cleanup') && agent.cwd) {
560
+ try {
561
+ await rm(agent.cwd, { recursive: true, force: true });
562
+ log.info(`workspace cleaned: ${name} (${agent.cwd})`);
563
+ } catch (err) { log.debug(`workspace cleanup failed for ${name}: ${err.message}`); }
564
+ }
565
+
566
+ this.agents.delete(name);
567
+ this._lastActivity.delete(name);
568
+ this._lastPaneHash.delete(name);
569
+ this._classifyFails.delete(name);
570
+ this._removed.add(name);
571
+ this.identityRegistry.release(agent.identity);
572
+
573
+ if (this.agents.size === 0 && session) await this._pruneSession(session);
574
+
575
+ this.emit('removed', { name, reason: 'killed', agent });
576
+ }
577
+
578
+ // ── Tmux Cleanup ─────────────────────────────────────
579
+
580
+ /**
581
+ * Resolve the tmux session name from registered agents or config.
582
+ *
583
+ * Agents carry their own session name (defaults to 'loreli' but
584
+ * overridable in tests). This avoids hardcoding the session name
585
+ * and keeps cleanup aligned with wherever agents actually live.
586
+ *
587
+ * @returns {string} The tmux session name.
588
+ */
589
+ _session() {
590
+ for (const agent of this.agents.values()) {
591
+ if (agent.session) return agent.session;
592
+ }
593
+ return this.cfg?.get?.('tmux.session') ?? 'loreli';
594
+ }
595
+
596
+ /**
597
+ * Destroy the tmux session when no agents remain.
598
+ *
599
+ * Safety net that runs after the last agent is removed via
600
+ * {@link kill} or {@link shutdown}. Individual agent `stop()` calls
601
+ * kill their own panes, but orphaned panes (from `remain-on-exit`,
602
+ * crashed backends, or timing gaps) can keep the session alive.
603
+ * This ensures a clean slate.
604
+ *
605
+ * @param {string} session - The tmux session name to prune.
606
+ * @returns {Promise<void>}
607
+ */
608
+ async _pruneSession(session) {
609
+ if (!Tmux.available()) return;
610
+ const tmux = new Tmux();
611
+
612
+ try {
613
+ if (await tmux.has(session)) {
614
+ let paneCount = 0;
615
+ try {
616
+ const panes = await tmux.allPanes(session);
617
+ paneCount = panes.length;
618
+ } catch { /* session may be in a bad state */ }
619
+ await tmux.kill(session);
620
+ log.info(`pruned tmux session "${session}" (${paneCount} orphaned panes) — no agents remain`);
621
+ }
622
+ } catch (err) {
623
+ log.debug(`session prune failed: ${err.message}`);
624
+ }
625
+ }
626
+
627
+ /**
628
+ * Garbage-collect orphaned tmux panes not tracked by any agent.
629
+ *
630
+ * Lists all panes in the loreli tmux session and kills any that
631
+ * do not belong to a registered agent. Safe to call at any time —
632
+ * tracked agent panes are preserved.
633
+ *
634
+ * When all panes are orphaned, the session is destroyed entirely.
635
+ *
636
+ * @returns {Promise<{killed: number}>} Count of orphaned panes killed.
637
+ */
638
+ async gc() {
639
+ if (!Tmux.available()) return { killed: 0 };
640
+ const session = this._session();
641
+ const tmux = new Tmux();
642
+
643
+ if (!await tmux.has(session)) return { killed: 0 };
644
+
645
+ const tracked = new Set();
646
+ for (const agent of this.agents.values()) {
647
+ if (agent.paneId) tracked.add(agent.paneId);
648
+ }
649
+
650
+ let all;
651
+ try {
652
+ all = await tmux.allPanes(session);
653
+ } catch {
654
+ return { killed: 0 };
655
+ }
656
+
657
+ let killed = 0;
658
+ for (const pane of all) {
659
+ if (!tracked.has(pane.id)) {
660
+ try {
661
+ await tmux.killPane(pane.id);
662
+ killed++;
663
+ log.info(`gc: killed orphaned pane ${pane.id}`);
664
+ } catch { /* pane may have died between list and kill */ }
665
+ }
666
+ }
667
+
668
+ // Destroy the session when every pane was orphaned
669
+ if (killed > 0 && killed === all.length) {
670
+ try {
671
+ if (await tmux.has(session)) await tmux.kill(session);
672
+ } catch { /* session may auto-destroy */ }
673
+ }
674
+
675
+ return { killed };
676
+ }
677
+
678
+ // ── Coordination ──────────────────────────────────────
679
+
680
+ /**
681
+ * Auto-spawn an agent for a given provider and role when one is not
682
+ * already available. Cross-provider review is the core value proposition;
683
+ * silently skipping it defeats the purpose, so the orchestrator
684
+ * proactively enlists the opposing side.
685
+ *
686
+ * Delegates to {@link Factory#spawn} for the full creation pipeline.
687
+ *
688
+ * @param {string} provider - AI provider to spawn for.
689
+ * @param {string} role - Agent role ('reviewer', 'action', etc.).
690
+ * @param {object} [opts] - Additional options.
691
+ * @returns {Promise<object>} The spawned agent instance.
692
+ */
693
+ async enlist(provider, role, opts = {}) {
694
+ log.info(`enlisting ${role} agent for ${provider} — cross-provider pairing requires it`);
695
+
696
+ // Seed taken names before first acquire — one-time cost, zero
697
+ // ongoing overhead because reactor ticks keep the set current.
698
+ await this.seed();
699
+
700
+ // Build context for the factory so prepare() writes session env vars.
701
+ // Include home and token so agent subprocesses use the same storage
702
+ // location and can create a hub for stamped GitHub operations.
703
+ const context = this.sessionId ? {
704
+ session: this.sessionId,
705
+ agent: null, // set after identity is acquired inside factory
706
+ repo: this.repo,
707
+ home: this.storage?.home,
708
+ token: process.env.GITHUB_TOKEN
709
+ } : undefined;
710
+
711
+ // Theme coherence: inherit from an existing agent so antagonist
712
+ // pairs always share the same theme universe. Only pick from
713
+ // config when no agents exist yet (first enlistment).
714
+ const existing = [...this.agents.values()].find(function hasTheme(a) { return a.identity?.theme; });
715
+ const theme = existing?.identity?.theme ?? pick(this.cfg?.get?.('theme'));
716
+
717
+ const agent = await this.factory.create(provider, role, {
718
+ theme,
719
+ model: this.cfg?.get?.(`workflows.${role}.model`) ?? this.cfg?.get?.('model'),
720
+ config: this.cfg,
721
+ context,
722
+ taken: this.takenNames,
723
+ ...opts
724
+ });
725
+
726
+ // Persist session data BEFORE spawn so the agent's MCP server
727
+ // subprocess can hydrate from storage on startup. Without this,
728
+ // the agent's _hydrate() call races against the host's save.
729
+ if (this.sessionId && this.storage && agent.identity?.name) {
730
+ const session = new Session({
731
+ identity: agent.identity.toJSON?.() ?? agent.identity,
732
+ role,
733
+ backend: agent.constructor.name,
734
+ paneId: null, // not yet known
735
+ repo: this.repo
736
+ });
737
+ await this.storage.save(this.sessionId, agent.identity.name, session.toJSON());
738
+ }
739
+
740
+ try {
741
+ await this.spawn(agent);
742
+ } catch (err) {
743
+ if (this.sessionId && this.storage && agent.identity?.name) {
744
+ try { await this.storage.remove(this.sessionId, agent.identity.name); } catch { /* best-effort */ }
745
+ }
746
+ throw err;
747
+ }
748
+
749
+ if (this.sessionId && this.storage && agent.identity?.name && agent.paneId) {
750
+ const data = await this.storage.load(this.sessionId, agent.identity.name);
751
+ if (data) {
752
+ data.paneId = agent.paneId;
753
+ await this.storage.save(this.sessionId, agent.identity.name, data);
754
+ }
755
+ }
756
+
757
+ return agent;
758
+ }
759
+
760
+ /**
761
+ * Record a heartbeat timestamp for an agent. Resets the stall timer.
762
+ *
763
+ * @param {string} name - Agent identity name.
764
+ */
765
+ activity(name) {
766
+ this._lastActivity.set(name, new Date().toISOString());
767
+ }
768
+
769
+ /**
770
+ * Check whether an agent's tmux pane has new output since the last
771
+ * check. When output changes, the agent is provably active — update
772
+ * `_lastActivity` and return `true`. This is the ground truth signal
773
+ * that feeds into `health()` and, transitively, the proof-of-life
774
+ * responder.
775
+ *
776
+ * @param {string} name - Agent identity name.
777
+ * @returns {Promise<boolean>} True when pane output changed (agent is active).
778
+ */
779
+ async refresh(name) {
780
+ const agent = this.agents.get(name);
781
+ if (!agent?.capture) return false;
782
+
783
+ try {
784
+ const output = await agent.capture(50);
785
+ const digest = createHash('md5').update(output ?? '').digest('hex');
786
+ const prev = this._lastPaneHash.get(name);
787
+ this._lastPaneHash.set(name, digest);
788
+ if (prev && prev !== digest) {
789
+ if (hasFatalError(output)) return false;
790
+ this._lastActivity.set(name, new Date().toISOString());
791
+ return true;
792
+ }
793
+ return false;
794
+ } catch { return false; }
795
+ }
796
+
797
+ /**
798
+ * Update the pane hash after an orchestrator-initiated interaction.
799
+ *
800
+ * Must be called after any action that changes the pane content
801
+ * (sending keys, messages, etc.) to prevent `refresh()` from
802
+ * misinterpreting the orchestrator's own output as agent activity
803
+ * on the next monitor cycle. The orchestrator also resets the
804
+ * stall timer here because a remediation attempt should buy the
805
+ * agent time to react before another nudge is sent.
806
+ *
807
+ * @param {string} name - Agent identity name.
808
+ * @param {object} agent - Agent instance with a `capture()` method.
809
+ * @returns {Promise<void>}
810
+ */
811
+ async _rehash(name, agent) {
812
+ try {
813
+ const content = await agent.capture(50);
814
+ const digest = createHash('md5').update(content ?? '').digest('hex');
815
+ this._lastPaneHash.set(name, digest);
816
+ this._lastActivity.set(name, new Date().toISOString());
817
+ } catch { /* capture can fail if pane died */ }
818
+ }
819
+
820
+ // ── Death Snapshot ──────────────────────────────────
821
+
822
+ /**
823
+ * Capture a dying agent's pane output and write it to the session
824
+ * logs directory as `<name>.death.log`. Requires `remain-on-exit`
825
+ * on the pane so output survives after the process exits.
826
+ *
827
+ * Non-fatal: silently skips when session or storage is unavailable,
828
+ * and logs a warning when capture or write fails.
829
+ *
830
+ * @param {string} name - Agent identity name.
831
+ * @param {object} agent - Agent instance with a `capture()` method.
832
+ * @returns {Promise<void>}
833
+ */
834
+ async snapshot(name, agent) {
835
+ if (!this.sessionId || !this.storage?.home) return;
836
+ try {
837
+ const raw = await agent.capture();
838
+ const cleaned = output.clean(raw);
839
+ const dir = join(this.storage.home, 'sessions', this.sessionId, 'logs');
840
+ await mkdir(dir, { recursive: true });
841
+ await writeFile(join(dir, `${name}.death.log`), cleaned, 'utf8');
842
+ log.info(`death snapshot written: ${name}`);
843
+ } catch (err) {
844
+ log.warn(`death snapshot failed for ${name}: ${err.message}`);
845
+ }
846
+ }
847
+
848
+ // ── Reconcile (Liveness Sweep) ───────────────────────
849
+
850
+ /**
851
+ * Synchronize registry state with actual tmux pane liveness.
852
+ *
853
+ * Iterates all registered agents and checks whether their underlying
854
+ * process is still alive. Dead agents are stopped, removed from the
855
+ * registry, and their identities released — closing the gap where
856
+ * team_status reports agents as "working" after their processes have
857
+ * exited.
858
+ *
859
+ * Dormant agents are skipped because they are already in a terminal
860
+ * state and are handled by the stall monitor's dormant cleanup.
861
+ *
862
+ * @returns {Promise<string[]>} Names of agents that were reconciled.
863
+ * @fires Orchestrator#removed
864
+ */
865
+ async reconcile() {
866
+ const reconciled = [];
867
+ const entries = [...this.agents.entries()];
868
+
869
+ for (const [name, agent] of entries) {
870
+ if (agent.state === 'dormant') continue;
871
+
872
+ let alive;
873
+ try {
874
+ alive = await agent.alive();
875
+ } catch (err) {
876
+ log.warn(`reconcile: alive() threw for ${name}: ${err.message} — treating as dead`);
877
+ alive = false;
878
+ }
879
+
880
+ if (alive) continue;
881
+
882
+ log.warn(`reconcile: ${name} pane is dead (state was "${agent.state}") — removing`);
883
+
884
+ await this.snapshot(name, agent);
885
+ try { await agent.stop(); } catch { /* pane already dead */ }
886
+
887
+ this.agents.delete(name);
888
+ this._lastActivity.delete(name);
889
+ this._lastPaneHash.delete(name);
890
+ this._classifyFails.delete(name);
891
+ this._removed.add(name);
892
+ this.identityRegistry.release(agent.identity);
893
+
894
+ this.emit('removed', { name, reason: 'reconciled', agent });
895
+ reconciled.push(name);
896
+ }
897
+
898
+ return reconciled;
899
+ }
900
+
901
+ // ── Health ───────────────────────────────────────────
902
+
903
+ /**
904
+ * Multi-signal health assessment for a named agent.
905
+ *
906
+ * Evaluates: tmux/process liveness, agent state machine, activity
907
+ * recency (last orchestrator interaction), and captured output length.
908
+ *
909
+ * @param {string} name - Agent identity name.
910
+ * @returns {Promise<{alive: boolean, status: string, details: string, outputLength?: number}>}
911
+ */
912
+ async health(name) {
913
+ const agent = this.agents.get(name);
914
+ if (!agent) return { alive: false, status: 'not-found', details: `agent ${name} not registered` };
915
+
916
+ if (agent.state === 'dormant')
917
+ return { alive: false, status: 'unhealthy', details: `agent ${name} is dormant` };
918
+
919
+ const paneAlive = await agent.alive();
920
+ if (!paneAlive)
921
+ return { alive: false, status: 'unhealthy', details: `agent ${name} pane is dead` };
922
+
923
+ const output = await agent.capture().catch(function noop() { return ''; });
924
+ const outputLength = output.length;
925
+
926
+ // Local proof-of-life: check tmux pane for real activity before
927
+ // declaring staleness. Agent-side MCP tool calls don't update
928
+ // _lastActivity, but they DO produce terminal output.
929
+ await this.refresh(name);
930
+
931
+ const lastTs = this._lastActivity.get(name);
932
+ if (lastTs) {
933
+ const elapsed = Date.now() - new Date(lastTs).getTime();
934
+ if (elapsed > this.stallTimeout)
935
+ return { alive: true, status: 'unhealthy', details: `agent ${name} activity is stale (${Math.round(elapsed / 1000)}s)`, outputLength };
936
+ }
937
+
938
+ return { alive: true, status: 'healthy', details: `agent ${name} is active`, outputLength };
939
+ }
940
+
941
+ // ── Scaling ──────────────────────────────────────────
942
+
943
+ /**
944
+ * Demand-driven scaling: collect demand signals from all registered
945
+ * workflows, then spawn agents to fill deficits — respecting global
946
+ * caps, per-role caps, rate limits, and cooldowns.
947
+ *
948
+ * Runs after all workflow handlers in the reactor chain so demand
949
+ * signals reflect the latest hydrated state. Spawns are capped by
950
+ * `maxPerTick` to avoid resource spikes.
951
+ *
952
+ * Priority order: reviewer > risk > action > planner. Reviewers
953
+ * unblock merges, risk unblocks reviewers, so they are filled first
954
+ * when at the global cap.
955
+ *
956
+ * @param {string} repo - Repository in "owner/name" format.
957
+ * @returns {Promise<Array<{role: string, agent: string}>>} Spawned agents.
958
+ */
959
+ async scale(repo) {
960
+ if (!this.workflows.size) return [];
961
+
962
+ const maxAgents = this.cfg?.get?.('scaling.maxAgents') ?? 8;
963
+ const maxPerTick = this.cfg?.get?.('scaling.maxPerTick') ?? 2;
964
+ const cooldown = this.cfg?.get?.('scaling.cooldown') ?? 30000;
965
+
966
+ // Collect demand signals from each workflow.
967
+ // Use the workflow's static `role` for enlist() — the map key may
968
+ // differ (e.g. map key 'review' vs static role 'reviewer'). The
969
+ // static role is what agents, demand(), and pair() filter on.
970
+ const signals = [];
971
+ for (const [key, workflow] of this.workflows) {
972
+ const role = workflow.constructor.role ?? key;
973
+ try {
974
+ const signal = await workflow.demand(repo);
975
+ signals.push({ role, ...signal });
976
+ log.debug(`scale: ${role} { workload: ${signal.workload}, supply: ${signal.supply}, deficit: ${signal.deficit} }`);
977
+ } catch (err) {
978
+ log.warn(`scale: demand() failed for ${role}: ${err.message}`);
979
+ }
980
+ }
981
+
982
+ // Global cap — only count live agents. Dormant agents stay in
983
+ // the map after exit but consume no resources.
984
+ const live = [...this.agents.values()]
985
+ .filter(function alive(a) { return a.state !== 'dormant'; }).length;
986
+ if (live >= maxAgents) {
987
+ log.debug(`scale: at global cap (${live}/${maxAgents}) — skipping`);
988
+ return [];
989
+ }
990
+
991
+ // Sort by priority: reviewer > risk > action > planner
992
+ const priority = { reviewer: 0, risk: 1, action: 2, planner: 3 };
993
+ signals.sort(function byPriority(a, b) {
994
+ return (priority[a.role] ?? 99) - (priority[b.role] ?? 99);
995
+ });
996
+
997
+ const spawned = [];
998
+ let budget = maxPerTick;
999
+ const now = Date.now();
1000
+
1001
+ for (const signal of signals) {
1002
+ if (budget <= 0) break;
1003
+ if (signal.deficit <= 0) continue;
1004
+
1005
+ const { role } = signal;
1006
+ const roleCap = this.cfg?.get?.(`workflows.${role}.maxAgents`) ?? Infinity;
1007
+ const current = [...this.agents.values()]
1008
+ .filter(function liveRole(a) { return a.role === role && a.state !== 'dormant'; }).length;
1009
+
1010
+ // Per-role cap
1011
+ if (current >= roleCap) {
1012
+ log.debug(`scale: ${role} at role cap (${current}/${roleCap}) — skipping`);
1013
+ continue;
1014
+ }
1015
+
1016
+ // Cooldown check
1017
+ const last = this._lastSpawn.get(role);
1018
+ if (last && now - last < cooldown) {
1019
+ log.debug(`scale: ${role} in cooldown (${Math.round((now - last) / 1000)}s < ${Math.round(cooldown / 1000)}s) — skipping`);
1020
+ continue;
1021
+ }
1022
+
1023
+ // How many to spawn: min of deficit, role headroom, global headroom, tick budget
1024
+ const roleHeadroom = roleCap - current;
1025
+ const globalHeadroom = maxAgents - live;
1026
+ const count = Math.min(signal.deficit, roleHeadroom, globalHeadroom, budget);
1027
+
1028
+ if (count <= 0) continue;
1029
+
1030
+ await this.backendRegistry.discover();
1031
+
1032
+ for (let i = 0; i < count; i++) {
1033
+ try {
1034
+ const providers = this.backendRegistry.providers();
1035
+ const info = capability(providers);
1036
+ let provider;
1037
+
1038
+ // Reviewer and risk agents must oppose the action agents they
1039
+ // pair with — pair() finds the cross-provider match, so
1040
+ // spawning on the same side means scan()/assess() can never
1041
+ // dispatch them. Pick the opposite of existing action agents.
1042
+ //
1043
+ // When no live action agent exists (dead/foreign), fall back
1044
+ // to PR label metadata via demand().actionProviders so the
1045
+ // correct opposing side is still selected.
1046
+ let actionProvider = (role === 'reviewer' || role === 'risk')
1047
+ ? [...this.agents.values()]
1048
+ .find(function hasProvider(a) { return a.role === 'action' && a.identity?.provider; })
1049
+ ?.identity?.provider
1050
+ : null;
1051
+
1052
+ if (!actionProvider && signal.actionProviders?.length) {
1053
+ actionProvider = signal.actionProviders[0];
1054
+ }
1055
+
1056
+ if (actionProvider) {
1057
+ const opposite = side(actionProvider) === 'yin' ? 'yang' : 'yin';
1058
+ provider = providers.find(function isOpp(p) { return side(p) === opposite; });
1059
+ if (!provider) {
1060
+ if (info.mode === 'single') {
1061
+ provider = providers[0] ?? null;
1062
+ if (!provider) {
1063
+ log.warn(`scale: no providers available for ${role} — skipping`);
1064
+ break;
1065
+ }
1066
+ log.info(`scale: no ${opposite}-side provider available for ${role} — using single-side fallback (${provider})`);
1067
+ } else {
1068
+ log.warn(`scale: no ${opposite}-side provider available for ${role} — skipping`);
1069
+ break;
1070
+ }
1071
+ }
1072
+ } else if (role === 'reviewer' || role === 'risk') {
1073
+ provider = providers[0];
1074
+ } else {
1075
+ // Action / planner: balance yin/yang within their own role
1076
+ const yinCount = [...this.agents.values()]
1077
+ .filter(function isRole(a) { return a.role === role && side(a.identity?.provider) === 'yin'; }).length;
1078
+ const yangCount = [...this.agents.values()]
1079
+ .filter(function isRole(a) { return a.role === role && side(a.identity?.provider) === 'yang'; }).length;
1080
+
1081
+ if (yinCount <= yangCount) {
1082
+ provider = providers.find(function isYin(p) { return side(p) === 'yin'; }) ?? providers[0];
1083
+ } else {
1084
+ provider = providers.find(function isYang(p) { return side(p) === 'yang'; }) ?? providers[0];
1085
+ }
1086
+ }
1087
+ // Final defensive default for non-pairing roles and mixed
1088
+ // provider sets: if discovery returned providers but side
1089
+ // selection yielded none, use the first discovered provider.
1090
+ if (!provider && providers.length) provider = providers[0];
1091
+
1092
+ if (!provider) {
1093
+ log.warn(`scale: no provider resolved for ${role} — skipping`);
1094
+ break;
1095
+ }
1096
+
1097
+ // Backend degradation fallback: when the native backend for
1098
+ // the selected provider is degraded (repeated rapid failures
1099
+ // like budget exhaustion), switch to the cursor-* virtual
1100
+ // provider. Same yin/yang side, different API path.
1101
+ if (!provider.startsWith('cursor-') && this.backendRegistry.degraded) {
1102
+ let nativeBackend = null;
1103
+ for (const info of this.backendRegistry.discovered.values()) {
1104
+ if (info.provider === provider) { nativeBackend = info.name; break; }
1105
+ }
1106
+ if (nativeBackend && this.backendRegistry.degraded(nativeBackend)) {
1107
+ const variant = `cursor-${provider}`;
1108
+ if (providers.includes(variant)) {
1109
+ log.info(`scale: ${nativeBackend} degraded, falling back to ${variant}`);
1110
+ provider = variant;
1111
+ }
1112
+ }
1113
+ }
1114
+
1115
+ const agent = await this.enlist(provider, role);
1116
+ spawned.push({ role, agent: agent.identity.name });
1117
+ log.info(`scale: spawned ${agent.identity.name} as ${role} (${provider})`);
1118
+ } catch (err) {
1119
+ log.warn(`scale: failed to spawn ${role} agent: ${err.message}`);
1120
+ break;
1121
+ }
1122
+ }
1123
+
1124
+ if (spawned.length) {
1125
+ this._lastSpawn.set(role, now);
1126
+ budget -= count;
1127
+ }
1128
+ }
1129
+
1130
+ if (spawned.length) {
1131
+ log.info(`scale: spawned ${spawned.length} agents — ${spawned.map(function fmt(s) { return `${s.role}:${s.agent}`; }).join(', ')}`);
1132
+ }
1133
+
1134
+ return spawned;
1135
+ }
1136
+
1137
+ // ── Reap (Global Safety Net) ─────────────────────────
1138
+
1139
+ /**
1140
+ * Global safety net — stop dormant agents when no work remains.
1141
+ *
1142
+ * Role-specific reaping is handled by each workflow's `reap()` method
1143
+ * (planner-reap, review-reap, action-reap). This global reap runs
1144
+ * last in the reactor chain and catches anything the workflow-level
1145
+ * reaps missed.
1146
+ *
1147
+ * Only **dormant** agents are eligible for global reaping. Agents in
1148
+ * any other state (spawned, working, reviewing, etc.) are actively
1149
+ * doing something and must not be interrupted. Stall detection is
1150
+ * the separate mechanism that handles truly stuck agents.
1151
+ *
1152
+ * All of these conditions must be true before reaping:
1153
+ * 1. No open issues with the `loreli` label
1154
+ * 2. No open pull requests (PRs in flight = work not done)
1155
+ * 3. Every remaining agent is dormant
1156
+ *
1157
+ * Registered as the last reactor handler so it runs after all
1158
+ * workflow-specific reaps have completed.
1159
+ *
1160
+ * @param {string} repo - Repository in "owner/name" format.
1161
+ * @returns {Promise<void>}
1162
+ */
1163
+ async reap(repo) {
1164
+ if (!this.hub || !this.agents.size) return;
1165
+
1166
+ try {
1167
+ // Only dormant agents are candidates — anything else is active work
1168
+ const all = [...this.agents.values()];
1169
+ const dormant = all.filter(function idle(a) { return a.state === 'dormant'; });
1170
+ if (dormant.length === 0) {
1171
+ log.debug(`reap: ${all.length} agents still active — skipping`);
1172
+ return;
1173
+ }
1174
+
1175
+ const open = await this.hub.issues(repo, { state: 'open' });
1176
+ const loreli = open.filter(function tagged(i) {
1177
+ return i.labels?.some?.(function isLoreli(l) {
1178
+ const name = typeof l === 'string' ? l : l.name;
1179
+ return name === 'loreli';
1180
+ });
1181
+ });
1182
+
1183
+ if (loreli.length > 0) return;
1184
+
1185
+ // Check for open PRs — work is still in flight if PRs exist
1186
+ const prs = await this.hub.pulls(repo, { state: 'open' });
1187
+ if (prs.length > 0) return;
1188
+
1189
+ log.info(`reap: no open loreli issues or PRs — stopping ${dormant.length} dormant agents`);
1190
+
1191
+ for (const agent of dormant) {
1192
+ try {
1193
+ await this.kill(agent.identity.name);
1194
+ log.info(`reap: stopped ${agent.identity.name}`);
1195
+ } catch (err) {
1196
+ log.warn(`reap: failed to stop ${agent.identity.name}: ${err.message}`);
1197
+ }
1198
+ }
1199
+ } catch (err) {
1200
+ log.debug(`reap: skipped — ${err.message}`);
1201
+ }
1202
+ }
1203
+
1204
+ // ── Reactor (Polling Loop) ────────────────────────────
1205
+
1206
+ /**
1207
+ * Register a reactor handler called on every tick.
1208
+ * Role packages register their scan/forward/land handlers here.
1209
+ *
1210
+ * @param {string} name - Handler name (for logging/debugging).
1211
+ * @param {function(string): Promise<void>} handler - Async function receiving repo.
1212
+ */
1213
+ register(name, handler) {
1214
+ this._handlers.set(name, handler);
1215
+ log.info(`reactor handler registered: ${name}`);
1216
+ }
1217
+
1218
+ /**
1219
+ * Execute one polling iteration: call all registered handlers.
1220
+ * Each handler receives the repo string. Errors in one handler
1221
+ * do not prevent subsequent handlers from running.
1222
+ *
1223
+ * @param {string} repo - Repository in "owner/name" format.
1224
+ * @returns {Promise<void>}
1225
+ */
1226
+ async tick(repo) {
1227
+ log.debug('tick start');
1228
+
1229
+ for (const [name, handler] of this._handlers) {
1230
+ try {
1231
+ await handler(repo);
1232
+ } catch (err) {
1233
+ log.error(`reactor handler "${name}" failed: ${err.message}`);
1234
+ }
1235
+ }
1236
+
1237
+ log.debug('tick end');
1238
+ }
1239
+
1240
+ /**
1241
+ * Start the polling reactor loop using a self-scheduling setTimeout
1242
+ * chain. Each tick runs to completion before the next is scheduled,
1243
+ * eliminating overlap risk without a reentrant guard. This pattern
1244
+ * is more reliable than setInterval for async callbacks — setInterval
1245
+ * does not await its callback, and combined with unref() it can
1246
+ * silently stop firing after heavy async operations (observed in
1247
+ * production after agent spawn via tmux).
1248
+ *
1249
+ * @param {string} repo - Repository in "owner/name" format.
1250
+ */
1251
+ watch(repo) {
1252
+ if (this._watchHandle) return;
1253
+ this.repo = repo;
1254
+
1255
+ const interval = this.cfg?.get?.('watch.interval') ?? 60000;
1256
+ const self = this;
1257
+
1258
+ log.info(`watcher started for ${repo} (interval: ${interval}ms)`);
1259
+
1260
+ function schedule() {
1261
+ self._watchHandle = setTimeout(async function cycle() {
1262
+ await self.tick(repo);
1263
+ if (self._watchHandle) schedule();
1264
+ }, interval);
1265
+
1266
+ self._watchHandle.unref();
1267
+ }
1268
+
1269
+ schedule();
1270
+ }
1271
+
1272
+ /**
1273
+ * Stop the polling reactor loop.
1274
+ */
1275
+ unwatch() {
1276
+ if (this._watchHandle) {
1277
+ clearTimeout(this._watchHandle);
1278
+ this._watchHandle = null;
1279
+ log.info('watcher stopped');
1280
+ }
1281
+ }
1282
+
1283
+ // ── Monitor (Stall Detection) ─────────────────────────
1284
+
1285
+ /**
1286
+ * Start the stall detection monitor with LLM-powered classification.
1287
+ *
1288
+ * When an agent's pane output has not changed for longer than the
1289
+ * stall timeout, the monitor captures the pane content, classifies
1290
+ * it via `loreli/classify`, and dispatches the appropriate action:
1291
+ *
1292
+ * - `working` — reset activity timer, leave the agent alone
1293
+ * - `waiting_for_input` — send a continuation prompt
1294
+ * - `option_dialog` — send the appropriate keystroke (Enter)
1295
+ * - `error_loop` — emit 'stall' with diagnostic context
1296
+ * - `idle` — transition the agent to dormant
1297
+ * - `fatal` — kill the agent and mark the backend degraded
1298
+ *
1299
+ * Falls back to regex heuristics when no LLM backend is available.
1300
+ * Consecutive classification failures trigger a safety-net kill
1301
+ * (replaces the old tier 3 fixed-time kill).
1302
+ *
1303
+ * @fires Orchestrator#stall
1304
+ */
1305
+ monitor() {
1306
+ if (this._monitorHandle) return;
1307
+ log.info('stall detection monitor started');
1308
+
1309
+ const stallTimeout = this.stallTimeout;
1310
+ const maxClassifyFails = this.cfg?.get?.('classify.maxRetries') ?? 5;
1311
+ const self = this;
1312
+
1313
+ /** @type {boolean} Re-entrancy guard for the monitor callback. */
1314
+ this._monitoring = false;
1315
+
1316
+ this._monitorHandle = setInterval(async function checkStalls() {
1317
+ if (self._monitoring) return;
1318
+ self._monitoring = true;
1319
+
1320
+ try {
1321
+ await self.reconcile();
1322
+
1323
+ const now = Date.now();
1324
+ const snapshot = [...self.agents.entries()];
1325
+
1326
+ for (const [name, agent] of snapshot) {
1327
+ if (agent.state === 'dormant') {
1328
+ const last = self._lastActivity.get(name);
1329
+ if (!last) continue;
1330
+ const elapsed = now - new Date(last).getTime();
1331
+ if (elapsed > stallTimeout * 3) {
1332
+ self.agents.delete(name);
1333
+ self._lastActivity.delete(name);
1334
+ self._lastPaneHash.delete(name);
1335
+ self._classifyFails.delete(name);
1336
+ log.info(`stall: cleaned up dormant agent ${name}`);
1337
+ }
1338
+ continue;
1339
+ }
1340
+
1341
+ const last = self._lastActivity.get(name);
1342
+ if (!last) continue;
1343
+
1344
+ if (await self.refresh(name)) {
1345
+ self._classifyFails.set(name, 0);
1346
+ continue;
1347
+ }
1348
+
1349
+ const elapsed = now - new Date(last).getTime();
1350
+ if (elapsed <= stallTimeout) continue;
1351
+
1352
+ // Stall detected — classify the pane content
1353
+ const maxLines = self.cfg?.get?.('classify.maxLines') ?? 100;
1354
+ let result;
1355
+
1356
+ try {
1357
+ const pane = await agent.capture(maxLines);
1358
+ log.debug(`monitor pane ${name} (${agent.backend}, stale=${Math.round(elapsed / 1000)}s):\n${paneDebug(pane)}`);
1359
+ result = await classify('pane-state', pane, {
1360
+ backends: self.backendRegistry,
1361
+ config: self.cfg,
1362
+ vars: { model: agent.model, backend: agent.backend, role: agent.role }
1363
+ });
1364
+ self._classifyFails.set(name, 0);
1365
+ log.info(`classify ${name}: ${result.category} — ${result.reasoning}`);
1366
+ } catch (err) {
1367
+ const fails = (self._classifyFails.get(name) ?? 0) + 1;
1368
+ self._classifyFails.set(name, fails);
1369
+ log.warn(`classify failed for ${name} (${fails}/${maxClassifyFails}): ${err.message}`);
1370
+
1371
+ if (fails >= maxClassifyFails) {
1372
+ log.error(`agent ${name} unclassifiable after ${fails} attempts — killing as safety net`);
1373
+ self.emit('stall', { name, elapsed, severity: 'critical' });
1374
+ try { await self.kill(name); } catch (e) { log.error(`safety kill failed for ${name}: ${e.message}`); }
1375
+ }
1376
+ continue;
1377
+ }
1378
+
1379
+ switch (result.category) {
1380
+ case 'working':
1381
+ self._lastActivity.set(name, new Date().toISOString());
1382
+ break;
1383
+
1384
+ case 'waiting_for_input':
1385
+ try {
1386
+ await agent.send('Please continue working or report your status.');
1387
+ await self._rehash(name, agent);
1388
+ } catch (err) { log.debug(`monitor: send failed for ${name}: ${err.message}`); }
1389
+ self.emit('stall', { name, elapsed, severity: 'nudge', diagnosis: result });
1390
+ break;
1391
+
1392
+ case 'option_dialog': {
1393
+ const keys = remedy(result.remedy);
1394
+ try {
1395
+ const tmux = new Tmux();
1396
+ await tmux.keys(agent.paneId, ...keys);
1397
+ await self._rehash(name, agent);
1398
+ } catch (err) { log.debug(`monitor: keys failed for ${name}: ${err.message}`); }
1399
+ self.emit('stall', { name, elapsed, severity: 'nudge', diagnosis: result });
1400
+ break;
1401
+ }
1402
+
1403
+ case 'error_loop':
1404
+ self.emit('stall', { name, elapsed, severity: 'warning', diagnosis: result });
1405
+ break;
1406
+
1407
+ case 'idle':
1408
+ agent.transition?.('dormant');
1409
+ self.emit('stall', { name, elapsed, severity: 'nudge', diagnosis: result });
1410
+ break;
1411
+
1412
+ case 'fatal':
1413
+ log.error(`agent ${name} hit fatal error — killing`);
1414
+ self.emit('stall', { name, elapsed, severity: 'critical', diagnosis: result });
1415
+ try {
1416
+ await self.kill(name);
1417
+ self.backendRegistry?.recordFailure?.(agent.backend);
1418
+ } catch (err) { log.error(`fatal kill failed for ${name}: ${err.message}`); }
1419
+ break;
1420
+
1421
+ default:
1422
+ log.warn(`classify ${name}: unknown category "${result.category}"`);
1423
+ break;
1424
+ }
1425
+ }
1426
+ } finally {
1427
+ self._monitoring = false;
1428
+ }
1429
+ }, Math.min(stallTimeout / 2, 60000));
1430
+
1431
+ this._monitorHandle.unref();
1432
+ }
1433
+
1434
+ /**
1435
+ * Stop the stall detection monitor.
1436
+ */
1437
+ stopMonitor() {
1438
+ if (this._monitorHandle) {
1439
+ clearInterval(this._monitorHandle);
1440
+ this._monitorHandle = null;
1441
+ log.info('stall detection monitor stopped');
1442
+ }
1443
+ }
1444
+
1445
+ // ── Halt (Full System Stop) ──────────────────────────
1446
+
1447
+ /**
1448
+ * Stop the entire orchestrator: reactor loop, stall monitor, and
1449
+ * all registered agents. The MCP server process stays alive so the
1450
+ * user can call `start` again to resume.
1451
+ *
1452
+ * Composes {@link unwatch}, {@link stopMonitor}, and {@link kill}
1453
+ * into a single atomic operation. Idempotent — safe to call when
1454
+ * already halted.
1455
+ *
1456
+ * @returns {Promise<{reactor: boolean, monitor: boolean, agents: string[]}>}
1457
+ * Summary of what was stopped.
1458
+ * @fires Orchestrator#halted
1459
+ */
1460
+ async halt() {
1461
+ const reactor = Boolean(this._watchHandle);
1462
+ const monitor = Boolean(this._monitorHandle);
1463
+
1464
+ this.unwatch();
1465
+ this.stopMonitor();
1466
+
1467
+ const killed = [];
1468
+ const entries = [...this.agents.entries()];
1469
+
1470
+ for (const [name] of entries) {
1471
+ try {
1472
+ await this.kill(name);
1473
+ killed.push(name);
1474
+ } catch (err) {
1475
+ log.warn(`halt: failed to kill ${name}: ${err.message}`);
1476
+ }
1477
+ }
1478
+
1479
+ log.info(`halt: reactor=${reactor} monitor=${monitor} agents=${killed.length}`);
1480
+
1481
+ /**
1482
+ * @event Orchestrator#halted
1483
+ * @type {object}
1484
+ * @property {boolean} reactor - Whether the reactor was running.
1485
+ * @property {boolean} monitor - Whether the monitor was running.
1486
+ * @property {string[]} agents - Names of agents that were killed.
1487
+ */
1488
+ this.emit('halted', { reactor, monitor, agents: killed });
1489
+
1490
+ return { reactor, monitor, agents: killed };
1491
+ }
1492
+ }