loreli 1.0.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. package/README.md +66 -26
  2. package/package.json +17 -14
  3. package/packages/action/prompts/action.md +172 -0
  4. package/packages/action/src/index.js +33 -5
  5. package/packages/agent/README.md +107 -18
  6. package/packages/agent/src/backends/claude.js +111 -11
  7. package/packages/agent/src/backends/codex.js +78 -5
  8. package/packages/agent/src/backends/cursor.js +104 -27
  9. package/packages/agent/src/backends/index.js +162 -5
  10. package/packages/agent/src/cli.js +80 -3
  11. package/packages/agent/src/discover.js +396 -0
  12. package/packages/agent/src/factory.js +39 -34
  13. package/packages/agent/src/models.js +24 -6
  14. package/packages/classify/README.md +136 -0
  15. package/packages/classify/prompts/blocker.md +12 -0
  16. package/packages/classify/prompts/feedback.md +14 -0
  17. package/packages/classify/prompts/pane-state.md +20 -0
  18. package/packages/classify/src/index.js +81 -0
  19. package/packages/config/README.md +156 -91
  20. package/packages/config/src/defaults.js +32 -21
  21. package/packages/config/src/index.js +33 -2
  22. package/packages/config/src/schema.js +57 -39
  23. package/packages/hub/src/github.js +59 -20
  24. package/packages/identity/README.md +1 -1
  25. package/packages/identity/src/index.js +2 -2
  26. package/packages/knowledge/README.md +86 -106
  27. package/packages/knowledge/src/index.js +56 -225
  28. package/packages/mcp/README.md +51 -7
  29. package/packages/mcp/instructions.md +6 -1
  30. package/packages/mcp/scaffolding/loreli.yml +115 -77
  31. package/packages/mcp/scaffolding/mcp-configs/.codex/config.toml +1 -0
  32. package/packages/mcp/scaffolding/mcp-configs/.cursor/mcp.json +4 -1
  33. package/packages/mcp/scaffolding/mcp-configs/.mcp.json +4 -1
  34. package/packages/mcp/src/index.js +45 -16
  35. package/packages/mcp/src/tools/agent-context.js +44 -0
  36. package/packages/mcp/src/tools/agents.js +34 -13
  37. package/packages/mcp/src/tools/context.js +3 -2
  38. package/packages/mcp/src/tools/github.js +11 -47
  39. package/packages/mcp/src/tools/hitl.js +19 -6
  40. package/packages/mcp/src/tools/index.js +2 -1
  41. package/packages/mcp/src/tools/refactor.js +227 -0
  42. package/packages/mcp/src/tools/repo.js +44 -0
  43. package/packages/mcp/src/tools/start.js +159 -90
  44. package/packages/mcp/src/tools/status.js +5 -2
  45. package/packages/mcp/src/tools/work.js +18 -8
  46. package/packages/orchestrator/src/index.js +345 -79
  47. package/packages/planner/README.md +84 -1
  48. package/packages/planner/prompts/plan-reviewer.md +109 -0
  49. package/packages/planner/prompts/planner.md +191 -0
  50. package/packages/planner/prompts/tiebreaker-reviewer.md +71 -0
  51. package/packages/planner/src/index.js +326 -111
  52. package/packages/review/README.md +2 -2
  53. package/packages/review/prompts/reviewer.md +158 -0
  54. package/packages/review/src/index.js +196 -76
  55. package/packages/risk/README.md +81 -22
  56. package/packages/risk/prompts/risk.md +272 -0
  57. package/packages/risk/src/index.js +44 -33
  58. package/packages/tmux/src/index.js +61 -12
  59. package/packages/workflow/README.md +18 -14
  60. package/packages/workflow/prompts/preamble.md +14 -0
  61. package/packages/workflow/src/index.js +191 -12
  62. package/packages/workspace/README.md +2 -2
  63. package/packages/workspace/src/index.js +69 -18
@@ -6,6 +6,7 @@ import { Factory, Session, output } from 'loreli/agent';
6
6
  import { Tmux } from 'loreli/tmux';
7
7
  import { prepare } from 'loreli/workspace';
8
8
  import { pick, side, capability } from 'loreli/identity';
9
+ import { classify } from 'loreli/classify';
9
10
  import { logger } from 'loreli/log';
10
11
 
11
12
  const log = logger('orchestrator');
@@ -24,9 +25,19 @@ const FATAL_PATTERNS = [
24
25
  /authentication[_ ]*(error|failed)/i,
25
26
  /invalid[_ ]*api[_ ]*key/i,
26
27
  /quota[_ ]*exceeded/i,
27
- /insufficient[_ ]*quota/i
28
+ /insufficient[_ ]*quota/i,
29
+ /invalid model name/i,
30
+ /unable to connect to api/i,
31
+ /connection\s*refused/i
28
32
  ];
29
33
 
34
+ /**
35
+ * Maximum pane characters logged in diagnostic debug output.
36
+ *
37
+ * @type {number}
38
+ */
39
+ const PANE_DEBUG_LIMIT = 4000;
40
+
30
41
  /**
31
42
  * Check if pane output contains fatal API error patterns.
32
43
  *
@@ -38,6 +49,46 @@ function hasFatalError(output) {
38
49
  return FATAL_PATTERNS.some(function match(p) { return p.test(output); });
39
50
  }
40
51
 
52
+ /**
53
+ * Format captured pane output for debug logging.
54
+ *
55
+ * Keeps logs readable while still preserving enough context to validate
56
+ * classifier and fallback decisions during stall/rapid-death diagnosis.
57
+ *
58
+ * @param {string} output - Raw pane output.
59
+ * @returns {string} Pane text, truncated when necessary.
60
+ */
61
+ function paneDebug(output) {
62
+ if (!output) return '[empty pane output]';
63
+ if (output.length <= PANE_DEBUG_LIMIT) return output;
64
+ const rest = output.length - PANE_DEBUG_LIMIT;
65
+ return `${output.slice(0, PANE_DEBUG_LIMIT)}\n… [truncated ${rest} chars]`;
66
+ }
67
+
68
+ /**
69
+ * Normalize remedy instructions into tmux key names.
70
+ *
71
+ * Classifier prompts return remedies as space-delimited strings
72
+ * (`"Down Enter"`), while backend fallback diagnose methods return
73
+ * string arrays (`['Down', 'Enter']`). The orchestrator accepts both.
74
+ *
75
+ * @param {string|string[]|null|undefined} remedy - Remedy from diagnosis.
76
+ * @returns {string[]} Tmux key sequence.
77
+ */
78
+ function remedy(remedy) {
79
+ if (Array.isArray(remedy)) {
80
+ const keys = remedy.filter(Boolean);
81
+ if (keys.length > 0) return keys;
82
+ return ['Enter'];
83
+ }
84
+ if (typeof remedy === 'string') {
85
+ const keys = remedy.split(/\s+/).filter(Boolean);
86
+ if (keys.length > 0) return keys;
87
+ return ['Enter'];
88
+ }
89
+ return ['Enter'];
90
+ }
91
+
41
92
  /**
42
93
  * Generic agent lifecycle coordinator via EventEmitter.
43
94
  *
@@ -108,6 +159,9 @@ export class Orchestrator extends EventEmitter {
108
159
  /** @type {Map<string, string>} MD5 hash of last captured pane output per agent for tmux-based activity detection. */
109
160
  this._lastPaneHash = new Map();
110
161
 
162
+ /** @type {Map<string, number>} Consecutive classify failures per agent — safety net kill after threshold. */
163
+ this._classifyFails = new Map();
164
+
111
165
  /** @type {NodeJS.Timeout|null} Reactor polling interval handle. */
112
166
  this._watchHandle = null;
113
167
 
@@ -231,34 +285,121 @@ export class Orchestrator extends EventEmitter {
231
285
  // backend is likely broken (budget exhaustion, API outage).
232
286
  // Mark it as degraded so scale() falls back to cursor-agent.
233
287
  //
234
- // Two checks:
235
- // 1. Dead pane agent exited on error
236
- // 2. Stuck-alive → agent stays alive but shows budget/rate-limit
237
- // errors in its pane output
288
+ // Uses the pane-state classifier when pane output is available
289
+ // (remain-on-exit keeps dead panes capturable). Falls back to
290
+ // raw alive() when capture fails.
238
291
  if (agent.backend && agent.alive) {
239
292
  const backend = agent.backend;
240
293
  const name = agent.identity.name;
241
294
  const registry = this.backendRegistry;
242
295
  const self = this;
243
296
  const timer = setTimeout(async function rapidDeathCheck() {
297
+ if (agent.state === 'dormant') return;
298
+
244
299
  try {
245
300
  const alive = await agent.alive();
246
- if (!alive && agent.state !== 'dormant') {
247
- log.warn(`rapid death: ${name} died within ${self.rapidDeathDelay}ms of spawn marking ${backend} degraded`);
301
+
302
+ // Agent is alive and healthyno rapid death
303
+ if (alive && !agent.capture) return;
304
+
305
+ let output;
306
+ try {
307
+ output = agent.capture
308
+ ? await agent.capture(self.cfg?.get?.('classify.maxLines') ?? 100)
309
+ : null;
310
+ } catch { output = null; }
311
+ if (output !== null) {
312
+ log.debug(`rapid-death pane ${name} (${backend}, alive=${alive}):\n${paneDebug(output)}`);
313
+ } else {
314
+ log.debug(`rapid-death pane ${name} (${backend}, alive=${alive}): [capture unavailable]`);
315
+ }
316
+
317
+ // Classify the pane output to determine why the agent
318
+ // died or what error it hit while still alive.
319
+ let diagnosis;
320
+ if (output) {
321
+ try {
322
+ diagnosis = await classify('pane-state', output, {
323
+ backends: self.backendRegistry,
324
+ config: self.cfg,
325
+ vars: { model: agent.model, backend, role: agent.role }
326
+ });
327
+ log.info(`rapid-death classify ${name}: ${diagnosis.category} — ${diagnosis.reasoning}`);
328
+ } catch (err) {
329
+ log.warn(`rapid-death classify failed for ${name}: ${err.message}`);
330
+ }
331
+ }
332
+
333
+ // When LLM classify fails, fall back to backend-specific
334
+ // regex detection. Each backend knows its CLI's dialog patterns.
335
+ let category = diagnosis?.category;
336
+ if (alive && output) {
337
+ const fallback = registry?.diagnose?.(backend, output);
338
+ const actionable = new Set(['option_dialog', 'waiting_for_input', 'fatal', 'dead']);
339
+ const fallbackActionable = actionable.has(fallback?.category);
340
+ const llmActionable = actionable.has(category);
341
+ const llmCategory = category;
342
+
343
+ if (!category && fallback) {
344
+ category = fallback.category;
345
+ diagnosis = fallback;
346
+ log.info(`rapid-death fallback diagnose ${name}: ${category} — ${fallback.reasoning}`);
347
+ } else if (fallbackActionable && !llmActionable) {
348
+ category = fallback.category;
349
+ diagnosis = fallback;
350
+ log.info(`rapid-death fallback override ${name}: ${fallback.category} over ${llmCategory ?? 'unknown'} — ${fallback.reasoning}`);
351
+ }
352
+ }
353
+
354
+ if (!alive) {
355
+ log.warn(`rapid death: ${name} died within ${self.rapidDeathDelay}ms of spawn (${category ?? 'unknown'}) — marking ${backend} degraded`);
248
356
  registry?.recordFailure(backend);
249
- if (agent.canTransition?.('dormant')) agent.transition('dormant');
250
- self.emit('rapid-death', { name, backend });
357
+ try { await self.kill(name); } catch { /* already dead */ }
358
+ self.emit('rapid-death', { name, backend, diagnosis });
251
359
  return;
252
360
  }
253
361
 
254
- if (alive && agent.capture) {
255
- const output = await agent.capture();
256
- if (hasFatalError(output)) {
257
- log.warn(`stuck-alive: ${name} shows fatal API error marking ${backend} degraded`);
258
- registry?.recordFailure(backend);
259
- try { await agent.stop(); } catch { /* stop can fail */ }
260
- self.emit('rapid-death', { name, backend, reason: 'stuck-alive' });
261
- }
362
+ // Alive with recoverable dialog — send the appropriate
363
+ // input to dismiss it. Record a soft warning instead of a
364
+ // hard failure so the backend isn't blacklisted for a
365
+ // transient issue. Repeated warnings promote to failure.
366
+ if (category === 'option_dialog') {
367
+ const keys = remedy(diagnosis?.remedy);
368
+ log.info(`rapid-death remediation: ${name} has option dialog — sending ${keys.join('+')}`);
369
+ try {
370
+ const tmux = new Tmux();
371
+ await tmux.keys(agent.paneId, ...keys);
372
+ } catch (err) { log.debug(`rapid-death: keys failed for ${name}: ${err.message}`); }
373
+ registry?.recordWarning?.(backend);
374
+ self.emit('rapid-death', { name, backend, reason: 'remediated', diagnosis });
375
+ return;
376
+ }
377
+
378
+ if (category === 'waiting_for_input') {
379
+ log.info(`rapid-death remediation: ${name} waiting for input — sending continuation`);
380
+ try {
381
+ await agent.send('Please continue working or report your status.');
382
+ } catch (err) { log.debug(`rapid-death: send failed for ${name}: ${err.message}`); }
383
+ registry?.recordWarning?.(backend);
384
+ self.emit('rapid-death', { name, backend, reason: 'remediated', diagnosis });
385
+ return;
386
+ }
387
+
388
+ // Alive but classifier detected fatal state
389
+ if (category === 'fatal' || category === 'dead') {
390
+ log.warn(`stuck-alive: ${name} classified as ${category} — marking ${backend} degraded`);
391
+ registry?.recordFailure(backend);
392
+ try { await agent.stop(); } catch { /* stop can fail */ }
393
+ self.emit('rapid-death', { name, backend, reason: 'stuck-alive', diagnosis });
394
+ return;
395
+ }
396
+
397
+ // Alive but regex fallback for when classifier didn't detect fatal
398
+ if (alive && output && hasFatalError(output)) {
399
+ log.warn(`stuck-alive: ${name} shows fatal API error (regex) — marking ${backend} degraded`);
400
+ registry?.recordFailure(backend);
401
+ try { await agent.stop(); } catch { /* stop can fail */ }
402
+ self.emit('rapid-death', { name, backend, reason: 'stuck-alive' });
262
403
  }
263
404
  } catch { /* pane check can fail when session is torn down */ }
264
405
  }, this.rapidDeathDelay);
@@ -378,6 +519,7 @@ export class Orchestrator extends EventEmitter {
378
519
  this.agents.delete(name);
379
520
  this._lastActivity.delete(name);
380
521
  this._lastPaneHash.delete(name);
522
+ this._classifyFails.delete(name);
381
523
  this._removed.add(name);
382
524
  this.identityRegistry.release(agent.identity);
383
525
  log.info(`agent shut down: ${name}`);
@@ -424,6 +566,7 @@ export class Orchestrator extends EventEmitter {
424
566
  this.agents.delete(name);
425
567
  this._lastActivity.delete(name);
426
568
  this._lastPaneHash.delete(name);
569
+ this._classifyFails.delete(name);
427
570
  this._removed.add(name);
428
571
  this.identityRegistry.release(agent.identity);
429
572
 
@@ -573,7 +716,7 @@ export class Orchestrator extends EventEmitter {
573
716
 
574
717
  const agent = await this.factory.create(provider, role, {
575
718
  theme,
576
- model: this.cfg?.get?.('model'),
719
+ model: this.cfg?.get?.(`workflows.${role}.model`) ?? this.cfg?.get?.('model'),
577
720
  config: this.cfg,
578
721
  context,
579
722
  taken: this.takenNames,
@@ -594,7 +737,14 @@ export class Orchestrator extends EventEmitter {
594
737
  await this.storage.save(this.sessionId, agent.identity.name, session.toJSON());
595
738
  }
596
739
 
597
- await this.spawn(agent);
740
+ try {
741
+ await this.spawn(agent);
742
+ } catch (err) {
743
+ if (this.sessionId && this.storage && agent.identity?.name) {
744
+ try { await this.storage.remove(this.sessionId, agent.identity.name); } catch { /* best-effort */ }
745
+ }
746
+ throw err;
747
+ }
598
748
 
599
749
  if (this.sessionId && this.storage && agent.identity?.name && agent.paneId) {
600
750
  const data = await this.storage.load(this.sessionId, agent.identity.name);
@@ -636,6 +786,7 @@ export class Orchestrator extends EventEmitter {
636
786
  const prev = this._lastPaneHash.get(name);
637
787
  this._lastPaneHash.set(name, digest);
638
788
  if (prev && prev !== digest) {
789
+ if (hasFatalError(output)) return false;
639
790
  this._lastActivity.set(name, new Date().toISOString());
640
791
  return true;
641
792
  }
@@ -643,6 +794,29 @@ export class Orchestrator extends EventEmitter {
643
794
  } catch { return false; }
644
795
  }
645
796
 
797
+ /**
798
+ * Update the pane hash after an orchestrator-initiated interaction.
799
+ *
800
+ * Must be called after any action that changes the pane content
801
+ * (sending keys, messages, etc.) to prevent `refresh()` from
802
+ * misinterpreting the orchestrator's own output as agent activity
803
+ * on the next monitor cycle. The orchestrator also resets the
804
+ * stall timer here because a remediation attempt should buy the
805
+ * agent time to react before another nudge is sent.
806
+ *
807
+ * @param {string} name - Agent identity name.
808
+ * @param {object} agent - Agent instance with a `capture()` method.
809
+ * @returns {Promise<void>}
810
+ */
811
+ async _rehash(name, agent) {
812
+ try {
813
+ const content = await agent.capture(50);
814
+ const digest = createHash('md5').update(content ?? '').digest('hex');
815
+ this._lastPaneHash.set(name, digest);
816
+ this._lastActivity.set(name, new Date().toISOString());
817
+ } catch { /* capture can fail if pane died */ }
818
+ }
819
+
646
820
  // ── Death Snapshot ──────────────────────────────────
647
821
 
648
822
  /**
@@ -698,8 +872,9 @@ export class Orchestrator extends EventEmitter {
698
872
  let alive;
699
873
  try {
700
874
  alive = await agent.alive();
701
- } catch {
702
- continue;
875
+ } catch (err) {
876
+ log.warn(`reconcile: alive() threw for ${name}: ${err.message} — treating as dead`);
877
+ alive = false;
703
878
  }
704
879
 
705
880
  if (alive) continue;
@@ -712,6 +887,7 @@ export class Orchestrator extends EventEmitter {
712
887
  this.agents.delete(name);
713
888
  this._lastActivity.delete(name);
714
889
  this._lastPaneHash.delete(name);
890
+ this._classifyFails.delete(name);
715
891
  this._removed.add(name);
716
892
  this.identityRegistry.release(agent.identity);
717
893
 
@@ -784,7 +960,6 @@ export class Orchestrator extends EventEmitter {
784
960
  if (!this.workflows.size) return [];
785
961
 
786
962
  const maxAgents = this.cfg?.get?.('scaling.maxAgents') ?? 8;
787
- const maxPerRole = this.cfg?.get?.('scaling.maxPerRole') ?? {};
788
963
  const maxPerTick = this.cfg?.get?.('scaling.maxPerTick') ?? 2;
789
964
  const cooldown = this.cfg?.get?.('scaling.cooldown') ?? 30000;
790
965
 
@@ -828,7 +1003,7 @@ export class Orchestrator extends EventEmitter {
828
1003
  if (signal.deficit <= 0) continue;
829
1004
 
830
1005
  const { role } = signal;
831
- const roleCap = maxPerRole[role] ?? Infinity;
1006
+ const roleCap = this.cfg?.get?.(`workflows.${role}.maxAgents`) ?? Infinity;
832
1007
  const current = [...this.agents.values()]
833
1008
  .filter(function liveRole(a) { return a.role === role && a.state !== 'dormant'; }).length;
834
1009
 
@@ -1108,18 +1283,22 @@ export class Orchestrator extends EventEmitter {
1108
1283
  // ── Monitor (Stall Detection) ─────────────────────────
1109
1284
 
1110
1285
  /**
1111
- * Start the stall detection monitor with 3-tier escalation.
1286
+ * Start the stall detection monitor with LLM-powered classification.
1112
1287
  *
1113
- * Tier 1 Nudge (1x stall timeout):
1114
- * Send a message to the agent asking for status. Emits 'stall'
1115
- * with severity 'nudge'.
1288
+ * When an agent's pane output has not changed for longer than the
1289
+ * stall timeout, the monitor captures the pane content, classifies
1290
+ * it via `loreli/classify`, and dispatches the appropriate action:
1116
1291
  *
1117
- * Tier 2Warning (2x stall timeout):
1118
- * Emits 'stall' with severity 'warning'. Role packages can
1119
- * subscribe and post GitHub comments.
1292
+ * - `working`reset activity timer, leave the agent alone
1293
+ * - `waiting_for_input` send a continuation prompt
1294
+ * - `option_dialog` send the appropriate keystroke (Enter)
1295
+ * - `error_loop` — emit 'stall' with diagnostic context
1296
+ * - `idle` — transition the agent to dormant
1297
+ * - `fatal` — kill the agent and mark the backend degraded
1120
1298
  *
1121
- * Tier 3 Critical (3x stall timeout):
1122
- * Kills the agent and emits 'stall' with severity 'critical'.
1299
+ * Falls back to regex heuristics when no LLM backend is available.
1300
+ * Consecutive classification failures trigger a safety-net kill
1301
+ * (replaces the old tier 3 fixed-time kill).
1123
1302
  *
1124
1303
  * @fires Orchestrator#stall
1125
1304
  */
@@ -1128,24 +1307,23 @@ export class Orchestrator extends EventEmitter {
1128
1307
  log.info('stall detection monitor started');
1129
1308
 
1130
1309
  const stallTimeout = this.stallTimeout;
1131
- const nudge = this.cfg?.get?.('timeouts.nudge') ?? true;
1310
+ const maxClassifyFails = this.cfg?.get?.('classify.maxRetries') ?? 5;
1132
1311
  const self = this;
1133
1312
 
1313
+ /** @type {boolean} Re-entrancy guard for the monitor callback. */
1314
+ this._monitoring = false;
1315
+
1134
1316
  this._monitorHandle = setInterval(async function checkStalls() {
1135
- // Reconcile first: detect dead panes and clean up before
1136
- // running stall-escalation checks. Without this, dead agents
1137
- // linger until stallTimeout elapses.
1317
+ if (self._monitoring) return;
1318
+ self._monitoring = true;
1319
+
1320
+ try {
1138
1321
  await self.reconcile();
1139
1322
 
1140
1323
  const now = Date.now();
1141
-
1142
- // Snapshot keys to avoid mutation during iteration — Tier 3
1143
- // calls kill() which deletes from self.agents mid-loop.
1144
1324
  const snapshot = [...self.agents.entries()];
1325
+
1145
1326
  for (const [name, agent] of snapshot) {
1146
- // Dormant agents are kept registered so downstream workflows
1147
- // (e.g. review scan) can still match them by identity. Skip
1148
- // nudge/warning, but allow Tier 3 kill for eventual cleanup.
1149
1327
  if (agent.state === 'dormant') {
1150
1328
  const last = self._lastActivity.get(name);
1151
1329
  if (!last) continue;
@@ -1154,6 +1332,7 @@ export class Orchestrator extends EventEmitter {
1154
1332
  self.agents.delete(name);
1155
1333
  self._lastActivity.delete(name);
1156
1334
  self._lastPaneHash.delete(name);
1335
+ self._classifyFails.delete(name);
1157
1336
  log.info(`stall: cleaned up dormant agent ${name}`);
1158
1337
  }
1159
1338
  continue;
@@ -1162,52 +1341,91 @@ export class Orchestrator extends EventEmitter {
1162
1341
  const last = self._lastActivity.get(name);
1163
1342
  if (!last) continue;
1164
1343
 
1165
- // Local proof-of-life: check tmux pane for real activity
1166
- // before escalating. If output changed, _lastActivity is
1167
- // now current and the tier checks naturally skip.
1168
- if (await self.refresh(name)) continue;
1344
+ if (await self.refresh(name)) {
1345
+ self._classifyFails.set(name, 0);
1346
+ continue;
1347
+ }
1169
1348
 
1170
1349
  const elapsed = now - new Date(last).getTime();
1350
+ if (elapsed <= stallTimeout) continue;
1171
1351
 
1172
- if (elapsed > stallTimeout * 3) {
1173
- // Tier 3: Critically stalled — kill and emit
1174
- log.error(`agent ${name} critically stalled (${Math.round(elapsed / 1000)}s) — killing`);
1352
+ // Stall detected classify the pane content
1353
+ const maxLines = self.cfg?.get?.('classify.maxLines') ?? 100;
1354
+ let result;
1175
1355
 
1176
- /**
1177
- * @event Orchestrator#stall
1178
- * @type {object}
1179
- * @property {string} name - Agent identity name.
1180
- * @property {number} elapsed - Time since last activity in ms.
1181
- * @property {string} severity - 'nudge', 'warning', or 'critical'.
1182
- */
1183
- self.emit('stall', { name, elapsed, severity: 'critical' });
1184
-
1185
- try {
1186
- await self.kill(name);
1187
- log.info(`stall tier 3: agent ${name} killed`);
1188
- } catch (err) {
1189
- log.error(`stall tier 3: kill failed for ${name}: ${err.message}`);
1356
+ try {
1357
+ const pane = await agent.capture(maxLines);
1358
+ log.debug(`monitor pane ${name} (${agent.backend}, stale=${Math.round(elapsed / 1000)}s):\n${paneDebug(pane)}`);
1359
+ result = await classify('pane-state', pane, {
1360
+ backends: self.backendRegistry,
1361
+ config: self.cfg,
1362
+ vars: { model: agent.model, backend: agent.backend, role: agent.role }
1363
+ });
1364
+ self._classifyFails.set(name, 0);
1365
+ log.info(`classify ${name}: ${result.category} — ${result.reasoning}`);
1366
+ } catch (err) {
1367
+ const fails = (self._classifyFails.get(name) ?? 0) + 1;
1368
+ self._classifyFails.set(name, fails);
1369
+ log.warn(`classify failed for ${name} (${fails}/${maxClassifyFails}): ${err.message}`);
1370
+
1371
+ if (fails >= maxClassifyFails) {
1372
+ log.error(`agent ${name} unclassifiable after ${fails} attempts — killing as safety net`);
1373
+ self.emit('stall', { name, elapsed, severity: 'critical' });
1374
+ try { await self.kill(name); } catch (e) { log.error(`safety kill failed for ${name}: ${e.message}`); }
1190
1375
  }
1191
- } else if (elapsed > stallTimeout * 2) {
1192
- // Tier 2: Warning
1193
- log.warn(`agent ${name} stalled tier 2 (${Math.round(elapsed / 1000)}s)`);
1194
- self.emit('stall', { name, elapsed, severity: 'warning' });
1195
- } else if (elapsed > stallTimeout) {
1196
- // Tier 1: Optional nudge
1197
- if (nudge) {
1198
- log.warn(`agent ${name} stalled tier 1 (${Math.round(elapsed / 1000)}s) - nudging`);
1376
+ continue;
1377
+ }
1378
+
1379
+ switch (result.category) {
1380
+ case 'working':
1381
+ self._lastActivity.set(name, new Date().toISOString());
1382
+ break;
1383
+
1384
+ case 'waiting_for_input':
1199
1385
  try {
1200
- await agent.send('You appear to be stalled. Please report your current status or continue working.');
1201
- // Activity resets only when the agent responds (via MCP tool
1202
- // calls or hub activity), NOT when we nudge it. Resetting
1203
- // here would trap agents at tier 1 forever.
1204
- } catch (err) { log.debug(`monitor: nudge failed for ${name}: ${err.message}`); }
1205
- } else {
1206
- log.warn(`agent ${name} stalled tier 1 (${Math.round(elapsed / 1000)}s) - nudge suppressed by config`);
1386
+ await agent.send('Please continue working or report your status.');
1387
+ await self._rehash(name, agent);
1388
+ } catch (err) { log.debug(`monitor: send failed for ${name}: ${err.message}`); }
1389
+ self.emit('stall', { name, elapsed, severity: 'nudge', diagnosis: result });
1390
+ break;
1391
+
1392
+ case 'option_dialog': {
1393
+ const keys = remedy(result.remedy);
1394
+ try {
1395
+ const tmux = new Tmux();
1396
+ await tmux.keys(agent.paneId, ...keys);
1397
+ await self._rehash(name, agent);
1398
+ } catch (err) { log.debug(`monitor: keys failed for ${name}: ${err.message}`); }
1399
+ self.emit('stall', { name, elapsed, severity: 'nudge', diagnosis: result });
1400
+ break;
1207
1401
  }
1208
- self.emit('stall', { name, elapsed, severity: 'nudge' });
1402
+
1403
+ case 'error_loop':
1404
+ self.emit('stall', { name, elapsed, severity: 'warning', diagnosis: result });
1405
+ break;
1406
+
1407
+ case 'idle':
1408
+ agent.transition?.('dormant');
1409
+ self.emit('stall', { name, elapsed, severity: 'nudge', diagnosis: result });
1410
+ break;
1411
+
1412
+ case 'fatal':
1413
+ log.error(`agent ${name} hit fatal error — killing`);
1414
+ self.emit('stall', { name, elapsed, severity: 'critical', diagnosis: result });
1415
+ try {
1416
+ await self.kill(name);
1417
+ self.backendRegistry?.recordFailure?.(agent.backend);
1418
+ } catch (err) { log.error(`fatal kill failed for ${name}: ${err.message}`); }
1419
+ break;
1420
+
1421
+ default:
1422
+ log.warn(`classify ${name}: unknown category "${result.category}"`);
1423
+ break;
1209
1424
  }
1210
1425
  }
1426
+ } finally {
1427
+ self._monitoring = false;
1428
+ }
1211
1429
  }, Math.min(stallTimeout / 2, 60000));
1212
1430
 
1213
1431
  this._monitorHandle.unref();
@@ -1223,4 +1441,52 @@ export class Orchestrator extends EventEmitter {
1223
1441
  log.info('stall detection monitor stopped');
1224
1442
  }
1225
1443
  }
1444
+
1445
+ // ── Halt (Full System Stop) ──────────────────────────
1446
+
1447
+ /**
1448
+ * Stop the entire orchestrator: reactor loop, stall monitor, and
1449
+ * all registered agents. The MCP server process stays alive so the
1450
+ * user can call `start` again to resume.
1451
+ *
1452
+ * Composes {@link unwatch}, {@link stopMonitor}, and {@link kill}
1453
+ * into a single atomic operation. Idempotent — safe to call when
1454
+ * already halted.
1455
+ *
1456
+ * @returns {Promise<{reactor: boolean, monitor: boolean, agents: string[]}>}
1457
+ * Summary of what was stopped.
1458
+ * @fires Orchestrator#halted
1459
+ */
1460
+ async halt() {
1461
+ const reactor = Boolean(this._watchHandle);
1462
+ const monitor = Boolean(this._monitorHandle);
1463
+
1464
+ this.unwatch();
1465
+ this.stopMonitor();
1466
+
1467
+ const killed = [];
1468
+ const entries = [...this.agents.entries()];
1469
+
1470
+ for (const [name] of entries) {
1471
+ try {
1472
+ await this.kill(name);
1473
+ killed.push(name);
1474
+ } catch (err) {
1475
+ log.warn(`halt: failed to kill ${name}: ${err.message}`);
1476
+ }
1477
+ }
1478
+
1479
+ log.info(`halt: reactor=${reactor} monitor=${monitor} agents=${killed.length}`);
1480
+
1481
+ /**
1482
+ * @event Orchestrator#halted
1483
+ * @type {object}
1484
+ * @property {boolean} reactor - Whether the reactor was running.
1485
+ * @property {boolean} monitor - Whether the monitor was running.
1486
+ * @property {string[]} agents - Names of agents that were killed.
1487
+ */
1488
+ this.emit('halted', { reactor, monitor, agents: killed });
1489
+
1490
+ return { reactor, monitor, agents: killed };
1491
+ }
1226
1492
  }