loreli 1.0.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +66 -26
- package/package.json +17 -14
- package/packages/action/prompts/action.md +172 -0
- package/packages/action/src/index.js +33 -5
- package/packages/agent/README.md +107 -18
- package/packages/agent/src/backends/claude.js +111 -11
- package/packages/agent/src/backends/codex.js +78 -5
- package/packages/agent/src/backends/cursor.js +104 -27
- package/packages/agent/src/backends/index.js +162 -5
- package/packages/agent/src/cli.js +80 -3
- package/packages/agent/src/discover.js +396 -0
- package/packages/agent/src/factory.js +39 -34
- package/packages/agent/src/models.js +24 -6
- package/packages/classify/README.md +136 -0
- package/packages/classify/prompts/blocker.md +12 -0
- package/packages/classify/prompts/feedback.md +14 -0
- package/packages/classify/prompts/pane-state.md +20 -0
- package/packages/classify/src/index.js +81 -0
- package/packages/config/README.md +156 -91
- package/packages/config/src/defaults.js +32 -21
- package/packages/config/src/index.js +33 -2
- package/packages/config/src/schema.js +57 -39
- package/packages/hub/src/github.js +59 -20
- package/packages/identity/README.md +1 -1
- package/packages/identity/src/index.js +2 -2
- package/packages/knowledge/README.md +86 -106
- package/packages/knowledge/src/index.js +56 -225
- package/packages/mcp/README.md +51 -7
- package/packages/mcp/instructions.md +6 -1
- package/packages/mcp/scaffolding/loreli.yml +115 -77
- package/packages/mcp/scaffolding/mcp-configs/.codex/config.toml +1 -0
- package/packages/mcp/scaffolding/mcp-configs/.cursor/mcp.json +4 -1
- package/packages/mcp/scaffolding/mcp-configs/.mcp.json +4 -1
- package/packages/mcp/src/index.js +45 -16
- package/packages/mcp/src/tools/agent-context.js +44 -0
- package/packages/mcp/src/tools/agents.js +34 -13
- package/packages/mcp/src/tools/context.js +3 -2
- package/packages/mcp/src/tools/github.js +11 -47
- package/packages/mcp/src/tools/hitl.js +19 -6
- package/packages/mcp/src/tools/index.js +2 -1
- package/packages/mcp/src/tools/refactor.js +227 -0
- package/packages/mcp/src/tools/repo.js +44 -0
- package/packages/mcp/src/tools/start.js +159 -90
- package/packages/mcp/src/tools/status.js +5 -2
- package/packages/mcp/src/tools/work.js +18 -8
- package/packages/orchestrator/src/index.js +345 -79
- package/packages/planner/README.md +84 -1
- package/packages/planner/prompts/plan-reviewer.md +109 -0
- package/packages/planner/prompts/planner.md +191 -0
- package/packages/planner/prompts/tiebreaker-reviewer.md +71 -0
- package/packages/planner/src/index.js +326 -111
- package/packages/review/README.md +2 -2
- package/packages/review/prompts/reviewer.md +158 -0
- package/packages/review/src/index.js +196 -76
- package/packages/risk/README.md +81 -22
- package/packages/risk/prompts/risk.md +272 -0
- package/packages/risk/src/index.js +44 -33
- package/packages/tmux/src/index.js +61 -12
- package/packages/workflow/README.md +18 -14
- package/packages/workflow/prompts/preamble.md +14 -0
- package/packages/workflow/src/index.js +191 -12
- package/packages/workspace/README.md +2 -2
- package/packages/workspace/src/index.js +69 -18
|
@@ -6,6 +6,7 @@ import { Factory, Session, output } from 'loreli/agent';
|
|
|
6
6
|
import { Tmux } from 'loreli/tmux';
|
|
7
7
|
import { prepare } from 'loreli/workspace';
|
|
8
8
|
import { pick, side, capability } from 'loreli/identity';
|
|
9
|
+
import { classify } from 'loreli/classify';
|
|
9
10
|
import { logger } from 'loreli/log';
|
|
10
11
|
|
|
11
12
|
const log = logger('orchestrator');
|
|
@@ -24,9 +25,19 @@ const FATAL_PATTERNS = [
|
|
|
24
25
|
/authentication[_ ]*(error|failed)/i,
|
|
25
26
|
/invalid[_ ]*api[_ ]*key/i,
|
|
26
27
|
/quota[_ ]*exceeded/i,
|
|
27
|
-
/insufficient[_ ]*quota/i
|
|
28
|
+
/insufficient[_ ]*quota/i,
|
|
29
|
+
/invalid model name/i,
|
|
30
|
+
/unable to connect to api/i,
|
|
31
|
+
/connection\s*refused/i
|
|
28
32
|
];
|
|
29
33
|
|
|
34
|
+
/**
|
|
35
|
+
* Maximum pane characters logged in diagnostic debug output.
|
|
36
|
+
*
|
|
37
|
+
* @type {number}
|
|
38
|
+
*/
|
|
39
|
+
const PANE_DEBUG_LIMIT = 4000;
|
|
40
|
+
|
|
30
41
|
/**
|
|
31
42
|
* Check if pane output contains fatal API error patterns.
|
|
32
43
|
*
|
|
@@ -38,6 +49,46 @@ function hasFatalError(output) {
|
|
|
38
49
|
return FATAL_PATTERNS.some(function match(p) { return p.test(output); });
|
|
39
50
|
}
|
|
40
51
|
|
|
52
|
+
/**
|
|
53
|
+
* Format captured pane output for debug logging.
|
|
54
|
+
*
|
|
55
|
+
* Keeps logs readable while still preserving enough context to validate
|
|
56
|
+
* classifier and fallback decisions during stall/rapid-death diagnosis.
|
|
57
|
+
*
|
|
58
|
+
* @param {string} output - Raw pane output.
|
|
59
|
+
* @returns {string} Pane text, truncated when necessary.
|
|
60
|
+
*/
|
|
61
|
+
function paneDebug(output) {
|
|
62
|
+
if (!output) return '[empty pane output]';
|
|
63
|
+
if (output.length <= PANE_DEBUG_LIMIT) return output;
|
|
64
|
+
const rest = output.length - PANE_DEBUG_LIMIT;
|
|
65
|
+
return `${output.slice(0, PANE_DEBUG_LIMIT)}\n… [truncated ${rest} chars]`;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Normalize remedy instructions into tmux key names.
|
|
70
|
+
*
|
|
71
|
+
* Classifier prompts return remedies as space-delimited strings
|
|
72
|
+
* (`"Down Enter"`), while backend fallback diagnose methods return
|
|
73
|
+
* string arrays (`['Down', 'Enter']`). The orchestrator accepts both.
|
|
74
|
+
*
|
|
75
|
+
* @param {string|string[]|null|undefined} remedy - Remedy from diagnosis.
|
|
76
|
+
* @returns {string[]} Tmux key sequence.
|
|
77
|
+
*/
|
|
78
|
+
function remedy(remedy) {
|
|
79
|
+
if (Array.isArray(remedy)) {
|
|
80
|
+
const keys = remedy.filter(Boolean);
|
|
81
|
+
if (keys.length > 0) return keys;
|
|
82
|
+
return ['Enter'];
|
|
83
|
+
}
|
|
84
|
+
if (typeof remedy === 'string') {
|
|
85
|
+
const keys = remedy.split(/\s+/).filter(Boolean);
|
|
86
|
+
if (keys.length > 0) return keys;
|
|
87
|
+
return ['Enter'];
|
|
88
|
+
}
|
|
89
|
+
return ['Enter'];
|
|
90
|
+
}
|
|
91
|
+
|
|
41
92
|
/**
|
|
42
93
|
* Generic agent lifecycle coordinator via EventEmitter.
|
|
43
94
|
*
|
|
@@ -108,6 +159,9 @@ export class Orchestrator extends EventEmitter {
|
|
|
108
159
|
/** @type {Map<string, string>} MD5 hash of last captured pane output per agent for tmux-based activity detection. */
|
|
109
160
|
this._lastPaneHash = new Map();
|
|
110
161
|
|
|
162
|
+
/** @type {Map<string, number>} Consecutive classify failures per agent — safety net kill after threshold. */
|
|
163
|
+
this._classifyFails = new Map();
|
|
164
|
+
|
|
111
165
|
/** @type {NodeJS.Timeout|null} Reactor polling interval handle. */
|
|
112
166
|
this._watchHandle = null;
|
|
113
167
|
|
|
@@ -231,34 +285,121 @@ export class Orchestrator extends EventEmitter {
|
|
|
231
285
|
// backend is likely broken (budget exhaustion, API outage).
|
|
232
286
|
// Mark it as degraded so scale() falls back to cursor-agent.
|
|
233
287
|
//
|
|
234
|
-
//
|
|
235
|
-
//
|
|
236
|
-
//
|
|
237
|
-
// errors in its pane output
|
|
288
|
+
// Uses the pane-state classifier when pane output is available
|
|
289
|
+
// (remain-on-exit keeps dead panes capturable). Falls back to
|
|
290
|
+
// raw alive() when capture fails.
|
|
238
291
|
if (agent.backend && agent.alive) {
|
|
239
292
|
const backend = agent.backend;
|
|
240
293
|
const name = agent.identity.name;
|
|
241
294
|
const registry = this.backendRegistry;
|
|
242
295
|
const self = this;
|
|
243
296
|
const timer = setTimeout(async function rapidDeathCheck() {
|
|
297
|
+
if (agent.state === 'dormant') return;
|
|
298
|
+
|
|
244
299
|
try {
|
|
245
300
|
const alive = await agent.alive();
|
|
246
|
-
|
|
247
|
-
|
|
301
|
+
|
|
302
|
+
// Agent is alive and healthy — no rapid death
|
|
303
|
+
if (alive && !agent.capture) return;
|
|
304
|
+
|
|
305
|
+
let output;
|
|
306
|
+
try {
|
|
307
|
+
output = agent.capture
|
|
308
|
+
? await agent.capture(self.cfg?.get?.('classify.maxLines') ?? 100)
|
|
309
|
+
: null;
|
|
310
|
+
} catch { output = null; }
|
|
311
|
+
if (output !== null) {
|
|
312
|
+
log.debug(`rapid-death pane ${name} (${backend}, alive=${alive}):\n${paneDebug(output)}`);
|
|
313
|
+
} else {
|
|
314
|
+
log.debug(`rapid-death pane ${name} (${backend}, alive=${alive}): [capture unavailable]`);
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
// Classify the pane output to determine why the agent
|
|
318
|
+
// died or what error it hit while still alive.
|
|
319
|
+
let diagnosis;
|
|
320
|
+
if (output) {
|
|
321
|
+
try {
|
|
322
|
+
diagnosis = await classify('pane-state', output, {
|
|
323
|
+
backends: self.backendRegistry,
|
|
324
|
+
config: self.cfg,
|
|
325
|
+
vars: { model: agent.model, backend, role: agent.role }
|
|
326
|
+
});
|
|
327
|
+
log.info(`rapid-death classify ${name}: ${diagnosis.category} — ${diagnosis.reasoning}`);
|
|
328
|
+
} catch (err) {
|
|
329
|
+
log.warn(`rapid-death classify failed for ${name}: ${err.message}`);
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
// When LLM classify fails, fall back to backend-specific
|
|
334
|
+
// regex detection. Each backend knows its CLI's dialog patterns.
|
|
335
|
+
let category = diagnosis?.category;
|
|
336
|
+
if (alive && output) {
|
|
337
|
+
const fallback = registry?.diagnose?.(backend, output);
|
|
338
|
+
const actionable = new Set(['option_dialog', 'waiting_for_input', 'fatal', 'dead']);
|
|
339
|
+
const fallbackActionable = actionable.has(fallback?.category);
|
|
340
|
+
const llmActionable = actionable.has(category);
|
|
341
|
+
const llmCategory = category;
|
|
342
|
+
|
|
343
|
+
if (!category && fallback) {
|
|
344
|
+
category = fallback.category;
|
|
345
|
+
diagnosis = fallback;
|
|
346
|
+
log.info(`rapid-death fallback diagnose ${name}: ${category} — ${fallback.reasoning}`);
|
|
347
|
+
} else if (fallbackActionable && !llmActionable) {
|
|
348
|
+
category = fallback.category;
|
|
349
|
+
diagnosis = fallback;
|
|
350
|
+
log.info(`rapid-death fallback override ${name}: ${fallback.category} over ${llmCategory ?? 'unknown'} — ${fallback.reasoning}`);
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
if (!alive) {
|
|
355
|
+
log.warn(`rapid death: ${name} died within ${self.rapidDeathDelay}ms of spawn (${category ?? 'unknown'}) — marking ${backend} degraded`);
|
|
248
356
|
registry?.recordFailure(backend);
|
|
249
|
-
|
|
250
|
-
self.emit('rapid-death', { name, backend });
|
|
357
|
+
try { await self.kill(name); } catch { /* already dead */ }
|
|
358
|
+
self.emit('rapid-death', { name, backend, diagnosis });
|
|
251
359
|
return;
|
|
252
360
|
}
|
|
253
361
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
362
|
+
// Alive with recoverable dialog — send the appropriate
|
|
363
|
+
// input to dismiss it. Record a soft warning instead of a
|
|
364
|
+
// hard failure so the backend isn't blacklisted for a
|
|
365
|
+
// transient issue. Repeated warnings promote to failure.
|
|
366
|
+
if (category === 'option_dialog') {
|
|
367
|
+
const keys = remedy(diagnosis?.remedy);
|
|
368
|
+
log.info(`rapid-death remediation: ${name} has option dialog — sending ${keys.join('+')}`);
|
|
369
|
+
try {
|
|
370
|
+
const tmux = new Tmux();
|
|
371
|
+
await tmux.keys(agent.paneId, ...keys);
|
|
372
|
+
} catch (err) { log.debug(`rapid-death: keys failed for ${name}: ${err.message}`); }
|
|
373
|
+
registry?.recordWarning?.(backend);
|
|
374
|
+
self.emit('rapid-death', { name, backend, reason: 'remediated', diagnosis });
|
|
375
|
+
return;
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
if (category === 'waiting_for_input') {
|
|
379
|
+
log.info(`rapid-death remediation: ${name} waiting for input — sending continuation`);
|
|
380
|
+
try {
|
|
381
|
+
await agent.send('Please continue working or report your status.');
|
|
382
|
+
} catch (err) { log.debug(`rapid-death: send failed for ${name}: ${err.message}`); }
|
|
383
|
+
registry?.recordWarning?.(backend);
|
|
384
|
+
self.emit('rapid-death', { name, backend, reason: 'remediated', diagnosis });
|
|
385
|
+
return;
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
// Alive but classifier detected fatal state
|
|
389
|
+
if (category === 'fatal' || category === 'dead') {
|
|
390
|
+
log.warn(`stuck-alive: ${name} classified as ${category} — marking ${backend} degraded`);
|
|
391
|
+
registry?.recordFailure(backend);
|
|
392
|
+
try { await agent.stop(); } catch { /* stop can fail */ }
|
|
393
|
+
self.emit('rapid-death', { name, backend, reason: 'stuck-alive', diagnosis });
|
|
394
|
+
return;
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
// Alive but regex fallback for when classifier didn't detect fatal
|
|
398
|
+
if (alive && output && hasFatalError(output)) {
|
|
399
|
+
log.warn(`stuck-alive: ${name} shows fatal API error (regex) — marking ${backend} degraded`);
|
|
400
|
+
registry?.recordFailure(backend);
|
|
401
|
+
try { await agent.stop(); } catch { /* stop can fail */ }
|
|
402
|
+
self.emit('rapid-death', { name, backend, reason: 'stuck-alive' });
|
|
262
403
|
}
|
|
263
404
|
} catch { /* pane check can fail when session is torn down */ }
|
|
264
405
|
}, this.rapidDeathDelay);
|
|
@@ -378,6 +519,7 @@ export class Orchestrator extends EventEmitter {
|
|
|
378
519
|
this.agents.delete(name);
|
|
379
520
|
this._lastActivity.delete(name);
|
|
380
521
|
this._lastPaneHash.delete(name);
|
|
522
|
+
this._classifyFails.delete(name);
|
|
381
523
|
this._removed.add(name);
|
|
382
524
|
this.identityRegistry.release(agent.identity);
|
|
383
525
|
log.info(`agent shut down: ${name}`);
|
|
@@ -424,6 +566,7 @@ export class Orchestrator extends EventEmitter {
|
|
|
424
566
|
this.agents.delete(name);
|
|
425
567
|
this._lastActivity.delete(name);
|
|
426
568
|
this._lastPaneHash.delete(name);
|
|
569
|
+
this._classifyFails.delete(name);
|
|
427
570
|
this._removed.add(name);
|
|
428
571
|
this.identityRegistry.release(agent.identity);
|
|
429
572
|
|
|
@@ -573,7 +716,7 @@ export class Orchestrator extends EventEmitter {
|
|
|
573
716
|
|
|
574
717
|
const agent = await this.factory.create(provider, role, {
|
|
575
718
|
theme,
|
|
576
|
-
model: this.cfg?.get?.('model'),
|
|
719
|
+
model: this.cfg?.get?.(`workflows.${role}.model`) ?? this.cfg?.get?.('model'),
|
|
577
720
|
config: this.cfg,
|
|
578
721
|
context,
|
|
579
722
|
taken: this.takenNames,
|
|
@@ -594,7 +737,14 @@ export class Orchestrator extends EventEmitter {
|
|
|
594
737
|
await this.storage.save(this.sessionId, agent.identity.name, session.toJSON());
|
|
595
738
|
}
|
|
596
739
|
|
|
597
|
-
|
|
740
|
+
try {
|
|
741
|
+
await this.spawn(agent);
|
|
742
|
+
} catch (err) {
|
|
743
|
+
if (this.sessionId && this.storage && agent.identity?.name) {
|
|
744
|
+
try { await this.storage.remove(this.sessionId, agent.identity.name); } catch { /* best-effort */ }
|
|
745
|
+
}
|
|
746
|
+
throw err;
|
|
747
|
+
}
|
|
598
748
|
|
|
599
749
|
if (this.sessionId && this.storage && agent.identity?.name && agent.paneId) {
|
|
600
750
|
const data = await this.storage.load(this.sessionId, agent.identity.name);
|
|
@@ -636,6 +786,7 @@ export class Orchestrator extends EventEmitter {
|
|
|
636
786
|
const prev = this._lastPaneHash.get(name);
|
|
637
787
|
this._lastPaneHash.set(name, digest);
|
|
638
788
|
if (prev && prev !== digest) {
|
|
789
|
+
if (hasFatalError(output)) return false;
|
|
639
790
|
this._lastActivity.set(name, new Date().toISOString());
|
|
640
791
|
return true;
|
|
641
792
|
}
|
|
@@ -643,6 +794,29 @@ export class Orchestrator extends EventEmitter {
|
|
|
643
794
|
} catch { return false; }
|
|
644
795
|
}
|
|
645
796
|
|
|
797
|
+
/**
|
|
798
|
+
* Update the pane hash after an orchestrator-initiated interaction.
|
|
799
|
+
*
|
|
800
|
+
* Must be called after any action that changes the pane content
|
|
801
|
+
* (sending keys, messages, etc.) to prevent `refresh()` from
|
|
802
|
+
* misinterpreting the orchestrator's own output as agent activity
|
|
803
|
+
* on the next monitor cycle. The orchestrator also resets the
|
|
804
|
+
* stall timer here because a remediation attempt should buy the
|
|
805
|
+
* agent time to react before another nudge is sent.
|
|
806
|
+
*
|
|
807
|
+
* @param {string} name - Agent identity name.
|
|
808
|
+
* @param {object} agent - Agent instance with a `capture()` method.
|
|
809
|
+
* @returns {Promise<void>}
|
|
810
|
+
*/
|
|
811
|
+
async _rehash(name, agent) {
|
|
812
|
+
try {
|
|
813
|
+
const content = await agent.capture(50);
|
|
814
|
+
const digest = createHash('md5').update(content ?? '').digest('hex');
|
|
815
|
+
this._lastPaneHash.set(name, digest);
|
|
816
|
+
this._lastActivity.set(name, new Date().toISOString());
|
|
817
|
+
} catch { /* capture can fail if pane died */ }
|
|
818
|
+
}
|
|
819
|
+
|
|
646
820
|
// ── Death Snapshot ──────────────────────────────────
|
|
647
821
|
|
|
648
822
|
/**
|
|
@@ -698,8 +872,9 @@ export class Orchestrator extends EventEmitter {
|
|
|
698
872
|
let alive;
|
|
699
873
|
try {
|
|
700
874
|
alive = await agent.alive();
|
|
701
|
-
} catch {
|
|
702
|
-
|
|
875
|
+
} catch (err) {
|
|
876
|
+
log.warn(`reconcile: alive() threw for ${name}: ${err.message} — treating as dead`);
|
|
877
|
+
alive = false;
|
|
703
878
|
}
|
|
704
879
|
|
|
705
880
|
if (alive) continue;
|
|
@@ -712,6 +887,7 @@ export class Orchestrator extends EventEmitter {
|
|
|
712
887
|
this.agents.delete(name);
|
|
713
888
|
this._lastActivity.delete(name);
|
|
714
889
|
this._lastPaneHash.delete(name);
|
|
890
|
+
this._classifyFails.delete(name);
|
|
715
891
|
this._removed.add(name);
|
|
716
892
|
this.identityRegistry.release(agent.identity);
|
|
717
893
|
|
|
@@ -784,7 +960,6 @@ export class Orchestrator extends EventEmitter {
|
|
|
784
960
|
if (!this.workflows.size) return [];
|
|
785
961
|
|
|
786
962
|
const maxAgents = this.cfg?.get?.('scaling.maxAgents') ?? 8;
|
|
787
|
-
const maxPerRole = this.cfg?.get?.('scaling.maxPerRole') ?? {};
|
|
788
963
|
const maxPerTick = this.cfg?.get?.('scaling.maxPerTick') ?? 2;
|
|
789
964
|
const cooldown = this.cfg?.get?.('scaling.cooldown') ?? 30000;
|
|
790
965
|
|
|
@@ -828,7 +1003,7 @@ export class Orchestrator extends EventEmitter {
|
|
|
828
1003
|
if (signal.deficit <= 0) continue;
|
|
829
1004
|
|
|
830
1005
|
const { role } = signal;
|
|
831
|
-
const roleCap =
|
|
1006
|
+
const roleCap = this.cfg?.get?.(`workflows.${role}.maxAgents`) ?? Infinity;
|
|
832
1007
|
const current = [...this.agents.values()]
|
|
833
1008
|
.filter(function liveRole(a) { return a.role === role && a.state !== 'dormant'; }).length;
|
|
834
1009
|
|
|
@@ -1108,18 +1283,22 @@ export class Orchestrator extends EventEmitter {
|
|
|
1108
1283
|
// ── Monitor (Stall Detection) ─────────────────────────
|
|
1109
1284
|
|
|
1110
1285
|
/**
|
|
1111
|
-
* Start the stall detection monitor with
|
|
1286
|
+
* Start the stall detection monitor with LLM-powered classification.
|
|
1112
1287
|
*
|
|
1113
|
-
*
|
|
1114
|
-
*
|
|
1115
|
-
*
|
|
1288
|
+
* When an agent's pane output has not changed for longer than the
|
|
1289
|
+
* stall timeout, the monitor captures the pane content, classifies
|
|
1290
|
+
* it via `loreli/classify`, and dispatches the appropriate action:
|
|
1116
1291
|
*
|
|
1117
|
-
*
|
|
1118
|
-
*
|
|
1119
|
-
*
|
|
1292
|
+
* - `working` — reset activity timer, leave the agent alone
|
|
1293
|
+
* - `waiting_for_input` — send a continuation prompt
|
|
1294
|
+
* - `option_dialog` — send the appropriate keystroke (Enter)
|
|
1295
|
+
* - `error_loop` — emit 'stall' with diagnostic context
|
|
1296
|
+
* - `idle` — transition the agent to dormant
|
|
1297
|
+
* - `fatal` — kill the agent and mark the backend degraded
|
|
1120
1298
|
*
|
|
1121
|
-
*
|
|
1122
|
-
*
|
|
1299
|
+
* Falls back to regex heuristics when no LLM backend is available.
|
|
1300
|
+
* Consecutive classification failures trigger a safety-net kill
|
|
1301
|
+
* (replaces the old tier 3 fixed-time kill).
|
|
1123
1302
|
*
|
|
1124
1303
|
* @fires Orchestrator#stall
|
|
1125
1304
|
*/
|
|
@@ -1128,24 +1307,23 @@ export class Orchestrator extends EventEmitter {
|
|
|
1128
1307
|
log.info('stall detection monitor started');
|
|
1129
1308
|
|
|
1130
1309
|
const stallTimeout = this.stallTimeout;
|
|
1131
|
-
const
|
|
1310
|
+
const maxClassifyFails = this.cfg?.get?.('classify.maxRetries') ?? 5;
|
|
1132
1311
|
const self = this;
|
|
1133
1312
|
|
|
1313
|
+
/** @type {boolean} Re-entrancy guard for the monitor callback. */
|
|
1314
|
+
this._monitoring = false;
|
|
1315
|
+
|
|
1134
1316
|
this._monitorHandle = setInterval(async function checkStalls() {
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1317
|
+
if (self._monitoring) return;
|
|
1318
|
+
self._monitoring = true;
|
|
1319
|
+
|
|
1320
|
+
try {
|
|
1138
1321
|
await self.reconcile();
|
|
1139
1322
|
|
|
1140
1323
|
const now = Date.now();
|
|
1141
|
-
|
|
1142
|
-
// Snapshot keys to avoid mutation during iteration — Tier 3
|
|
1143
|
-
// calls kill() which deletes from self.agents mid-loop.
|
|
1144
1324
|
const snapshot = [...self.agents.entries()];
|
|
1325
|
+
|
|
1145
1326
|
for (const [name, agent] of snapshot) {
|
|
1146
|
-
// Dormant agents are kept registered so downstream workflows
|
|
1147
|
-
// (e.g. review scan) can still match them by identity. Skip
|
|
1148
|
-
// nudge/warning, but allow Tier 3 kill for eventual cleanup.
|
|
1149
1327
|
if (agent.state === 'dormant') {
|
|
1150
1328
|
const last = self._lastActivity.get(name);
|
|
1151
1329
|
if (!last) continue;
|
|
@@ -1154,6 +1332,7 @@ export class Orchestrator extends EventEmitter {
|
|
|
1154
1332
|
self.agents.delete(name);
|
|
1155
1333
|
self._lastActivity.delete(name);
|
|
1156
1334
|
self._lastPaneHash.delete(name);
|
|
1335
|
+
self._classifyFails.delete(name);
|
|
1157
1336
|
log.info(`stall: cleaned up dormant agent ${name}`);
|
|
1158
1337
|
}
|
|
1159
1338
|
continue;
|
|
@@ -1162,52 +1341,91 @@ export class Orchestrator extends EventEmitter {
|
|
|
1162
1341
|
const last = self._lastActivity.get(name);
|
|
1163
1342
|
if (!last) continue;
|
|
1164
1343
|
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1344
|
+
if (await self.refresh(name)) {
|
|
1345
|
+
self._classifyFails.set(name, 0);
|
|
1346
|
+
continue;
|
|
1347
|
+
}
|
|
1169
1348
|
|
|
1170
1349
|
const elapsed = now - new Date(last).getTime();
|
|
1350
|
+
if (elapsed <= stallTimeout) continue;
|
|
1171
1351
|
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1352
|
+
// Stall detected — classify the pane content
|
|
1353
|
+
const maxLines = self.cfg?.get?.('classify.maxLines') ?? 100;
|
|
1354
|
+
let result;
|
|
1175
1355
|
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1356
|
+
try {
|
|
1357
|
+
const pane = await agent.capture(maxLines);
|
|
1358
|
+
log.debug(`monitor pane ${name} (${agent.backend}, stale=${Math.round(elapsed / 1000)}s):\n${paneDebug(pane)}`);
|
|
1359
|
+
result = await classify('pane-state', pane, {
|
|
1360
|
+
backends: self.backendRegistry,
|
|
1361
|
+
config: self.cfg,
|
|
1362
|
+
vars: { model: agent.model, backend: agent.backend, role: agent.role }
|
|
1363
|
+
});
|
|
1364
|
+
self._classifyFails.set(name, 0);
|
|
1365
|
+
log.info(`classify ${name}: ${result.category} — ${result.reasoning}`);
|
|
1366
|
+
} catch (err) {
|
|
1367
|
+
const fails = (self._classifyFails.get(name) ?? 0) + 1;
|
|
1368
|
+
self._classifyFails.set(name, fails);
|
|
1369
|
+
log.warn(`classify failed for ${name} (${fails}/${maxClassifyFails}): ${err.message}`);
|
|
1370
|
+
|
|
1371
|
+
if (fails >= maxClassifyFails) {
|
|
1372
|
+
log.error(`agent ${name} unclassifiable after ${fails} attempts — killing as safety net`);
|
|
1373
|
+
self.emit('stall', { name, elapsed, severity: 'critical' });
|
|
1374
|
+
try { await self.kill(name); } catch (e) { log.error(`safety kill failed for ${name}: ${e.message}`); }
|
|
1190
1375
|
}
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1376
|
+
continue;
|
|
1377
|
+
}
|
|
1378
|
+
|
|
1379
|
+
switch (result.category) {
|
|
1380
|
+
case 'working':
|
|
1381
|
+
self._lastActivity.set(name, new Date().toISOString());
|
|
1382
|
+
break;
|
|
1383
|
+
|
|
1384
|
+
case 'waiting_for_input':
|
|
1199
1385
|
try {
|
|
1200
|
-
await agent.send('
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1386
|
+
await agent.send('Please continue working or report your status.');
|
|
1387
|
+
await self._rehash(name, agent);
|
|
1388
|
+
} catch (err) { log.debug(`monitor: send failed for ${name}: ${err.message}`); }
|
|
1389
|
+
self.emit('stall', { name, elapsed, severity: 'nudge', diagnosis: result });
|
|
1390
|
+
break;
|
|
1391
|
+
|
|
1392
|
+
case 'option_dialog': {
|
|
1393
|
+
const keys = remedy(result.remedy);
|
|
1394
|
+
try {
|
|
1395
|
+
const tmux = new Tmux();
|
|
1396
|
+
await tmux.keys(agent.paneId, ...keys);
|
|
1397
|
+
await self._rehash(name, agent);
|
|
1398
|
+
} catch (err) { log.debug(`monitor: keys failed for ${name}: ${err.message}`); }
|
|
1399
|
+
self.emit('stall', { name, elapsed, severity: 'nudge', diagnosis: result });
|
|
1400
|
+
break;
|
|
1207
1401
|
}
|
|
1208
|
-
|
|
1402
|
+
|
|
1403
|
+
case 'error_loop':
|
|
1404
|
+
self.emit('stall', { name, elapsed, severity: 'warning', diagnosis: result });
|
|
1405
|
+
break;
|
|
1406
|
+
|
|
1407
|
+
case 'idle':
|
|
1408
|
+
agent.transition?.('dormant');
|
|
1409
|
+
self.emit('stall', { name, elapsed, severity: 'nudge', diagnosis: result });
|
|
1410
|
+
break;
|
|
1411
|
+
|
|
1412
|
+
case 'fatal':
|
|
1413
|
+
log.error(`agent ${name} hit fatal error — killing`);
|
|
1414
|
+
self.emit('stall', { name, elapsed, severity: 'critical', diagnosis: result });
|
|
1415
|
+
try {
|
|
1416
|
+
await self.kill(name);
|
|
1417
|
+
self.backendRegistry?.recordFailure?.(agent.backend);
|
|
1418
|
+
} catch (err) { log.error(`fatal kill failed for ${name}: ${err.message}`); }
|
|
1419
|
+
break;
|
|
1420
|
+
|
|
1421
|
+
default:
|
|
1422
|
+
log.warn(`classify ${name}: unknown category "${result.category}"`);
|
|
1423
|
+
break;
|
|
1209
1424
|
}
|
|
1210
1425
|
}
|
|
1426
|
+
} finally {
|
|
1427
|
+
self._monitoring = false;
|
|
1428
|
+
}
|
|
1211
1429
|
}, Math.min(stallTimeout / 2, 60000));
|
|
1212
1430
|
|
|
1213
1431
|
this._monitorHandle.unref();
|
|
@@ -1223,4 +1441,52 @@ export class Orchestrator extends EventEmitter {
|
|
|
1223
1441
|
log.info('stall detection monitor stopped');
|
|
1224
1442
|
}
|
|
1225
1443
|
}
|
|
1444
|
+
|
|
1445
|
+
// ── Halt (Full System Stop) ──────────────────────────
|
|
1446
|
+
|
|
1447
|
+
/**
|
|
1448
|
+
* Stop the entire orchestrator: reactor loop, stall monitor, and
|
|
1449
|
+
* all registered agents. The MCP server process stays alive so the
|
|
1450
|
+
* user can call `start` again to resume.
|
|
1451
|
+
*
|
|
1452
|
+
* Composes {@link unwatch}, {@link stopMonitor}, and {@link kill}
|
|
1453
|
+
* into a single atomic operation. Idempotent — safe to call when
|
|
1454
|
+
* already halted.
|
|
1455
|
+
*
|
|
1456
|
+
* @returns {Promise<{reactor: boolean, monitor: boolean, agents: string[]}>}
|
|
1457
|
+
* Summary of what was stopped.
|
|
1458
|
+
* @fires Orchestrator#halted
|
|
1459
|
+
*/
|
|
1460
|
+
async halt() {
|
|
1461
|
+
const reactor = Boolean(this._watchHandle);
|
|
1462
|
+
const monitor = Boolean(this._monitorHandle);
|
|
1463
|
+
|
|
1464
|
+
this.unwatch();
|
|
1465
|
+
this.stopMonitor();
|
|
1466
|
+
|
|
1467
|
+
const killed = [];
|
|
1468
|
+
const entries = [...this.agents.entries()];
|
|
1469
|
+
|
|
1470
|
+
for (const [name] of entries) {
|
|
1471
|
+
try {
|
|
1472
|
+
await this.kill(name);
|
|
1473
|
+
killed.push(name);
|
|
1474
|
+
} catch (err) {
|
|
1475
|
+
log.warn(`halt: failed to kill ${name}: ${err.message}`);
|
|
1476
|
+
}
|
|
1477
|
+
}
|
|
1478
|
+
|
|
1479
|
+
log.info(`halt: reactor=${reactor} monitor=${monitor} agents=${killed.length}`);
|
|
1480
|
+
|
|
1481
|
+
/**
|
|
1482
|
+
* @event Orchestrator#halted
|
|
1483
|
+
* @type {object}
|
|
1484
|
+
* @property {boolean} reactor - Whether the reactor was running.
|
|
1485
|
+
* @property {boolean} monitor - Whether the monitor was running.
|
|
1486
|
+
* @property {string[]} agents - Names of agents that were killed.
|
|
1487
|
+
*/
|
|
1488
|
+
this.emit('halted', { reactor, monitor, agents: killed });
|
|
1489
|
+
|
|
1490
|
+
return { reactor, monitor, agents: killed };
|
|
1491
|
+
}
|
|
1226
1492
|
}
|