@ouro.bot/cli 0.1.0-alpha.600 → 0.1.0-alpha.602
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/changelog.json
CHANGED
|
@@ -1,6 +1,18 @@
|
|
|
1
1
|
{
|
|
2
2
|
"_note": "This changelog is maintained as part of the PR/version-bump workflow. Agent-curated, not auto-generated. Agents read this file directly via read_file to understand what changed between versions.",
|
|
3
3
|
"versions": [
|
|
4
|
+
{
|
|
5
|
+
"version": "0.1.0-alpha.602",
|
|
6
|
+
"changes": [
|
|
7
|
+
"Two-part fix for the 2026-05-11 BlueBubbles wedge that drove the user (Ari) up the wall: Slugger's BB session showed the same user message replayed 76 times, because each death-spiral cycle re-injected the inbound. Root cause was the daemon's HTTP health probe (`createHttpHealthProbe(\"bluebubbles:<agent>\", port)`) GETting the sense's /health endpoint every ~60 s with a 5 s timeout — busy BB sense (e.g. VLM image-describe at 20+ s) timed out, daemon declared 'critical', SIGTERM'd the sense mid-work, respawned, hit the same image, killed again, forever. Part 1: removed the HTTP probe entirely from `listHealthProbes()`. Process supervision (`processManager` child-process exit handler) already catches dead processes; for 'alive but hung' we now rely on the agent's own awareness via `pendingRecoveryCount` / `lastRecoveredAt` in the BB runtime state surfaced into the prompt, plus the agent's new `restart_runtime` tool (from alpha.598 / #723). Part 2: defense-in-depth respawn-loop guard in `processManager.restartAgent` — if anything triggers more than `RESPAWN_GUARD_MAX_RESTARTS = 5` orchestrated restarts in `RESPAWN_GUARD_WINDOW_MS = 10 min`, refuse further restarts (`daemon.agent_respawn_loop_tripped` nerves event, errorReason + fixHint set on the snapshot). Trip self-clears once timestamps age out of the window, and `startAgent` (= `ouro up`) bypasses the guard so the operator can always recover. Even if some other future cause re-introduces a tight respawn loop, the guard bounds it. The 2026-05-11 spiral was ~60 restarts/hr — well above 5/10min, so this would have caught it."
|
|
8
|
+
]
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"version": "0.1.0-alpha.601",
|
|
12
|
+
"changes": [
|
|
13
|
+
"Mailbox tab loading hang: listMailOutbound now downloads blobs in parallel (MESSAGE_LIST_SCAN_CONCURRENCY=32) instead of sequentially, and accepts an optional limit. The Mailbox tab passes limit=50 so the UI no longer blocks on a full-history scan for agents with many outbound records."
|
|
14
|
+
]
|
|
15
|
+
},
|
|
4
16
|
{
|
|
5
17
|
"version": "0.1.0-alpha.600",
|
|
6
18
|
"changes": [
|
|
@@ -33,7 +33,7 @@ var __importStar = (this && this.__importStar) || (function () {
|
|
|
33
33
|
};
|
|
34
34
|
})();
|
|
35
35
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
-
exports.DaemonProcessManager = void 0;
|
|
36
|
+
exports.DaemonProcessManager = exports.RESPAWN_GUARD_WINDOW_MS = exports.RESPAWN_GUARD_MAX_RESTARTS = void 0;
|
|
37
37
|
const child_process_1 = require("child_process");
|
|
38
38
|
const path = __importStar(require("path"));
|
|
39
39
|
const identity_1 = require("../identity");
|
|
@@ -41,6 +41,18 @@ const runtime_1 = require("../../nerves/runtime");
|
|
|
41
41
|
function startOfHour(ms) {
|
|
42
42
|
return ms - 60 * 60 * 1000;
|
|
43
43
|
}
|
|
44
|
+
/**
|
|
45
|
+
* Respawn-loop guard: refuse `restartAgent` if we've already orchestrated
|
|
46
|
+
* RESPAWN_GUARD_MAX_RESTARTS in the past RESPAWN_GUARD_WINDOW_MS.
|
|
47
|
+
*
|
|
48
|
+
* Calibrated for the 2026-05-11 BB sense incident: a misconfigured probe
|
|
49
|
+
* was triggering `restartAgent` every ~60s for hours. Five restarts in
|
|
50
|
+
* 10 minutes is well above the rate of legitimate operational restarts
|
|
51
|
+
* (a single human-initiated `ouro down && ouro up` produces one) and well
|
|
52
|
+
* below the rate of a death spiral (60/hr ⇒ 10/10min).
|
|
53
|
+
*/
|
|
54
|
+
exports.RESPAWN_GUARD_MAX_RESTARTS = 5;
|
|
55
|
+
exports.RESPAWN_GUARD_WINDOW_MS = 10 * 60_000;
|
|
44
56
|
class DaemonProcessManager {
|
|
45
57
|
agents = new Map();
|
|
46
58
|
maxRestartsPerHour;
|
|
@@ -149,6 +161,8 @@ class DaemonProcessManager {
|
|
|
149
161
|
startAttemptId: 0,
|
|
150
162
|
restartTimer: null,
|
|
151
163
|
crashTimestamps: [],
|
|
164
|
+
orchestratedRestartTimestamps: [],
|
|
165
|
+
respawnLoopTripped: false,
|
|
152
166
|
stopRequested: false,
|
|
153
167
|
cooldownTimer: null,
|
|
154
168
|
cooldownRetryCount: 0,
|
|
@@ -370,6 +384,11 @@ class DaemonProcessManager {
|
|
|
370
384
|
this.clearRestartTimer(state);
|
|
371
385
|
this.clearCooldownTimer(state);
|
|
372
386
|
state.stopRequested = true;
|
|
387
|
+
// NOTE: do not touch state.respawnLoopTripped / orchestratedRestartTimestamps
|
|
388
|
+
// here. restartAgent calls stopAgent internally; clearing the guard here
|
|
389
|
+
// would reset the window every cycle and defeat the loop-detection. The
|
|
390
|
+
// guard self-clears when timestamps age out of the window (handled inside
|
|
391
|
+
// restartAgent at the prune step).
|
|
373
392
|
if (!state.process) {
|
|
374
393
|
state.snapshot.status = "stopped";
|
|
375
394
|
state.snapshot.pid = null;
|
|
@@ -396,6 +415,61 @@ class DaemonProcessManager {
|
|
|
396
415
|
}
|
|
397
416
|
async restartAgent(agent) {
|
|
398
417
|
const state = this.requireAgent(agent);
|
|
418
|
+
// Respawn-loop guard: prune timestamps outside the window, then check
|
|
419
|
+
// whether we've already restarted this agent too many times in it.
|
|
420
|
+
const now = this.now();
|
|
421
|
+
const windowStart = now - exports.RESPAWN_GUARD_WINDOW_MS;
|
|
422
|
+
state.orchestratedRestartTimestamps = state.orchestratedRestartTimestamps.filter((ts) => ts >= windowStart);
|
|
423
|
+
// If the window is now empty, the trip naturally self-clears. That means
|
|
424
|
+
// after RESPAWN_GUARD_WINDOW_MS of no restart attempts, the daemon is
|
|
425
|
+
// willing to try again (e.g. for a fresh health probe failure that has
|
|
426
|
+
// nothing to do with the original loop).
|
|
427
|
+
if (state.respawnLoopTripped && state.orchestratedRestartTimestamps.length === 0) {
|
|
428
|
+
state.respawnLoopTripped = false;
|
|
429
|
+
state.snapshot.errorReason = null;
|
|
430
|
+
state.snapshot.fixHint = null;
|
|
431
|
+
(0, runtime_1.emitNervesEvent)({
|
|
432
|
+
component: "daemon",
|
|
433
|
+
event: "daemon.agent_respawn_loop_cleared",
|
|
434
|
+
message: "respawn-loop guard cleared by window-aging",
|
|
435
|
+
meta: { agent, windowMs: exports.RESPAWN_GUARD_WINDOW_MS },
|
|
436
|
+
});
|
|
437
|
+
this.notifySnapshotChange(state.snapshot);
|
|
438
|
+
}
|
|
439
|
+
if (state.respawnLoopTripped) {
|
|
440
|
+
(0, runtime_1.emitNervesEvent)({
|
|
441
|
+
level: "error",
|
|
442
|
+
component: "daemon",
|
|
443
|
+
event: "daemon.agent_respawn_loop_blocked",
|
|
444
|
+
message: "refused agent restart — respawn-loop guard tripped; manual intervention required",
|
|
445
|
+
meta: {
|
|
446
|
+
agent,
|
|
447
|
+
recentRestartCount: state.orchestratedRestartTimestamps.length,
|
|
448
|
+
windowMs: exports.RESPAWN_GUARD_WINDOW_MS,
|
|
449
|
+
},
|
|
450
|
+
});
|
|
451
|
+
return;
|
|
452
|
+
}
|
|
453
|
+
if (state.orchestratedRestartTimestamps.length >= exports.RESPAWN_GUARD_MAX_RESTARTS) {
|
|
454
|
+
state.respawnLoopTripped = true;
|
|
455
|
+
state.snapshot.errorReason = `respawn loop detected: ${exports.RESPAWN_GUARD_MAX_RESTARTS}+ restarts in ${Math.round(exports.RESPAWN_GUARD_WINDOW_MS / 60_000)}min — refusing further restarts`;
|
|
456
|
+
state.snapshot.fixHint = "investigate the root cause then run `ouro up` to resume";
|
|
457
|
+
(0, runtime_1.emitNervesEvent)({
|
|
458
|
+
level: "error",
|
|
459
|
+
component: "daemon",
|
|
460
|
+
event: "daemon.agent_respawn_loop_tripped",
|
|
461
|
+
message: "respawn-loop guard tripped; further restarts blocked",
|
|
462
|
+
meta: {
|
|
463
|
+
agent,
|
|
464
|
+
restartCount: state.orchestratedRestartTimestamps.length,
|
|
465
|
+
windowMs: exports.RESPAWN_GUARD_WINDOW_MS,
|
|
466
|
+
maxRestarts: exports.RESPAWN_GUARD_MAX_RESTARTS,
|
|
467
|
+
},
|
|
468
|
+
});
|
|
469
|
+
this.notifySnapshotChange(state.snapshot);
|
|
470
|
+
return;
|
|
471
|
+
}
|
|
472
|
+
state.orchestratedRestartTimestamps.push(now);
|
|
399
473
|
if (state.startInFlight && !state.process) {
|
|
400
474
|
const startedAt = state.startAttemptedAtMs;
|
|
401
475
|
/* v8 ignore next -- defensive: startInFlight always records a start timestamp @preserve */
|
|
@@ -44,7 +44,6 @@ const provider_credentials_1 = require("../provider-credentials");
|
|
|
44
44
|
const sense_truth_1 = require("../sense-truth");
|
|
45
45
|
const machine_identity_1 = require("../machine-identity");
|
|
46
46
|
const process_manager_1 = require("./process-manager");
|
|
47
|
-
const http_health_probe_1 = require("./http-health-probe");
|
|
48
47
|
const DEFAULT_TEAMS_PORT = 3978;
|
|
49
48
|
const DEFAULT_BLUEBUBBLES_PORT = 18790;
|
|
50
49
|
const DEFAULT_BLUEBUBBLES_WEBHOOK_PATH = "/bluebubbles-webhook";
|
|
@@ -631,13 +630,33 @@ class DaemonSenseManager {
|
|
|
631
630
|
if (!context.senses.bluebubbles.enabled || !context.facts.bluebubbles.configured || !machineRuntimeConfig.ok) {
|
|
632
631
|
continue;
|
|
633
632
|
}
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
633
|
+
// DELIBERATELY no HTTP health probe for BlueBubbles.
|
|
634
|
+
//
|
|
635
|
+
// We used to register `createHttpHealthProbe(...)` here, which GETs the
|
|
636
|
+
// sense's /health endpoint every ~60s with a 5s timeout. On 2026-05-11
|
|
637
|
+
// that caused a death spiral:
|
|
638
|
+
// 1. Sense gets busy with real work (e.g. VLM image describe → 20+s)
|
|
639
|
+
// 2. /health probe times out at 5s
|
|
640
|
+
// 3. Daemon declares the sense "critical" → SIGTERMs it mid-work
|
|
641
|
+
// 4. Sense respawns, recovery loop replays the same inbound message
|
|
642
|
+
// into the agent's BB session (visible side-effect — slugger saw
|
|
643
|
+
// the same user text injected 76 times)
|
|
644
|
+
// 5. New sense hits the same VLM call, gets killed at 5s, repeat
|
|
645
|
+
//
|
|
646
|
+
// The probe was redundant supervision: dead processes are already
|
|
647
|
+
// recaptured by `processManager`'s child-process exit handler. The
|
|
648
|
+
// probe specifically caught "alive but hung" cases — but the cost
|
|
649
|
+
// (killing genuinely-busy processes and replaying messages) far
|
|
650
|
+
// outweighed the benefit. For "alive but hung" detection we now
|
|
651
|
+
// rely on the agent's own awareness: BB sense's runtime.json carries
|
|
652
|
+
// pendingRecoveryCount + lastRecoveredAt, surfaced in the agent
|
|
653
|
+
// prompt. If recovery has been wedged for too long, the agent can
|
|
654
|
+
// call `restart_runtime` itself (see alpha.598 / PR #723).
|
|
655
|
+
//
|
|
656
|
+
// The respawn-loop guard in processManager is the backstop: even if
|
|
657
|
+
// something else triggers a tight respawn cycle for any reason, the
|
|
658
|
+
// guard fires and refuses further restarts after N attempts in M
|
|
659
|
+
// minutes, so we can never re-enter the 2026-05-11 spiral.
|
|
641
660
|
}
|
|
642
661
|
return probes;
|
|
643
662
|
}
|
|
@@ -8,6 +8,10 @@ const reader_1 = require("../../../mailroom/reader");
|
|
|
8
8
|
const core_1 = require("../../../mailroom/core");
|
|
9
9
|
const MAILBOX_MAIL_LIST_LIMIT = 50;
|
|
10
10
|
const MAILBOX_MAIL_SUMMARY_LIMIT = MAILBOX_MAIL_LIST_LIMIT;
|
|
11
|
+
// Cap the outbound list returned to the UI. The Mailbox tab renders a
|
|
12
|
+
// bounded recent-outbound view; surfacing the entire outbound history is
|
|
13
|
+
// what makes the tab hang on a loading spinner for large mailboxes.
|
|
14
|
+
const MAILBOX_MAIL_OUTBOUND_LIMIT = 50;
|
|
11
15
|
const MAILBOX_MAIL_BODY_LIMIT = 12_000;
|
|
12
16
|
function emptyFolders() {
|
|
13
17
|
return [
|
|
@@ -272,7 +276,11 @@ async function readMailView(agentName) {
|
|
|
272
276
|
const summaries = result.decrypted.map(mailSummary);
|
|
273
277
|
const screener = (await resolved.store.listScreenerCandidates({ agentId: agentName, status: "pending", limit: 100 }))
|
|
274
278
|
.map(screenerCandidate);
|
|
275
|
-
|
|
279
|
+
// Cap outbound records returned to the UI. The mailbox tab shows a
|
|
280
|
+
// bounded recent-outbound view; pulling the entire history blocks the
|
|
281
|
+
// Mailbox tab on a loading spinner while the blob/file store grinds
|
|
282
|
+
// through every outbound record the agent has ever written.
|
|
283
|
+
const outbound = (await resolved.store.listMailOutbound(agentName, { limit: MAILBOX_MAIL_OUTBOUND_LIMIT })).map(outboundRecord);
|
|
276
284
|
await resolved.store.recordAccess({
|
|
277
285
|
agentId: agentName,
|
|
278
286
|
tool: "mailbox_mail_list",
|
|
@@ -642,24 +642,36 @@ class AzureBlobMailroomStore {
|
|
|
642
642
|
});
|
|
643
643
|
return record;
|
|
644
644
|
}
|
|
645
|
-
async listMailOutbound(agentId) {
|
|
645
|
+
async listMailOutbound(agentId, options = {}) {
|
|
646
646
|
await this.ensureContainer();
|
|
647
|
-
|
|
647
|
+
// Stream blob NAMES first (no JSON downloads yet), then fan out the
|
|
648
|
+
// downloads with bounded concurrency. The previous loop awaited each
|
|
649
|
+
// download sequentially inside the iterator, which made every Mailbox
|
|
650
|
+
// tab open scale linearly with the total number of outbound records in
|
|
651
|
+
// the container — for an agent with many drafts/sent/failed entries,
|
|
652
|
+
// the UI sat on a loading spinner for minutes (or never finished) while
|
|
653
|
+
// the daemon ground through O(N) sequential network round-trips.
|
|
654
|
+
const outboundBlobNames = [];
|
|
648
655
|
for await (const item of this.container.listBlobsFlat({ prefix: "outbound/" })) {
|
|
649
|
-
|
|
650
|
-
if (record)
|
|
651
|
-
records.push(record);
|
|
656
|
+
outboundBlobNames.push(item.name);
|
|
652
657
|
}
|
|
653
|
-
const
|
|
658
|
+
const downloaded = await mapWithConcurrency(outboundBlobNames, MESSAGE_LIST_SCAN_CONCURRENCY, async (name) => {
|
|
659
|
+
return downloadJson(this.container.getBlockBlobClient(name), this.blobOperationTimeoutMs);
|
|
660
|
+
});
|
|
661
|
+
const filtered = downloaded
|
|
662
|
+
.filter((record) => record !== null)
|
|
654
663
|
.filter((record) => record.agentId === agentId)
|
|
655
664
|
.sort((left, right) => right.updatedAt.localeCompare(left.updatedAt));
|
|
665
|
+
const limited = typeof options.limit === "number" && options.limit > 0
|
|
666
|
+
? filtered.slice(0, options.limit)
|
|
667
|
+
: filtered;
|
|
656
668
|
(0, runtime_1.emitNervesEvent)({
|
|
657
669
|
component: "senses",
|
|
658
670
|
event: "senses.mail_blob_outbound_records_listed",
|
|
659
671
|
message: "azure blob mail outbound records listed",
|
|
660
|
-
meta: { agentId, count:
|
|
672
|
+
meta: { agentId, count: limited.length, scanned: outboundBlobNames.length },
|
|
661
673
|
});
|
|
662
|
-
return
|
|
674
|
+
return limited;
|
|
663
675
|
}
|
|
664
676
|
async recordAccess(entry) {
|
|
665
677
|
await this.ensureContainer();
|
|
@@ -327,20 +327,23 @@ class FileMailroomStore {
|
|
|
327
327
|
});
|
|
328
328
|
return record;
|
|
329
329
|
}
|
|
330
|
-
async listMailOutbound(agentId) {
|
|
330
|
+
async listMailOutbound(agentId, options = {}) {
|
|
331
331
|
const records = fs.readdirSync(this.outboundDir)
|
|
332
332
|
.filter((name) => name.endsWith(".json"))
|
|
333
333
|
.map((name) => readJson(path.join(this.outboundDir, name)))
|
|
334
334
|
.filter((record) => record !== null)
|
|
335
335
|
.filter((record) => record.agentId === agentId)
|
|
336
336
|
.sort((left, right) => right.updatedAt.localeCompare(left.updatedAt));
|
|
337
|
+
const limited = typeof options.limit === "number" && options.limit > 0
|
|
338
|
+
? records.slice(0, options.limit)
|
|
339
|
+
: records;
|
|
337
340
|
(0, runtime_1.emitNervesEvent)({
|
|
338
341
|
component: "senses",
|
|
339
342
|
event: "senses.mail_outbound_records_listed",
|
|
340
343
|
message: "mail outbound records listed",
|
|
341
|
-
meta: { agentId, count: records.length },
|
|
344
|
+
meta: { agentId, count: limited.length, scanned: records.length },
|
|
342
345
|
});
|
|
343
|
-
return
|
|
346
|
+
return limited;
|
|
344
347
|
}
|
|
345
348
|
async recordAccess(entry) {
|
|
346
349
|
const complete = {
|