@ouro.bot/cli 0.1.0-alpha.601 → 0.1.0-alpha.604

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/changelog.json CHANGED
@@ -1,6 +1,18 @@
1
1
  {
2
2
  "_note": "This changelog is maintained as part of the PR/version-bump workflow. Agent-curated, not auto-generated. Agents read this file directly via read_file to understand what changed between versions.",
3
3
  "versions": [
4
+ {
5
+ "version": "0.1.0-alpha.604",
6
+ "changes": [
7
+ "Inner tab in mailbox UI surfaces the return-obligation queue (queuedCount/runningCount/oldestActiveAt) so the panel stops claiming 'No pending inner work' when dozens of held items are actively being re-injected."
8
+ ]
9
+ },
10
+ {
11
+ "version": "0.1.0-alpha.602",
12
+ "changes": [
13
+ "Two-part fix for the 2026-05-11 BlueBubbles wedge that drove the user (Ari) up the wall: Slugger's BB session showed the same user message replayed 76 times, because each death-spiral cycle re-injected the inbound. Root cause was the daemon's HTTP health probe (`createHttpHealthProbe(\"bluebubbles:<agent>\", port)`) GETting the sense's /health endpoint every ~60 s with a 5 s timeout — busy BB sense (e.g. VLM image-describe at 20+ s) timed out, daemon declared 'critical', SIGTERM'd the sense mid-work, respawned, hit the same image, killed again, forever. Part 1: removed the HTTP probe entirely from `listHealthProbes()`. Process supervision (`processManager` child-process exit handler) already catches dead processes; for 'alive but hung' we now rely on the agent's own awareness via `pendingRecoveryCount` / `lastRecoveredAt` in the BB runtime state surfaced into the prompt, plus the agent's new `restart_runtime` tool (from alpha.598 / #723). Part 2: defense-in-depth respawn-loop guard in `processManager.restartAgent` — if anything triggers more than `RESPAWN_GUARD_MAX_RESTARTS = 5` orchestrated restarts in `RESPAWN_GUARD_WINDOW_MS = 10 min`, refuse further restarts (`daemon.agent_respawn_loop_tripped` nerves event, errorReason + fixHint set on the snapshot). Trip self-clears once timestamps age out of the window, and `startAgent` (= `ouro up`) bypasses the guard so the operator can always recover. Even if some other future cause re-introduces a tight respawn loop, the guard bounds it. The 2026-05-11 spiral was ~60 restarts/hr — well above 5/10min, so this would have caught it."
14
+ ]
15
+ },
4
16
  {
5
17
  "version": "0.1.0-alpha.601",
6
18
  "changes": [
@@ -33,7 +33,7 @@ var __importStar = (this && this.__importStar) || (function () {
33
33
  };
34
34
  })();
35
35
  Object.defineProperty(exports, "__esModule", { value: true });
36
- exports.DaemonProcessManager = void 0;
36
+ exports.DaemonProcessManager = exports.RESPAWN_GUARD_WINDOW_MS = exports.RESPAWN_GUARD_MAX_RESTARTS = void 0;
37
37
  const child_process_1 = require("child_process");
38
38
  const path = __importStar(require("path"));
39
39
  const identity_1 = require("../identity");
@@ -41,6 +41,18 @@ const runtime_1 = require("../../nerves/runtime");
41
41
  function startOfHour(ms) {
42
42
  return ms - 60 * 60 * 1000;
43
43
  }
44
+ /**
45
+ * Respawn-loop guard: refuse `restartAgent` if we've already orchestrated
46
+ * RESPAWN_GUARD_MAX_RESTARTS in the past RESPAWN_GUARD_WINDOW_MS.
47
+ *
48
+ * Calibrated for the 2026-05-11 BB sense incident: a misconfigured probe
49
+ * was triggering `restartAgent` every ~60s for hours. Five restarts in
50
+ * 10 minutes is well above the rate of legitimate operational restarts
51
+ * (a single human-initiated `ouro down && ouro up` produces one) and well
52
+ * below the rate of a death spiral (60/hr ⇒ 10/10min).
53
+ */
54
+ exports.RESPAWN_GUARD_MAX_RESTARTS = 5;
55
+ exports.RESPAWN_GUARD_WINDOW_MS = 10 * 60_000;
44
56
  class DaemonProcessManager {
45
57
  agents = new Map();
46
58
  maxRestartsPerHour;
@@ -149,6 +161,8 @@ class DaemonProcessManager {
149
161
  startAttemptId: 0,
150
162
  restartTimer: null,
151
163
  crashTimestamps: [],
164
+ orchestratedRestartTimestamps: [],
165
+ respawnLoopTripped: false,
152
166
  stopRequested: false,
153
167
  cooldownTimer: null,
154
168
  cooldownRetryCount: 0,
@@ -370,6 +384,11 @@ class DaemonProcessManager {
370
384
  this.clearRestartTimer(state);
371
385
  this.clearCooldownTimer(state);
372
386
  state.stopRequested = true;
387
+ // NOTE: do not touch state.respawnLoopTripped / orchestratedRestartTimestamps
388
+ // here. restartAgent calls stopAgent internally; clearing the guard here
389
+ // would reset the window every cycle and defeat the loop-detection. The
390
+ // guard self-clears when timestamps age out of the window (handled inside
391
+ // restartAgent at the prune step).
373
392
  if (!state.process) {
374
393
  state.snapshot.status = "stopped";
375
394
  state.snapshot.pid = null;
@@ -396,6 +415,61 @@ class DaemonProcessManager {
396
415
  }
397
416
  async restartAgent(agent) {
398
417
  const state = this.requireAgent(agent);
418
+ // Respawn-loop guard: prune timestamps outside the window, then check
419
+ // whether we've already restarted this agent too many times in it.
420
+ const now = this.now();
421
+ const windowStart = now - exports.RESPAWN_GUARD_WINDOW_MS;
422
+ state.orchestratedRestartTimestamps = state.orchestratedRestartTimestamps.filter((ts) => ts >= windowStart);
423
+ // If the window is now empty, the trip naturally self-clears. That means
424
+ // after RESPAWN_GUARD_WINDOW_MS of no restart attempts, the daemon is
425
+ // willing to try again (e.g. for a fresh health probe failure that has
426
+ // nothing to do with the original loop).
427
+ if (state.respawnLoopTripped && state.orchestratedRestartTimestamps.length === 0) {
428
+ state.respawnLoopTripped = false;
429
+ state.snapshot.errorReason = null;
430
+ state.snapshot.fixHint = null;
431
+ (0, runtime_1.emitNervesEvent)({
432
+ component: "daemon",
433
+ event: "daemon.agent_respawn_loop_cleared",
434
+ message: "respawn-loop guard cleared by window-aging",
435
+ meta: { agent, windowMs: exports.RESPAWN_GUARD_WINDOW_MS },
436
+ });
437
+ this.notifySnapshotChange(state.snapshot);
438
+ }
439
+ if (state.respawnLoopTripped) {
440
+ (0, runtime_1.emitNervesEvent)({
441
+ level: "error",
442
+ component: "daemon",
443
+ event: "daemon.agent_respawn_loop_blocked",
444
+ message: "refused agent restart — respawn-loop guard tripped; manual intervention required",
445
+ meta: {
446
+ agent,
447
+ recentRestartCount: state.orchestratedRestartTimestamps.length,
448
+ windowMs: exports.RESPAWN_GUARD_WINDOW_MS,
449
+ },
450
+ });
451
+ return;
452
+ }
453
+ if (state.orchestratedRestartTimestamps.length >= exports.RESPAWN_GUARD_MAX_RESTARTS) {
454
+ state.respawnLoopTripped = true;
455
+ state.snapshot.errorReason = `respawn loop detected: ${exports.RESPAWN_GUARD_MAX_RESTARTS}+ restarts in ${Math.round(exports.RESPAWN_GUARD_WINDOW_MS / 60_000)}min — refusing further restarts`;
456
+ state.snapshot.fixHint = "investigate the root cause then run `ouro up` to resume";
457
+ (0, runtime_1.emitNervesEvent)({
458
+ level: "error",
459
+ component: "daemon",
460
+ event: "daemon.agent_respawn_loop_tripped",
461
+ message: "respawn-loop guard tripped; further restarts blocked",
462
+ meta: {
463
+ agent,
464
+ restartCount: state.orchestratedRestartTimestamps.length,
465
+ windowMs: exports.RESPAWN_GUARD_WINDOW_MS,
466
+ maxRestarts: exports.RESPAWN_GUARD_MAX_RESTARTS,
467
+ },
468
+ });
469
+ this.notifySnapshotChange(state.snapshot);
470
+ return;
471
+ }
472
+ state.orchestratedRestartTimestamps.push(now);
399
473
  if (state.startInFlight && !state.process) {
400
474
  const startedAt = state.startAttemptedAtMs;
401
475
  /* v8 ignore next -- defensive: startInFlight always records a start timestamp @preserve */
@@ -44,7 +44,6 @@ const provider_credentials_1 = require("../provider-credentials");
44
44
  const sense_truth_1 = require("../sense-truth");
45
45
  const machine_identity_1 = require("../machine-identity");
46
46
  const process_manager_1 = require("./process-manager");
47
- const http_health_probe_1 = require("./http-health-probe");
48
47
  const DEFAULT_TEAMS_PORT = 3978;
49
48
  const DEFAULT_BLUEBUBBLES_PORT = 18790;
50
49
  const DEFAULT_BLUEBUBBLES_WEBHOOK_PATH = "/bluebubbles-webhook";
@@ -631,13 +630,33 @@ class DaemonSenseManager {
631
630
  if (!context.senses.bluebubbles.enabled || !context.facts.bluebubbles.configured || !machineRuntimeConfig.ok) {
632
631
  continue;
633
632
  }
634
- const machinePayload = machineRuntimeConfig.config;
635
- const bluebubblesChannel = machinePayload.bluebubblesChannel;
636
- const port = numberField(bluebubblesChannel, "port", DEFAULT_BLUEBUBBLES_PORT);
637
- probes.push({
638
- ...(0, http_health_probe_1.createHttpHealthProbe)(`bluebubbles:${agent}`, port),
639
- managedName: `${agent}:bluebubbles`,
640
- });
633
+ // DELIBERATELY no HTTP health probe for BlueBubbles.
634
+ //
635
+ // We used to register `createHttpHealthProbe(...)` here, which GETs the
636
+ // sense's /health endpoint every ~60s with a 5s timeout. On 2026-05-11
637
+ // that caused a death spiral:
638
+ // 1. Sense gets busy with real work (e.g. VLM image describe → 20+s)
639
+ // 2. /health probe times out at 5s
640
+ // 3. Daemon declares the sense "critical" → SIGTERMs it mid-work
641
+ // 4. Sense respawns, recovery loop replays the same inbound message
642
+ // into the agent's BB session (visible side-effect — slugger saw
643
+ // the same user text injected 76 times)
644
+ // 5. New sense hits the same VLM call, gets killed at 5s, repeat
645
+ //
646
+ // The probe was redundant supervision: dead processes are already
647
+ // recaptured by `processManager`'s child-process exit handler. The
648
+ // probe specifically caught "alive but hung" cases — but the cost
649
+ // (killing genuinely-busy processes and replaying messages) far
650
+ // outweighed the benefit. For "alive but hung" detection we now
651
+ // rely on the agent's own awareness: BB sense's runtime.json carries
652
+ // pendingRecoveryCount + lastRecoveredAt, surfaced in the agent
653
+ // prompt. If recovery has been wedged for too long, the agent can
654
+ // call `restart_runtime` itself (see alpha.598 / PR #723).
655
+ //
656
+ // The respawn-loop guard in processManager is the backstop: even if
657
+ // something else triggers a tight respawn cycle for any reason, the
658
+ // guard fires and refuses further restarts after N attempts in M
659
+ // minutes, so we can never re-enter the 2026-05-11 spiral.
641
660
  }
642
661
  return probes;
643
662
  }
@@ -117,6 +117,7 @@ function buildInnerView(inner, viewer) {
117
117
  hasPending: inner.hasPending,
118
118
  origin: inner.origin,
119
119
  obligationStatus: inner.obligationStatus,
120
+ returnObligationQueue: inner.returnObligationQueue,
120
121
  };
121
122
  }
122
123
  return {
@@ -124,6 +125,7 @@ function buildInnerView(inner, viewer) {
124
125
  status: inner.status,
125
126
  summary: inner.surfacedSummary,
126
127
  hasPending: inner.hasPending,
128
+ returnObligationQueue: inner.returnObligationQueue,
127
129
  };
128
130
  }
129
131
  function buildRecentActivity(agent) {
@@ -186,6 +186,12 @@ function readInnerSummary(agentRoot) {
186
186
  ?? runtimeState?.startedAt
187
187
  ?? runtimeState?.lastCompletedAt
188
188
  ?? null;
189
+ // Read the return-obligation queue so the Inner tab can show what the
190
+ // agent is actually holding right now. Before this, the "Inner work"
191
+ // panel only consulted the pending-messages dir (inbox-style); it
192
+ // reported "No pending inner work" even when dozens of held items were
193
+ // sitting in arc/obligations/inner/ waiting to be reinjected next turn.
194
+ const returnObligationQueue = readReturnObligationQueueSummary(agentRoot);
189
195
  return {
190
196
  summary: {
191
197
  visibility: mailbox_types_1.MAILBOX_DEFAULT_INNER_VISIBILITY,
@@ -195,11 +201,48 @@ function readInnerSummary(agentRoot) {
195
201
  origin: job.origin,
196
202
  obligationStatus: job.obligationStatus,
197
203
  latestActivityAt,
204
+ returnObligationQueue,
198
205
  },
199
206
  issues: [],
200
207
  latestActivityAt,
201
208
  };
202
209
  }
210
+ function readReturnObligationQueueSummary(agentRoot) {
211
+ const dir = path.join(agentRoot, "arc", "obligations", "inner");
212
+ let names = [];
213
+ try {
214
+ names = fs.readdirSync(dir).filter((name) => name.endsWith(".json"));
215
+ }
216
+ catch {
217
+ return { queuedCount: 0, runningCount: 0, oldestActiveAt: null };
218
+ }
219
+ let queuedCount = 0;
220
+ let runningCount = 0;
221
+ let oldestActiveAt = null;
222
+ for (const name of names) {
223
+ let parsed = null;
224
+ try {
225
+ parsed = JSON.parse(fs.readFileSync(path.join(dir, name), "utf-8"));
226
+ }
227
+ catch {
228
+ continue;
229
+ }
230
+ if (!parsed)
231
+ continue;
232
+ const status = parsed.status;
233
+ if (status === "queued")
234
+ queuedCount += 1;
235
+ else if (status === "running")
236
+ runningCount += 1;
237
+ else
238
+ continue;
239
+ const createdAt = typeof parsed.createdAt === "number" ? parsed.createdAt : null;
240
+ if (createdAt !== null && (oldestActiveAt === null || createdAt < oldestActiveAt)) {
241
+ oldestActiveAt = createdAt;
242
+ }
243
+ }
244
+ return { queuedCount, runningCount, oldestActiveAt };
245
+ }
203
246
  function readCodingSummary(agentRoot) {
204
247
  const stateFilePath = path.join(agentRoot, "state", "coding", "sessions.json");
205
248
  const issues = [];