bosun 0.33.8 → 0.33.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,6 +31,7 @@ const DEFAULT_PORT = 18432;
31
31
  const MAX_BODY_SIZE = 1024 * 1024; // 1 MB
32
32
  const REQUEST_TIMEOUT_MS = 30_000; // 30 seconds
33
33
  const ACCESS_DENIED_COOLDOWN_MS = 10 * 60 * 1000; // 10 minutes
34
+ const BOSUN_ROOT_HINT = __dirname.toLowerCase().replace(/\\/g, '/');
34
35
 
35
36
  // Valid status transitions when an agent self-reports
36
37
  const VALID_TRANSITIONS = {
@@ -175,6 +176,41 @@ function isAlreadyExitedProcessError(err) {
175
176
  );
176
177
  }
177
178
 
179
+ function normalizeCommandLine(commandLine) {
180
+ return String(commandLine || "").toLowerCase().replace(/\\/g, "/").trim();
181
+ }
182
+
183
+ function isLikelyBosunCommandLine(commandLine) {
184
+ const normalized = normalizeCommandLine(commandLine);
185
+ if (!normalized) return false;
186
+
187
+ if (normalized.includes(BOSUN_ROOT_HINT)) return true;
188
+
189
+ if (
190
+ normalized.includes("/bosun/") &&
191
+ (normalized.includes("monitor.mjs") ||
192
+ normalized.includes("cli.mjs") ||
193
+ normalized.includes("agent-endpoint.mjs") ||
194
+ normalized.includes("ve-orchestrator"))
195
+ ) {
196
+ return true;
197
+ }
198
+
199
+ // Dev-mode often launches monitor as node monitor.mjs from bosun root.
200
+ if (/\bnode(?:\.exe)?\b/.test(normalized) && /\bmonitor\.mjs\b/.test(normalized)) {
201
+ return true;
202
+ }
203
+
204
+ return false;
205
+ }
206
+
207
+ function summarizeCommandLine(commandLine, maxLen = 140) {
208
+ const compact = String(commandLine || "").replace(/\s+/g, " ").trim();
209
+ if (!compact) return "command line unavailable";
210
+ if (compact.length <= maxLen) return compact;
211
+ return compact.slice(0, maxLen) + "...";
212
+ }
213
+
178
214
  // ── AgentEndpoint Class ─────────────────────────────────────────────────────
179
215
 
180
216
  export class AgentEndpoint {
@@ -311,7 +347,7 @@ export class AgentEndpoint {
311
347
  const { execSync, spawnSync } = await import("node:child_process");
312
348
  const isWindows = process.platform === "win32";
313
349
  let output;
314
- let pids = new Set();
350
+ const pids = new Set();
315
351
 
316
352
  // PIDs we must NEVER kill — ourselves, our parent (cli.mjs fork host),
317
353
  // and any ancestor in the same process tree. lsof can return these when
@@ -321,13 +357,43 @@ export class AgentEndpoint {
321
357
  String(process.ppid),
322
358
  ]);
323
359
 
360
+ const readProcessCommandLine = (pid) => {
361
+ try {
362
+ if (isWindows) {
363
+ const query = `$p = Get-CimInstance Win32_Process -Filter "ProcessId=${pid}" -ErrorAction SilentlyContinue; if ($p) { $p.CommandLine }`;
364
+ const result = spawnSync(
365
+ "powershell",
366
+ ["-NoProfile", "-Command", query],
367
+ {
368
+ encoding: "utf8",
369
+ timeout: 5000,
370
+ windowsHide: true,
371
+ stdio: ["ignore", "pipe", "pipe"],
372
+ },
373
+ );
374
+ if (result.error || result.status !== 0) return "";
375
+ return String(result.stdout || "").trim();
376
+ }
377
+
378
+ const result = spawnSync("ps", ["-p", String(pid), "-o", "args="], {
379
+ encoding: "utf8",
380
+ timeout: 5000,
381
+ stdio: ["ignore", "pipe", "pipe"],
382
+ });
383
+ if (result.error || result.status !== 0) return "";
384
+ return String(result.stdout || "").trim();
385
+ } catch {
386
+ return "";
387
+ }
388
+ };
389
+
324
390
  if (isWindows) {
325
391
  // Windows: netstat -ano | findstr
326
392
  output = execSync(`netstat -ano | findstr ":${port}"`, {
327
393
  encoding: "utf8",
328
394
  timeout: 5000,
329
395
  }).trim();
330
- const lines = output.split("\n").filter((l) => l.includes("LISTENING"));
396
+ const lines = output.split("\n").filter((line) => line.includes("LISTENING"));
331
397
  for (const line of lines) {
332
398
  const parts = line.trim().split(/\s+/);
333
399
  const pid = parts[parts.length - 1];
@@ -342,7 +408,7 @@ export class AgentEndpoint {
342
408
  encoding: "utf8",
343
409
  timeout: 5000,
344
410
  }).trim();
345
- const pidList = output.split("\n").filter((p) => p.trim());
411
+ const pidList = output.split("\n").filter((pid) => pid.trim());
346
412
  for (const pid of pidList) {
347
413
  if (pid && /^\d+$/.test(pid) && !protectedPids.has(pid)) {
348
414
  pids.add(pid);
@@ -363,7 +429,23 @@ export class AgentEndpoint {
363
429
  }
364
430
  }
365
431
 
432
+ const killEligiblePids = new Set();
366
433
  for (const pid of pids) {
434
+ const commandLine = readProcessCommandLine(pid);
435
+ if (!isLikelyBosunCommandLine(commandLine)) {
436
+ console.warn(
437
+ `${TAG} Port ${port} held by non-bosun PID ${pid} (${summarizeCommandLine(commandLine)}); skipping forced kill`,
438
+ );
439
+ continue;
440
+ }
441
+ killEligiblePids.add(pid);
442
+ }
443
+
444
+ if (killEligiblePids.size === 0) {
445
+ return;
446
+ }
447
+
448
+ for (const pid of killEligiblePids) {
367
449
  console.log(`${TAG} Sending SIGTERM to stale process PID ${pid} on port ${port}`);
368
450
  try {
369
451
  if (isWindows) {
@@ -421,12 +503,13 @@ export class AgentEndpoint {
421
503
  );
422
504
  }
423
505
  }
506
+
424
507
  // Give the SIGTERM'd processes time to exit gracefully
425
508
  await new Promise((r) => setTimeout(r, 2000));
426
509
 
427
510
  // Escalate: check if any are still alive and SIGKILL them
428
511
  if (!isWindows) {
429
- for (const pid of pids) {
512
+ for (const pid of killEligiblePids) {
430
513
  try {
431
514
  process.kill(Number(pid), 0); // probe — throws if dead
432
515
  console.warn(`${TAG} PID ${pid} still alive after SIGTERM — sending SIGKILL`);
@@ -447,7 +530,6 @@ export class AgentEndpoint {
447
530
  }
448
531
  }
449
532
  }
450
-
451
533
  /**
452
534
  * Stop the HTTP server.
453
535
  * @returns {Promise<void>}
package/maintenance.mjs CHANGED
@@ -468,6 +468,7 @@ export function cleanupStaleBranches(repoRoot, opts = {}) {
468
468
 
469
469
  const PID_FILE_NAME = "bosun.pid";
470
470
  const MONITOR_MARKER = "bosun/monitor.mjs";
471
+ const PID_START_TIME_TOLERANCE_MS = 90_000;
471
472
 
472
473
  function parsePidFile(raw) {
473
474
  const text = String(raw || "").trim();
@@ -483,24 +484,72 @@ function parsePidFile(raw) {
483
484
  return { pid: Number(text), raw: text };
484
485
  }
485
486
 
486
- function getProcessCommandLine(pid) {
487
- if (!Number.isFinite(pid) || pid <= 0) return "";
487
+ function getProcessSnapshot(pid) {
488
+ if (!Number.isFinite(pid) || pid <= 0) return null;
488
489
  const processes = getProcesses();
489
490
  const entry = processes.find((p) => Number(p.pid) === Number(pid));
490
- return entry?.commandLine || "";
491
+ if (!entry) return null;
492
+ return {
493
+ commandLine: entry.commandLine || "",
494
+ creationDate: entry.creationDate || null,
495
+ };
491
496
  }
492
497
 
493
- function classifyMonitorProcess(pid) {
494
- const cmd = getProcessCommandLine(pid);
495
- if (!cmd) return "unknown";
496
- const normalized = cmd.toLowerCase();
498
+ export function classifyMonitorCommandLine(commandLine) {
499
+ const normalized = String(commandLine || "").toLowerCase().replace(/\\/g, "/");
500
+ if (!normalized.trim()) return "unknown";
497
501
  if (normalized.includes(MONITOR_MARKER)) return "monitor";
498
502
  if (normalized.includes("bosun") && normalized.includes("monitor.mjs")) {
499
503
  return "monitor";
500
504
  }
505
+
506
+ // Dev-mode launches often use a relative script path: `node monitor.mjs`.
507
+ if (
508
+ /\bnode(?:\.exe)?\b/.test(normalized) &&
509
+ /(?:^|[\s"'=])monitor\.mjs(?:$|[\s"'])/.test(normalized)
510
+ ) {
511
+ return "monitor";
512
+ }
513
+
514
+ if (
515
+ normalized.includes("bosun") &&
516
+ normalized.includes("cli.mjs") &&
517
+ normalized.includes("monitormonitor")
518
+ ) {
519
+ return "monitor";
520
+ }
521
+
501
522
  return "other";
502
523
  }
503
524
 
525
+ function isLikelySameProcessFromPidFile(processInfo, pidFileData) {
526
+ if (!processInfo || !pidFileData?.started_at) return null;
527
+ const lockStartMs = Date.parse(pidFileData.started_at);
528
+ const processStartMs = processInfo.creationDate?.getTime?.() || NaN;
529
+ if (!Number.isFinite(lockStartMs) || !Number.isFinite(processStartMs)) {
530
+ return null;
531
+ }
532
+ return Math.abs(processStartMs - lockStartMs) <= PID_START_TIME_TOLERANCE_MS;
533
+ }
534
+
535
+ function pidFileLooksLikeMonitor(pidFileData) {
536
+ if (!pidFileData || !Array.isArray(pidFileData.argv)) return false;
537
+ return classifyMonitorCommandLine(pidFileData.argv.join(" ")) === "monitor";
538
+ }
539
+
540
+ function classifyMonitorProcess(pid, pidFileData) {
541
+ const processInfo = getProcessSnapshot(pid);
542
+ const commandClass = classifyMonitorCommandLine(processInfo?.commandLine || "");
543
+ if (commandClass === "monitor") return "monitor";
544
+
545
+ // Fallback for cases where OS command-line capture omits argv.
546
+ const sameProcess = isLikelySameProcessFromPidFile(processInfo, pidFileData);
547
+ if (sameProcess === true && pidFileLooksLikeMonitor(pidFileData)) {
548
+ return "monitor";
549
+ }
550
+
551
+ return commandClass;
552
+ }
504
553
  /**
505
554
  * Acquire a singleton lock by writing our PID file.
506
555
  * If a stale monitor is detected (PID file exists but process dead), clean up and take over.
@@ -587,7 +636,7 @@ export function acquireMonitorLock(lockDir) {
587
636
  existingPid !== process.pid &&
588
637
  isProcessAlive(existingPid)
589
638
  ) {
590
- const classification = classifyMonitorProcess(existingPid);
639
+ const classification = classifyMonitorProcess(existingPid, parsed.data);
591
640
  if (classification === "monitor") {
592
641
  console.error(
593
642
  "[maintenance] another bosun is already running (PID " + existingPid + "). Exiting.",
package/monitor.mjs CHANGED
@@ -800,6 +800,26 @@ let githubReconcile = githubReconcileConfig || {
800
800
  mergedLookbackHours: 72,
801
801
  trackingLabels: ["tracking"],
802
802
  };
803
+ let chdirUnsupportedInRuntime = false;
804
+
805
+ function isChdirUnsupportedError(err) {
806
+ if (!err) {
807
+ return false;
808
+ }
809
+ const code = String(err.code || "");
810
+ const message = String(err.message || "").toLowerCase();
811
+ return (
812
+ code === "ERR_WORKER_UNSUPPORTED_OPERATION" ||
813
+ message.includes("process.chdir() is not supported in workers")
814
+ );
815
+ }
816
+
817
+ function normalizePathForCompare(pathValue) {
818
+ const normalized = String(pathValue || "")
819
+ .replace(/[\\/]+/g, "/")
820
+ .replace(/\/+$/, "");
821
+ return process.platform === "win32" ? normalized.toLowerCase() : normalized;
822
+ }
803
823
 
804
824
  // ── Ensure CWD is the repo root ─────────────────────────────────────────────
805
825
  // The daemon is spawned with cwd=homedir (to avoid deleted worktree paths).
@@ -807,9 +827,13 @@ let githubReconcile = githubReconcileConfig || {
807
827
  // trusted git directory, preventing "Not inside a trusted directory" errors.
808
828
  // Prefer agentRepoRoot (workspace-aware) over raw repoRoot.
809
829
  const effectiveRepoRoot = agentRepoRoot || repoRoot;
810
- if (!isMainThread) {
830
+ const needsChdir =
831
+ effectiveRepoRoot &&
832
+ normalizePathForCompare(process.cwd()) !==
833
+ normalizePathForCompare(effectiveRepoRoot);
834
+ if (!isMainThread || chdirUnsupportedInRuntime) {
811
835
  // Worker threads cannot call process.chdir(); skip to avoid noisy warnings.
812
- } else if (effectiveRepoRoot && process.cwd() !== effectiveRepoRoot) {
836
+ } else if (needsChdir) {
813
837
  try {
814
838
  process.chdir(effectiveRepoRoot);
815
839
  console.log(`[monitor] changed CWD to repo root: ${effectiveRepoRoot}`);
@@ -818,14 +842,29 @@ if (!isMainThread) {
818
842
  console.log(`[monitor] developer repo root: ${repoRoot}`);
819
843
  }
820
844
  } catch (err) {
821
- console.warn(`[monitor] could not chdir to ${effectiveRepoRoot}: ${err.message}`);
822
- // Fall back to repoRoot if agentRepoRoot failed
823
- if (agentRepoRoot && repoRoot && agentRepoRoot !== repoRoot) {
824
- try {
825
- process.chdir(repoRoot);
826
- console.log(`[monitor] fell back to developer repo root: ${repoRoot}`);
827
- } catch (e2) {
828
- console.warn(`[monitor] could not chdir to fallback ${repoRoot}: ${e2.message}`);
845
+ if (isChdirUnsupportedError(err)) {
846
+ chdirUnsupportedInRuntime = true;
847
+ console.log("[monitor] runtime does not support process.chdir(); skipping CWD re-anchor.");
848
+ } else {
849
+ console.warn(`[monitor] could not chdir to ${effectiveRepoRoot}: ${err.message}`);
850
+ // Fall back to repoRoot if agentRepoRoot failed
851
+ if (
852
+ agentRepoRoot &&
853
+ repoRoot &&
854
+ agentRepoRoot !== repoRoot &&
855
+ normalizePathForCompare(process.cwd()) !== normalizePathForCompare(repoRoot)
856
+ ) {
857
+ try {
858
+ process.chdir(repoRoot);
859
+ console.log(`[monitor] fell back to developer repo root: ${repoRoot}`);
860
+ } catch (e2) {
861
+ if (isChdirUnsupportedError(e2)) {
862
+ chdirUnsupportedInRuntime = true;
863
+ console.log("[monitor] runtime does not support process.chdir(); fallback skipped.");
864
+ } else {
865
+ console.warn(`[monitor] could not chdir to fallback ${repoRoot}: ${e2.message}`);
866
+ }
867
+ }
829
868
  }
830
869
  }
831
870
  }
@@ -13159,8 +13198,21 @@ process.on("uncaughtException", (err) => {
13159
13198
  );
13160
13199
  return;
13161
13200
  }
13201
+ // Always log the exception — even during shutdown — so the crash is traceable.
13202
+ const detail = err?.stack || msg || String(err);
13203
+ try {
13204
+ process.stderr.write("[monitor] uncaughtException: " + detail + "\n");
13205
+ } catch { /* stderr may be torn down */ }
13206
+ try {
13207
+ const crashDir = config?.logDir || resolve(__dirname, "logs");
13208
+ mkdirSync(crashDir, { recursive: true });
13209
+ appendFileSync(
13210
+ resolve(crashDir, "monitor-crash-breadcrumb.log"),
13211
+ `[${new Date().toISOString()}] uncaughtException (shuttingDown=${shuttingDown}): ${detail}\n`,
13212
+ );
13213
+ } catch { /* best effort */ }
13162
13214
  if (shuttingDown) return;
13163
- console.error("[monitor] uncaughtException: " + (err?.stack || msg));
13215
+ console.error("[monitor] uncaughtException: " + detail);
13164
13216
  handleMonitorFailure("uncaughtException", err).catch((failureErr) => {
13165
13217
  try {
13166
13218
  process.stderr.write(
@@ -13174,6 +13226,16 @@ process.on("uncaughtException", (err) => {
13174
13226
 
13175
13227
  process.on("unhandledRejection", (reason) => {
13176
13228
  const msg = reason?.message || String(reason || "");
13229
+ // Always write breadcrumb — unhandled rejections can cause exit code 1
13230
+ try {
13231
+ const crashDir = config?.logDir || resolve(__dirname, "logs");
13232
+ mkdirSync(crashDir, { recursive: true });
13233
+ const detail = reason instanceof Error ? (reason.stack || msg) : msg;
13234
+ appendFileSync(
13235
+ resolve(crashDir, "monitor-crash-breadcrumb.log"),
13236
+ `[${new Date().toISOString()}] unhandledRejection (shuttingDown=${shuttingDown}): ${detail}\n`,
13237
+ );
13238
+ } catch { /* best effort */ }
13177
13239
  // Always suppress stream noise
13178
13240
  if (isStreamNoise(msg)) {
13179
13241
  console.error(
@@ -13199,14 +13261,26 @@ process.on("unhandledRejection", (reason) => {
13199
13261
  // ── Exit diagnostic: always log the exit code so crashes are traceable ──────
13200
13262
  process.on("exit", (code) => {
13201
13263
  if (code === 0 || code === SELF_RESTART_EXIT_CODE) return;
13264
+ const ts = new Date().toISOString();
13265
+ const line = `[${ts}] process exiting with code ${code} (shuttingDown=${shuttingDown}, uptime=${Math.round(process.uptime())}s)`;
13202
13266
  // Write directly to stderr — console may already be torn down at exit time
13203
13267
  try {
13204
- process.stderr.write(
13205
- `[monitor] process exiting with code ${code} (shuttingDown=${shuttingDown})\n`,
13206
- );
13268
+ process.stderr.write("[monitor] " + line + "\n");
13207
13269
  } catch {
13208
13270
  /* best effort — stderr may be broken */
13209
13271
  }
13272
+ // Persist breadcrumb to disk so the crash is always traceable even when
13273
+ // stderr output is lost (e.g., background daemon, piped output).
13274
+ try {
13275
+ const crashDir = config?.logDir || resolve(__dirname, "logs");
13276
+ mkdirSync(crashDir, { recursive: true });
13277
+ appendFileSync(
13278
+ resolve(crashDir, "monitor-crash-breadcrumb.log"),
13279
+ line + "\n",
13280
+ );
13281
+ } catch {
13282
+ /* best effort */
13283
+ }
13210
13284
  });
13211
13285
 
13212
13286
  if (!isMonitorTestRuntime) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "bosun",
3
- "version": "0.33.8",
3
+ "version": "0.33.9",
4
4
  "description": "AI-powered orchestrator supervisor — manages AI agent executors with failover, auto-restarts on failure, analyzes crashes with Codex SDK, creates PRs via Vibe-Kanban API, and sends Telegram notifications. Supports N executors with weighted distribution, multi-repo projects, and auto-setup.",
5
5
  "type": "module",
6
6
  "license": "Apache 2.0",
package/task-store.mjs CHANGED
@@ -54,6 +54,7 @@ const ATOMIC_RENAME_FALLBACK_CODES = new Set(["EPERM", "EACCES", "EBUSY", "EXDEV
54
54
  let _store = null; // { _meta: {...}, tasks: { [id]: Task } }
55
55
  let _loaded = false;
56
56
  let _writeChain = Promise.resolve(); // simple write lock
57
+ let _didLogInitialLoad = false;
57
58
 
58
59
  export function configureTaskStore(options = {}) {
59
60
  const baseDir = options.baseDir ? resolve(options.baseDir) : null;
@@ -74,6 +75,7 @@ export function configureTaskStore(options = {}) {
74
75
  _store = null;
75
76
  _loaded = false;
76
77
  _writeChain = Promise.resolve();
78
+ _didLogInitialLoad = false;
77
79
  }
78
80
 
79
81
  return storePath;
@@ -222,10 +224,13 @@ export function loadStore() {
222
224
  _meta: { ...defaultMeta(), ...(data._meta || {}) },
223
225
  tasks: data.tasks || {},
224
226
  };
225
- console.log(
226
- TAG,
227
- `Loaded ${Object.keys(_store.tasks).length} tasks from disk`,
228
- );
227
+ if (!_didLogInitialLoad) {
228
+ _didLogInitialLoad = true;
229
+ console.log(
230
+ TAG,
231
+ `Loaded ${Object.keys(_store.tasks).length} tasks from disk`,
232
+ );
233
+ }
229
234
  } else {
230
235
  _store = { _meta: defaultMeta(), tasks: {} };
231
236
  console.log(TAG, "No store file found — initialised empty store");