bosun 0.35.0 → 0.35.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/agent-pool.mjs CHANGED
@@ -45,6 +45,11 @@ import { loadConfig } from "./config.mjs";
45
45
  import { resolveRepoRoot, resolveAgentRepoRoot } from "./repo-root.mjs";
46
46
  import { resolveCodexProfileRuntime } from "./codex-model-profiles.mjs";
47
47
  import { getGitHubToken } from "./github-auth-manager.mjs";
48
+ import {
49
+ isTransientStreamError,
50
+ streamRetryDelay,
51
+ MAX_STREAM_RETRIES,
52
+ } from "./stream-resilience.mjs";
48
53
 
49
54
  // Lazy-load MCP registry to avoid circular dependencies.
50
55
  // Cached at module scope per AGENTS.md hard rules.
@@ -1053,6 +1058,24 @@ async function launchCodexThread(prompt, cwd, timeoutMs, extra = {}) {
1053
1058
  threadId: null,
1054
1059
  };
1055
1060
  }
1061
+ // ── Transient stream / network error ─ retry without resetting thread state ─
1062
+ if (isTransientStreamError(err)) {
1063
+ const retryAttempt = (extra._streamRetryAttempt || 0) + 1;
1064
+ if (retryAttempt < MAX_STREAM_RETRIES) {
1065
+ const delay = streamRetryDelay(retryAttempt - 1);
1066
+ console.warn(
1067
+ `${TAG} codex transient stream error (attempt ${retryAttempt}/${MAX_STREAM_RETRIES}): ${err.message || err} — retrying in ${Math.round(delay)}ms`,
1068
+ );
1069
+ await new Promise((r) => setTimeout(r, delay));
1070
+ return launchCodexThread(prompt, cwd, timeoutMs, {
1071
+ ...extra,
1072
+ _streamRetryAttempt: retryAttempt,
1073
+ });
1074
+ }
1075
+ console.error(
1076
+ `${TAG} codex stream disconnection not resolved after ${MAX_STREAM_RETRIES} attempts`,
1077
+ );
1078
+ }
1056
1079
  return {
1057
1080
  success: false,
1058
1081
  output: "",
@@ -1472,6 +1495,24 @@ async function launchCopilotThread(prompt, cwd, timeoutMs, extra = {}) {
1472
1495
  threadId: resumeThreadId,
1473
1496
  };
1474
1497
  }
1498
+ // ── Transient stream / network error ─ retry with a fresh Copilot session ─
1499
+ if (!isTimeout && !isIdleWaitTimeout && isTransientStreamError(err)) {
1500
+ const retryAttempt = (extra._streamRetryAttempt || 0) + 1;
1501
+ if (retryAttempt < MAX_STREAM_RETRIES) {
1502
+ const delay = streamRetryDelay(retryAttempt - 1);
1503
+ console.warn(
1504
+ `${TAG} copilot transient stream error (attempt ${retryAttempt}/${MAX_STREAM_RETRIES}): ${errMsg} — retrying in ${Math.round(delay)}ms`,
1505
+ );
1506
+ await new Promise((r) => setTimeout(r, delay));
1507
+ return launchCopilotThread(prompt, cwd, timeoutMs, {
1508
+ ...extra,
1509
+ _streamRetryAttempt: retryAttempt,
1510
+ });
1511
+ }
1512
+ console.error(
1513
+ `${TAG} copilot stream disconnection not resolved after ${MAX_STREAM_RETRIES} attempts`,
1514
+ );
1515
+ }
1475
1516
  return {
1476
1517
  success: false,
1477
1518
  output: "",
@@ -1846,6 +1887,24 @@ async function launchClaudeThread(prompt, cwd, timeoutMs, extra = {}) {
1846
1887
  threadId: resumeThreadId,
1847
1888
  };
1848
1889
  }
1890
+ // ── Transient stream / network error ─ retry with a fresh Claude query ──
1891
+ if (isTransientStreamError(err)) {
1892
+ const retryAttempt = (extra._streamRetryAttempt || 0) + 1;
1893
+ if (retryAttempt < MAX_STREAM_RETRIES) {
1894
+ const delay = streamRetryDelay(retryAttempt - 1);
1895
+ console.warn(
1896
+ `${TAG} claude transient stream error (attempt ${retryAttempt}/${MAX_STREAM_RETRIES}): ${err.message || err} — retrying in ${Math.round(delay)}ms`,
1897
+ );
1898
+ await new Promise((r) => setTimeout(r, delay));
1899
+ return launchClaudeThread(prompt, cwd, timeoutMs, {
1900
+ ...extra,
1901
+ _streamRetryAttempt: retryAttempt,
1902
+ });
1903
+ }
1904
+ console.error(
1905
+ `${TAG} claude stream disconnection not resolved after ${MAX_STREAM_RETRIES} attempts`,
1906
+ );
1907
+ }
1849
1908
  return {
1850
1909
  success: false,
1851
1910
  output: "",
package/claude-shell.mjs CHANGED
@@ -11,6 +11,11 @@ import { resolve } from "node:path";
11
11
  import { homedir } from "node:os";
12
12
  import { fileURLToPath } from "node:url";
13
13
  import { resolveRepoRoot } from "./repo-root.mjs";
14
+ import {
15
+ isTransientStreamError,
16
+ streamRetryDelay,
17
+ MAX_STREAM_RETRIES,
18
+ } from "./stream-resilience.mjs";
14
19
 
15
20
  const __dirname = resolve(fileURLToPath(new URL(".", import.meta.url)));
16
21
 
@@ -471,7 +476,7 @@ export async function execClaudePrompt(userMessage, options = {}) {
471
476
  abortController = null,
472
477
  } = options;
473
478
 
474
- if (activeTurn) {
479
+ if (activeTurn && !options._holdActiveTurn) {
475
480
  return {
476
481
  finalResponse:
477
482
  "⏳ Agent is still executing a previous task. Please wait.",
@@ -489,7 +494,9 @@ export async function execClaudePrompt(userMessage, options = {}) {
489
494
  };
490
495
  }
491
496
 
492
- activeTurn = true;
497
+ if (!options._holdActiveTurn) activeTurn = true;
498
+ /** Sentinel: true while a retry call is pending so finally skips cleanup. */
499
+ let _retryPending = false;
493
500
  toolUseById.clear();
494
501
 
495
502
  const transport = resolveClaudeTransport();
@@ -643,6 +650,33 @@ export async function execClaudePrompt(userMessage, options = {}) {
643
650
  : `⏱️ Agent timed out after ${timeoutMs / 1000}s`;
644
651
  return { finalResponse: msg, items: [], usage: null };
645
652
  }
653
+ // ── Transient stream retry ──────────────────────────────────────────────────
654
+ // Network/stream blips are safe to retry without resetting session state.
655
+ // _retryPending keeps the finally block from releasing activeTurn early.
656
+ if (isTransientStreamError(err)) {
657
+ const retryAttempt = (options._streamRetryAttempt || 0) + 1;
658
+ if (retryAttempt < MAX_STREAM_RETRIES) {
659
+ if (activeQueue) activeQueue.close();
660
+ activeQueue = null;
661
+ activeQuery = null;
662
+ toolUseById.clear();
663
+ const delay = streamRetryDelay(retryAttempt - 1);
664
+ console.warn(
665
+ `[claude-shell] transient stream error (attempt ${retryAttempt}/${MAX_STREAM_RETRIES}): ${err.message || err} — retrying in ${Math.round(delay)}ms`,
666
+ );
667
+ _retryPending = true; // prevent outer finally from releasing activeTurn
668
+ await new Promise((r) => setTimeout(r, delay));
669
+ // _retryPending stays true through the return so outer finally skips cleanup
670
+ return execClaudePrompt(userMessage, {
671
+ ...options,
672
+ _streamRetryAttempt: retryAttempt,
673
+ _holdActiveTurn: true, // skip activeTurn guard in recursive call
674
+ });
675
+ }
676
+ console.error(
677
+ `[claude-shell] stream disconnection not resolved after ${MAX_STREAM_RETRIES} attempts`,
678
+ );
679
+ }
646
680
  const message = err?.message || String(err || "unknown error");
647
681
  return {
648
682
  finalResponse: `❌ Claude agent failed: ${message}`,
@@ -650,10 +684,14 @@ export async function execClaudePrompt(userMessage, options = {}) {
650
684
  usage: null,
651
685
  };
652
686
  } finally {
653
- if (activeQueue) activeQueue.close();
654
- activeQueue = null;
655
- activeQuery = null;
656
- activeTurn = false;
687
+ // Only the outermost invocation (or the final retry) cleans up.
688
+ // When _retryPending is true, the recursive retry call owns the activeTurn lock.
689
+ if (!_retryPending) {
690
+ if (activeQueue) activeQueue.close();
691
+ activeQueue = null;
692
+ activeQuery = null;
693
+ activeTurn = false;
694
+ }
657
695
  }
658
696
  }
659
697
 
package/codex-shell.mjs CHANGED
@@ -18,12 +18,18 @@ import { fileURLToPath } from "node:url";
18
18
  import { resolveAgentSdkConfig } from "./agent-sdk.mjs";
19
19
  import { resolveRepoRoot } from "./repo-root.mjs";
20
20
  import { resolveCodexProfileRuntime } from "./codex-model-profiles.mjs";
21
+ import {
22
+ isTransientStreamError,
23
+ streamRetryDelay,
24
+ MAX_STREAM_RETRIES,
25
+ } from "./stream-resilience.mjs";
21
26
 
22
27
  const __dirname = resolve(fileURLToPath(new URL(".", import.meta.url)));
23
28
 
24
29
  // ── Configuration ────────────────────────────────────────────────────────────
25
30
 
26
31
  const DEFAULT_TIMEOUT_MS = 60 * 60 * 1000; // 60 min for agentic tasks (matches Azure stream timeout)
32
+ // MAX_STREAM_RETRIES, isTransientStreamError, streamRetryDelay ← imported from ./stream-resilience.mjs
27
33
  const STATE_FILE = resolve(__dirname, "logs", "codex-shell-state.json");
28
34
  const SESSIONS_DIR = resolve(__dirname, "logs", "sessions");
29
35
  const MAX_PERSISTENT_TURNS = 50;
@@ -554,30 +560,32 @@ export async function execCodexPrompt(userMessage, options = {}) {
554
560
  }
555
561
  // else: persistent && same session && under limit → reuse activeThread
556
562
 
557
- for (let attempt = 0; attempt < 2; attempt += 1) {
558
- const thread = await getThread();
563
+ // Build the user prompt with optional status context (built once, reused across retries)
564
+ let prompt = userMessage;
565
+ if (statusData) {
566
+ const statusSnippet = JSON.stringify(statusData, null, 2).slice(0, 2000);
567
+ prompt = `[Orchestrator Status]\n\`\`\`json\n${statusSnippet}\n\`\`\`\n\n# YOUR TASK — EXECUTE NOW\n\n${userMessage}\n\n---\nDo NOT respond with "Ready" or ask what to do. EXECUTE this task. Read files, run commands, produce detailed output.`;
568
+ } else {
569
+ prompt = `${userMessage}\n\n\n# YOUR TASK — EXECUTE NOW\n\n\n---\nDo NOT respond with "Ready" or ask what to do. EXECUTE this task. Read files, run commands, produce detailed output & complete the user's request E2E.`;
570
+ }
571
+ // Sanitize & size-guard once — prevents invalid_request_error from oversized
572
+ // bodies (BytePositionInLine > 80 000) or unescaped control characters.
573
+ const safePrompt = sanitizeAndTruncatePrompt(prompt);
559
574
 
560
- // Build the user prompt with optional status context
561
- let prompt = userMessage;
562
- if (statusData) {
563
- const statusSnippet = JSON.stringify(statusData, null, 2).slice(
564
- 0,
565
- 2000,
566
- );
567
- prompt = `[Orchestrator Status]\n\`\`\`json\n${statusSnippet}\n\`\`\`\n\n# YOUR TASK — EXECUTE NOW\n\n${userMessage}\n\n---\nDo NOT respond with "Ready" or ask what to do. EXECUTE this task. Read files, run commands, produce detailed output.`;
568
- } else {
569
- prompt = `${userMessage}\n\n\n# YOUR TASK — EXECUTE NOW\n\n\n---\nDo NOT respond with "Ready" or ask what to do. EXECUTE this task. Read files, run commands, produce detailed output & complete the user's request E2E.`;
570
- }
575
+ let threadResetDone = false;
571
576
 
572
- // Set up timeout
577
+ for (let attempt = 0; attempt < MAX_STREAM_RETRIES; attempt += 1) {
578
+ const thread = await getThread();
579
+
580
+ // Each attempt gets a fresh AbortController tied to the same timeout budget.
581
+ // We intentionally do NOT share the same controller across retries: if the
582
+ // first attempt times out the signal is already aborted and the retry would
583
+ // immediately fail. The total wall-clock budget is still bounded by the
584
+ // outer timeoutMs passed in.
573
585
  const controller = abortController || new AbortController();
574
586
  const timer = setTimeout(() => controller.abort("timeout"), timeoutMs);
575
587
 
576
588
  try {
577
- // Sanitize & size-guard before sending — prevents invalid_request_error
578
- // from oversized bodies (BytePositionInLine > 80 000) or control chars.
579
- const safePrompt = sanitizeAndTruncatePrompt(prompt);
580
-
581
589
  // Use runStreamed for real-time event streaming
582
590
  const streamedTurn = await thread.runStreamed(safePrompt, {
583
591
  signal: controller.signal,
@@ -585,6 +593,7 @@ export async function execCodexPrompt(userMessage, options = {}) {
585
593
 
586
594
  let finalResponse = "";
587
595
  const allItems = [];
596
+ let turnFailedErr = null;
588
597
 
589
598
  // Process events from the async generator
590
599
  for await (const event of streamedTurn.events) {
@@ -594,6 +603,13 @@ export async function execCodexPrompt(userMessage, options = {}) {
594
603
  await saveState();
595
604
  }
596
605
 
606
+ // turn.failed is emitted by the SDK when the server signals response.failed.
607
+ // Convert it into a retriable error so the retry loop can back off & retry.
608
+ if (event.type === "turn.failed") {
609
+ const detail = event.error?.message || "response.failed";
610
+ turnFailedErr = new Error(`stream disconnected before completion: ${detail}`);
611
+ }
612
+
597
613
  // Format and emit event
598
614
  if (onEvent) {
599
615
  const formatted = formatEvent(event);
@@ -628,6 +644,10 @@ export async function execCodexPrompt(userMessage, options = {}) {
628
644
  }
629
645
  }
630
646
 
647
+ // If a turn.failed event was seen during the stream, treat it as a
648
+ // transient stream error so the retry loop handles it correctly.
649
+ if (turnFailedErr) throw turnFailedErr;
650
+
631
651
  clearTimeout(timer);
632
652
 
633
653
  return {
@@ -638,6 +658,7 @@ export async function execCodexPrompt(userMessage, options = {}) {
638
658
  };
639
659
  } catch (err) {
640
660
  clearTimeout(timer);
661
+
641
662
  if (err.name === "AbortError") {
642
663
  const reason = controller.signal.reason;
643
664
  const msg =
@@ -646,18 +667,44 @@ export async function execCodexPrompt(userMessage, options = {}) {
646
667
  : `⏱️ Agent timed out after ${timeoutMs / 1000}s`;
647
668
  return { finalResponse: msg, items: [], usage: null };
648
669
  }
649
- if (attempt === 0 && isRecoverableThreadError(err)) {
670
+
671
+ // ── Thread corruption errors: reset thread & retry once ──────────────
672
+ if (!threadResetDone && isRecoverableThreadError(err)) {
650
673
  console.warn(
651
674
  `[codex-shell] recoverable thread error: ${err.message || err} — resetting thread`,
652
675
  );
653
676
  await resetThread();
654
- continue;
677
+ threadResetDone = true;
678
+ continue; // retry without counting against stream-retry budget
655
679
  }
680
+
681
+ // ── Transient stream/network errors: backoff & retry ─────────────────
682
+ if (isTransientStreamError(err)) {
683
+ const attemptsLeft = MAX_STREAM_RETRIES - 1 - attempt;
684
+ if (attemptsLeft > 0) {
685
+ const delay = streamRetryDelay(attempt);
686
+ console.warn(
687
+ `[codex-shell] transient stream error (attempt ${attempt + 1}/${MAX_STREAM_RETRIES}): ${err.message || err} — retrying in ${Math.round(delay)}ms`,
688
+ );
689
+ await new Promise((r) => setTimeout(r, delay));
690
+ continue;
691
+ }
692
+ // Exhausted all retries
693
+ console.error(
694
+ `[codex-shell] stream disconnection not resolved after ${MAX_STREAM_RETRIES} attempts — giving up`,
695
+ );
696
+ return {
697
+ finalResponse: `❌ Stream disconnected after ${MAX_STREAM_RETRIES} retries: ${err.message}`,
698
+ items: [],
699
+ usage: null,
700
+ };
701
+ }
702
+
656
703
  throw err;
657
704
  }
658
705
  }
659
706
  return {
660
- finalResponse: "❌ Agent failed after retry.",
707
+ finalResponse: "❌ Agent failed after all retry attempts.",
661
708
  items: [],
662
709
  usage: null,
663
710
  };
package/copilot-shell.mjs CHANGED
@@ -14,6 +14,11 @@ import { fileURLToPath } from "node:url";
14
14
  import { execSync } from "node:child_process";
15
15
  import { resolveRepoRoot } from "./repo-root.mjs";
16
16
  import { getGitHubToken } from "./github-auth-manager.mjs";
17
+ import {
18
+ isTransientStreamError,
19
+ streamRetryDelay,
20
+ MAX_STREAM_RETRIES,
21
+ } from "./stream-resilience.mjs";
17
22
 
18
23
  const __dirname = resolve(fileURLToPath(new URL(".", import.meta.url)));
19
24
 
@@ -710,7 +715,7 @@ export async function execCopilotPrompt(userMessage, options = {}) {
710
715
  persistent = false,
711
716
  } = options;
712
717
 
713
- if (activeTurn) {
718
+ if (activeTurn && !options._holdActiveTurn) {
714
719
  return {
715
720
  finalResponse:
716
721
  "⏳ Agent is still executing a previous task. Please wait.",
@@ -719,7 +724,9 @@ export async function execCopilotPrompt(userMessage, options = {}) {
719
724
  };
720
725
  }
721
726
 
722
- activeTurn = true;
727
+ if (!options._holdActiveTurn) activeTurn = true;
728
+ /** Sentinel: true while a retry call is pending so finally skips cleanup. */
729
+ let _retryPending = false;
723
730
 
724
731
  if (!persistent) {
725
732
  // Task executor path — fresh session each call
@@ -841,22 +848,54 @@ export async function execCopilotPrompt(userMessage, options = {}) {
841
848
  : `⏱️ Agent timed out after ${timeoutMs / 1000}s`;
842
849
  return { finalResponse: msg, items: [], usage: null };
843
850
  }
851
+ // ── Transient stream retry ──────────────────────────────────────────────────
852
+ // Reset session and re-run; _retryPending holds the activeTurn lock.
853
+ if (isTransientStreamError(err)) {
854
+ const retryAttempt = (options._streamRetryAttempt || 0) + 1;
855
+ if (retryAttempt < MAX_STREAM_RETRIES) {
856
+ if (typeof unsubscribe === "function") try { unsubscribe(); } catch { /* best effort */ }
857
+ unsubscribe = null;
858
+ activeSession = null; // force getSession() to create a fresh session
859
+ const delay = streamRetryDelay(retryAttempt - 1);
860
+ console.warn(
861
+ `[copilot-shell] transient stream error (attempt ${retryAttempt}/${MAX_STREAM_RETRIES}): ${err.message || err} — retrying in ${Math.round(delay)}ms`,
862
+ );
863
+ _retryPending = true; // prevent outer finally from releasing activeTurn
864
+ await new Promise((r) => setTimeout(r, delay));
865
+ return execCopilotPrompt(userMessage, {
866
+ ...options,
867
+ _streamRetryAttempt: retryAttempt,
868
+ _holdActiveTurn: true, // skip activeTurn guard in recursive call
869
+ });
870
+ }
871
+ console.error(
872
+ `[copilot-shell] stream disconnection not resolved after ${MAX_STREAM_RETRIES} attempts`,
873
+ );
874
+ return {
875
+ finalResponse: `❌ Stream disconnected after ${MAX_STREAM_RETRIES} retries: ${err.message}`,
876
+ items: [],
877
+ usage: null,
878
+ };
879
+ }
844
880
  throw err;
845
881
  } finally {
846
- if (typeof unsubscribe === "function") {
847
- try {
848
- unsubscribe();
849
- } catch {
850
- /* best effort */
851
- }
852
- } else if (typeof session.off === "function") {
853
- try {
854
- session.off(handleEvent);
855
- } catch {
856
- /* best effort */
882
+ // Only the outermost invocation (or the final retry) cleans up.
883
+ if (!_retryPending) {
884
+ if (typeof unsubscribe === "function") {
885
+ try {
886
+ unsubscribe();
887
+ } catch {
888
+ /* best effort */
889
+ }
890
+ } else if (typeof session.off === "function") {
891
+ try {
892
+ session.off(handleEvent);
893
+ } catch {
894
+ /* best effort */
895
+ }
857
896
  }
897
+ activeTurn = false;
858
898
  }
859
- activeTurn = false;
860
899
  }
861
900
  }
862
901
 
package/maintenance.mjs CHANGED
@@ -1019,6 +1019,55 @@ export function syncLocalTrackingBranches(repoRoot, branches) {
1019
1019
 
1020
1020
  if (behind === 0 && ahead === 0) continue; // Already in sync
1021
1021
 
1022
+ // 'main' must always mirror upstream — NEVER push local commits.
1023
+ // Hard-reset instead of rebase-push to prevent workspace drift.
1024
+ if (branch === "main" && ahead > 0) {
1025
+ if (branch === currentBranch) {
1026
+ const reset = spawnSync("git", ["reset", "--hard", remoteRef], {
1027
+ cwd: repoRoot,
1028
+ encoding: "utf8",
1029
+ timeout: 10_000,
1030
+ windowsHide: true,
1031
+ });
1032
+ if (reset.status === 0) {
1033
+ logThrottledBranchSync(
1034
+ `sync:${branch}:hard-reset`,
1035
+ `[maintenance] hard-reset 'main' to ${remoteRef} (was ${ahead}\u2191 ${behind}\u2193) — main must not diverge from upstream`,
1036
+ "warn",
1037
+ );
1038
+ synced++;
1039
+ } else {
1040
+ logThrottledBranchSync(
1041
+ `sync:${branch}:hard-reset-failed`,
1042
+ `[maintenance] hard-reset of 'main' to ${remoteRef} failed`,
1043
+ "error",
1044
+ );
1045
+ }
1046
+ } else {
1047
+ // Not checked out — force-update the ref directly
1048
+ const update = spawnSync(
1049
+ "git",
1050
+ ["update-ref", `refs/heads/${branch}`, `refs/remotes/${remoteRef}`],
1051
+ { cwd: repoRoot, timeout: 5000, windowsHide: true },
1052
+ );
1053
+ if (update.status === 0) {
1054
+ logThrottledBranchSync(
1055
+ `sync:${branch}:force-update-ref`,
1056
+ `[maintenance] force-updated 'main' ref to ${remoteRef} (was ${ahead}\u2191 ${behind}\u2193) — discarded local-only commits`,
1057
+ "warn",
1058
+ );
1059
+ synced++;
1060
+ } else {
1061
+ logThrottledBranchSync(
1062
+ `sync:${branch}:force-update-ref-failed`,
1063
+ `[maintenance] force-update of 'main' ref to ${remoteRef} failed`,
1064
+ "error",
1065
+ );
1066
+ }
1067
+ }
1068
+ continue;
1069
+ }
1070
+
1022
1071
  // Local is ahead of remote but not behind — try a plain push
1023
1072
  if (behind === 0 && ahead > 0) {
1024
1073
  const push = spawnSync(