bosun 0.33.7 → 0.33.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.env.example CHANGED
@@ -920,6 +920,14 @@ COPILOT_CLOUD_DISABLED=true
920
920
  # stale-task-followup. Configure/enable in bosun.config.json under triggerSystem.
921
921
  # TASK_TRIGGER_SYSTEM_ENABLED=false
922
922
 
923
+ # ─── Workflow Automation (event-driven) ──────────────────────────────────────
924
+ # Enables automatic Workflow Engine trigger evaluation from monitor events
925
+ # (task.assigned, task.completed, task.failed, pr.opened, pr.merged, etc).
926
+ # Enabled by default. Set to false to disable event-driven automation.
927
+ # WORKFLOW_AUTOMATION_ENABLED=true
928
+ # Optional dedup window to avoid event storms (milliseconds).
929
+ # WORKFLOW_EVENT_DEDUP_WINDOW_MS=15000
930
+
923
931
  # ─── GitHub Issue Reconciler ─────────────────────────────────────────────────
924
932
  # Periodically reconciles open GitHub issues against open/merged PRs.
925
933
  # Hybrid close policy:
@@ -31,6 +31,7 @@ const DEFAULT_PORT = 18432;
31
31
  const MAX_BODY_SIZE = 1024 * 1024; // 1 MB
32
32
  const REQUEST_TIMEOUT_MS = 30_000; // 30 seconds
33
33
  const ACCESS_DENIED_COOLDOWN_MS = 10 * 60 * 1000; // 10 minutes
34
+ const BOSUN_ROOT_HINT = __dirname.toLowerCase().replace(/\\/g, '/');
34
35
 
35
36
  // Valid status transitions when an agent self-reports
36
37
  const VALID_TRANSITIONS = {
@@ -175,6 +176,41 @@ function isAlreadyExitedProcessError(err) {
175
176
  );
176
177
  }
177
178
 
179
+ function normalizeCommandLine(commandLine) {
180
+ return String(commandLine || "").toLowerCase().replace(/\\/g, "/").trim();
181
+ }
182
+
183
+ function isLikelyBosunCommandLine(commandLine) {
184
+ const normalized = normalizeCommandLine(commandLine);
185
+ if (!normalized) return false;
186
+
187
+ if (normalized.includes(BOSUN_ROOT_HINT)) return true;
188
+
189
+ if (
190
+ normalized.includes("/bosun/") &&
191
+ (normalized.includes("monitor.mjs") ||
192
+ normalized.includes("cli.mjs") ||
193
+ normalized.includes("agent-endpoint.mjs") ||
194
+ normalized.includes("ve-orchestrator"))
195
+ ) {
196
+ return true;
197
+ }
198
+
199
+ // Dev-mode often launches monitor as node monitor.mjs from bosun root.
200
+ if (/\bnode(?:\.exe)?\b/.test(normalized) && /\bmonitor\.mjs\b/.test(normalized)) {
201
+ return true;
202
+ }
203
+
204
+ return false;
205
+ }
206
+
207
+ function summarizeCommandLine(commandLine, maxLen = 140) {
208
+ const compact = String(commandLine || "").replace(/\s+/g, " ").trim();
209
+ if (!compact) return "command line unavailable";
210
+ if (compact.length <= maxLen) return compact;
211
+ return compact.slice(0, maxLen) + "...";
212
+ }
213
+
178
214
  // ── AgentEndpoint Class ─────────────────────────────────────────────────────
179
215
 
180
216
  export class AgentEndpoint {
@@ -311,7 +347,7 @@ export class AgentEndpoint {
311
347
  const { execSync, spawnSync } = await import("node:child_process");
312
348
  const isWindows = process.platform === "win32";
313
349
  let output;
314
- let pids = new Set();
350
+ const pids = new Set();
315
351
 
316
352
  // PIDs we must NEVER kill — ourselves, our parent (cli.mjs fork host),
317
353
  // and any ancestor in the same process tree. lsof can return these when
@@ -321,13 +357,43 @@ export class AgentEndpoint {
321
357
  String(process.ppid),
322
358
  ]);
323
359
 
360
+ const readProcessCommandLine = (pid) => {
361
+ try {
362
+ if (isWindows) {
363
+ const query = `$p = Get-CimInstance Win32_Process -Filter "ProcessId=${pid}" -ErrorAction SilentlyContinue; if ($p) { $p.CommandLine }`;
364
+ const result = spawnSync(
365
+ "powershell",
366
+ ["-NoProfile", "-Command", query],
367
+ {
368
+ encoding: "utf8",
369
+ timeout: 5000,
370
+ windowsHide: true,
371
+ stdio: ["ignore", "pipe", "pipe"],
372
+ },
373
+ );
374
+ if (result.error || result.status !== 0) return "";
375
+ return String(result.stdout || "").trim();
376
+ }
377
+
378
+ const result = spawnSync("ps", ["-p", String(pid), "-o", "args="], {
379
+ encoding: "utf8",
380
+ timeout: 5000,
381
+ stdio: ["ignore", "pipe", "pipe"],
382
+ });
383
+ if (result.error || result.status !== 0) return "";
384
+ return String(result.stdout || "").trim();
385
+ } catch {
386
+ return "";
387
+ }
388
+ };
389
+
324
390
  if (isWindows) {
325
391
  // Windows: netstat -ano | findstr
326
392
  output = execSync(`netstat -ano | findstr ":${port}"`, {
327
393
  encoding: "utf8",
328
394
  timeout: 5000,
329
395
  }).trim();
330
- const lines = output.split("\n").filter((l) => l.includes("LISTENING"));
396
+ const lines = output.split("\n").filter((line) => line.includes("LISTENING"));
331
397
  for (const line of lines) {
332
398
  const parts = line.trim().split(/\s+/);
333
399
  const pid = parts[parts.length - 1];
@@ -342,7 +408,7 @@ export class AgentEndpoint {
342
408
  encoding: "utf8",
343
409
  timeout: 5000,
344
410
  }).trim();
345
- const pidList = output.split("\n").filter((p) => p.trim());
411
+ const pidList = output.split("\n").filter((pid) => pid.trim());
346
412
  for (const pid of pidList) {
347
413
  if (pid && /^\d+$/.test(pid) && !protectedPids.has(pid)) {
348
414
  pids.add(pid);
@@ -363,7 +429,23 @@ export class AgentEndpoint {
363
429
  }
364
430
  }
365
431
 
432
+ const killEligiblePids = new Set();
366
433
  for (const pid of pids) {
434
+ const commandLine = readProcessCommandLine(pid);
435
+ if (!isLikelyBosunCommandLine(commandLine)) {
436
+ console.warn(
437
+ `${TAG} Port ${port} held by non-bosun PID ${pid} (${summarizeCommandLine(commandLine)}); skipping forced kill`,
438
+ );
439
+ continue;
440
+ }
441
+ killEligiblePids.add(pid);
442
+ }
443
+
444
+ if (killEligiblePids.size === 0) {
445
+ return;
446
+ }
447
+
448
+ for (const pid of killEligiblePids) {
367
449
  console.log(`${TAG} Sending SIGTERM to stale process PID ${pid} on port ${port}`);
368
450
  try {
369
451
  if (isWindows) {
@@ -421,12 +503,13 @@ export class AgentEndpoint {
421
503
  );
422
504
  }
423
505
  }
506
+
424
507
  // Give the SIGTERM'd processes time to exit gracefully
425
508
  await new Promise((r) => setTimeout(r, 2000));
426
509
 
427
510
  // Escalate: check if any are still alive and SIGKILL them
428
511
  if (!isWindows) {
429
- for (const pid of pids) {
512
+ for (const pid of killEligiblePids) {
430
513
  try {
431
514
  process.kill(Number(pid), 0); // probe — throws if dead
432
515
  console.warn(`${TAG} PID ${pid} still alive after SIGTERM — sending SIGKILL`);
@@ -447,7 +530,6 @@ export class AgentEndpoint {
447
530
  }
448
531
  }
449
532
  }
450
-
451
533
  /**
452
534
  * Stop the HTTP server.
453
535
  * @returns {Promise<void>}
package/agent-pool.mjs CHANGED
@@ -1923,7 +1923,9 @@ export async function ensureThreadRegistryLoaded() {
1923
1923
  }
1924
1924
 
1925
1925
  // Kick off async load at module init (non-blocking), callers can await explicitly.
1926
- void ensureThreadRegistryLoaded();
1926
+ ensureThreadRegistryLoaded().catch((err) => {
1927
+ console.warn(TAG + " thread registry warm-up failed: " + (err?.message || err));
1928
+ });
1927
1929
 
1928
1930
  // ---------------------------------------------------------------------------
1929
1931
  // Per-SDK Resume Launchers
@@ -2667,7 +2669,11 @@ export function invalidateThread(taskKey) {
2667
2669
  }
2668
2670
  // If registry hasn't loaded yet, defer invalidation until load completes.
2669
2671
  if (!threadRegistryLoaded) {
2670
- void invalidateThreadAsync(taskKey);
2672
+ invalidateThreadAsync(taskKey).catch((err) => {
2673
+ console.warn(
2674
+ TAG + " deferred invalidateThreadAsync failed for \"" + taskKey + "\": " + (err?.message || err),
2675
+ );
2676
+ });
2671
2677
  }
2672
2678
  }
2673
2679
 
package/agent-prompts.mjs CHANGED
@@ -150,6 +150,24 @@ You are an autonomous task orchestrator agent. You receive implementation tasks
150
150
  4. Run relevant verification (tests/lint/build) before finalizing.
151
151
  5. Use conventional commit messages.
152
152
 
153
+ ## Code Quality — Hard Rules
154
+
155
+ These rules are non-negotiable. Violations cause real production crashes.
156
+
157
+ - **Module-scope caching:** Variables that cache state (lazy singletons, loaded
158
+ flags, memoization maps) MUST be at module scope, never inside a function body
159
+ that runs repeatedly.
160
+ - **Async safety:** NEVER use bare \`void asyncFn()\`. Every async call must be
161
+ \`await\`-ed or have a \`.catch()\` handler. Unhandled rejections crash Node.js.
162
+ - **Error boundaries:** HTTP handlers, timers, and event callbacks MUST wrap async
163
+ work in try/catch so one failure doesn't kill the process.
164
+ - **No over-mocking in tests:** Mock only external boundaries (network, disk, clock).
165
+ Never mock the module under test. If a test needs > 3 mocks, refactor the code.
166
+ - **Deterministic tests:** No \`Math.random()\`, real network calls, or \`setTimeout\`
167
+ for synchronization. Tests must be reproducible and order-independent.
168
+ - **Dynamic \`import()\` must be cached:** Never place \`import()\` inside a
169
+ frequently-called function without caching the result at module scope.
170
+
153
171
  ## Completion Criteria
154
172
 
155
173
  - Implementation matches requested behavior.
@@ -270,6 +288,27 @@ Check for relevant skills before implementing:
270
288
  - No placeholders/stubs/TODO-only output.
271
289
  - Keep behavior stable and production-safe.
272
290
 
291
+ ## Code Quality — Mandatory Checks
292
+
293
+ These patterns have caused real production crashes. Treat them as hard rules:
294
+
295
+ 1. **Module-scope caching:** If you declare variables that cache state (lazy
296
+ singletons, init flags, memoization), place them at **module scope** — never
297
+ inside a function body that runs per-request or per-event.
298
+ 2. **Async fire-and-forget:** Never use bare \`void asyncFn()\`. Always \`await\`
299
+ or append \`.catch()\`. Unhandled promise rejections crash Node.js (exit 1).
300
+ 3. **Error boundaries:** Wrap HTTP handlers, timers, and event callbacks in
301
+ top-level try/catch. One unguarded throw must not kill the process.
302
+ 4. **Dynamic imports:** Cache \`import()\` results at module scope. Never call
303
+ \`import()\` inside a hot path without caching — it causes repeated I/O.
304
+ 5. **Test quality:** Mock only external boundaries (network, disk, clock). Never
305
+ mock the module under test. No \`setTimeout\`/\`sleep\` for synchronization.
306
+ Tests must be deterministic and order-independent. Assert on behavior, not
307
+ implementation details.
308
+ 6. **No architectural shortcuts:** Don't force-enable feature flags inline. Don't
309
+ add config overrides that bypass safety checks. If a feature is behind a flag,
310
+ respect it.
311
+
273
312
  ## Bosun Task Agent — Git & PR Workflow
274
313
 
275
314
  You are running as a **Bosun-managed task agent**. Environment variables
@@ -375,6 +414,13 @@ Review the following PR diff for CRITICAL issues ONLY.
375
414
  2. Bugs / correctness regressions
376
415
  3. Missing implementations
377
416
  4. Broken functionality
417
+ 5. Cache/singleton variables declared inside function bodies instead of module scope
418
+ 6. Bare \`void asyncFn()\` or async calls without \`await\` / \`.catch()\`
419
+ 7. HTTP handlers, timers, or event callbacks missing try/catch error boundaries
420
+ 8. Dynamic \`import()\` inside hot paths without module-scope caching
421
+ 9. Tests that over-mock (mocking the module under test, > 3 mocks per test)
422
+ 10. Flaky test patterns: \`setTimeout\`/sleep for sync, \`Math.random()\`, real network
423
+ 11. Force-enabled feature flags or config overrides that bypass safety checks
378
424
 
379
425
  ## What to ignore
380
426
  - Style-only concerns
@@ -399,7 +445,7 @@ Respond with JSON only:
399
445
  "issues": [
400
446
  {
401
447
  "severity": "critical" | "major",
402
- "category": "security" | "bug" | "missing_impl" | "broken",
448
+ "category": "security" | "bug" | "missing_impl" | "broken" | "anti_pattern" | "flaky_test",
403
449
  "file": "path/to/file",
404
450
  "line": 123,
405
451
  "description": "..."
@@ -434,7 +434,11 @@ async function runStuckSweep() {
434
434
  function startStuckSweep() {
435
435
  if (stuckSweepTimer) return;
436
436
  stuckSweepTimer = setInterval(() => {
437
- void runStuckSweep();
437
+ runStuckSweep().catch((err) => {
438
+ console.error(
439
+ "[agent-work-analyzer] Stuck sweep failed: " + (err?.message || err),
440
+ );
441
+ });
438
442
  }, STUCK_SWEEP_INTERVAL_MS);
439
443
  stuckSweepTimer.unref?.();
440
444
  }
package/bosun-skills.mjs CHANGED
@@ -529,6 +529,207 @@ curl -sX POST http://127.0.0.1:$BOSUN_ENDPOINT_PORT/api/tasks/$BOSUN_TASK_ID/err
529
529
  | \`BOSUN_ENDPOINT_PORT\` | \`VE_ENDPOINT_PORT\` | API server port |
530
530
  | \`BOSUN_SDK\` | \`VE_SDK\` | SDK/executor type (COPILOT/CODEX/CLAUDE_CODE) |
531
531
  | \`BOSUN_MANAGED\` | \`VE_MANAGED\` | Set to "1" when running under bosun |
532
+ `,
533
+ },
534
+ {
535
+ filename: "code-quality-anti-patterns.md",
536
+ title: "Code Quality Anti-Patterns",
537
+ tags: ["quality", "code", "architecture", "async", "testing", "reliability", "bug", "crash", "scope", "caching", "promise", "module"],
538
+ scope: "global",
539
+ content: `# Skill: Code Quality Anti-Patterns
540
+
541
+ ## Purpose
542
+ Prevent common coding mistakes that cause crashes, flaky behavior, memory leaks,
543
+ and hard-to-diagnose production failures. Every pattern below has caused real
544
+ outages — treat each as a hard rule, not a suggestion.
545
+
546
+ ---
547
+
548
+ ## 1. Module-Scope vs Function-Scope — Caching & Singletons
549
+
550
+ **Rule:** Variables that cache module-level state (lazy singletons, loaded
551
+ configs, memoized results) MUST be declared at **module scope**, never inside
552
+ a function that runs repeatedly.
553
+
554
+ ### Bad — re-initializes on every call
555
+ \`\`\`js
556
+ export function handleRequest(req, res) {
557
+ let _engine; // ← reset to undefined on EVERY call
558
+ let _loaded = false; // ← never stays true across calls
559
+ if (!_loaded) {
560
+ _engine = await loadEngine();
561
+ _loaded = true;
562
+ }
563
+ // ...
564
+ }
565
+ \`\`\`
566
+
567
+ ### Good — persists across calls
568
+ \`\`\`js
569
+ let _engine;
570
+ let _loaded = false;
571
+
572
+ export function handleRequest(req, res) {
573
+ if (!_loaded) {
574
+ _engine = await loadEngine();
575
+ _loaded = true;
576
+ }
577
+ // ...
578
+ }
579
+ \`\`\`
580
+
581
+ **Why:** Placing cache variables inside a function body causes:
582
+ - Repeated expensive initialization (import, parse, connect) on every call
583
+ - Log spam from repeated init messages
584
+ - Potential memory leaks from orphaned resources
585
+ - Race conditions when multiple concurrent calls all see \`_loaded === false\`
586
+
587
+ **Checklist:**
588
+ - [ ] Lazy singletons: module scope
589
+ - [ ] Memoization caches: module scope (or a \`Map\`/\`WeakMap\` at module scope)
590
+ - [ ] "loaded" / "initialized" flags: module scope
591
+ - [ ] Config objects read once from disk: module scope
592
+
593
+ ---
594
+
595
+ ## 2. Async Fire-and-Forget — Always Handle Rejections
596
+
597
+ **Rule:** NEVER use bare \`void asyncFn()\` or call an async function without
598
+ either \`await\`-ing or chaining \`.catch()\`. Unhandled promise rejections crash
599
+ Node.js processes.
600
+
601
+ ### Bad — unhandled rejection → crash
602
+ \`\`\`js
603
+ void dispatchEvent(data); // if dispatchEvent is async and throws → crash
604
+ asyncCleanup(); // no await, no catch → crash
605
+ \`\`\`
606
+
607
+ ### Good — always handle the rejection
608
+ \`\`\`js
609
+ await dispatchEvent(data); // preferred: await it
610
+ dispatchEvent(data).catch(() => {}); // fire-and-forget OK
611
+ dispatchEvent(data).catch(err => log.warn(err)); // fire-and-forget with logging
612
+ \`\`\`
613
+
614
+ **Why:** Since Node.js 15+, unhandled promise rejections terminate the process
615
+ with exit code 1. A single \`void asyncFn()\` in a hot path can cause a
616
+ crash → restart → crash loop that takes down the entire system.
617
+
618
+ **Checklist:**
619
+ - [ ] Every async call is \`await\`-ed OR has a \`.catch()\` handler
620
+ - [ ] No bare \`void asyncFn()\` patterns
621
+ - [ ] Event dispatch functions wrapped in try/catch at the top level
622
+ - [ ] setInterval/setTimeout callbacks that call async functions use \`.catch()\`
623
+
624
+ ---
625
+
626
+ ## 3. Error Boundaries & Defensive Coding
627
+
628
+ **Rule:** Any function called from a hot path (HTTP handlers, event loops,
629
+ timers) MUST have a top-level try/catch that prevents a single failure from
630
+ crashing the entire process.
631
+
632
+ ### Bad — one bad event kills the server
633
+ \`\`\`js
634
+ router.post('/webhook', async (req, res) => {
635
+ const data = parsePayload(req.body);
636
+ await processAllWebhooks(data);
637
+ res.json({ ok: true });
638
+ });
639
+ \`\`\`
640
+
641
+ ### Good — contained failure
642
+ \`\`\`js
643
+ router.post('/webhook', async (req, res) => {
644
+ try {
645
+ const data = parsePayload(req.body);
646
+ await processAllWebhooks(data);
647
+ res.json({ ok: true });
648
+ } catch (err) {
649
+ log.error('webhook handler failed', err);
650
+ res.status(500).json({ error: 'internal' });
651
+ }
652
+ });
653
+ \`\`\`
654
+
655
+ ---
656
+
657
+ ## 4. Testing Anti-Patterns
658
+
659
+ ### Over-Mocking
660
+ **Rule:** Tests should validate real behavior, not just confirm that mocks
661
+ return what you told them to return.
662
+
663
+ - Mock only external boundaries (network, filesystem, clock).
664
+ - Never mock the module under test.
665
+ - If you need > 3 mocks for a single test, the code under test probably needs
666
+ refactoring, not more mocks.
667
+ - Prefer integration tests with real instances over unit tests with heavy mocking.
668
+
669
+ ### Flaky Tests
670
+ **Rule:** Tests must be deterministic and reproducible.
671
+
672
+ - No \`Math.random()\` or \`Date.now()\` without mocking.
673
+ - No network calls to real servers.
674
+ - No \`setTimeout\`/\`sleep\` for synchronization — use proper async patterns.
675
+ - No implicit ordering dependencies between tests.
676
+ - If a test creates global state, clean it up in \`afterEach\`.
677
+
678
+ ### Assertion Quality
679
+ - Test ONE behavior per test case.
680
+ - Assert on observable outputs, not internal state.
681
+ - Check error cases, not just happy paths.
682
+ - Use descriptive test names: \`parseDate_invalidInput_throwsError\`
683
+ not \`test parseDate 3\`.
684
+
685
+ ---
686
+
687
+ ## 5. Architectural Patterns
688
+
689
+ ### Initialization Guards
690
+ When a module has expensive async initialization, use a promise-based
691
+ deduplication pattern to prevent multiple concurrent initializations:
692
+
693
+ \`\`\`js
694
+ let _initPromise = null;
695
+
696
+ async function ensureInit() {
697
+ if (!_initPromise) {
698
+ _initPromise = doExpensiveInit(); // called ONCE
699
+ }
700
+ return _initPromise;
701
+ }
702
+ \`\`\`
703
+
704
+ ### Import/Require in Module Scope
705
+ Dynamic \`import()\` calls should be cached at module scope.
706
+ Never put \`import()\` inside a frequently-called function without caching.
707
+
708
+ ### Guard Clauses for Optional Features
709
+ When calling into optional subsystems (plugins, workflow engines, etc.),
710
+ always check that the subsystem is enabled before invoking:
711
+
712
+ \`\`\`js
713
+ if (!config.featureEnabled) return;
714
+ const engine = await getEngine();
715
+ if (!engine) return;
716
+ await engine.process(data);
717
+ \`\`\`
718
+
719
+ ---
720
+
721
+ ## Quick Reference: Red Flags in Code Review
722
+
723
+ | Pattern | Risk | Fix |
724
+ |---------|------|-----|
725
+ | \`let x\` inside function body used as cache | Re-init every call | Hoist to module scope |
726
+ | \`void asyncFn()\` | Unhandled rejection → crash | \`await\` or \`.catch()\` |
727
+ | Async callback without try/catch | Uncaught exception → crash | Wrap in try/catch |
728
+ | \`import()\` inside hot function, no cache | Repeated I/O, log spam | Cache at module scope |
729
+ | Test mocking the module under test | Test proves nothing | Mock only boundaries |
730
+ | \`setTimeout\`/\`sleep\` in tests | Flaky | Use async events/mocks |
731
+ | No error case tests | False confidence | Add negative test cases |
732
+ | \`git add .\` | Stages unrelated files | Stage files individually |
532
733
  `,
533
734
  },
534
735
  ];
@@ -241,6 +241,43 @@ function parseBooleanEnv(value, fallback = false) {
241
241
  if (["0", "false", "no", "off"].includes(key)) return false;
242
242
  return fallback;
243
243
  }
244
+ const GH_TRANSIENT_ERROR_PATTERNS = [
245
+ /bad gateway/i,
246
+ /service unavailable/i,
247
+ /gateway timeout/i,
248
+ /http\s*502/i,
249
+ /http\s*503/i,
250
+ /http\s*504/i,
251
+ /econnreset/i,
252
+ /econnrefused/i,
253
+ /etimedout/i,
254
+ /socket hang up/i,
255
+ /network error/i,
256
+ /temporarily unavailable/i,
257
+ /invalid character '<' looking for beginning of value/i,
258
+ /unexpected token </i,
259
+ ];
260
+
261
+ function sleepMs(ms) {
262
+ return new Promise((resolve) => setTimeout(resolve, ms));
263
+ }
264
+
265
+ function isGhRateLimitError(text) {
266
+ const errText = String(text || "").toLowerCase();
267
+ if (!errText) return false;
268
+ return (
269
+ errText.includes("rate limit") ||
270
+ errText.includes("api rate limit exceeded") ||
271
+ (errText.includes("403") && errText.includes("limit"))
272
+ );
273
+ }
274
+
275
+ function isGhTransientError(text) {
276
+ const errText = String(text || "").toLowerCase();
277
+ if (!errText) return false;
278
+ if (isGhRateLimitError(errText)) return false;
279
+ return GH_TRANSIENT_ERROR_PATTERNS.some((pattern) => pattern.test(errText));
280
+ }
244
281
 
245
282
  function parseRepoSlug(raw) {
246
283
  const text = String(raw || "").trim().replace(/^https?:\/\/github\.com\//i, "");
@@ -1115,6 +1152,14 @@ class GitHubIssuesAdapter {
1115
1152
  // Rate limit retry delay (ms) — configurable for tests
1116
1153
  this._rateLimitRetryDelayMs =
1117
1154
  Number(process.env.GH_RATE_LIMIT_RETRY_MS) || 60_000;
1155
+ this._transientRetryDelayMs = Math.max(
1156
+ 250,
1157
+ Number(process.env.GH_TRANSIENT_RETRY_MS) || 2_000,
1158
+ );
1159
+ this._transientRetryMax = Math.max(
1160
+ 0,
1161
+ Number(process.env.GH_TRANSIENT_RETRY_MAX) || 2,
1162
+ );
1118
1163
  }
1119
1164
 
1120
1165
  /**
@@ -1891,46 +1936,84 @@ class GitHubIssuesAdapter {
1891
1936
  const execFileAsync = promisify(execFile);
1892
1937
 
1893
1938
  const attempt = async () => {
1894
- const { stdout, stderr } = await execFileAsync("gh", args, {
1895
- maxBuffer: 10 * 1024 * 1024,
1896
- timeout: 30_000,
1897
- });
1898
- return { stdout, stderr };
1939
+ try {
1940
+ const { stdout, stderr } = await execFileAsync("gh", args, {
1941
+ maxBuffer: 10 * 1024 * 1024,
1942
+ timeout: 30_000,
1943
+ });
1944
+ return { stdout, stderr };
1945
+ } catch (err) {
1946
+ const message = String(err?.message || err);
1947
+ const stdout = String(err?.stdout || "");
1948
+ const stderr = String(err?.stderr || "");
1949
+ const ghError = new Error(message);
1950
+ ghError.stdout = stdout;
1951
+ ghError.stderr = stderr;
1952
+ ghError.fullText = [message, stderr, stdout].filter(Boolean).join("\n");
1953
+ ghError.isRateLimit = isGhRateLimitError([message, stderr].join("\n"));
1954
+ ghError.isTransient = isGhTransientError([message, stderr, stdout].join("\n"));
1955
+ throw ghError;
1956
+ }
1899
1957
  };
1900
1958
 
1901
- let result;
1902
- try {
1903
- result = await attempt();
1904
- } catch (err) {
1905
- const errText = String(err?.message || err?.stderr || err).toLowerCase();
1906
- // Rate limit detection: "API rate limit exceeded" or HTTP 403
1907
- if (
1908
- errText.includes("rate limit") ||
1909
- errText.includes("api rate limit exceeded") ||
1910
- (errText.includes("403") && errText.includes("limit"))
1911
- ) {
1912
- console.warn(`${TAG} rate limit detected, waiting 60s before retry...`);
1913
- await new Promise((resolve) =>
1914
- setTimeout(resolve, this._rateLimitRetryDelayMs),
1915
- );
1916
- try {
1917
- result = await attempt();
1918
- } catch (retryErr) {
1919
- throw new Error(
1920
- `gh CLI failed (after rate limit retry): ${retryErr.message}`,
1959
+ let usedRateLimitRetry = false;
1960
+ let transientRetries = 0;
1961
+ const maxTransientRetries = Math.max(0, Number(this._transientRetryMax) || 0);
1962
+
1963
+ while (true) {
1964
+ let result;
1965
+ try {
1966
+ result = await attempt();
1967
+ } catch (err) {
1968
+ const message = String(err?.message || err);
1969
+ if (err?.isRateLimit && !usedRateLimitRetry) {
1970
+ usedRateLimitRetry = true;
1971
+ console.warn(
1972
+ `${TAG} rate limit detected, waiting ${this._rateLimitRetryDelayMs}ms before retry...`,
1921
1973
  );
1974
+ await sleepMs(this._rateLimitRetryDelayMs);
1975
+ continue;
1922
1976
  }
1923
- } else {
1924
- throw new Error(`gh CLI failed: ${err.message}`);
1977
+ if (err?.isTransient && transientRetries < maxTransientRetries) {
1978
+ transientRetries += 1;
1979
+ console.warn(
1980
+ `${TAG} transient gh failure (attempt ${transientRetries}/${maxTransientRetries}), retrying in ${this._transientRetryDelayMs}ms...`,
1981
+ );
1982
+ await sleepMs(this._transientRetryDelayMs);
1983
+ continue;
1984
+ }
1985
+ if (err?.isRateLimit && usedRateLimitRetry) {
1986
+ throw new Error(`gh CLI failed (after rate limit retry): ${message}`);
1987
+ }
1988
+ throw new Error(`gh CLI failed: ${message}`);
1925
1989
  }
1926
- }
1927
1990
 
1928
- const text = String(result.stdout || "").trim();
1929
- if (!parseJson) return text;
1930
- if (!text) return null;
1931
- return JSON.parse(text);
1932
- }
1991
+ const text = String(result?.stdout || "").trim();
1992
+ if (!parseJson) return text;
1993
+ if (!text) return null;
1933
1994
 
1995
+ try {
1996
+ return JSON.parse(text);
1997
+ } catch (err) {
1998
+ const parseMessage = String(err?.message || err);
1999
+ const parseContext = [parseMessage, result?.stderr || "", text.slice(0, 512)]
2000
+ .filter(Boolean)
2001
+ .join("\n");
2002
+ if (
2003
+ isGhTransientError(parseContext) &&
2004
+ transientRetries < maxTransientRetries
2005
+ ) {
2006
+ transientRetries += 1;
2007
+ console.warn(
2008
+ `${TAG} transient gh JSON parse failure (attempt ${transientRetries}/${maxTransientRetries}), retrying in ${this._transientRetryDelayMs}ms...`,
2009
+ );
2010
+ await sleepMs(this._transientRetryDelayMs);
2011
+ continue;
2012
+ }
2013
+ throw new Error(`gh CLI returned invalid JSON: ${parseMessage}`);
2014
+ }
2015
+ }
2016
+ }
1934
2017
  async _ensureLabelExists(label) {
1935
2018
  const name = String(label || "").trim();
1936
2019
  if (!name) return;
@@ -4865,3 +4948,5 @@ export async function unmarkTaskIgnored(taskId) {
4865
4948
  );
4866
4949
  return false;
4867
4950
  }
4951
+
4952
+