bosun 0.40.16 → 0.40.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -40,6 +40,8 @@
40
40
  */
41
41
 
42
42
  import { resolve, dirname } from "node:path";
43
+ import { existsSync, readFileSync } from "node:fs";
44
+ import { homedir } from "node:os";
43
45
  import { fileURLToPath } from "node:url";
44
46
  import { loadConfig } from "../config/config.mjs";
45
47
  import { resolveRepoRoot, resolveAgentRepoRoot } from "../config/repo-root.mjs";
@@ -566,16 +568,28 @@ function hasSdkPrerequisites(name, runtimeEnv = process.env) {
566
568
  }
567
569
 
568
570
  if (name === "codex") {
569
- // Codex needs an OpenAI API key (or Azure key, or profile-specific key)
571
+ // Codex needs an OpenAI API key (or Azure key, or profile-specific key),
572
+ // OR a valid ~/.codex/config.toml where an env_key reference is satisfied.
570
573
  const hasKey =
571
574
  runtimeEnv.OPENAI_API_KEY ||
572
575
  runtimeEnv.AZURE_OPENAI_API_KEY ||
573
576
  runtimeEnv.CODEX_MODEL_PROFILE_XL_API_KEY ||
574
577
  runtimeEnv.CODEX_MODEL_PROFILE_M_API_KEY;
575
- if (!hasKey) {
576
- return { ok: false, reason: "no API key (OPENAI_API_KEY / AZURE_OPENAI_API_KEY)" };
578
+ if (hasKey) return { ok: true, reason: null };
579
+ // Check ~/.codex/config.toml Codex CLI SDK reads auth env_key refs from there
580
+ try {
581
+ const configToml = resolve(homedir(), ".codex", "config.toml");
582
+ if (existsSync(configToml)) {
583
+ const tomlText = readFileSync(configToml, "utf8");
584
+ // Extract all env_key = "VAR_NAME" entries and check if any are set
585
+ for (const match of tomlText.matchAll(/env_key\s*=\s*"([^"]+)"/g)) {
586
+ if (runtimeEnv[match[1]]) return { ok: true, reason: null };
587
+ }
588
+ }
589
+ } catch {
590
+ // best effort — fall through to failure
577
591
  }
578
- return { ok: true, reason: null };
592
+ return { ok: false, reason: "no API key (OPENAI_API_KEY / AZURE_OPENAI_API_KEY) and no satisfied env_key in ~/.codex/config.toml" };
579
593
  }
580
594
  if (name === "copilot") {
581
595
  // Copilot auth can come from multiple sources (OAuth manager, gh auth,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "bosun",
3
- "version": "0.40.16",
3
+ "version": "0.40.18",
4
4
  "description": "Bosun Autonomous Engineering — manages AI agent executors with failover, extremely powerful workflow builder, and a massive amount of included default workflow templates for autonomous engineering, creates PRs via Vibe-Kanban API, and sends Telegram notifications. Supports N executors with weighted distribution, multi-repo projects, and auto-setup.",
5
5
  "type": "module",
6
6
  "license": "Apache-2.0",
@@ -2675,7 +2675,59 @@ export class WorkflowEngine extends EventEmitter {
2675
2675
 
2676
2676
  console.log(`${TAG} Resuming ${runs.length} interrupted run(s)...`);
2677
2677
 
2678
+ // ── Deduplicate by taskId: keep only the most recent run per task ────
2679
+ // After N crash/restart cycles, N run entries accumulate for the same
2680
+ // taskId. Resuming all of them causes competing workflow runs that race
2681
+ // to claim the task → "claim was stolen" errors on every restart.
2682
+ // Solution: pre-scan detail files, keep latest startedAt per taskId,
2683
+ // and mark older duplicates as not-resumable before we even try them.
2684
+ const runDetailCache = new Map(); // runId → parsed detail
2685
+ const latestByTaskId = new Map(); // taskId → run entry (highest startedAt)
2686
+
2687
+ for (const run of runs) {
2688
+ const dp = resolve(this.runsDir, `${run.runId}.json`);
2689
+ if (!existsSync(dp)) continue;
2690
+ try {
2691
+ const d = JSON.parse(readFileSync(dp, "utf8"));
2692
+ runDetailCache.set(run.runId, d);
2693
+ const tid = d.data?.taskId || d.inputData?.taskId;
2694
+ if (!tid) continue;
2695
+ const prev = latestByTaskId.get(tid);
2696
+ if (!prev || (run.startedAt || 0) >= (prev.startedAt || 0)) {
2697
+ latestByTaskId.set(tid, run);
2698
+ }
2699
+ } catch {
2700
+ /* unreadable detail — handled in the main loop below */
2701
+ }
2702
+ }
2703
+
2704
+ // Mark older duplicate runs as not-resumable before entering the loop
2705
+ let dedupedCount = 0;
2706
+ for (const run of runs) {
2707
+ const d = runDetailCache.get(run.runId);
2708
+ const tid = d?.data?.taskId || d?.inputData?.taskId;
2709
+ if (!tid) continue;
2710
+ const latest = latestByTaskId.get(tid);
2711
+ if (latest && latest.runId !== run.runId) {
2712
+ this._markRunUnresumable(run.runId, "duplicate_task_run");
2713
+ dedupedCount++;
2714
+ }
2715
+ }
2716
+ if (dedupedCount > 0) {
2717
+ console.log(
2718
+ `${TAG} Skipped ${dedupedCount} duplicate interrupted run(s) (kept latest per taskId)`,
2719
+ );
2720
+ }
2721
+
2678
2722
  for (const run of runs) {
2723
+ // Skip runs that were marked as duplicates above
2724
+ const _runDetail = runDetailCache.get(run.runId);
2725
+ const _tid = _runDetail?.data?.taskId || _runDetail?.inputData?.taskId;
2726
+ if (_tid) {
2727
+ const latest = latestByTaskId.get(_tid);
2728
+ if (latest && latest.runId !== run.runId) continue;
2729
+ }
2730
+
2679
2731
  try {
2680
2732
  // Check if the workflow definition still exists
2681
2733
  const def = this.get(run.workflowId);
@@ -2693,7 +2745,8 @@ export class WorkflowEngine extends EventEmitter {
2693
2745
  continue;
2694
2746
  }
2695
2747
 
2696
- const detail = JSON.parse(readFileSync(detailPath, "utf8"));
2748
+ // Reuse cached detail if available (already parsed above)
2749
+ const detail = runDetailCache.get(run.runId) ?? JSON.parse(readFileSync(detailPath, "utf8"));
2697
2750
  const nodeStatuses = detail.nodeStatuses || {};
2698
2751
  const hasCompletedNodes = Object.values(nodeStatuses).some(
2699
2752
  (s) => s === NodeStatus.COMPLETED,
@@ -131,15 +131,22 @@ async function loadRegistry(registryPath) {
131
131
  const registry = JSON.parse(content);
132
132
 
133
133
  // Validate structure
134
- if (
135
- !registry.version ||
136
- !registry.tasks ||
137
- typeof registry.tasks !== "object"
138
- ) {
134
+ // Repair instead of wipe: preserve any valid task entries while fixing
135
+ // missing/invalid structural fields. Wiping on minor corruption was causing
136
+ // active claims to be lost, leading to cascading "claim was stolen" failures.
137
+ let repaired = false;
138
+ if (!registry.version) {
139
+ registry.version = REGISTRY_VERSION;
140
+ repaired = true;
141
+ }
142
+ if (!registry.tasks || typeof registry.tasks !== "object" || Array.isArray(registry.tasks)) {
143
+ registry.tasks = {};
144
+ repaired = true;
145
+ }
146
+ if (repaired) {
139
147
  console.warn(
140
- "[SharedStateManager] Invalid registry structure, resetting",
148
+ "[SharedStateManager] Invalid registry structure, repaired (preserved existing task entries)",
141
149
  );
142
- return createEmptyRegistry();
143
150
  }
144
151
 
145
152
  return registry;