npm - @os-eco/overstory-cli - Versions diffs - 0.7.7 → 0.7.9 - Mend

@os-eco/overstory-cli 0.7.7 → 0.7.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/README.md +105 -3
package/package.json +1 -1
package/src/agents/manifest.test.ts +168 -1
package/src/agents/manifest.ts +23 -2
package/src/commands/agents.ts +1 -0
package/src/commands/coordinator.test.ts +131 -2
package/src/commands/coordinator.ts +40 -9
package/src/commands/costs.test.ts +5 -0
package/src/commands/costs.ts +1 -1
package/src/commands/init.test.ts +1 -0
package/src/commands/init.ts +1 -0
package/src/commands/log.ts +2 -0
package/src/commands/prime.test.ts +1 -0
package/src/commands/sling.test.ts +63 -1
package/src/commands/sling.ts +37 -2
package/src/config.test.ts +68 -0
package/src/config.ts +16 -0
package/src/doctor/structure.test.ts +1 -0
package/src/doctor/structure.ts +1 -0
package/src/index.ts +2 -1
package/src/metrics/pricing.test.ts +258 -0
package/src/metrics/store.test.ts +227 -0
package/src/metrics/store.ts +40 -5
package/src/runtimes/gemini.test.ts +537 -0
package/src/runtimes/gemini.ts +235 -0
package/src/runtimes/registry.test.ts +15 -1
package/src/runtimes/registry.ts +2 -0
package/src/schema-consistency.test.ts +1 -0
package/src/types.ts +8 -0
package/src/worktree/tmux.test.ts +49 -0
package/src/worktree/tmux.ts +33 -0

package/README.md CHANGED Viewed

@@ -6,7 +6,7 @@ Multi-agent orchestration for AI coding agents.
 [![CI](https://github.com/jayminwest/overstory/actions/workflows/ci.yml/badge.svg)](https://github.com/jayminwest/overstory/actions/workflows/ci.yml)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
-Overstory turns a single coding session into a multi-agent team by spawning worker agents in git worktrees via tmux, coordinating them through a custom SQLite mail system, and merging their work back with tiered conflict resolution. A pluggable `AgentRuntime` interface lets you swap between runtimes — Claude Code, [Pi](https://github.com/badlogic/pi-mono/tree/main/packages/coding-agent), or your own adapter.
+Overstory turns a single coding session into a multi-agent team by spawning worker agents in git worktrees via tmux, coordinating them through a custom SQLite mail system, and merging their work back with tiered conflict resolution. A pluggable `AgentRuntime` interface lets you swap between runtimes — Claude Code, [Pi](https://github.com/badlogic/pi-mono/tree/main/packages/coding-agent), [Gemini CLI](https://github.com/google-gemini/gemini-cli), or your own adapter.
 > **Warning: Agent swarms are not a universal solution.** Do not deploy Overstory without understanding the risks of multi-agent orchestration — compounding error rates, cost amplification, debugging complexity, and merge conflicts are the normal case, not edge cases. Read [STEELMAN.md](STEELMAN.md) for a full risk analysis and the [Agentic Engineering Book](https://github.com/jayminwest/agentic-engineering-book) ([web version](https://jayminwest.com/agentic-engineering-book)) before using this tool in production.
@@ -18,6 +18,7 @@ Requires [Bun](https://bun.sh) v1.0+, git, and tmux. At least one supported agen
 - [Pi](https://github.com/badlogic/pi-mono/tree/main/packages/coding-agent) (`pi` CLI)
 - [GitHub Copilot](https://github.com/features/copilot) (`copilot` CLI)
 - [Codex](https://github.com/openai/codex) (`codex` CLI)
+- [Gemini CLI](https://github.com/google-gemini/gemini-cli) (`gemini` CLI)
 ```bash
 bun install -g @os-eco/overstory-cli
@@ -80,7 +81,7 @@ Every command supports `--json` where noted. Global flags: `-q`/`--quiet`, `--ti
 | Command | Description |
 |---------|-------------|
 | `ov init` | Initialize `.overstory/` and bootstrap os-eco tools (`--yes`, `--name`, `--tools`, `--skip-mulch`, `--skip-seeds`, `--skip-canopy`, `--skip-onboard`, `--json`) |
-| `ov sling <task-id>` | Spawn a worker agent (`--capability`, `--name`, `--spec`, `--files`, `--parent`, `--depth`, `--skip-scout`, `--skip-review`, `--max-agents`, `--dispatch-max-agents`, `--skip-task-check`, `--no-scout-check`, `--runtime`, `--json`) |
+| `ov sling <task-id>` | Spawn a worker agent (`--capability`, `--name`, `--spec`, `--files`, `--parent`, `--depth`, `--skip-scout`, `--skip-review`, `--max-agents`, `--dispatch-max-agents`, `--skip-task-check`, `--no-scout-check`, `--runtime`, `--base-branch`, `--json`) |
 | `ov stop <agent-name>` | Terminate a running agent (`--clean-worktree`, `--json`) |
 | `ov prime` | Load context for orchestrator/agent (`--agent`, `--compact`) |
 | `ov spec write <task-id>` | Write a task specification (`--body`) |
@@ -175,6 +176,7 @@ Overstory is runtime-agnostic. The `AgentRuntime` interface (`src/runtimes/types
 | Pi | `pi` | `.pi/extensions/` guard extension | Active development |
 | Copilot | `copilot` | (none — `--allow-all-tools`) | Active development |
 | Codex | `codex` | OS-level sandbox (Seatbelt/Landlock) | Active development |
+| Gemini | `gemini` | `--sandbox` flag | Active development |
 ## How It Works
@@ -271,7 +273,7 @@ overstory/
     metrics/                      SQLite metrics + pricing + transcript parsing
     doctor/                       Health check modules (11 checks)
     insights/                     Session insight analyzer for auto-expertise
-    runtimes/                     AgentRuntime abstraction (registry + adapters: Claude, Pi, Copilot, Codex)
+    runtimes/                     AgentRuntime abstraction (registry + adapters: Claude, Pi, Copilot, Codex, Gemini)
     tracker/                      Pluggable task tracker (beads + seeds backends)
     mulch/                        mulch client (programmatic API + CLI wrapper)
     e2e/                          End-to-end lifecycle tests
@@ -279,6 +281,106 @@ overstory/
   templates/                      Templates for overlays and hooks
 ```
+## Configuration
+### Gateway Providers
+Route agent API calls through custom gateway endpoints (z.ai, OpenRouter, self-hosted proxies). Configure providers in `.overstory/config.yaml`:
+```yaml
+providers:
+  anthropic:
+    type: native
+  zai:
+    type: gateway
+    baseUrl: https://api.z.ai/v1
+    authTokenEnv: ZAI_API_KEY
+  openrouter:
+    type: gateway
+    baseUrl: https://openrouter.ai/api/v1
+    authTokenEnv: OPENROUTER_API_KEY
+models:
+  builder: zai/claude-sonnet-4-6
+  scout: openrouter/openai/gpt-4o
+```
+**How it works:** Model refs use `provider/model-id` format. Overstory sets `ANTHROPIC_BASE_URL` to the gateway `baseUrl`, `ANTHROPIC_AUTH_TOKEN` from the env var named in `authTokenEnv`, and `ANTHROPIC_API_KEY=""` to prevent direct Anthropic calls. The agent receives `"sonnet"` as a model alias and Claude Code routes via env vars.
+**Environment variable notes:**
+- `ANTHROPIC_AUTH_TOKEN` and `ANTHROPIC_API_KEY` are mutually exclusive per-agent
+- Gateway agents get `ANTHROPIC_API_KEY=""` and `ANTHROPIC_AUTH_TOKEN` from provider config
+- Direct Anthropic API calls (merge resolver, watchdog triage) still need `ANTHROPIC_API_KEY` in the orchestrator env
+**Validation:** `ov doctor --category providers` checks reachability, auth tokens, model-provider refs, and tool-use compatibility.
+**`ProviderConfig` fields:**
+| Field | Type | Required | Description |
+|-------|------|----------|-------------|
+| `type` | `native` or `gateway` | Yes | Provider type |
+| `baseUrl` | string | Gateway only | API endpoint URL |
+| `authTokenEnv` | string | Gateway only | Env var name holding auth token |
+## Troubleshooting
+### Coordinator died during startup
+This error means the coordinator tmux session exited before the TUI became ready. The most common cause is slow shell initialization.
+**Step 1: Measure shell startup time**
+```bash
+time zsh -i -c exit   # For zsh
+time bash -i -c exit  # For bash
+```
+If startup takes more than 1 second, slow shell init is likely the cause.
+**Step 2: Common slow-startup causes**
+| Cause | Typical delay | Fix |
+|-------|---------------|-----|
+| oh-my-zsh with many plugins | 1-5s | Reduce plugins, switch to lighter framework (zinit with lazy loading) |
+| nvm (Node Version Manager) | 1-3s | Use `--no-use` + lazy-load nvm, or switch to fnm/volta |
+| pyenv init | 0.5-2s | Lazy-load pyenv |
+| rbenv init | 0.5-1s | Lazy-load rbenv |
+| starship prompt | 0.5-1s | Check starship timings |
+| conda auto-activate | 1-3s | `auto_activate_base: false` in `.condarc` |
+| Homebrew shellenv | 0.5-1s | Cache output instead of evaluating every shell start |
+**Step 3: Configure `shellInitDelayMs`** in `.overstory/config.yaml`:
+```yaml
+runtime:
+  shellInitDelayMs: 3000
+```
+- Default: `0` (no delay)
+- Typical values: `1000`–`5000` depending on shell startup time
+- Values above `30000` (30s) trigger a warning
+- Inserts a delay between tmux session creation and TUI readiness polling
+**Step 4: Optimization examples**
+Lazy-load nvm (add to `~/.zshrc` or `~/.bashrc`):
+```bash
+# Lazy-load nvm — only activates when you first call nvm/node/npm
+export NVM_DIR="$HOME/.nvm"
+nvm() { unset -f nvm node npm npx; [ -s "$NVM_DIR/nvm.sh" ] && . "$NVM_DIR/nvm.sh"; nvm "$@"; }
+node() { unset -f nvm node npm npx; [ -s "$NVM_DIR/nvm.sh" ] && . "$NVM_DIR/nvm.sh"; node "$@"; }
+npm()  { unset -f nvm node npm npx; [ -s "$NVM_DIR/nvm.sh" ] && . "$NVM_DIR/nvm.sh"; npm  "$@"; }
+npx()  { unset -f nvm node npm npx; [ -s "$NVM_DIR/nvm.sh" ] && . "$NVM_DIR/nvm.sh"; npx  "$@"; }
+```
+Reduce oh-my-zsh plugins (edit `~/.zshrc`):
+```bash
+# Before: plugins=(git zsh-autosuggestions zsh-syntax-highlighting node npm python ruby rbenv pyenv ...)
+# After: keep only what you use regularly
+plugins=(git)
+```
 ## Part of os-eco
 Overstory is part of the [os-eco](https://github.com/jayminwest/os-eco) AI agent tooling ecosystem.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "@os-eco/overstory-cli",
-	"version": "0.7.7",
+	"version": "0.7.9",
 	"description": "Multi-agent orchestration for AI coding agents — spawn workers in git worktrees via tmux, coordinate through SQLite mail, merge with tiered conflict resolution. Pluggable runtime adapters for Claude Code, Pi, and more.",
 	"author": "Jaymin West",
 	"license": "MIT",

package/src/agents/manifest.test.ts CHANGED Viewed

@@ -5,7 +5,12 @@ import { join } from "node:path";
 import { AgentError } from "../errors.ts";
 import { cleanupTempDir } from "../test-helpers.ts";
 import type { AgentManifest, OverstoryConfig } from "../types.ts";
-import { createManifestLoader, resolveModel, resolveProviderEnv } from "./manifest.ts";
+import {
+	createManifestLoader,
+	expandAliasFromEnv,
+	resolveModel,
+	resolveProviderEnv,
+} from "./manifest.ts";
 const VALID_MANIFEST = {
 	version: "1.0",
@@ -673,6 +678,168 @@ describe("resolveModel", () => {
 	});
 });
+describe("expandAliasFromEnv", () => {
+	test("returns expanded model ID when env var is set", () => {
+		expect(
+			expandAliasFromEnv("haiku", {
+				ANTHROPIC_DEFAULT_HAIKU_MODEL: "us.anthropic.claude-3-5-haiku-20241022-v1:0",
+			}),
+		).toBe("us.anthropic.claude-3-5-haiku-20241022-v1:0");
+	});
+	test("returns alias unchanged when env var is unset", () => {
+		expect(expandAliasFromEnv("haiku", {})).toBe("haiku");
+	});
+	test("expands all three aliases via their env vars", () => {
+		const env = {
+			ANTHROPIC_DEFAULT_HAIKU_MODEL: "bedrock-haiku-id",
+			ANTHROPIC_DEFAULT_SONNET_MODEL: "bedrock-sonnet-id",
+			ANTHROPIC_DEFAULT_OPUS_MODEL: "bedrock-opus-id",
+		};
+		expect(expandAliasFromEnv("haiku", env)).toBe("bedrock-haiku-id");
+		expect(expandAliasFromEnv("sonnet", env)).toBe("bedrock-sonnet-id");
+		expect(expandAliasFromEnv("opus", env)).toBe("bedrock-opus-id");
+	});
+	test("trims whitespace from env var value", () => {
+		expect(
+			expandAliasFromEnv("sonnet", {
+				ANTHROPIC_DEFAULT_SONNET_MODEL: "  bedrock-sonnet-id  ",
+			}),
+		).toBe("bedrock-sonnet-id");
+	});
+	test("returns alias when env var is empty string", () => {
+		expect(expandAliasFromEnv("sonnet", { ANTHROPIC_DEFAULT_SONNET_MODEL: "" })).toBe("sonnet");
+	});
+	test("returns alias when env var is whitespace only", () => {
+		expect(expandAliasFromEnv("sonnet", { ANTHROPIC_DEFAULT_SONNET_MODEL: "   " })).toBe("sonnet");
+	});
+	test("returns unknown alias unchanged", () => {
+		expect(expandAliasFromEnv("gpt-4", {})).toBe("gpt-4");
+	});
+});
+describe("resolveModel env var expansion", () => {
+	const baseManifest: AgentManifest = {
+		version: "1.0",
+		agents: {
+			scout: {
+				file: "scout.md",
+				model: "haiku",
+				tools: ["Read"],
+				capabilities: ["explore"],
+				canSpawn: false,
+				constraints: [],
+			},
+			builder: {
+				file: "builder.md",
+				model: "sonnet",
+				tools: ["Read", "Write"],
+				capabilities: ["implement"],
+				canSpawn: false,
+				constraints: [],
+			},
+		},
+		capabilityIndex: { explore: ["scout"], implement: ["builder"] },
+	};
+	function makeConfig(models: OverstoryConfig["models"] = {}): OverstoryConfig {
+		return {
+			project: { name: "test", root: "/tmp/test", canonicalBranch: "main" },
+			agents: {
+				manifestPath: ".overstory/agent-manifest.json",
+				baseDir: ".overstory/agent-defs",
+				maxConcurrent: 5,
+				staggerDelayMs: 1000,
+				maxDepth: 2,
+				maxSessionsPerRun: 0,
+				maxAgentsPerLead: 5,
+			},
+			worktrees: { baseDir: ".overstory/worktrees" },
+			taskTracker: { backend: "auto", enabled: false },
+			mulch: { enabled: false, domains: [], primeFormat: "markdown" },
+			merge: { aiResolveEnabled: false, reimagineEnabled: false },
+			providers: { anthropic: { type: "native" } },
+			watchdog: {
+				tier0Enabled: false,
+				tier0IntervalMs: 30000,
+				tier1Enabled: false,
+				tier2Enabled: false,
+				staleThresholdMs: 300000,
+				zombieThresholdMs: 600000,
+				nudgeIntervalMs: 60000,
+			},
+			models,
+			logging: { verbose: false, redactSecrets: true },
+		};
+	}
+	test("expands alias when env var is set", () => {
+		const saved = process.env.ANTHROPIC_DEFAULT_HAIKU_MODEL;
+		process.env.ANTHROPIC_DEFAULT_HAIKU_MODEL = "us.anthropic.claude-3-5-haiku-20241022-v1:0";
+		try {
+			const result = resolveModel(makeConfig(), baseManifest, "scout", "sonnet");
+			expect(result).toEqual({ model: "us.anthropic.claude-3-5-haiku-20241022-v1:0" });
+		} finally {
+			if (saved === undefined) {
+				delete process.env.ANTHROPIC_DEFAULT_HAIKU_MODEL;
+			} else {
+				process.env.ANTHROPIC_DEFAULT_HAIKU_MODEL = saved;
+			}
+		}
+	});
+	test("passes alias through when env var is unset", () => {
+		const saved = process.env.ANTHROPIC_DEFAULT_HAIKU_MODEL;
+		delete process.env.ANTHROPIC_DEFAULT_HAIKU_MODEL;
+		try {
+			const result = resolveModel(makeConfig(), baseManifest, "scout", "sonnet");
+			expect(result).toEqual({ model: "haiku" });
+		} finally {
+			if (saved !== undefined) {
+				process.env.ANTHROPIC_DEFAULT_HAIKU_MODEL = saved;
+			}
+		}
+	});
+	test("config override to full model ID is not affected by env vars", () => {
+		const saved = process.env.ANTHROPIC_DEFAULT_SONNET_MODEL;
+		process.env.ANTHROPIC_DEFAULT_SONNET_MODEL = "bedrock-sonnet";
+		try {
+			// Config overrides to a direct model string (not an alias)
+			const config = makeConfig({ builder: "claude-3-5-sonnet-20241022" });
+			const result = resolveModel(config, baseManifest, "builder", "haiku");
+			expect(result).toEqual({ model: "claude-3-5-sonnet-20241022" });
+		} finally {
+			if (saved === undefined) {
+				delete process.env.ANTHROPIC_DEFAULT_SONNET_MODEL;
+			} else {
+				process.env.ANTHROPIC_DEFAULT_SONNET_MODEL = saved;
+			}
+		}
+	});
+	test("config override to alias also expands via env var", () => {
+		const saved = process.env.ANTHROPIC_DEFAULT_OPUS_MODEL;
+		process.env.ANTHROPIC_DEFAULT_OPUS_MODEL = "bedrock-opus-id";
+		try {
+			const config = makeConfig({ scout: "opus" });
+			const result = resolveModel(config, baseManifest, "scout", "haiku");
+			expect(result).toEqual({ model: "bedrock-opus-id" });
+		} finally {
+			if (saved === undefined) {
+				delete process.env.ANTHROPIC_DEFAULT_OPUS_MODEL;
+			} else {
+				process.env.ANTHROPIC_DEFAULT_OPUS_MODEL = saved;
+			}
+		}
+	});
+});
 describe("resolveProviderEnv", () => {
 	test("returns null for unknown provider", () => {
 		const result = resolveProviderEnv("unknown", "some/model", {});

package/src/agents/manifest.ts CHANGED Viewed

@@ -34,6 +34,27 @@ interface RawManifest {
 const MODEL_ALIASES = new Set(["sonnet", "opus", "haiku"]);
+// Env var mapping: alias → ANTHROPIC_DEFAULT_{ALIAS}_MODEL
+const ALIAS_ENV_VARS: Record<string, string> = {
+	haiku: "ANTHROPIC_DEFAULT_HAIKU_MODEL",
+	sonnet: "ANTHROPIC_DEFAULT_SONNET_MODEL",
+	opus: "ANTHROPIC_DEFAULT_OPUS_MODEL",
+};
+/**
+ * Expand a model alias via its corresponding ANTHROPIC_DEFAULT_{ALIAS}_MODEL env var.
+ * Returns the env var value if set, otherwise the original alias.
+ */
+export function expandAliasFromEnv(
+	alias: string,
+	env: Record<string, string | undefined> = process.env as Record<string, string | undefined>,
+): string {
+	const envVar = ALIAS_ENV_VARS[alias];
+	if (!envVar) return alias;
+	const value = env[envVar];
+	return value?.trim() || alias;
+}
 /**
  * Validate that a raw parsed object conforms to the AgentDefinition shape.
  * Returns a list of error messages for any violations.
@@ -333,9 +354,9 @@ export function resolveModel(
 	const configModel = config.models[role];
 	const rawModel = configModel ?? manifest.agents[role]?.model ?? fallback;
-	// Simple alias — no provider env needed
+	// Simple alias — expand via env var if set (e.g. ANTHROPIC_DEFAULT_SONNET_MODEL)
 	if (MODEL_ALIASES.has(rawModel)) {
-		return { model: rawModel };
+		return { model: expandAliasFromEnv(rawModel) };
 	}
 	// Provider-prefixed: split on first "/" to get provider name and model ID

package/src/commands/agents.ts CHANGED Viewed

@@ -34,6 +34,7 @@ export interface DiscoveredAgent {
 const KNOWN_INSTRUCTION_PATHS = [
 	join(".claude", "CLAUDE.md"), // Claude Code, Pi
 	"AGENTS.md", // Codex (future)
+	"GEMINI.md", // Gemini CLI
 ];
 /**

package/src/commands/coordinator.test.ts CHANGED Viewed

@@ -38,6 +38,7 @@ interface TmuxCallTracker {
 		env?: Record<string, string>;
 	}>;
 	isSessionAlive: Array<{ name: string; result: boolean }>;
+	checkSessionState: Array<{ name: string; result: "alive" | "dead" | "no_server" }>;
 	killSession: Array<{ name: string }>;
 	sendKeys: Array<{ name: string; keys: string }>;
 	waitForTuiReady: Array<{ name: string }>;
@@ -68,6 +69,7 @@ function makeFakeTmux(
 	options: {
 		waitForTuiReadyResult?: boolean;
 		ensureTmuxAvailableError?: Error;
+		checkSessionStateMap?: Record<string, "alive" | "dead" | "no_server">;
 	} = {},
 ): {
 	tmux: NonNullable<CoordinatorDeps["_tmux"]>;
@@ -76,6 +78,7 @@ function makeFakeTmux(
 	const calls: TmuxCallTracker = {
 		createSession: [],
 		isSessionAlive: [],
+		checkSessionState: [],
 		killSession: [],
 		sendKeys: [],
 		waitForTuiReady: [],
@@ -97,6 +100,13 @@ function makeFakeTmux(
 			calls.isSessionAlive.push({ name, result: alive });
 			return alive;
 		},
+		checkSessionState: async (name: string): Promise<"alive" | "dead" | "no_server"> => {
+			const stateMap = options.checkSessionStateMap ?? {};
+			// Default: derive from sessionAliveMap for backwards compat
+			const state = stateMap[name] ?? (sessionAliveMap[name] ? "alive" : "dead");
+			calls.checkSessionState.push({ name, result: state });
+			return state;
+		},
 		killSession: async (name: string): Promise<void> => {
 			calls.killSession.push({ name });
 		},
@@ -325,7 +335,11 @@ function makeDeps(
 	sessionAliveMap: Record<string, boolean> = {},
 	watchdogConfig?: { running?: boolean; startSuccess?: boolean; stopSuccess?: boolean },
 	monitorConfig?: { running?: boolean; startSuccess?: boolean; stopSuccess?: boolean },
-	tmuxOptions?: { waitForTuiReadyResult?: boolean; ensureTmuxAvailableError?: Error },
+	tmuxOptions?: {
+		waitForTuiReadyResult?: boolean;
+		ensureTmuxAvailableError?: Error;
+		checkSessionStateMap?: Record<string, "alive" | "dead" | "no_server">;
+	},
 ): {
 	deps: CoordinatorDeps;
 	calls: TmuxCallTracker;
@@ -606,7 +620,7 @@ describe("startCoordinator", () => {
 	test("rejects duplicate when coordinator is already running", async () => {
 		// Write an existing active coordinator session
-		const existing = makeCoordinatorSession({ state: "working" });
+		const existing = makeCoordinatorSession({ state: "working", pid: process.pid });
 		saveSessionsToDb([existing]);
 		// Mock tmux as alive for the existing session
@@ -623,6 +637,29 @@ describe("startCoordinator", () => {
 		}
 	});
+	test("rejects duplicate when pid is null but tmux session is alive", async () => {
+		// Session has null pid (e.g. migrated from older schema) but tmux is alive.
+		// Cannot prove it's a zombie without a pid, so treat as active.
+		const existing = makeCoordinatorSession({ state: "working", pid: null });
+		saveSessionsToDb([existing]);
+		const { deps } = makeDeps(
+			{ "overstory-test-project-coordinator": true },
+			undefined,
+			undefined,
+			{ checkSessionStateMap: { "overstory-test-project-coordinator": "alive" } },
+		);
+		try {
+			await coordinatorCommand(["start"], deps);
+			expect(true).toBe(false); // Should have thrown
+		} catch (err) {
+			expect(err).toBeInstanceOf(AgentError);
+			const ae = err as AgentError;
+			expect(ae.message).toContain("already running");
+		}
+	});
 	test("cleans up dead session and starts new one", async () => {
 		// Write an existing session that claims to be working
 		const deadSession = makeCoordinatorSession({
@@ -656,6 +693,98 @@ describe("startCoordinator", () => {
 		expect(newSession?.id).not.toBe("session-dead-coordinator");
 	});
+	test("cleans up zombie session when tmux alive but PID dead", async () => {
+		// Session is "working" in DB, tmux session exists, but the PID is dead
+		const zombieSession = makeCoordinatorSession({
+			id: "session-zombie-coordinator",
+			state: "working",
+			pid: 999999, // Non-existent PID
+		});
+		saveSessionsToDb([zombieSession]);
+		// Tmux session is alive (pane exists) but PID 999999 is not running
+		const { deps } = makeDeps(
+			{ "overstory-test-project-coordinator": true },
+			undefined,
+			undefined,
+			{ checkSessionStateMap: { "overstory-test-project-coordinator": "alive" } },
+		);
+		const originalSleep = Bun.sleep;
+		Bun.sleep = (() => Promise.resolve()) as typeof Bun.sleep;
+		try {
+			await captureStdout(() => coordinatorCommand(["start"], deps));
+		} finally {
+			Bun.sleep = originalSleep;
+		}
+		// Zombie session should be cleaned up and new one created
+		const sessions = loadSessionsFromDb();
+		expect(sessions).toHaveLength(1);
+		const newSession = sessions[0];
+		expect(newSession?.state).toBe("booting");
+		expect(newSession?.id).not.toBe("session-zombie-coordinator");
+	});
+	test("cleans up stale session when tmux server is not running", async () => {
+		// Session is "booting" in DB but tmux server crashed
+		const staleSession = makeCoordinatorSession({
+			id: "session-stale-coordinator",
+			state: "booting",
+		});
+		saveSessionsToDb([staleSession]);
+		// checkSessionState returns no_server
+		const { deps } = makeDeps(
+			{ "overstory-test-project-coordinator": false },
+			undefined,
+			undefined,
+			{ checkSessionStateMap: { "overstory-test-project-coordinator": "no_server" } },
+		);
+		const originalSleep = Bun.sleep;
+		Bun.sleep = (() => Promise.resolve()) as typeof Bun.sleep;
+		try {
+			await captureStdout(() => coordinatorCommand(["start"], deps));
+		} finally {
+			Bun.sleep = originalSleep;
+		}
+		// Stale session cleaned up, new one created
+		const sessions = loadSessionsFromDb();
+		expect(sessions).toHaveLength(1);
+		const newSession = sessions[0];
+		expect(newSession?.state).toBe("booting");
+		expect(newSession?.id).not.toBe("session-stale-coordinator");
+	});
+	test("respects shellInitDelayMs config before polling TUI readiness", async () => {
+		// Append shellInitDelayMs to existing config (preserve tier2Enabled etc.)
+		const configPath = join(tempDir, ".overstory", "config.yaml");
+		const existing = await Bun.file(configPath).text();
+		await Bun.write(configPath, `${existing}\nruntime:\n  shellInitDelayMs: 500\n`);
+		const { deps } = makeDeps();
+		const sleepCalls: number[] = [];
+		const originalSleep = Bun.sleep;
+		Bun.sleep = ((ms: number | Date) => {
+			if (typeof ms === "number") sleepCalls.push(ms);
+			return Promise.resolve();
+		}) as typeof Bun.sleep;
+		try {
+			await captureStdout(() => coordinatorCommand(["start"], deps));
+		} finally {
+			Bun.sleep = originalSleep;
+		}
+		// The 500ms shell init delay should appear in the sleep calls
+		expect(sleepCalls).toContain(500);
+	});
 	test("throws AgentError when tmux is not available", async () => {
 		const { deps } = makeDeps({}, undefined, undefined, {
 			ensureTmuxAvailableError: new AgentError(

package/src/commands/coordinator.ts CHANGED Viewed

@@ -27,7 +27,9 @@ import { createRunStore } from "../sessions/store.ts";
 import { resolveBackend, trackerCliName } from "../tracker/factory.ts";
 import type { AgentSession } from "../types.ts";
 import { isProcessRunning } from "../watchdog/health.ts";
+import type { SessionState } from "../worktree/tmux.ts";
 import {
+	checkSessionState,
 	createSession,
 	ensureTmuxAvailable,
 	isSessionAlive,
@@ -58,6 +60,7 @@ export interface CoordinatorDeps {
 			env?: Record<string, string>,
 		) => Promise<number>;
 		isSessionAlive: (name: string) => Promise<boolean>;
+		checkSessionState: (name: string) => Promise<SessionState>;
 		killSession: (name: string) => Promise<void>;
 		sendKeys: (name: string, keys: string) => Promise<void>;
 		waitForTuiReady: (
@@ -275,6 +278,7 @@ async function startCoordinator(
 	const tmux = deps._tmux ?? {
 		createSession,
 		isSessionAlive,
+		checkSessionState,
 		killSession,
 		sendKeys,
 		waitForTuiReady,
@@ -308,15 +312,29 @@ async function startCoordinator(
 			existing.state !== "completed" &&
 			existing.state !== "zombie"
 		) {
-			const alive = await tmux.isSessionAlive(existing.tmuxSession);
-			if (alive) {
-				throw new AgentError(
-					`Coordinator is already running (tmux: ${existing.tmuxSession}, since: ${existing.startedAt})`,
-					{ agentName: COORDINATOR_NAME },
-				);
+			const sessionState = await tmux.checkSessionState(existing.tmuxSession);
+			if (sessionState === "alive") {
+				// Tmux session exists -- but is the process inside still running?
+				// A crashed Claude Code leaves a zombie tmux pane that blocks retries.
+				if (existing.pid !== null && !isProcessRunning(existing.pid)) {
+					// Zombie: tmux pane exists but agent process has exited.
+					// Kill the empty session and reclaim the slot.
+					await tmux.killSession(existing.tmuxSession);
+					store.updateState(COORDINATOR_NAME, "completed");
+				} else {
+					// Either the process is genuinely running (pid alive), or pid is null
+					// (e.g. sessions migrated from an older schema). In both cases we
+					// cannot prove the session is a zombie, so treat it as active.
+					throw new AgentError(
+						`Coordinator is already running (tmux: ${existing.tmuxSession}, since: ${existing.startedAt})`,
+						{ agentName: COORDINATOR_NAME },
+					);
+				}
+			} else {
+				// Session is dead or tmux server is not running -- clean up stale DB entry.
+				store.updateState(COORDINATOR_NAME, "completed");
 			}
-			// Session recorded but tmux is dead — mark as completed and continue
-			store.updateState(COORDINATOR_NAME, "completed");
 		}
 		// Resolve model and runtime early (needed for deployConfig and spawn)
@@ -413,6 +431,12 @@ async function startCoordinator(
 		store.upsert(session);
+		// Give slow shells time to finish initializing before polling for TUI readiness.
+		const shellDelay = config.runtime?.shellInitDelayMs ?? 0;
+		if (shellDelay > 0) {
+			await Bun.sleep(shellDelay);
+		}
 		// Wait for Claude Code TUI to render before sending input
 		const tuiReady = await tmux.waitForTuiReady(tmuxSession, (content) =>
 			runtime.detectReady(content),
@@ -423,8 +447,13 @@ async function startCoordinator(
 			if (!alive) {
 				// Clean up the stale session record
 				store.updateState(COORDINATOR_NAME, "completed");
+				const sessionState = await tmux.checkSessionState(tmuxSession);
+				const detail =
+					sessionState === "no_server"
+						? "The tmux server is no longer running. It may have crashed or been killed externally."
+						: "The Claude Code process may have crashed or exited immediately. Check tmux logs or try running the claude command manually.";
 				throw new AgentError(
-					`Coordinator tmux session "${tmuxSession}" died during startup. The Claude Code process may have crashed or exited immediately. Check tmux logs or try running the claude command manually.`,
+					`Coordinator tmux session "${tmuxSession}" died during startup. ${detail}`,
 					{ agentName: COORDINATOR_NAME },
 				);
 			}
@@ -512,6 +541,7 @@ async function stopCoordinator(opts: { json: boolean }, deps: CoordinatorDeps =
 	const tmux = deps._tmux ?? {
 		createSession,
 		isSessionAlive,
+		checkSessionState,
 		killSession,
 		sendKeys,
 		waitForTuiReady,
@@ -626,6 +656,7 @@ async function statusCoordinator(
 	const tmux = deps._tmux ?? {
 		createSession,
 		isSessionAlive,
+		checkSessionState,
 		killSession,
 		sendKeys,
 		waitForTuiReady,