npm - mstro-app - Versions diffs - 0.2.0 → 0.3.0 - Mend

mstro-app 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (114) hide show

package/server/cli/headless/stall-assessor.test.ts ADDED Viewed

@@ -0,0 +1,165 @@
+import { describe, expect, it } from 'vitest';
+import type { StallContext } from './stall-assessor.js';
+// quickHeuristic, parseAssessmentResponse, and parseVerdictResponse are not exported.
+// We test them via assessStall (which calls quickHeuristic first) and by testing
+// the parsing functions indirectly. Since quickHeuristic is the critical logic
+// and assessStall calls it before Haiku, we can test the heuristic paths by
+// providing contexts that match known patterns.
+//
+// To avoid spawning Haiku (which requires `claude` CLI), we only test contexts
+// that trigger the heuristic fast-path (return non-null from quickHeuristic).
+import { assessStall } from './stall-assessor.js';
+function makeContext(overrides: Partial<StallContext> = {}): StallContext {
+  return {
+    originalPrompt: 'Fix the bug in auth.ts',
+    silenceMs: 120_000,
+    pendingToolCount: 0,
+    totalToolCalls: 5,
+    elapsedTotalMs: 300_000,
+    ...overrides,
+  };
+}
+describe('assessStall - quickHeuristic paths', () => {
+  it('extends when tokens are still flowing (tokenSilenceMs < 60s)', async () => {
+    const ctx = makeContext({ tokenSilenceMs: 30_000 });
+    const verdict = await assessStall(ctx, 'claude', false, false);
+    expect(verdict.action).toBe('extend');
+    expect(verdict.extensionMs).toBe(10 * 60_000);
+    expect(verdict.reason).toContain('Tokens still flowing');
+  });
+  it('extends when tokenSilenceMs is 0', async () => {
+    const ctx = makeContext({ tokenSilenceMs: 0 });
+    const verdict = await assessStall(ctx, 'claude', false, false);
+    expect(verdict.action).toBe('extend');
+    expect(verdict.reason).toContain('Tokens still flowing');
+  });
+  it('does not use token heuristic when tokenSilenceMs >= 60s', async () => {
+    const ctx = makeContext({
+      tokenSilenceMs: 60_000,
+      pendingToolCount: 3, // will trigger parallel tools heuristic
+    });
+    const verdict = await assessStall(ctx, 'claude', false, false);
+    // Should NOT hit the token heuristic, should hit the 3+ parallel tools one
+    expect(verdict.action).toBe('extend');
+    expect(verdict.reason).toContain('parallel tool calls');
+  });
+  it('defers to watchdog when active and tools are pending', async () => {
+    const ctx = makeContext({ pendingToolCount: 1, lastToolName: 'Bash' });
+    const verdict = await assessStall(ctx, 'claude', false, true);
+    expect(verdict.action).toBe('extend');
+    expect(verdict.extensionMs).toBe(15 * 60_000);
+    expect(verdict.reason).toContain('Watchdog active');
+  });
+  it('defers to watchdog and lists pending tool names', async () => {
+    const ctx = makeContext({
+      pendingToolCount: 2,
+      pendingToolNames: new Set(['WebFetch', 'Bash']),
+    });
+    const verdict = await assessStall(ctx, 'claude', false, true);
+    expect(verdict.action).toBe('extend');
+    expect(verdict.reason).toContain('WebFetch');
+    expect(verdict.reason).toContain('Bash');
+  });
+  it('extends for Task subagent via pendingToolNames', async () => {
+    const ctx = makeContext({
+      pendingToolCount: 1,
+      pendingToolNames: new Set(['Task']),
+    });
+    const verdict = await assessStall(ctx, 'claude', false, false);
+    expect(verdict.action).toBe('extend');
+    expect(verdict.reason).toContain('Task subagent');
+  });
+  it('extends for Task subagent via lastToolName fallback', async () => {
+    const ctx = makeContext({
+      pendingToolCount: 1,
+      lastToolName: 'Task',
+    });
+    const verdict = await assessStall(ctx, 'claude', false, false);
+    expect(verdict.action).toBe('extend');
+    expect(verdict.reason).toContain('Task subagent');
+  });
+  it('scales Task extension with pending count', async () => {
+    const ctx1 = makeContext({
+      pendingToolCount: 1,
+      pendingToolNames: new Set(['Task']),
+    });
+    const ctx3 = makeContext({
+      pendingToolCount: 3,
+      pendingToolNames: new Set(['Task']),
+    });
+    const v1 = await assessStall(ctx1, 'claude', false, false);
+    const v3 = await assessStall(ctx3, 'claude', false, false);
+    // More pending = more extension, capped at 30 min
+    expect(v3.extensionMs).toBeGreaterThanOrEqual(v1.extensionMs);
+    expect(v3.extensionMs).toBeLessThanOrEqual(30 * 60_000);
+  });
+  it('extends for 3+ parallel tool calls', async () => {
+    const ctx = makeContext({ pendingToolCount: 3 });
+    const verdict = await assessStall(ctx, 'claude', false, false);
+    expect(verdict.action).toBe('extend');
+    expect(verdict.extensionMs).toBe(15 * 60_000);
+    expect(verdict.reason).toContain('parallel tool calls');
+  });
+  it('extends for 5 parallel tool calls', async () => {
+    const ctx = makeContext({ pendingToolCount: 5 });
+    const verdict = await assessStall(ctx, 'claude', false, false);
+    expect(verdict.action).toBe('extend');
+    expect(verdict.reason).toContain('5 parallel tool calls');
+  });
+  it('extends for WebSearch without watchdog', async () => {
+    const ctx = makeContext({ lastToolName: 'WebSearch', pendingToolCount: 1 });
+    // pendingToolCount < 3, not Task, not watchdog active, but WebSearch
+    const verdict = await assessStall(ctx, 'claude', false, false);
+    expect(verdict.action).toBe('extend');
+    expect(verdict.extensionMs).toBe(5 * 60_000);
+    expect(verdict.reason).toContain('WebSearch');
+  });
+  it('extends for WebFetch without watchdog', async () => {
+    const ctx = makeContext({ lastToolName: 'WebFetch', pendingToolCount: 1 });
+    const verdict = await assessStall(ctx, 'claude', false, false);
+    expect(verdict.action).toBe('extend');
+    expect(verdict.extensionMs).toBe(5 * 60_000);
+    expect(verdict.reason).toContain('WebFetch');
+  });
+  it('does NOT extend for WebSearch when watchdog is active', async () => {
+    // When watchdog is active and tools are pending, the watchdog deferral
+    // takes priority over the WebSearch heuristic
+    const ctx = makeContext({
+      lastToolName: 'WebSearch',
+      pendingToolCount: 1,
+    });
+    const verdict = await assessStall(ctx, 'claude', false, true);
+    // Should defer to watchdog, not WebSearch heuristic
+    expect(verdict.action).toBe('extend');
+    expect(verdict.reason).toContain('Watchdog active');
+  });
+  it('falls back to extend when Haiku assessment fails', async () => {
+    // Context that doesn't match any heuristic → triggers Haiku →
+    // Haiku fails (no `claude` binary) → cautious extend
+    const ctx = makeContext({
+      pendingToolCount: 1,
+      lastToolName: 'Edit',
+    });
+    const verdict = await assessStall(ctx, 'nonexistent-claude-binary', false, false);
+    expect(verdict.action).toBe('extend');
+    expect(verdict.extensionMs).toBe(10 * 60_000);
+    expect(verdict.reason).toContain('unavailable');
+  });
+});

package/server/cli/headless/stall-assessor.ts CHANGED Viewed

@@ -35,6 +35,8 @@ export interface StallContext {
   totalToolCalls: number;
   /** Total wall-clock time since process started (ms) */
   elapsedTotalMs: number;
+  /** Time since the last token usage event (ms). Undefined if no token events yet. */
+  tokenSilenceMs?: number;
 }
 export interface StallVerdict {
@@ -57,6 +59,17 @@ function quickHeuristic(ctx: StallContext, toolWatchdogActive = false): StallVer
   const pendingNames = ctx.pendingToolNames ?? new Set<string>();
   const hasPendingTools = ctx.pendingToolCount > 0;
+  // Tokens still flowing = process is alive and actively processing.
+  // Extend generously when token activity is recent (< 60s), regardless
+  // of stdout silence. This covers silent thinking and tool result processing.
+  if (ctx.tokenSilenceMs !== undefined && ctx.tokenSilenceMs < 60_000) {
+    return {
+      action: 'extend',
+      extensionMs: 10 * 60_000,
+      reason: `Tokens still flowing (last activity ${Math.round(ctx.tokenSilenceMs / 1000)}s ago) — process is alive`,
+    };
+  }
   // When the watchdog is active and tools are pending, always defer.
   // The watchdog manages per-tool timeouts; the stall detector should only
   // fire when no tools are running and there's genuine silence.
@@ -156,6 +169,7 @@ export async function assessToolTimeout(
   elapsedMs: number,
   claudeCommand: string,
   verbose: boolean,
+  tokenSilenceMs?: number,
 ): Promise<StallVerdict> {
   const elapsedSec = Math.round(elapsedMs / 1000);
@@ -181,13 +195,19 @@ export async function assessToolTimeout(
   };
   const toolDesc = toolDescriptions[toolName] || `executes the ${toolName} tool`;
+  const tokenLine = tokenSilenceMs !== undefined
+    ? `Token activity: last token event ${Math.round(tokenSilenceMs / 1000)}s ago (recent tokens = process is alive and processing)`
+    : 'Token activity: no token events observed';
   const prompt = [
     `You are a process health monitor. A ${toolName} tool call has been running for ${elapsedSec}s.`,
     `${toolName} ${toolDesc}.`,
     `Tool input: ${inputSummary}`,
+    tokenLine,
     '',
     `Is this tool call likely still working, or is it hung/frozen?`,
     'Consider: network latency, server response times, anti-bot protections, large page sizes, complex operations.',
+    'IMPORTANT: If tokens were active recently (< 60s ago), the process is likely still alive and processing — strongly favor WORKING.',
     '',
     'Respond in EXACTLY this format (3 lines, no extra text):',
     'VERDICT: WORKING or STALLED',
@@ -305,6 +325,10 @@ function buildAssessmentPrompt(ctx: StallContext): string {
     ? `${ctx.originalPrompt.slice(0, 500)}...`
     : ctx.originalPrompt;
+  const tokenLine = ctx.tokenSilenceMs !== undefined
+    ? `Token activity: last token event ${Math.round(ctx.tokenSilenceMs / 1000)}s ago (tokens flowing = process alive)`
+    : 'Token activity: no token events observed';
   return [
     'You are a process health monitor. A Claude Code subprocess has been silent (no stdout) and you must determine if it is working or stalled.',
     '',
@@ -314,6 +338,7 @@ function buildAssessmentPrompt(ctx: StallContext): string {
     ctx.lastToolInputSummary ? `Last tool input: ${ctx.lastToolInputSummary}` : '',
     `Pending tool calls: ${ctx.pendingToolCount}`,
     `Total tool calls this session: ${ctx.totalToolCalls}`,
+    tokenLine,
     `Task being executed: ${promptPreview}`,
     '',
     'Respond in EXACTLY this format (3 lines, no extra text):',

package/server/cli/headless/tool-watchdog.test.ts ADDED Viewed

@@ -0,0 +1,429 @@
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+import { DEFAULT_TOOL_TIMEOUT_PROFILES, ToolWatchdog } from './tool-watchdog.js';
+describe('ToolWatchdog', () => {
+  beforeEach(() => {
+    vi.useFakeTimers();
+  });
+  afterEach(() => {
+    vi.useRealTimers();
+  });
+  // ========== getProfile ==========
+  describe('getProfile', () => {
+    it('returns specific profile for known tools', () => {
+      const watchdog = new ToolWatchdog();
+      const webFetch = watchdog.getProfile('WebFetch');
+      expect(webFetch.coldStartMs).toBe(180_000);
+      expect(webFetch.floorMs).toBe(120_000);
+      expect(webFetch.ceilingMs).toBe(300_000);
+      expect(webFetch.useAdaptive).toBe(true);
+      expect(webFetch.useHaikuTiebreaker).toBe(true);
+    });
+    it('returns Task profile with long timeouts', () => {
+      const watchdog = new ToolWatchdog();
+      const task = watchdog.getProfile('Task');
+      expect(task.coldStartMs).toBe(900_000);
+      expect(task.floorMs).toBe(600_000);
+      expect(task.ceilingMs).toBe(2_700_000);
+    });
+    it('returns default profile for unknown tools', () => {
+      const watchdog = new ToolWatchdog();
+      const unknown = watchdog.getProfile('SomeNewTool');
+      expect(unknown.coldStartMs).toBe(300_000);
+      expect(unknown.floorMs).toBe(120_000);
+      expect(unknown.ceilingMs).toBe(600_000);
+      expect(unknown.useAdaptive).toBe(false);
+    });
+    it('merges custom profiles with defaults', () => {
+      const watchdog = new ToolWatchdog({
+        profiles: {
+          WebFetch: { coldStartMs: 60_000 },
+        },
+      });
+      const profile = watchdog.getProfile('WebFetch');
+      expect(profile.coldStartMs).toBe(60_000);
+      // Other fields should come from default WebFetch profile
+      expect(profile.floorMs).toBe(DEFAULT_TOOL_TIMEOUT_PROFILES.WebFetch.floorMs);
+      expect(profile.useAdaptive).toBe(true);
+    });
+    it('allows custom profiles for new tool names', () => {
+      const watchdog = new ToolWatchdog({
+        profiles: {
+          CustomTool: { coldStartMs: 10_000, floorMs: 5_000, ceilingMs: 30_000 },
+        },
+      });
+      const profile = watchdog.getProfile('CustomTool');
+      expect(profile.coldStartMs).toBe(10_000);
+      expect(profile.floorMs).toBe(5_000);
+      expect(profile.ceilingMs).toBe(30_000);
+    });
+  });
+  // ========== getTimeout ==========
+  describe('getTimeout', () => {
+    it('returns coldStart for non-adaptive tools', () => {
+      const watchdog = new ToolWatchdog();
+      // Bash is non-adaptive
+      expect(watchdog.getTimeout('Bash')).toBe(300_000);
+    });
+    it('returns coldStart when no samples recorded', () => {
+      const watchdog = new ToolWatchdog();
+      expect(watchdog.getTimeout('WebFetch')).toBe(180_000);
+    });
+    it('returns adaptive timeout after recording samples', () => {
+      const watchdog = new ToolWatchdog();
+      // Record a 10s completion for WebFetch
+      watchdog.recordCompletion('WebFetch', 10_000);
+      const timeout = watchdog.getTimeout('WebFetch');
+      // First sample: est = 10000, dev = 5000, timeout = 10000 + 4*5000 = 30000
+      // But floor is 120000, so should be clamped to floor
+      expect(timeout).toBe(120_000);
+    });
+    it('respects floor clamping', () => {
+      const watchdog = new ToolWatchdog();
+      // Record very fast completions
+      watchdog.recordCompletion('WebFetch', 100);
+      watchdog.recordCompletion('WebFetch', 100);
+      watchdog.recordCompletion('WebFetch', 100);
+      // Adaptive calculation would be very low, but floor prevents it
+      expect(watchdog.getTimeout('WebFetch')).toBe(DEFAULT_TOOL_TIMEOUT_PROFILES.WebFetch.floorMs);
+    });
+    it('respects ceiling clamping', () => {
+      const watchdog = new ToolWatchdog();
+      // Record very slow completions
+      watchdog.recordCompletion('WebSearch', 500_000);
+      const timeout = watchdog.getTimeout('WebSearch');
+      // Should not exceed ceiling
+      expect(timeout).toBeLessThanOrEqual(DEFAULT_TOOL_TIMEOUT_PROFILES.WebSearch.ceilingMs);
+    });
+    it('does not record completions for non-adaptive tools', () => {
+      const watchdog = new ToolWatchdog();
+      // Bash is non-adaptive (Read too)
+      watchdog.recordCompletion('Bash', 5_000);
+      // Should still return coldStart
+      expect(watchdog.getTimeout('Bash')).toBe(300_000);
+    });
+  });
+  // ========== recordCompletion ==========
+  describe('recordCompletion', () => {
+    it('initializes tracker on first sample', () => {
+      const watchdog = new ToolWatchdog();
+      watchdog.recordCompletion('WebFetch', 20_000);
+      // After first sample: timeout should differ from cold start if above floor
+      const timeout = watchdog.getTimeout('WebFetch');
+      // est=20000, dev=10000, adaptive=20000+4*10000=60000, floor=120000 → 120000
+      expect(timeout).toBe(120_000);
+    });
+    it('updates EMA on subsequent samples', () => {
+      const watchdog = new ToolWatchdog();
+      // First sample
+      watchdog.recordCompletion('Glob', 10_000);
+      const timeout1 = watchdog.getTimeout('Glob');
+      // Second sample - much longer
+      watchdog.recordCompletion('Glob', 50_000);
+      const timeout2 = watchdog.getTimeout('Glob');
+      // Timeout should increase after longer sample
+      expect(timeout2).toBeGreaterThanOrEqual(timeout1);
+    });
+    it('converges toward actual duration over many samples', () => {
+      const watchdog = new ToolWatchdog();
+      // Record many similar samples for Glob (adaptive, floor=30000, ceiling=180000)
+      for (let i = 0; i < 20; i++) {
+        watchdog.recordCompletion('Glob', 45_000);
+      }
+      const timeout = watchdog.getTimeout('Glob');
+      // Should converge near 45000, with deviation near 0
+      // adaptive ≈ 45000 + 4*~0 ≈ 45000, but floor is 30000, so should be ~45000
+      expect(timeout).toBeGreaterThanOrEqual(30_000);
+      expect(timeout).toBeLessThanOrEqual(60_000);
+    });
+  });
+  // ========== startWatch / clearWatch ==========
+  describe('startWatch / clearWatch', () => {
+    it('calls timeout callback when timer expires', async () => {
+      const watchdog = new ToolWatchdog();
+      const onTimeout = vi.fn();
+      watchdog.startWatch('tool-1', 'WebFetch', { url: 'http://example.com' }, onTimeout);
+      // Advance past WebFetch cold start (180s) — async because internal handler is async
+      await vi.advanceTimersByTimeAsync(180_001);
+      // onTimeout should fire (no tiebreaker configured)
+      expect(onTimeout).toHaveBeenCalledOnce();
+    });
+    it('does not call timeout if cleared before expiry', async () => {
+      const watchdog = new ToolWatchdog();
+      const onTimeout = vi.fn();
+      watchdog.startWatch('tool-1', 'WebFetch', {}, onTimeout);
+      watchdog.clearWatch('tool-1');
+      await vi.advanceTimersByTimeAsync(300_000);
+      expect(onTimeout).not.toHaveBeenCalled();
+    });
+    it('replaces existing watch for same ID', async () => {
+      const watchdog = new ToolWatchdog();
+      const onTimeout1 = vi.fn();
+      const onTimeout2 = vi.fn();
+      watchdog.startWatch('tool-1', 'WebFetch', {}, onTimeout1);
+      watchdog.startWatch('tool-1', 'WebSearch', {}, onTimeout2);
+      // Advance past WebSearch cold start (90s)
+      await vi.advanceTimersByTimeAsync(90_001);
+      expect(onTimeout2).toHaveBeenCalledOnce();
+      expect(onTimeout1).not.toHaveBeenCalled();
+    });
+    it('tracks multiple watches independently', async () => {
+      const watchdog = new ToolWatchdog();
+      const onTimeout1 = vi.fn();
+      const onTimeout2 = vi.fn();
+      watchdog.startWatch('tool-1', 'WebSearch', {}, onTimeout1); // 90s
+      watchdog.startWatch('tool-2', 'WebFetch', {}, onTimeout2); // 180s
+      await vi.advanceTimersByTimeAsync(90_001);
+      expect(onTimeout1).toHaveBeenCalledOnce();
+      expect(onTimeout2).not.toHaveBeenCalled();
+      await vi.advanceTimersByTimeAsync(90_000);
+      expect(onTimeout2).toHaveBeenCalledOnce();
+    });
+  });
+  // ========== clearAll ==========
+  describe('clearAll', () => {
+    it('clears all active watches', () => {
+      const watchdog = new ToolWatchdog();
+      const onTimeout1 = vi.fn();
+      const onTimeout2 = vi.fn();
+      watchdog.startWatch('tool-1', 'WebFetch', {}, onTimeout1);
+      watchdog.startWatch('tool-2', 'WebSearch', {}, onTimeout2);
+      watchdog.clearAll();
+      vi.advanceTimersByTime(300_000);
+      expect(onTimeout1).not.toHaveBeenCalled();
+      expect(onTimeout2).not.toHaveBeenCalled();
+    });
+    it('clears active watches map', () => {
+      const watchdog = new ToolWatchdog();
+      watchdog.startWatch('tool-1', 'WebFetch', {}, vi.fn());
+      watchdog.startWatch('tool-2', 'WebSearch', {}, vi.fn());
+      watchdog.clearAll();
+      expect(watchdog.getActiveWatches().size).toBe(0);
+    });
+  });
+  // ========== getActiveWatch / getActiveWatches ==========
+  describe('getActiveWatch', () => {
+    it('returns watch for active tool', () => {
+      const watchdog = new ToolWatchdog();
+      watchdog.startWatch('tool-1', 'WebFetch', { url: 'http://test.com' }, vi.fn());
+      const watch = watchdog.getActiveWatch('tool-1');
+      expect(watch).toBeDefined();
+      expect(watch!.toolName).toBe('WebFetch');
+      expect(watch!.toolInput).toEqual({ url: 'http://test.com' });
+    });
+    it('returns undefined for cleared watch', () => {
+      const watchdog = new ToolWatchdog();
+      watchdog.startWatch('tool-1', 'WebFetch', {}, vi.fn());
+      watchdog.clearWatch('tool-1');
+      expect(watchdog.getActiveWatch('tool-1')).toBeUndefined();
+    });
+    it('returns undefined for unknown ID', () => {
+      const watchdog = new ToolWatchdog();
+      expect(watchdog.getActiveWatch('nonexistent')).toBeUndefined();
+    });
+  });
+  // ========== buildCheckpoint ==========
+  describe('buildCheckpoint', () => {
+    it('returns null when hung tool ID not found', () => {
+      const watchdog = new ToolWatchdog();
+      const checkpoint = watchdog.buildCheckpoint(
+        'test prompt', '', '', [], 'missing-id', undefined, Date.now()
+      );
+      expect(checkpoint).toBeNull();
+    });
+    it('builds checkpoint with correct tool separation', () => {
+      const watchdog = new ToolWatchdog();
+      vi.setSystemTime(new Date('2025-01-01T00:00:00Z'));
+      const processStartTime = Date.now();
+      watchdog.startWatch('hung-tool', 'WebFetch', { url: 'http://slow.com' }, vi.fn());
+      const accumulatedTools = [
+        { toolId: 'tool-1', toolName: 'Read', toolInput: { path: 'a.ts' }, result: 'content', isError: false, duration: 100 },
+        { toolId: 'tool-2', toolName: 'Grep', toolInput: { pattern: 'foo' }, result: undefined, isError: false },
+        { toolId: 'hung-tool', toolName: 'WebFetch', toolInput: { url: 'http://slow.com' }, result: undefined, isError: false },
+      ];
+      const checkpoint = watchdog.buildCheckpoint(
+        'find and fix',
+        'assistant response text',
+        'thinking about it',
+        accumulatedTools,
+        'hung-tool',
+        'session-123',
+        processStartTime,
+      );
+      expect(checkpoint).not.toBeNull();
+      expect(checkpoint!.originalPrompt).toBe('find and fix');
+      expect(checkpoint!.assistantText).toBe('assistant response text');
+      expect(checkpoint!.thinkingText).toBe('thinking about it');
+      expect(checkpoint!.claudeSessionId).toBe('session-123');
+      // Completed tools: only tool-1 (has result and is not hung)
+      expect(checkpoint!.completedTools).toHaveLength(1);
+      expect(checkpoint!.completedTools[0].toolId).toBe('tool-1');
+      // In-progress tools: tool-2 (no result, not hung)
+      expect(checkpoint!.inProgressTools).toHaveLength(1);
+      expect(checkpoint!.inProgressTools[0].toolId).toBe('tool-2');
+      // Hung tool
+      expect(checkpoint!.hungTool.toolName).toBe('WebFetch');
+      expect(checkpoint!.hungTool.toolId).toBe('hung-tool');
+      expect(checkpoint!.hungTool.url).toBe('http://slow.com');
+    });
+    it('extracts URL from tool input for WebFetch', () => {
+      const watchdog = new ToolWatchdog();
+      watchdog.startWatch('t1', 'WebFetch', { url: 'http://example.com' }, vi.fn());
+      const tools = [
+        { toolId: 't1', toolName: 'WebFetch', toolInput: { url: 'http://example.com' }, result: undefined, isError: false },
+      ];
+      const cp = watchdog.buildCheckpoint('prompt', '', '', tools, 't1', undefined, Date.now());
+      expect(cp!.hungTool.url).toBe('http://example.com');
+    });
+    it('extracts query from tool input for WebSearch', () => {
+      const watchdog = new ToolWatchdog();
+      watchdog.startWatch('t1', 'WebSearch', { query: 'test search' }, vi.fn());
+      const tools = [
+        { toolId: 't1', toolName: 'WebSearch', toolInput: { query: 'test search' }, result: undefined, isError: false },
+      ];
+      const cp = watchdog.buildCheckpoint('prompt', '', '', tools, 't1', undefined, Date.now());
+      expect(cp!.hungTool.url).toBe('test search');
+    });
+  });
+  // ========== tiebreaker integration ==========
+  describe('tiebreaker', () => {
+    it('extends when tiebreaker returns extend', async () => {
+      const onTiebreaker = vi.fn().mockResolvedValue({
+        action: 'extend',
+        extensionMs: 60_000,
+        reason: 'still working',
+      });
+      const watchdog = new ToolWatchdog({ onTiebreaker });
+      const onTimeout = vi.fn();
+      // Use a tool with useHaikuTiebreaker=true and short timeout
+      watchdog.startWatch('t1', 'WebFetch', {}, onTimeout);
+      // Advance to trigger timeout
+      await vi.advanceTimersByTimeAsync(180_001);
+      // Tiebreaker should have been called
+      expect(onTiebreaker).toHaveBeenCalledOnce();
+      // onTimeout should NOT have fired (tiebreaker extended)
+      expect(onTimeout).not.toHaveBeenCalled();
+      // Now advance past extension
+      await vi.advanceTimersByTimeAsync(60_001);
+      // Should fire after extension
+      expect(onTimeout).toHaveBeenCalledOnce();
+    });
+    it('kills when tiebreaker returns kill', async () => {
+      const onTiebreaker = vi.fn().mockResolvedValue({
+        action: 'kill',
+        extensionMs: 0,
+        reason: 'process is hung',
+      });
+      const watchdog = new ToolWatchdog({ onTiebreaker });
+      const onTimeout = vi.fn();
+      watchdog.startWatch('t1', 'WebFetch', {}, onTimeout);
+      await vi.advanceTimersByTimeAsync(180_001);
+      expect(onTiebreaker).toHaveBeenCalledOnce();
+      expect(onTimeout).toHaveBeenCalledOnce();
+    });
+    it('kills when tiebreaker throws', async () => {
+      const onTiebreaker = vi.fn().mockRejectedValue(new Error('haiku failed'));
+      const watchdog = new ToolWatchdog({ onTiebreaker });
+      const onTimeout = vi.fn();
+      watchdog.startWatch('t1', 'WebFetch', {}, onTimeout);
+      await vi.advanceTimersByTimeAsync(180_001);
+      expect(onTiebreaker).toHaveBeenCalledOnce();
+      expect(onTimeout).toHaveBeenCalledOnce();
+    });
+    it('does not attempt tiebreaker for tools with useHaikuTiebreaker=false', async () => {
+      const onTiebreaker = vi.fn();
+      const watchdog = new ToolWatchdog({ onTiebreaker });
+      const onTimeout = vi.fn();
+      // WebSearch has useHaikuTiebreaker: false
+      watchdog.startWatch('t1', 'WebSearch', {}, onTimeout);
+      await vi.advanceTimersByTimeAsync(90_001);
+      expect(onTiebreaker).not.toHaveBeenCalled();
+      expect(onTimeout).toHaveBeenCalledOnce();
+    });
+  });
+});