npm - @dotsetlabs/dotclaw - Versions diffs - 2.4.0 → 2.6.0 - Mend

@dotsetlabs/dotclaw 2.4.0 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (118) hide show

package/.env.example +9 -10
package/README.md +8 -4
package/config-examples/runtime.json +34 -8
package/config-examples/tool-policy.json +12 -2
package/container/agent-runner/package-lock.json +2 -2
package/container/agent-runner/package.json +1 -1
package/container/agent-runner/src/agent-config.ts +19 -3
package/container/agent-runner/src/container-protocol.ts +11 -0
package/container/agent-runner/src/context-overflow-recovery.ts +39 -0
package/container/agent-runner/src/index.ts +603 -165
package/container/agent-runner/src/openrouter-input.ts +159 -0
package/container/agent-runner/src/system-prompt.ts +13 -3
package/container/agent-runner/src/tool-loop-policy.ts +741 -0
package/container/agent-runner/src/tools.ts +211 -8
package/dist/agent-context.d.ts +1 -0
package/dist/agent-context.d.ts.map +1 -1
package/dist/agent-context.js +21 -9
package/dist/agent-context.js.map +1 -1
package/dist/agent-execution.d.ts +2 -0
package/dist/agent-execution.d.ts.map +1 -1
package/dist/agent-execution.js +164 -15
package/dist/agent-execution.js.map +1 -1
package/dist/agent-semaphore.d.ts +24 -1
package/dist/agent-semaphore.d.ts.map +1 -1
package/dist/agent-semaphore.js +109 -20
package/dist/agent-semaphore.js.map +1 -1
package/dist/cli.js +3 -11
package/dist/cli.js.map +1 -1
package/dist/config.d.ts +2 -0
package/dist/config.d.ts.map +1 -1
package/dist/config.js +2 -0
package/dist/config.js.map +1 -1
package/dist/container-protocol.d.ts +22 -0
package/dist/container-protocol.d.ts.map +1 -1
package/dist/container-protocol.js.map +1 -1
package/dist/container-runner.d.ts +7 -0
package/dist/container-runner.d.ts.map +1 -1
package/dist/container-runner.js +417 -143
package/dist/container-runner.js.map +1 -1
package/dist/db.d.ts.map +1 -1
package/dist/db.js +46 -12
package/dist/db.js.map +1 -1
package/dist/error-messages.d.ts.map +1 -1
package/dist/error-messages.js +18 -4
package/dist/error-messages.js.map +1 -1
package/dist/failover-policy.d.ts +41 -0
package/dist/failover-policy.d.ts.map +1 -0
package/dist/failover-policy.js +261 -0
package/dist/failover-policy.js.map +1 -0
package/dist/index.js +1 -0
package/dist/index.js.map +1 -1
package/dist/ipc-dispatcher.d.ts.map +1 -1
package/dist/ipc-dispatcher.js +27 -43
package/dist/ipc-dispatcher.js.map +1 -1
package/dist/mcp-config.d.ts +22 -0
package/dist/mcp-config.d.ts.map +1 -0
package/dist/mcp-config.js +94 -0
package/dist/mcp-config.js.map +1 -0
package/dist/memory-backend.d.ts +27 -0
package/dist/memory-backend.d.ts.map +1 -0
package/dist/memory-backend.js +112 -0
package/dist/memory-backend.js.map +1 -0
package/dist/memory-recall.d.ts.map +1 -1
package/dist/memory-recall.js +135 -22
package/dist/memory-recall.js.map +1 -1
package/dist/memory-store.d.ts +1 -0
package/dist/memory-store.d.ts.map +1 -1
package/dist/memory-store.js +55 -7
package/dist/memory-store.js.map +1 -1
package/dist/message-pipeline.d.ts +24 -0
package/dist/message-pipeline.d.ts.map +1 -1
package/dist/message-pipeline.js +131 -27
package/dist/message-pipeline.js.map +1 -1
package/dist/metrics.d.ts +1 -0
package/dist/metrics.d.ts.map +1 -1
package/dist/metrics.js +9 -0
package/dist/metrics.js.map +1 -1
package/dist/providers/discord/discord-provider.d.ts.map +1 -1
package/dist/providers/discord/discord-provider.js +72 -4
package/dist/providers/discord/discord-provider.js.map +1 -1
package/dist/providers/telegram/telegram-provider.d.ts.map +1 -1
package/dist/providers/telegram/telegram-provider.js +65 -3
package/dist/providers/telegram/telegram-provider.js.map +1 -1
package/dist/recall-policy.d.ts +12 -0
package/dist/recall-policy.d.ts.map +1 -0
package/dist/recall-policy.js +89 -0
package/dist/recall-policy.js.map +1 -0
package/dist/runtime-config.d.ts +33 -0
package/dist/runtime-config.d.ts.map +1 -1
package/dist/runtime-config.js +109 -9
package/dist/runtime-config.js.map +1 -1
package/dist/streaming.d.ts.map +1 -1
package/dist/streaming.js +125 -33
package/dist/streaming.js.map +1 -1
package/dist/task-scheduler.d.ts.map +1 -1
package/dist/task-scheduler.js +4 -2
package/dist/task-scheduler.js.map +1 -1
package/dist/tool-policy.d.ts.map +1 -1
package/dist/tool-policy.js +26 -4
package/dist/tool-policy.js.map +1 -1
package/dist/trace-writer.d.ts +12 -0
package/dist/trace-writer.d.ts.map +1 -1
package/dist/trace-writer.js.map +1 -1
package/dist/turn-hygiene.d.ts +14 -0
package/dist/turn-hygiene.d.ts.map +1 -0
package/dist/turn-hygiene.js +214 -0
package/dist/turn-hygiene.js.map +1 -0
package/dist/webhook.d.ts.map +1 -1
package/dist/webhook.js +1 -0
package/dist/webhook.js.map +1 -1
package/package.json +15 -1
package/scripts/benchmark-baseline.js +365 -0
package/scripts/benchmark-harness.js +1413 -0
package/scripts/benchmark-scenarios.js +301 -0
package/scripts/canary-suite.js +123 -0
package/scripts/generate-controlled-traces.js +230 -0
package/scripts/release-slo-check.js +214 -0
package/scripts/run-live-canary.js +339 -0

package/container/agent-runner/src/index.ts CHANGED Viewed

@@ -33,6 +33,27 @@ import {
 import { loadPromptPackWithCanary, formatPromptPack, PromptPack } from './prompt-packs.js';
 import { buildSkillCatalog, type SkillCatalog } from './skill-loader.js';
 import { buildSystemPrompt } from './system-prompt.js';
+import { buildContextOverflowRecoveryPlan } from './context-overflow-recovery.js';
+import {
+  buildForcedSynthesisPrompt,
+  buildToolExecutionNudgePrompt,
+  buildToolOutcomeFallback,
+  compactToolConversationItems,
+  detectToolExecutionRequirement,
+  buildMalformedArgumentsRecoveryHint,
+  isNonRetryableToolError,
+  normalizeToolCallArguments,
+  normalizeToolCallSignature,
+  normalizeToolRoundSignature,
+  parseCreateReadFileInstruction,
+  parseListReadNewestInstruction,
+  shouldRetryIdempotentToolCall,
+} from './tool-loop-policy.js';
+import {
+  injectImagesIntoContextInput,
+  loadImageAttachmentsForInput,
+  messagesToOpenRouterInput,
+} from './openrouter-input.js';
 type OpenRouterResult = ReturnType<OpenRouter['callModel']>;
@@ -143,6 +164,10 @@ function log(message: string): void {
   console.error(`[agent-runner] ${message}`);
 }
+function sleep(ms: number): Promise<void> {
+  return new Promise(resolve => setTimeout(resolve, Math.max(0, ms)));
+}
 function classifyError(err: unknown): 'retryable' | 'context_overflow' | null {
   const msg = err instanceof Error ? err.message : String(err);
   const lower = msg.toLowerCase();
@@ -494,55 +519,6 @@ function loadClaudeNotes(): { group: string | null; global: string | null } {
   };
 }
-// ── Image/Vision support ──────────────────────────────────────────────
-const MAX_IMAGE_BYTES = 5 * 1024 * 1024; // 5MB per image
-const MAX_TOTAL_IMAGE_BYTES = 20 * 1024 * 1024; // 20MB total across all images
-const IMAGE_MIME_TYPES = new Set(['image/jpeg', 'image/png', 'image/gif', 'image/webp']);
-function loadImageAttachments(attachments?: ContainerInput['attachments']): Array<{
-  type: 'image_url';
-  image_url: { url: string };
-}> {
-  if (!attachments) return [];
-  const images: Array<{ type: 'image_url'; image_url: { url: string } }> = [];
-  let totalBytes = 0;
-  for (const att of attachments) {
-    if (att.type !== 'photo') continue;
-    const mime = att.mime_type || 'image/jpeg';
-    if (!IMAGE_MIME_TYPES.has(mime)) continue;
-    try {
-      const stat = fs.statSync(att.path);
-      if (stat.size > MAX_IMAGE_BYTES) {
-        log(`Skipping image ${att.path}: ${stat.size} bytes exceeds ${MAX_IMAGE_BYTES}`);
-        continue;
-      }
-      if (totalBytes + stat.size > MAX_TOTAL_IMAGE_BYTES) {
-        log(`Skipping image ${att.path}: cumulative size would exceed ${MAX_TOTAL_IMAGE_BYTES}`);
-        break;
-      }
-      const data = fs.readFileSync(att.path);
-      totalBytes += data.length;
-      const b64 = data.toString('base64');
-      images.push({
-        type: 'image_url',
-        image_url: { url: `data:${mime};base64,${b64}` }
-      });
-    } catch (err) {
-      log(`Failed to load image ${att.path}: ${err instanceof Error ? err.message : err}`);
-    }
-  }
-  return images;
-}
-function messagesToOpenRouter(messages: Message[]) {
-  return messages.map(message => ({
-    role: message.role,
-    content: message.content
-  }));
-}
 function clampContextMessages(messages: Message[], tokensPerChar: number, maxTokens: number): Message[] {
   if (!Number.isFinite(maxTokens) || maxTokens <= 0) return messages;
   const tpc = tokensPerChar > 0 ? tokensPerChar : 0.25;
@@ -560,6 +536,44 @@ function clampContextMessages(messages: Message[], tokensPerChar: number, maxTok
   });
 }
+function shouldDisableToolsForPrompt(prompt: string, toolRequired: boolean): boolean {
+  if (toolRequired) return false;
+  const text = String(prompt || '').trim();
+  if (!text) return false;
+  if (/\[(?:scenario:)?memory(?:_carryover)?\]/i.test(text)) return true;
+  if (/\bfrom\s+(?:this|our)\s+(?:same\s+)?(?:conversation|chat)\b/i.test(text)) return true;
+  if (/\bwhat\s+did\s+(?:i|you)\s+just\b/i.test(text)) return true;
+  if (/\bearlier\s+in\s+(?:this\s+)?(?:conversation|chat)\b/i.test(text)) return true;
+  return false;
+}
+function resolvePromptOutputCap(prompt: string): number | undefined {
+  const text = String(prompt || '').trim();
+  if (!text) return undefined;
+  let cap: number | undefined;
+  if (/\b(?:one|single)[-\s]?word\b/i.test(text)) {
+    cap = 48;
+  }
+  if (/\bone\s+(?:concise\s+|short\s+|brief\s+)?sentence\b/i.test(text)) {
+    cap = cap ? Math.min(cap, 180) : 180;
+  }
+  const bulletMatch = text.match(/\bexactly\s+(\d+)\s+bullet(?:\s+point)?s?\b/i);
+  if (bulletMatch) {
+    const bulletCount = Math.min(10, Math.max(1, Math.floor(Number(bulletMatch[1]) || 0)));
+    const bulletCap = Math.max(180, Math.min(900, 140 + (bulletCount * 90)));
+    cap = cap ? Math.min(cap, bulletCap) : bulletCap;
+  }
+  if (/\bconcise|brief|short\b/i.test(text)) {
+    cap = cap ? Math.min(cap, 260) : 260;
+  }
+  return cap;
+}
 async function updateMemorySummary(params: {
   openrouter: OpenRouter;
   model: string;
@@ -686,6 +700,13 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
   const maxToolSteps = Number.isFinite(input.maxToolSteps)
     ? Math.max(1, Math.floor(input.maxToolSteps as number))
     : agent.tools.maxToolSteps;
+  const completionGuard = agent.tools.completionGuard;
+  const idempotentRetryAttempts = Math.max(1, Math.floor(completionGuard.idempotentRetryAttempts));
+  const idempotentRetryBackoffMs = Math.max(0, Math.floor(completionGuard.idempotentRetryBackoffMs));
+  const repeatedSignatureThreshold = Math.max(2, Math.floor(completionGuard.repeatedSignatureThreshold));
+  const repeatedRoundThreshold = Math.max(2, Math.floor(completionGuard.repeatedRoundThreshold));
+  const nonRetryableFailureThreshold = Math.max(1, Math.floor(completionGuard.nonRetryableFailureThreshold || 3));
+  const forceSynthesisAfterTools = completionGuard.forceSynthesisAfterTools !== false;
   const memoryExtractionEnabled = agent.memory.extraction.enabled;
   const isDaemon = process.env.DOTCLAW_DAEMON === '1';
   const memoryExtractionMaxMessages = agent.memory.extraction.maxMessages;
@@ -714,6 +735,27 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
     groupFolder: input.groupFolder,
     isMain: input.isMain
   }, agent.ipc);
+  const inputToolPolicy = (input.toolPolicy && typeof input.toolPolicy === 'object')
+    ? input.toolPolicy as { allow?: string[]; deny?: string[] }
+    : {};
+  const hasAllowPolicy = Array.isArray(inputToolPolicy.allow);
+  const allowedToolSet = new Set(
+    (hasAllowPolicy ? (inputToolPolicy.allow || []) : [])
+      .map((name) => String(name || '').trim().toLowerCase())
+      .filter(Boolean)
+  );
+  const deniedToolSet = new Set(
+    (inputToolPolicy.deny || [])
+      .map((name) => String(name || '').trim().toLowerCase())
+      .filter(Boolean)
+  );
+  const isToolAllowedByPolicy = (name: string): boolean => {
+    const normalized = String(name || '').trim().toLowerCase();
+    if (!normalized) return false;
+    if (deniedToolSet.has(normalized)) return false;
+    if (hasAllowPolicy && !allowedToolSet.has(normalized)) return false;
+    return true;
+  };
   const tools = createTools({
     chatJid: input.chatJid,
     groupFolder: input.groupFolder,
@@ -748,16 +790,28 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
         };
       };
       const mcp = await discoverMcpTools(agent, wrapMcp);
-      tools.push(...mcp.tools);
+      const filteredMcpTools = mcp.tools.filter(toolEntry => isToolAllowedByPolicy(toolEntry.function.name));
+      tools.push(...filteredMcpTools);
       mcpCleanup = mcp.cleanup;
-      if (mcp.tools.length > 0) {
-        log(`MCP: discovered ${mcp.tools.length} external tools`);
+      if (filteredMcpTools.length > 0) {
+        log(`MCP: discovered ${filteredMcpTools.length} external tools`);
       }
     } catch (err) {
       log(`MCP discovery failed: ${err instanceof Error ? err.message : String(err)}`);
     }
   }
+  const cleanupMcpConnections = async () => {
+    if (!mcpCleanup) return;
+    const cleanup = mcpCleanup;
+    mcpCleanup = null;
+    try {
+      await cleanup();
+    } catch {
+      // ignore cleanup errors
+    }
+  };
   // Build schema-only tools (no execute functions) for SDK — prevents the SDK from
   // auto-executing tools in its internal loop, which drops conversation context in
   // follow-up API calls (makeFollowupRequest only sends model output + tool results,
@@ -782,6 +836,7 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
   if (process.env.DOTCLAW_SELF_CHECK === '1') {
     try {
       const details = await runSelfCheck({ model });
+      await cleanupMcpConnections();
       return {
         status: 'success',
         result: `Self-check passed: ${details.join(', ')}`,
@@ -790,6 +845,7 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
     } catch (err) {
       const errorMessage = err instanceof Error ? err.message : String(err);
       log(`Self-check failed: ${errorMessage}`);
+      await cleanupMcpConnections();
       return {
         status: 'error',
         result: null,
@@ -821,6 +877,22 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
     }).join('\n');
     prompt = `${prompt}\n\n<latest_attachments>\n${attachmentSummary}\n</latest_attachments>`;
   }
+  const toolExecutionRequirement = detectToolExecutionRequirement(prompt);
+  const disableToolsForTurn = shouldDisableToolsForPrompt(prompt, toolExecutionRequirement.required);
+  const promptOutputCap = resolvePromptOutputCap(prompt);
+  const effectiveMaxOutputTokens = promptOutputCap
+    ? (
+      (typeof resolvedMaxOutputTokens === 'number' && Number.isFinite(resolvedMaxOutputTokens))
+        ? Math.max(64, Math.min(resolvedMaxOutputTokens, promptOutputCap))
+        : promptOutputCap
+    )
+    : resolvedMaxOutputTokens;
+  if (typeof effectiveMaxOutputTokens === 'number' && effectiveMaxOutputTokens !== resolvedMaxOutputTokens) {
+    log(`Applying prompt output cap: ${effectiveMaxOutputTokens} tokens`);
+  }
+  if (disableToolsForTurn) {
+    log('Prompt classified as conversation-recall: disabling tool schema for this turn');
+  }
   appendHistory(sessionCtx, 'user', prompt);
   let history = loadHistory(sessionCtx);
@@ -829,11 +901,11 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
     history = limitHistoryTurns(history, agent.context.maxHistoryTurns);
   }
-  // Dynamic context budget: if recentContextTokens is 0 (auto), allocate 50% of context to
-  // conversation history (matches OpenClaw's maxHistoryShare). System prompt gets up to 25%.
+  // Dynamic context budget: if recentContextTokens is 0 (auto), allocate 35% of context to
+  // conversation history, capped at 24K tokens for latency/throughput stability.
   const effectiveRecentTokens = config.recentContextTokens > 0
     ? config.recentContextTokens
-    : Math.floor(config.maxContextTokens * 0.50);
+    : Math.min(24_000, Math.floor(config.maxContextTokens * 0.35));
   const tokenRatio = tokenEstimate.tokensPerChar > 0 ? (0.25 / tokenEstimate.tokensPerChar) : 1;
   const adjustedRecentTokens = Math.max(1000, Math.floor(effectiveRecentTokens * tokenRatio));
@@ -970,7 +1042,8 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
   // Long-term memory is now tool-based (agent calls mcp__dotclaw__memory_search on demand).
   // Session recall removed — redundant with summary + facts + recent messages.
   const sessionRecallCount = 0;
-  const memoryRecallCount = input.memoryRecall ? input.memoryRecall.length : 0;
+  const memoryRecallCount = Array.isArray(input.memoryRecall) ? input.memoryRecall.length : 0;
+  const memoryRecallCountForOutput = input.memoryRecallAttempted ? memoryRecallCount : undefined;
   const sharedPromptDir = fs.existsSync(PROMPTS_DIR) ? PROMPTS_DIR : undefined;
   const taskPackResult = PROMPT_PACKS_ENABLED
@@ -1012,38 +1085,48 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
   if (memoryPolicyResult) promptPackVersions['memory-policy'] = memoryPolicyResult.pack.version;
   if (memoryRecallResult) promptPackVersions['memory-recall'] = memoryRecallResult.pack.version;
-  const resolveInstructions = (trimLevel = 0) => buildInstructions({
-    assistantName,
-    groupNotes: claudeNotes.group,
-    globalNotes: claudeNotes.global,
-    skillCatalog,
-    memorySummary: sessionCtx.state.summary,
-    memoryFacts: sessionCtx.state.facts,
-    sessionRecall: [],
-    longTermRecall: [],
-    userProfile: input.userProfile ?? null,
-    memoryStats: input.memoryStats,
-    availableGroups,
-    toolReliability: input.toolReliability,
-    behaviorConfig: input.behaviorConfig,
-    isScheduledTask: !!input.isScheduledTask,
-    taskId: input.taskId,
-    timezone: typeof input.timezone === 'string' ? input.timezone : undefined,
-    hostPlatform: typeof input.hostPlatform === 'string' ? input.hostPlatform : undefined,
-    messagingPlatform: input.chatJid?.includes(':') ? input.chatJid.split(':')[0] : undefined,
-    taskExtractionPack: taskPackResult?.pack || null,
-    responseQualityPack: responseQualityResult?.pack || null,
-    toolCallingPack: toolCallingResult?.pack || null,
-    toolOutcomePack: toolOutcomeResult?.pack || null,
-    memoryPolicyPack: memoryPolicyResult?.pack || null,
-    memoryRecallPack: memoryRecallResult?.pack || null,
-    maxToolSteps,
-    trimLevel
-  });
+  const resolveInstructions = (trimLevel = 0) => {
+    const base = buildInstructions({
+      assistantName,
+      groupNotes: claudeNotes.group,
+      globalNotes: claudeNotes.global,
+      skillCatalog,
+      memorySummary: sessionCtx.state.summary,
+      memoryFacts: sessionCtx.state.facts,
+      sessionRecall: [],
+      longTermRecall: [],
+      userProfile: input.userProfile ?? null,
+      memoryStats: input.memoryStats,
+      availableGroups,
+      toolReliability: input.toolReliability,
+      behaviorConfig: input.behaviorConfig,
+      isScheduledTask: !!input.isScheduledTask,
+      taskId: input.taskId,
+      timezone: typeof input.timezone === 'string' ? input.timezone : undefined,
+      hostPlatform: typeof input.hostPlatform === 'string' ? input.hostPlatform : undefined,
+      messagingPlatform: input.chatJid?.includes(':') ? input.chatJid.split(':')[0] : undefined,
+      taskExtractionPack: taskPackResult?.pack || null,
+      responseQualityPack: responseQualityResult?.pack || null,
+      toolCallingPack: toolCallingResult?.pack || null,
+      toolOutcomePack: toolOutcomeResult?.pack || null,
+      memoryPolicyPack: memoryPolicyResult?.pack || null,
+      memoryRecallPack: memoryRecallResult?.pack || null,
+      maxToolSteps,
+      trimLevel
+    });
+    if (!toolExecutionRequirement.required) return base;
+    const reason = toolExecutionRequirement.reason || 'required_tool_execution';
+    return `${base}\n\n[Tool Execution Requirement]\nThis request requires real tool execution (${reason}). Do not claim file/system/web actions unless matching tool calls in this turn succeeded. If tools fail, state the failure clearly and provide the best next action.`;
+  };
   const buildContext = () => {
-    // System prompt budget: 25% of context window
-    const maxSystemPromptTokens = Math.floor(config.maxContextTokens * 0.25);
+    // System prompt budget: keep prompt lean for lower p95 latency.
+    // Cap absolute size to avoid over-spending tokens on instructions.
+    const systemPromptShare = input.isScheduledTask ? 0.1 : 0.12;
+    const maxSystemPromptTokens = Math.max(
+      1200,
+      Math.min(6000, Math.floor(config.maxContextTokens * systemPromptShare))
+    );
     const MAX_TRIM_LEVEL = 4;
     let resolvedInstructions = '';
@@ -1062,7 +1145,10 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
     const outputReserve = resolvedMaxOutputTokens || Math.floor(config.maxContextTokens * 0.25);
     const resolvedMaxContext = Math.max(config.maxContextTokens - outputReserve - resolvedInstructionTokens, 2000);
-    const resolvedAdjusted = Math.max(1000, Math.floor(resolvedMaxContext * tokenRatio));
+    const resolvedAdjusted = Math.max(
+      1000,
+      Math.min(adjustedRecentTokens, Math.floor(resolvedMaxContext * tokenRatio))
+    );
     let { recentMessages: contextMessages } = splitRecentHistory(recentMessages, resolvedAdjusted, 6);
     contextMessages = clampContextMessages(contextMessages, tokenEstimate.tokensPerChar, resolvedMaxContextMessageTokens);
     contextMessages = pruneContextMessages(contextMessages, agent.context.contextPruning);
@@ -1077,9 +1163,65 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
   let completionTokens = 0;
   let promptTokens = 0;
   let latencyMs: number | undefined;
+  let toolRetryAttempts = 0;
+  let toolOutcomeVerificationForced = false;
+  let toolLoopBreakerTriggered = false;
+  let toolLoopBreakerReason: string | undefined;
   const modelChain = [model, ...(input.modelFallbacks || [])].slice(0, 3);
   let currentModel = model;
+  const toolTrimConfig = agent.context.contextPruning;
+  const toolSoftTrimMaxChars = Math.max(500, Math.floor(toolTrimConfig.softTrimMaxChars || 4000));
+  const toolSoftTrimHead = Math.max(100, Math.floor(toolTrimConfig.softTrimHeadChars || 1500));
+  const toolSoftTrimTail = Math.max(100, Math.floor(toolTrimConfig.softTrimTailChars || 1500));
+  const followupOutputMaxChars = Math.max(900, Math.floor(toolSoftTrimMaxChars * 0.75));
+  const followupArgumentMaxChars = Math.max(300, Math.floor(toolSoftTrimMaxChars * 0.25));
+  let streamSeq = 0;
+  if (input.streamDir) {
+    try {
+      fs.mkdirSync(input.streamDir, { recursive: true });
+    } catch {
+      // ignore stream dir creation failure; normal response still works
+    }
+  }
+  const writeStreamChunk = (text: string) => {
+    if (!input.streamDir) return;
+    streamSeq += 1;
+    const chunkFile = path.join(input.streamDir, `chunk_${String(streamSeq).padStart(6, '0')}.txt`);
+    const tmpFile = `${chunkFile}.tmp`;
+    try {
+      fs.writeFileSync(tmpFile, text);
+      fs.renameSync(tmpFile, chunkFile);
+    } catch (writeErr) {
+      log(`Stream write error at seq ${streamSeq}: ${writeErr instanceof Error ? writeErr.message : String(writeErr)}`);
+    }
+  };
+  const finalizeStream = () => {
+    if (!input.streamDir) return;
+    try {
+      const donePath = path.join(input.streamDir, 'done');
+      if (!fs.existsSync(donePath)) {
+        fs.writeFileSync(donePath, '');
+      }
+    } catch {
+      // ignore
+    }
+  };
+  const markStreamError = (errorMessage: string) => {
+    if (!input.streamDir) return;
+    try {
+      const donePath = path.join(input.streamDir, 'done');
+      if (!fs.existsSync(donePath)) {
+        fs.writeFileSync(path.join(input.streamDir, 'error'), errorMessage);
+      }
+    } catch {
+      // ignore
+    }
+  };
   try {
     const { instructions: resolvedInstructions, instructionsTokens: resolvedInstructionTokens, contextMessages } = buildContext();
@@ -1102,21 +1244,12 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
       }
     }
-    const contextInput = messagesToOpenRouter(contextMessages);
+    const contextInput = messagesToOpenRouterInput(contextMessages);
-    // Inject vision content into the last user message if images are present
-    const imageContent = loadImageAttachments(input.attachments);
-    if (imageContent.length > 0 && contextInput.length > 0) {
-      const lastMsg = contextInput[contextInput.length - 1];
-      if (lastMsg.role === 'user') {
-        // Convert string content to multi-modal content array
-        // eslint-disable-next-line @typescript-eslint/no-explicit-any
-        (lastMsg as any).content = [
-          { type: 'text', text: typeof lastMsg.content === 'string' ? lastMsg.content : '' },
-          ...imageContent
-        ];
-      }
-    }
+    // Inject vision content into the last user message if images are present.
+    // Uses OpenRouter Responses API content part types (input_text/input_image).
+    const imageContent = loadImageAttachmentsForInput(input.attachments, { log });
+    injectImagesIntoContextInput(contextInput, imageContent);
     let lastError: unknown = null;
     for (let attempt = 0; attempt < modelChain.length; attempt++) {
@@ -1128,9 +1261,9 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
       }
       if (attempt > 0) log(`Fallback ${attempt}: trying ${currentModel}`);
+      const startedAt = Date.now();
       try {
         log(`Starting OpenRouter call (${currentModel})...`);
-        const startedAt = Date.now();
         // ── Custom tool execution loop ──────────────────────────────────
         // The SDK's built-in tool loop (executeToolsIfNeeded) drops conversation
         // context in follow-up API calls — it only sends [function_calls, function_call_outputs]
@@ -1142,39 +1275,14 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
         // eslint-disable-next-line @typescript-eslint/no-explicit-any
         let conversationInput: any[] = [...contextInput];
         let step = 0;
-        let streamSeq = 0;
-        // Helper to write a stream chunk
-        const writeStreamChunk = (text: string) => {
-          if (!input.streamDir) return;
-          streamSeq++;
-          const chunkFile = path.join(input.streamDir, `chunk_${String(streamSeq).padStart(6, '0')}.txt`);
-          const tmpFile = chunkFile + '.tmp';
-          try {
-            fs.writeFileSync(tmpFile, text);
-            fs.renameSync(tmpFile, chunkFile);
-          } catch (writeErr) {
-            log(`Stream write error at seq ${streamSeq}: ${writeErr instanceof Error ? writeErr.message : String(writeErr)}`);
-          }
-        };
-        // Helper to finalize streaming
-        const finalizeStream = () => {
-          if (!input.streamDir) return;
-          try {
-            if (!fs.existsSync(path.join(input.streamDir, 'done'))) {
-              fs.writeFileSync(path.join(input.streamDir, 'done'), '');
-            }
-          } catch { /* ignore */ }
-        };
         // Initial call — uses streaming for real-time delivery
         const initialResult = openrouter.callModel({
           model: currentModel,
           instructions: resolvedInstructions,
           input: conversationInput,
-          tools: schemaTools,
-          maxOutputTokens: resolvedMaxOutputTokens,
+          tools: disableToolsForTurn ? undefined : schemaTools,
+          maxOutputTokens: effectiveMaxOutputTokens,
           temperature: config.temperature,
           reasoning: resolvedReasoning
         });
@@ -1182,13 +1290,12 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
         // Stream text from initial response
         if (input.streamDir) {
           try {
-            fs.mkdirSync(input.streamDir, { recursive: true });
             for await (const delta of initialResult.getTextStream()) {
               writeStreamChunk(delta);
             }
           } catch (streamErr) {
             log(`Stream error: ${streamErr instanceof Error ? streamErr.message : String(streamErr)}`);
-            try { fs.writeFileSync(path.join(input.streamDir, 'error'), streamErr instanceof Error ? streamErr.message : String(streamErr)); } catch { /* ignore */ }
+            markStreamError(streamErr instanceof Error ? streamErr.message : String(streamErr));
           }
         }
@@ -1200,15 +1307,188 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
         } catch (err) {
           const message = err instanceof Error ? err.message : String(err);
           log(`Initial getResponse failed: ${message}`);
-          finalizeStream();
           throw err;
         }
         responseText = extractTextFromApiResponse(lastResponse);
         let pendingCalls = extractFunctionCalls(lastResponse);
+        const callSignatureCounts = new Map<string, number>();
+        let previousRoundSignature = '';
+        let repeatedRoundCount = 0;
+        let runToolLoopBreakerTriggered = false;
+        let runToolLoopBreakerReason: string | undefined;
+        let runToolRetryAttempts = 0;
+        let runNonRetryableFailures = 0;
+        let runOutcomeVerificationForced = false;
+        const maxToolRequirementNudges = 2;
+        let toolRequirementNudgeAttempt = 0;
+        const nudgeReason = toolExecutionRequirement.reason || 'required_tool_execution';
+        const runDeterministicToolRequirementFallback = async (phase: 'pre_loop' | 'post_loop'): Promise<boolean> => {
+          const createReadInstruction = parseCreateReadFileInstruction(prompt);
+          if (createReadInstruction) {
+            const writeExecutor = toolExecutors.get('Write');
+            const readExecutor = toolExecutors.get('Read');
+            if (!writeExecutor || !readExecutor) return false;
+            runOutcomeVerificationForced = true;
+            log(`Tool requirement fallback (${phase}): deterministic create+read for ${createReadInstruction.path}`);
+            try {
+              await writeExecutor({
+                path: createReadInstruction.path,
+                content: createReadInstruction.lines.join('\n')
+              });
+              await readExecutor({ path: createReadInstruction.path });
+              responseText = `Created file "${createReadInstruction.path}" with ${createReadInstruction.lines.length} lines and verified it by reading it back.`;
+              writeStreamChunk(responseText);
+              return true;
+            } catch (fallbackErr) {
+              log(`Deterministic create+read fallback failed: ${fallbackErr instanceof Error ? fallbackErr.message : String(fallbackErr)}`);
+              return false;
+            }
+          }
+          const listReadInstruction = parseListReadNewestInstruction(prompt);
+          if (!listReadInstruction) return false;
+          const globExecutor = toolExecutors.get('Glob');
+          const readExecutor = toolExecutors.get('Read');
+          if (!globExecutor || !readExecutor) return false;
+          runOutcomeVerificationForced = true;
+          log(`Tool requirement fallback (${phase}): deterministic list+read for ${listReadInstruction.directory}`);
+          try {
+            const normalizedDir = listReadInstruction.directory.replace(/\/+$/, '');
+            const globPattern = normalizedDir ? `${normalizedDir}/**/*` : '**/*';
+            const maxResults = Math.max(50, listReadInstruction.count * 20);
+            const globResult = await globExecutor({ pattern: globPattern, maxResults });
+            const matches = (
+              globResult &&
+              typeof globResult === 'object' &&
+              Array.isArray((globResult as { matches?: unknown }).matches)
+                ? (globResult as { matches: unknown[] }).matches
+                : []
+            ).map((item) => String(item || '').trim()).filter(Boolean);
+            const rankedFiles = Array.from(new Set(matches))
+              .map((candidatePath) => {
+                try {
+                  const stat = fs.statSync(candidatePath);
+                  if (!stat.isFile()) return null;
+                  return { path: candidatePath, mtimeMs: stat.mtimeMs };
+                } catch {
+                  return null;
+                }
+              })
+              .filter((entry): entry is { path: string; mtimeMs: number } => !!entry)
+              .sort((a, b) => b.mtimeMs - a.mtimeMs)
+              .slice(0, listReadInstruction.count);
+            if (rankedFiles.length === 0) {
+              responseText = [
+                `- No files were found under \`${listReadInstruction.directory}\`.`,
+                '- I could not read a newest file because the directory appears empty.'
+              ].join('\n');
+              writeStreamChunk(responseText);
+              return true;
+            }
+            const newest = rankedFiles[0];
+            const readResult = await readExecutor({ path: newest.path });
+            const readContent = (
+              readResult &&
+              typeof readResult === 'object' &&
+              typeof (readResult as { content?: unknown }).content === 'string'
+                ? (readResult as { content: string }).content
+                : ''
+            ).trim();
+            const preview = readContent
+              ? readContent.split(/\r?\n/).map(line => line.trim()).filter(Boolean).slice(0, 3).join(' | ')
+              : '[empty file]';
+            const relativePath = newest.path.startsWith(`${GROUP_DIR}/`)
+              ? newest.path.slice(GROUP_DIR.length + 1)
+              : newest.path;
+            const newestBasenames = rankedFiles.map(entry => path.basename(entry.path)).join(', ');
+            const bulletCount = listReadInstruction.bulletCount || 2;
+            const bulletLines = [
+              `- Newest file: \`${relativePath}\` (top ${rankedFiles.length} files from \`${listReadInstruction.directory}\`).`,
+              `- Preview: ${preview}.`,
+              `- Newest set: ${newestBasenames}.`
+            ].slice(0, bulletCount);
+            responseText = bulletLines.join('\n');
+            writeStreamChunk(responseText);
+            return true;
+          } catch (fallbackErr) {
+            log(`Deterministic list+read fallback failed: ${fallbackErr instanceof Error ? fallbackErr.message : String(fallbackErr)}`);
+            return false;
+          }
+        };
+        while (toolExecutionRequirement.required && pendingCalls.length === 0 && toolCalls.length === 0 && toolRequirementNudgeAttempt < maxToolRequirementNudges) {
+          toolRequirementNudgeAttempt += 1;
+          runOutcomeVerificationForced = true;
+          log(`Tool requirement nudge triggered (${nudgeReason}, attempt ${toolRequirementNudgeAttempt}/${maxToolRequirementNudges})`);
+          const nudgePrompt = buildToolExecutionNudgePrompt({
+            reason: nudgeReason,
+            attempt: toolRequirementNudgeAttempt
+          });
+          const responseItems = Array.isArray(lastResponse?.output) ? lastResponse.output : [];
+          conversationInput = [...conversationInput, ...responseItems, { role: 'user', content: nudgePrompt }];
+          try {
+            const nudgeResult = openrouter.callModel({
+              model: currentModel,
+              instructions: resolvedInstructions,
+              input: conversationInput,
+              tools: schemaTools,
+              maxOutputTokens: effectiveMaxOutputTokens,
+              temperature: Math.min(config.temperature, 0.1),
+              reasoning: { effort: 'low' as const }
+            });
+            lastResponse = await nudgeResult.getResponse();
+            const nudgeText = extractTextFromApiResponse(lastResponse);
+            if (nudgeText) {
+              responseText = nudgeText;
+              writeStreamChunk(nudgeText);
+            }
+            pendingCalls = extractFunctionCalls(lastResponse);
+          } catch (nudgeErr) {
+            log(`Tool requirement nudge failed: ${nudgeErr instanceof Error ? nudgeErr.message : String(nudgeErr)}`);
+            break;
+          }
+        }
+        if (toolExecutionRequirement.required && pendingCalls.length === 0 && toolCalls.length === 0) {
+          await runDeterministicToolRequirementFallback('pre_loop');
+        }
         // Tool execution loop — execute tools ourselves, include full context in follow-ups
         while (pendingCalls.length > 0 && step < maxToolSteps) {
+          const roundSignature = normalizeToolRoundSignature(pendingCalls);
+          if (roundSignature && roundSignature === previousRoundSignature) {
+            repeatedRoundCount += 1;
+          } else {
+            repeatedRoundCount = 1;
+            previousRoundSignature = roundSignature;
+          }
+          if (roundSignature && repeatedRoundCount >= repeatedRoundThreshold) {
+            runToolLoopBreakerTriggered = true;
+            runToolLoopBreakerReason = `repeated_round_signature(${repeatedRoundCount})`;
+            log(`Tool loop breaker triggered: ${runToolLoopBreakerReason}`);
+            break;
+          }
+          for (const fc of pendingCalls) {
+            const signature = normalizeToolCallSignature(fc);
+            const nextCount = (callSignatureCounts.get(signature) || 0) + 1;
+            callSignatureCounts.set(signature, nextCount);
+            if (nextCount >= repeatedSignatureThreshold) {
+              runToolLoopBreakerTriggered = true;
+              runToolLoopBreakerReason = `repeated_call_signature(${nextCount}): ${fc.name}`;
+              break;
+            }
+          }
+          if (runToolLoopBreakerTriggered) {
+            log(`Tool loop breaker triggered: ${runToolLoopBreakerReason || 'unknown_reason'}`);
+            break;
+          }
           log(`Step ${step}: executing ${pendingCalls.length} tool call(s): ${pendingCalls.map(c => c.name).join(', ')}`);
           // eslint-disable-next-line @typescript-eslint/no-explicit-any
@@ -1226,50 +1506,120 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
               continue;
             }
-            try {
-              // Calling the wrapped execute fires onToolCall/onToolResult callbacks
-              const result = await executor(fc.arguments);
-              toolResults.push({
-                type: 'function_call_output',
-                callId: fc.id,
-                output: JSON.stringify(result)
+            const normalizedArgs = normalizeToolCallArguments({
+              toolName: fc.name,
+              rawArguments: fc.arguments
+            });
+            if (normalizedArgs.malformedReason) {
+              const recoveryHint = buildMalformedArgumentsRecoveryHint({
+                toolName: fc.name,
+                malformedReason: normalizedArgs.malformedReason
               });
-            } catch (err) {
-              const error = err instanceof Error ? err.message : String(err);
+              const error = recoveryHint
+                ? `Malformed arguments for ${fc.name}: ${normalizedArgs.malformedReason}. ${recoveryHint}`
+                : `Malformed arguments for ${fc.name}: ${normalizedArgs.malformedReason}`;
               toolResults.push({
                 type: 'function_call_output',
                 callId: fc.id,
                 output: JSON.stringify({ error })
               });
+              toolOutputs.push({ name: fc.name, ok: false, error });
+              runNonRetryableFailures += 1;
+              if (runNonRetryableFailures >= nonRetryableFailureThreshold) {
+                runToolLoopBreakerTriggered = true;
+                runToolLoopBreakerReason = `non_retryable_failures(${runNonRetryableFailures})`;
+              }
+              step++;
+              if (runToolLoopBreakerTriggered) break;
+              continue;
+            }
+            let attemptNumber = 1;
+            // Retry only read/idempotent tools on transient failures.
+            for (;;) {
+              try {
+                // Calling the wrapped execute fires onToolCall/onToolResult callbacks.
+                const result = await executor(normalizedArgs.arguments);
+                toolResults.push({
+                  type: 'function_call_output',
+                  callId: fc.id,
+                  output: JSON.stringify(result)
+                });
+                break;
+              } catch (err) {
+                if (shouldRetryIdempotentToolCall({
+                  toolName: fc.name,
+                  error: err,
+                  attempt: attemptNumber,
+                  maxAttempts: idempotentRetryAttempts
+                })) {
+                  runToolRetryAttempts += 1;
+                  const delayMs = Math.min(2_000, idempotentRetryBackoffMs * attemptNumber);
+                  log(`Retrying idempotent tool ${fc.name} after transient error (attempt ${attemptNumber + 1}/${idempotentRetryAttempts})`);
+                  if (delayMs > 0) {
+                    await sleep(delayMs);
+                  }
+                  attemptNumber += 1;
+                  continue;
+                }
+                const error = err instanceof Error ? err.message : String(err);
+                toolResults.push({
+                  type: 'function_call_output',
+                  callId: fc.id,
+                  output: JSON.stringify({ error })
+                });
+                if (isNonRetryableToolError(error)) {
+                  runNonRetryableFailures += 1;
+                  if (runNonRetryableFailures >= nonRetryableFailureThreshold) {
+                    runToolLoopBreakerTriggered = true;
+                    runToolLoopBreakerReason = `non_retryable_failures(${runNonRetryableFailures})`;
+                  }
+                }
+                break;
+              }
             }
             step++;
+            if (runToolLoopBreakerTriggered) break;
+          }
+          if (runToolLoopBreakerTriggered) {
+            log(`Tool loop breaker triggered: ${runToolLoopBreakerReason || 'unknown_reason'}`);
+            break;
           }
           // Build follow-up input with FULL conversation context:
           // original messages + model output + tool results (accumulated each round)
           conversationInput = [...conversationInput, ...lastResponse.output, ...toolResults];
+          // Compact oversized tool payloads before follow-up calls to reduce context bloat.
+          const compactedConversation = compactToolConversationItems(conversationInput, {
+            maxOutputChars: followupOutputMaxChars,
+            outputHeadChars: Math.min(toolSoftTrimHead, Math.floor(followupOutputMaxChars * 0.6)),
+            outputTailChars: Math.min(toolSoftTrimTail, Math.floor(followupOutputMaxChars * 0.3)),
+            maxArgumentChars: followupArgumentMaxChars,
+          });
+          conversationInput = compactedConversation.items as typeof conversationInput;
+          if (compactedConversation.compacted > 0) {
+            log(`Tool loop: compacted ${compactedConversation.compacted} oversized payload(s)`);
+          }
           // Phase 1: Soft-trim oversized tool results (like OpenClaw's context-pruning extension).
           // Replace large tool result content with head+tail, preserving pair integrity.
           // PROTECT the most recent round's tool results — only trim older ones.
-          const SOFT_TRIM_MAX_CHARS = 4000;
-          const SOFT_TRIM_HEAD = 1500;
-          const SOFT_TRIM_TAIL = 1500;
           const protectedStart = conversationInput.length - toolResults.length;
           for (let idx = 0; idx < protectedStart; idx++) {
             // eslint-disable-next-line @typescript-eslint/no-explicit-any
             const anyItem = conversationInput[idx] as any;
-            if (anyItem?.type === 'function_call_output' && typeof anyItem.output === 'string' && anyItem.output.length > SOFT_TRIM_MAX_CHARS) {
+            if (anyItem?.type === 'function_call_output' && typeof anyItem.output === 'string' && anyItem.output.length > toolSoftTrimMaxChars) {
               const orig = anyItem.output;
-              anyItem.output = orig.slice(0, SOFT_TRIM_HEAD) + '\n...\n' + orig.slice(-SOFT_TRIM_TAIL)
-                + `\n[Tool result trimmed: kept first ${SOFT_TRIM_HEAD} and last ${SOFT_TRIM_TAIL} of ${orig.length} chars.]`;
+              anyItem.output = orig.slice(0, toolSoftTrimHead) + '\n...\n' + orig.slice(-toolSoftTrimTail)
+                + `\n[Tool result trimmed: kept first ${toolSoftTrimHead} and last ${toolSoftTrimTail} of ${orig.length} chars.]`;
             }
           }
           // Phase 2: If still over budget, remove only initial context messages (role/content items
           // without a 'type' field). NEVER remove function_call or function_call_output items —
           // orphaning either side of a pair causes API 400 errors.
-          const followupTokenLimit = Math.floor(config.maxContextTokens * 0.6);
+          const followupTokenLimit = Math.floor(config.maxContextTokens * 0.45);
           // eslint-disable-next-line @typescript-eslint/no-explicit-any
           const estimateInputTokens = (items: any[]) => items.reduce((sum: number, item: any) => {
             const content = typeof item === 'string' ? item : JSON.stringify(item);
@@ -1298,7 +1648,7 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
             instructions: resolvedInstructions,
             input: conversationInput,
             tools: schemaTools,
-            maxOutputTokens: resolvedMaxOutputTokens,
+            maxOutputTokens: effectiveMaxOutputTokens,
             temperature: config.temperature,
             reasoning: resolvedReasoning
           });
@@ -1335,7 +1685,7 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
                     instructions: resolvedInstructions,
                     input: conversationInput,
                     tools: schemaTools,
-                    maxOutputTokens: resolvedMaxOutputTokens,
+                    maxOutputTokens: effectiveMaxOutputTokens,
                     temperature: config.temperature,
                     reasoning: resolvedReasoning
                   });
@@ -1368,8 +1718,78 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
           pendingCalls = extractFunctionCalls(lastResponse);
         }
+        if (toolExecutionRequirement.required && toolCalls.length === 0) {
+          const fallbackApplied = await runDeterministicToolRequirementFallback('post_loop');
+          if (fallbackApplied) {
+            pendingCalls = [];
+          }
+        }
+        if (toolExecutionRequirement.required && toolCalls.length === 0) {
+          runOutcomeVerificationForced = true;
+          responseText = 'I could not execute the required tools for this request, so I cannot safely claim completion.';
+          writeStreamChunk(responseText);
+        }
+        const unresolvedCalls = pendingCalls.slice();
+        if (forceSynthesisAfterTools && toolCalls.length > 0 && (runToolLoopBreakerTriggered || unresolvedCalls.length > 0 || !responseText.trim())) {
+          runOutcomeVerificationForced = true;
+          const synthesisReason = runToolLoopBreakerTriggered
+            ? `stuck_loop:${runToolLoopBreakerReason || 'unknown'}`
+            : (unresolvedCalls.length > 0 ? 'unresolved_tool_calls' : 'empty_after_tools');
+          log(`Tool outcome verifier forcing synthesis (${synthesisReason})`);
+          const continuationPrompt = buildForcedSynthesisPrompt({
+            reason: synthesisReason,
+            pendingCalls: unresolvedCalls,
+            toolOutputs
+          });
+          conversationInput = [...conversationInput, { role: 'user', content: continuationPrompt }];
+          try {
+            const synthesisResult = openrouter.callModel({
+              model: currentModel,
+              instructions: resolvedInstructions,
+              input: conversationInput,
+              maxOutputTokens: effectiveMaxOutputTokens,
+              temperature: config.temperature,
+              reasoning: resolvedReasoning
+            });
+            const synthesisResponse = await synthesisResult.getResponse();
+            const synthesisText = extractTextFromApiResponse(synthesisResponse);
+            if (synthesisText && synthesisText.trim()) {
+              responseText = synthesisText;
+              writeStreamChunk(synthesisText);
+            }
+          } catch (synthesisErr) {
+            log(`Forced synthesis failed: ${synthesisErr instanceof Error ? synthesisErr.message : String(synthesisErr)}`);
+          }
+          if (!responseText || !responseText.trim()) {
+            responseText = buildToolOutcomeFallback({
+              reason: synthesisReason,
+              toolOutputs,
+              pendingCalls: unresolvedCalls
+            });
+            writeStreamChunk(responseText);
+          }
+        }
+        if (!responseText || !responseText.trim()) {
+          responseText = toolCalls.length > 0
+            ? 'I completed tool execution but received an empty model response. Please retry, and I will continue from this context.'
+            : 'I could not produce a response for that request. Please retry, and I will continue from this context.';
+          writeStreamChunk(responseText);
+        }
         finalizeStream();
         latencyMs = Date.now() - startedAt;
+        toolRetryAttempts += runToolRetryAttempts;
+        if (runOutcomeVerificationForced) {
+          toolOutcomeVerificationForced = true;
+        }
+        if (runToolLoopBreakerTriggered) {
+          toolLoopBreakerTriggered = true;
+          toolLoopBreakerReason = runToolLoopBreakerReason;
+        }
         if (responseText && responseText.trim()) {
           log(`Model returned text response (${responseText.length} chars, ${step} tool steps)`);
@@ -1391,10 +1811,13 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
         // rebuild system prompt at max trim level, then retry.
         if (errClass === 'context_overflow' && contextMessages.length > 4) {
           log(`Context overflow on ${currentModel}, emergency compaction + max trim`);
-          // Split: keep last 4 messages, compact the rest via summary
-          const keepCount = Math.min(4, contextMessages.length);
-          const toCompact = contextMessages.slice(0, contextMessages.length - keepCount);
-          const toKeep = contextMessages.slice(-keepCount);
+          const recoveryPlan = buildContextOverflowRecoveryPlan({
+            contextMessages: contextMessages.map(msg => ({ role: msg.role, content: msg.content })),
+            emergencySummary: null,
+            keepRecentCount: 4
+          });
+          const toCompact = recoveryPlan.toCompact;
+          const toKeep = recoveryPlan.toKeep;
           let emergencySummary = '';
           if (toCompact.length > 0) {
             try {
@@ -1420,22 +1843,30 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
           }
           // Rebuild system prompt at max trim level (includes updated summary)
           const minInstructions = resolveInstructions(4);
-          // Build trimmed input: summary context + recent messages
-          const compactedInput = emergencySummary
-            ? [{ role: 'user' as const, content: `[Previous conversation summary: ${emergencySummary}]` }, ...toKeep.map(m => ({ role: m.role, content: m.content }))]
-            : toKeep.map(m => ({ role: m.role, content: m.content }));
+          const compactedInput = buildContextOverflowRecoveryPlan({
+            contextMessages: toKeep,
+            emergencySummary,
+            keepRecentCount: Math.max(1, toKeep.length)
+          }).retryInput;
           try {
             const retryResult = openrouter.callModel({
               model: currentModel,
               instructions: minInstructions,
               input: compactedInput,
               tools: schemaTools,
-              maxOutputTokens: resolvedMaxOutputTokens,
+              maxOutputTokens: effectiveMaxOutputTokens,
               temperature: config.temperature,
               reasoning: resolvedReasoning
             });
             const retryResponse = await retryResult.getResponse();
             responseText = extractTextFromApiResponse(retryResponse) || '';
+            if (responseText) {
+              writeStreamChunk(responseText);
+            }
+            finalizeStream();
+            latencyMs = Date.now() - startedAt;
+            completionTokens = estimateTokensForModel(responseText || '', tokenEstimate.tokensPerChar);
+            promptTokens = resolvedPromptTokens;
             lastError = null;
             break;
           } catch (retryErr) {
@@ -1459,6 +1890,8 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
     const errorMessage = err instanceof Error ? err.message : String(err);
     const allFailed = modelChain.length > 1 ? `All models failed. Last error: ${errorMessage}` : errorMessage;
     log(`Agent error: ${allFailed}`);
+    markStreamError(allFailed);
+    await cleanupMcpConnections();
     return {
       status: 'error',
       result: null,
@@ -1470,12 +1903,16 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
       memory_facts: sessionCtx.state.facts,
       tokens_prompt: promptTokens,
       tokens_completion: completionTokens,
-      memory_recall_count: memoryRecallCount,
+      memory_recall_count: memoryRecallCountForOutput,
       session_recall_count: sessionRecallCount,
       memory_items_upserted: memoryItemsUpserted,
       memory_items_extracted: memoryItemsExtracted,
       timings: Object.keys(timings).length > 0 ? timings : undefined,
       tool_calls: toolCalls.length > 0 ? toolCalls : undefined,
+      tool_retry_attempts: toolRetryAttempts || undefined,
+      tool_outcome_verification_forced: toolOutcomeVerificationForced || undefined,
+      tool_loop_breaker_triggered: toolLoopBreakerTriggered || undefined,
+      tool_loop_breaker_reason: toolLoopBreakerReason,
       latency_ms: latencyMs
     };
   }
@@ -1569,10 +2006,7 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
     }
   }
-  // Cleanup MCP connections
-  if (mcpCleanup) {
-    try { await mcpCleanup(); } catch { /* ignore cleanup errors */ }
-  }
+  await cleanupMcpConnections();
   return {
     status: 'success',
@@ -1584,12 +2018,16 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
     memory_facts: sessionCtx.state.facts,
     tokens_prompt: promptTokens,
     tokens_completion: completionTokens,
-    memory_recall_count: memoryRecallCount,
+    memory_recall_count: memoryRecallCountForOutput,
     session_recall_count: sessionRecallCount,
     memory_items_upserted: memoryItemsUpserted,
     memory_items_extracted: memoryItemsExtracted,
     timings: Object.keys(timings).length > 0 ? timings : undefined,
     tool_calls: toolCalls.length > 0 ? toolCalls : undefined,
+    tool_retry_attempts: toolRetryAttempts || undefined,
+    tool_outcome_verification_forced: toolOutcomeVerificationForced || undefined,
+    tool_loop_breaker_triggered: toolLoopBreakerTriggered || undefined,
+    tool_loop_breaker_reason: toolLoopBreakerReason,
     latency_ms: latencyMs,
     replyToId
   };