@dotsetlabs/dotclaw 2.4.0 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/.env.example +9 -10
  2. package/README.md +8 -4
  3. package/config-examples/runtime.json +34 -8
  4. package/config-examples/tool-policy.json +12 -2
  5. package/container/agent-runner/package-lock.json +2 -2
  6. package/container/agent-runner/package.json +1 -1
  7. package/container/agent-runner/src/agent-config.ts +19 -3
  8. package/container/agent-runner/src/container-protocol.ts +11 -0
  9. package/container/agent-runner/src/context-overflow-recovery.ts +39 -0
  10. package/container/agent-runner/src/index.ts +603 -165
  11. package/container/agent-runner/src/openrouter-input.ts +159 -0
  12. package/container/agent-runner/src/system-prompt.ts +13 -3
  13. package/container/agent-runner/src/tool-loop-policy.ts +741 -0
  14. package/container/agent-runner/src/tools.ts +211 -8
  15. package/dist/agent-context.d.ts +1 -0
  16. package/dist/agent-context.d.ts.map +1 -1
  17. package/dist/agent-context.js +21 -9
  18. package/dist/agent-context.js.map +1 -1
  19. package/dist/agent-execution.d.ts +2 -0
  20. package/dist/agent-execution.d.ts.map +1 -1
  21. package/dist/agent-execution.js +164 -15
  22. package/dist/agent-execution.js.map +1 -1
  23. package/dist/agent-semaphore.d.ts +24 -1
  24. package/dist/agent-semaphore.d.ts.map +1 -1
  25. package/dist/agent-semaphore.js +109 -20
  26. package/dist/agent-semaphore.js.map +1 -1
  27. package/dist/cli.js +3 -11
  28. package/dist/cli.js.map +1 -1
  29. package/dist/config.d.ts +2 -0
  30. package/dist/config.d.ts.map +1 -1
  31. package/dist/config.js +2 -0
  32. package/dist/config.js.map +1 -1
  33. package/dist/container-protocol.d.ts +22 -0
  34. package/dist/container-protocol.d.ts.map +1 -1
  35. package/dist/container-protocol.js.map +1 -1
  36. package/dist/container-runner.d.ts +7 -0
  37. package/dist/container-runner.d.ts.map +1 -1
  38. package/dist/container-runner.js +417 -143
  39. package/dist/container-runner.js.map +1 -1
  40. package/dist/db.d.ts.map +1 -1
  41. package/dist/db.js +46 -12
  42. package/dist/db.js.map +1 -1
  43. package/dist/error-messages.d.ts.map +1 -1
  44. package/dist/error-messages.js +18 -4
  45. package/dist/error-messages.js.map +1 -1
  46. package/dist/failover-policy.d.ts +41 -0
  47. package/dist/failover-policy.d.ts.map +1 -0
  48. package/dist/failover-policy.js +261 -0
  49. package/dist/failover-policy.js.map +1 -0
  50. package/dist/index.js +1 -0
  51. package/dist/index.js.map +1 -1
  52. package/dist/ipc-dispatcher.d.ts.map +1 -1
  53. package/dist/ipc-dispatcher.js +27 -43
  54. package/dist/ipc-dispatcher.js.map +1 -1
  55. package/dist/mcp-config.d.ts +22 -0
  56. package/dist/mcp-config.d.ts.map +1 -0
  57. package/dist/mcp-config.js +94 -0
  58. package/dist/mcp-config.js.map +1 -0
  59. package/dist/memory-backend.d.ts +27 -0
  60. package/dist/memory-backend.d.ts.map +1 -0
  61. package/dist/memory-backend.js +112 -0
  62. package/dist/memory-backend.js.map +1 -0
  63. package/dist/memory-recall.d.ts.map +1 -1
  64. package/dist/memory-recall.js +135 -22
  65. package/dist/memory-recall.js.map +1 -1
  66. package/dist/memory-store.d.ts +1 -0
  67. package/dist/memory-store.d.ts.map +1 -1
  68. package/dist/memory-store.js +55 -7
  69. package/dist/memory-store.js.map +1 -1
  70. package/dist/message-pipeline.d.ts +24 -0
  71. package/dist/message-pipeline.d.ts.map +1 -1
  72. package/dist/message-pipeline.js +131 -27
  73. package/dist/message-pipeline.js.map +1 -1
  74. package/dist/metrics.d.ts +1 -0
  75. package/dist/metrics.d.ts.map +1 -1
  76. package/dist/metrics.js +9 -0
  77. package/dist/metrics.js.map +1 -1
  78. package/dist/providers/discord/discord-provider.d.ts.map +1 -1
  79. package/dist/providers/discord/discord-provider.js +72 -4
  80. package/dist/providers/discord/discord-provider.js.map +1 -1
  81. package/dist/providers/telegram/telegram-provider.d.ts.map +1 -1
  82. package/dist/providers/telegram/telegram-provider.js +65 -3
  83. package/dist/providers/telegram/telegram-provider.js.map +1 -1
  84. package/dist/recall-policy.d.ts +12 -0
  85. package/dist/recall-policy.d.ts.map +1 -0
  86. package/dist/recall-policy.js +89 -0
  87. package/dist/recall-policy.js.map +1 -0
  88. package/dist/runtime-config.d.ts +33 -0
  89. package/dist/runtime-config.d.ts.map +1 -1
  90. package/dist/runtime-config.js +109 -9
  91. package/dist/runtime-config.js.map +1 -1
  92. package/dist/streaming.d.ts.map +1 -1
  93. package/dist/streaming.js +125 -33
  94. package/dist/streaming.js.map +1 -1
  95. package/dist/task-scheduler.d.ts.map +1 -1
  96. package/dist/task-scheduler.js +4 -2
  97. package/dist/task-scheduler.js.map +1 -1
  98. package/dist/tool-policy.d.ts.map +1 -1
  99. package/dist/tool-policy.js +26 -4
  100. package/dist/tool-policy.js.map +1 -1
  101. package/dist/trace-writer.d.ts +12 -0
  102. package/dist/trace-writer.d.ts.map +1 -1
  103. package/dist/trace-writer.js.map +1 -1
  104. package/dist/turn-hygiene.d.ts +14 -0
  105. package/dist/turn-hygiene.d.ts.map +1 -0
  106. package/dist/turn-hygiene.js +214 -0
  107. package/dist/turn-hygiene.js.map +1 -0
  108. package/dist/webhook.d.ts.map +1 -1
  109. package/dist/webhook.js +1 -0
  110. package/dist/webhook.js.map +1 -1
  111. package/package.json +15 -1
  112. package/scripts/benchmark-baseline.js +365 -0
  113. package/scripts/benchmark-harness.js +1413 -0
  114. package/scripts/benchmark-scenarios.js +301 -0
  115. package/scripts/canary-suite.js +123 -0
  116. package/scripts/generate-controlled-traces.js +230 -0
  117. package/scripts/release-slo-check.js +214 -0
  118. package/scripts/run-live-canary.js +339 -0
@@ -33,6 +33,27 @@ import {
33
33
  import { loadPromptPackWithCanary, formatPromptPack, PromptPack } from './prompt-packs.js';
34
34
  import { buildSkillCatalog, type SkillCatalog } from './skill-loader.js';
35
35
  import { buildSystemPrompt } from './system-prompt.js';
36
+ import { buildContextOverflowRecoveryPlan } from './context-overflow-recovery.js';
37
+ import {
38
+ buildForcedSynthesisPrompt,
39
+ buildToolExecutionNudgePrompt,
40
+ buildToolOutcomeFallback,
41
+ compactToolConversationItems,
42
+ detectToolExecutionRequirement,
43
+ buildMalformedArgumentsRecoveryHint,
44
+ isNonRetryableToolError,
45
+ normalizeToolCallArguments,
46
+ normalizeToolCallSignature,
47
+ normalizeToolRoundSignature,
48
+ parseCreateReadFileInstruction,
49
+ parseListReadNewestInstruction,
50
+ shouldRetryIdempotentToolCall,
51
+ } from './tool-loop-policy.js';
52
+ import {
53
+ injectImagesIntoContextInput,
54
+ loadImageAttachmentsForInput,
55
+ messagesToOpenRouterInput,
56
+ } from './openrouter-input.js';
36
57
 
37
58
  type OpenRouterResult = ReturnType<OpenRouter['callModel']>;
38
59
 
@@ -143,6 +164,10 @@ function log(message: string): void {
143
164
  console.error(`[agent-runner] ${message}`);
144
165
  }
145
166
 
167
+ function sleep(ms: number): Promise<void> {
168
+ return new Promise(resolve => setTimeout(resolve, Math.max(0, ms)));
169
+ }
170
+
146
171
  function classifyError(err: unknown): 'retryable' | 'context_overflow' | null {
147
172
  const msg = err instanceof Error ? err.message : String(err);
148
173
  const lower = msg.toLowerCase();
@@ -494,55 +519,6 @@ function loadClaudeNotes(): { group: string | null; global: string | null } {
494
519
  };
495
520
  }
496
521
 
497
-
498
- // ── Image/Vision support ──────────────────────────────────────────────
499
-
500
- const MAX_IMAGE_BYTES = 5 * 1024 * 1024; // 5MB per image
501
- const MAX_TOTAL_IMAGE_BYTES = 20 * 1024 * 1024; // 20MB total across all images
502
- const IMAGE_MIME_TYPES = new Set(['image/jpeg', 'image/png', 'image/gif', 'image/webp']);
503
-
504
- function loadImageAttachments(attachments?: ContainerInput['attachments']): Array<{
505
- type: 'image_url';
506
- image_url: { url: string };
507
- }> {
508
- if (!attachments) return [];
509
- const images: Array<{ type: 'image_url'; image_url: { url: string } }> = [];
510
- let totalBytes = 0;
511
- for (const att of attachments) {
512
- if (att.type !== 'photo') continue;
513
- const mime = att.mime_type || 'image/jpeg';
514
- if (!IMAGE_MIME_TYPES.has(mime)) continue;
515
- try {
516
- const stat = fs.statSync(att.path);
517
- if (stat.size > MAX_IMAGE_BYTES) {
518
- log(`Skipping image ${att.path}: ${stat.size} bytes exceeds ${MAX_IMAGE_BYTES}`);
519
- continue;
520
- }
521
- if (totalBytes + stat.size > MAX_TOTAL_IMAGE_BYTES) {
522
- log(`Skipping image ${att.path}: cumulative size would exceed ${MAX_TOTAL_IMAGE_BYTES}`);
523
- break;
524
- }
525
- const data = fs.readFileSync(att.path);
526
- totalBytes += data.length;
527
- const b64 = data.toString('base64');
528
- images.push({
529
- type: 'image_url',
530
- image_url: { url: `data:${mime};base64,${b64}` }
531
- });
532
- } catch (err) {
533
- log(`Failed to load image ${att.path}: ${err instanceof Error ? err.message : err}`);
534
- }
535
- }
536
- return images;
537
- }
538
-
539
- function messagesToOpenRouter(messages: Message[]) {
540
- return messages.map(message => ({
541
- role: message.role,
542
- content: message.content
543
- }));
544
- }
545
-
546
522
  function clampContextMessages(messages: Message[], tokensPerChar: number, maxTokens: number): Message[] {
547
523
  if (!Number.isFinite(maxTokens) || maxTokens <= 0) return messages;
548
524
  const tpc = tokensPerChar > 0 ? tokensPerChar : 0.25;
@@ -560,6 +536,44 @@ function clampContextMessages(messages: Message[], tokensPerChar: number, maxTok
560
536
  });
561
537
  }
562
538
 
539
+ function shouldDisableToolsForPrompt(prompt: string, toolRequired: boolean): boolean {
540
+ if (toolRequired) return false;
541
+ const text = String(prompt || '').trim();
542
+ if (!text) return false;
543
+ if (/\[(?:scenario:)?memory(?:_carryover)?\]/i.test(text)) return true;
544
+ if (/\bfrom\s+(?:this|our)\s+(?:same\s+)?(?:conversation|chat)\b/i.test(text)) return true;
545
+ if (/\bwhat\s+did\s+(?:i|you)\s+just\b/i.test(text)) return true;
546
+ if (/\bearlier\s+in\s+(?:this\s+)?(?:conversation|chat)\b/i.test(text)) return true;
547
+ return false;
548
+ }
549
+
550
+ function resolvePromptOutputCap(prompt: string): number | undefined {
551
+ const text = String(prompt || '').trim();
552
+ if (!text) return undefined;
553
+ let cap: number | undefined;
554
+
555
+ if (/\b(?:one|single)[-\s]?word\b/i.test(text)) {
556
+ cap = 48;
557
+ }
558
+
559
+ if (/\bone\s+(?:concise\s+|short\s+|brief\s+)?sentence\b/i.test(text)) {
560
+ cap = cap ? Math.min(cap, 180) : 180;
561
+ }
562
+
563
+ const bulletMatch = text.match(/\bexactly\s+(\d+)\s+bullet(?:\s+point)?s?\b/i);
564
+ if (bulletMatch) {
565
+ const bulletCount = Math.min(10, Math.max(1, Math.floor(Number(bulletMatch[1]) || 0)));
566
+ const bulletCap = Math.max(180, Math.min(900, 140 + (bulletCount * 90)));
567
+ cap = cap ? Math.min(cap, bulletCap) : bulletCap;
568
+ }
569
+
570
+ if (/\bconcise|brief|short\b/i.test(text)) {
571
+ cap = cap ? Math.min(cap, 260) : 260;
572
+ }
573
+
574
+ return cap;
575
+ }
576
+
563
577
  async function updateMemorySummary(params: {
564
578
  openrouter: OpenRouter;
565
579
  model: string;
@@ -686,6 +700,13 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
686
700
  const maxToolSteps = Number.isFinite(input.maxToolSteps)
687
701
  ? Math.max(1, Math.floor(input.maxToolSteps as number))
688
702
  : agent.tools.maxToolSteps;
703
+ const completionGuard = agent.tools.completionGuard;
704
+ const idempotentRetryAttempts = Math.max(1, Math.floor(completionGuard.idempotentRetryAttempts));
705
+ const idempotentRetryBackoffMs = Math.max(0, Math.floor(completionGuard.idempotentRetryBackoffMs));
706
+ const repeatedSignatureThreshold = Math.max(2, Math.floor(completionGuard.repeatedSignatureThreshold));
707
+ const repeatedRoundThreshold = Math.max(2, Math.floor(completionGuard.repeatedRoundThreshold));
708
+ const nonRetryableFailureThreshold = Math.max(1, Math.floor(completionGuard.nonRetryableFailureThreshold || 3));
709
+ const forceSynthesisAfterTools = completionGuard.forceSynthesisAfterTools !== false;
689
710
  const memoryExtractionEnabled = agent.memory.extraction.enabled;
690
711
  const isDaemon = process.env.DOTCLAW_DAEMON === '1';
691
712
  const memoryExtractionMaxMessages = agent.memory.extraction.maxMessages;
@@ -714,6 +735,27 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
714
735
  groupFolder: input.groupFolder,
715
736
  isMain: input.isMain
716
737
  }, agent.ipc);
738
+ const inputToolPolicy = (input.toolPolicy && typeof input.toolPolicy === 'object')
739
+ ? input.toolPolicy as { allow?: string[]; deny?: string[] }
740
+ : {};
741
+ const hasAllowPolicy = Array.isArray(inputToolPolicy.allow);
742
+ const allowedToolSet = new Set(
743
+ (hasAllowPolicy ? (inputToolPolicy.allow || []) : [])
744
+ .map((name) => String(name || '').trim().toLowerCase())
745
+ .filter(Boolean)
746
+ );
747
+ const deniedToolSet = new Set(
748
+ (inputToolPolicy.deny || [])
749
+ .map((name) => String(name || '').trim().toLowerCase())
750
+ .filter(Boolean)
751
+ );
752
+ const isToolAllowedByPolicy = (name: string): boolean => {
753
+ const normalized = String(name || '').trim().toLowerCase();
754
+ if (!normalized) return false;
755
+ if (deniedToolSet.has(normalized)) return false;
756
+ if (hasAllowPolicy && !allowedToolSet.has(normalized)) return false;
757
+ return true;
758
+ };
717
759
  const tools = createTools({
718
760
  chatJid: input.chatJid,
719
761
  groupFolder: input.groupFolder,
@@ -748,16 +790,28 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
748
790
  };
749
791
  };
750
792
  const mcp = await discoverMcpTools(agent, wrapMcp);
751
- tools.push(...mcp.tools);
793
+ const filteredMcpTools = mcp.tools.filter(toolEntry => isToolAllowedByPolicy(toolEntry.function.name));
794
+ tools.push(...filteredMcpTools);
752
795
  mcpCleanup = mcp.cleanup;
753
- if (mcp.tools.length > 0) {
754
- log(`MCP: discovered ${mcp.tools.length} external tools`);
796
+ if (filteredMcpTools.length > 0) {
797
+ log(`MCP: discovered ${filteredMcpTools.length} external tools`);
755
798
  }
756
799
  } catch (err) {
757
800
  log(`MCP discovery failed: ${err instanceof Error ? err.message : String(err)}`);
758
801
  }
759
802
  }
760
803
 
804
+ const cleanupMcpConnections = async () => {
805
+ if (!mcpCleanup) return;
806
+ const cleanup = mcpCleanup;
807
+ mcpCleanup = null;
808
+ try {
809
+ await cleanup();
810
+ } catch {
811
+ // ignore cleanup errors
812
+ }
813
+ };
814
+
761
815
  // Build schema-only tools (no execute functions) for SDK — prevents the SDK from
762
816
  // auto-executing tools in its internal loop, which drops conversation context in
763
817
  // follow-up API calls (makeFollowupRequest only sends model output + tool results,
@@ -782,6 +836,7 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
782
836
  if (process.env.DOTCLAW_SELF_CHECK === '1') {
783
837
  try {
784
838
  const details = await runSelfCheck({ model });
839
+ await cleanupMcpConnections();
785
840
  return {
786
841
  status: 'success',
787
842
  result: `Self-check passed: ${details.join(', ')}`,
@@ -790,6 +845,7 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
790
845
  } catch (err) {
791
846
  const errorMessage = err instanceof Error ? err.message : String(err);
792
847
  log(`Self-check failed: ${errorMessage}`);
848
+ await cleanupMcpConnections();
793
849
  return {
794
850
  status: 'error',
795
851
  result: null,
@@ -821,6 +877,22 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
821
877
  }).join('\n');
822
878
  prompt = `${prompt}\n\n<latest_attachments>\n${attachmentSummary}\n</latest_attachments>`;
823
879
  }
880
+ const toolExecutionRequirement = detectToolExecutionRequirement(prompt);
881
+ const disableToolsForTurn = shouldDisableToolsForPrompt(prompt, toolExecutionRequirement.required);
882
+ const promptOutputCap = resolvePromptOutputCap(prompt);
883
+ const effectiveMaxOutputTokens = promptOutputCap
884
+ ? (
885
+ (typeof resolvedMaxOutputTokens === 'number' && Number.isFinite(resolvedMaxOutputTokens))
886
+ ? Math.max(64, Math.min(resolvedMaxOutputTokens, promptOutputCap))
887
+ : promptOutputCap
888
+ )
889
+ : resolvedMaxOutputTokens;
890
+ if (typeof effectiveMaxOutputTokens === 'number' && effectiveMaxOutputTokens !== resolvedMaxOutputTokens) {
891
+ log(`Applying prompt output cap: ${effectiveMaxOutputTokens} tokens`);
892
+ }
893
+ if (disableToolsForTurn) {
894
+ log('Prompt classified as conversation-recall: disabling tool schema for this turn');
895
+ }
824
896
 
825
897
  appendHistory(sessionCtx, 'user', prompt);
826
898
  let history = loadHistory(sessionCtx);
@@ -829,11 +901,11 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
829
901
  history = limitHistoryTurns(history, agent.context.maxHistoryTurns);
830
902
  }
831
903
 
832
- // Dynamic context budget: if recentContextTokens is 0 (auto), allocate 50% of context to
833
- // conversation history (matches OpenClaw's maxHistoryShare). System prompt gets up to 25%.
904
+ // Dynamic context budget: if recentContextTokens is 0 (auto), allocate 35% of context to
905
+ // conversation history, capped at 24K tokens for latency/throughput stability.
834
906
  const effectiveRecentTokens = config.recentContextTokens > 0
835
907
  ? config.recentContextTokens
836
- : Math.floor(config.maxContextTokens * 0.50);
908
+ : Math.min(24_000, Math.floor(config.maxContextTokens * 0.35));
837
909
  const tokenRatio = tokenEstimate.tokensPerChar > 0 ? (0.25 / tokenEstimate.tokensPerChar) : 1;
838
910
  const adjustedRecentTokens = Math.max(1000, Math.floor(effectiveRecentTokens * tokenRatio));
839
911
 
@@ -970,7 +1042,8 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
970
1042
  // Long-term memory is now tool-based (agent calls mcp__dotclaw__memory_search on demand).
971
1043
  // Session recall removed — redundant with summary + facts + recent messages.
972
1044
  const sessionRecallCount = 0;
973
- const memoryRecallCount = input.memoryRecall ? input.memoryRecall.length : 0;
1045
+ const memoryRecallCount = Array.isArray(input.memoryRecall) ? input.memoryRecall.length : 0;
1046
+ const memoryRecallCountForOutput = input.memoryRecallAttempted ? memoryRecallCount : undefined;
974
1047
 
975
1048
  const sharedPromptDir = fs.existsSync(PROMPTS_DIR) ? PROMPTS_DIR : undefined;
976
1049
  const taskPackResult = PROMPT_PACKS_ENABLED
@@ -1012,38 +1085,48 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
1012
1085
  if (memoryPolicyResult) promptPackVersions['memory-policy'] = memoryPolicyResult.pack.version;
1013
1086
  if (memoryRecallResult) promptPackVersions['memory-recall'] = memoryRecallResult.pack.version;
1014
1087
 
1015
- const resolveInstructions = (trimLevel = 0) => buildInstructions({
1016
- assistantName,
1017
- groupNotes: claudeNotes.group,
1018
- globalNotes: claudeNotes.global,
1019
- skillCatalog,
1020
- memorySummary: sessionCtx.state.summary,
1021
- memoryFacts: sessionCtx.state.facts,
1022
- sessionRecall: [],
1023
- longTermRecall: [],
1024
- userProfile: input.userProfile ?? null,
1025
- memoryStats: input.memoryStats,
1026
- availableGroups,
1027
- toolReliability: input.toolReliability,
1028
- behaviorConfig: input.behaviorConfig,
1029
- isScheduledTask: !!input.isScheduledTask,
1030
- taskId: input.taskId,
1031
- timezone: typeof input.timezone === 'string' ? input.timezone : undefined,
1032
- hostPlatform: typeof input.hostPlatform === 'string' ? input.hostPlatform : undefined,
1033
- messagingPlatform: input.chatJid?.includes(':') ? input.chatJid.split(':')[0] : undefined,
1034
- taskExtractionPack: taskPackResult?.pack || null,
1035
- responseQualityPack: responseQualityResult?.pack || null,
1036
- toolCallingPack: toolCallingResult?.pack || null,
1037
- toolOutcomePack: toolOutcomeResult?.pack || null,
1038
- memoryPolicyPack: memoryPolicyResult?.pack || null,
1039
- memoryRecallPack: memoryRecallResult?.pack || null,
1040
- maxToolSteps,
1041
- trimLevel
1042
- });
1088
+ const resolveInstructions = (trimLevel = 0) => {
1089
+ const base = buildInstructions({
1090
+ assistantName,
1091
+ groupNotes: claudeNotes.group,
1092
+ globalNotes: claudeNotes.global,
1093
+ skillCatalog,
1094
+ memorySummary: sessionCtx.state.summary,
1095
+ memoryFacts: sessionCtx.state.facts,
1096
+ sessionRecall: [],
1097
+ longTermRecall: [],
1098
+ userProfile: input.userProfile ?? null,
1099
+ memoryStats: input.memoryStats,
1100
+ availableGroups,
1101
+ toolReliability: input.toolReliability,
1102
+ behaviorConfig: input.behaviorConfig,
1103
+ isScheduledTask: !!input.isScheduledTask,
1104
+ taskId: input.taskId,
1105
+ timezone: typeof input.timezone === 'string' ? input.timezone : undefined,
1106
+ hostPlatform: typeof input.hostPlatform === 'string' ? input.hostPlatform : undefined,
1107
+ messagingPlatform: input.chatJid?.includes(':') ? input.chatJid.split(':')[0] : undefined,
1108
+ taskExtractionPack: taskPackResult?.pack || null,
1109
+ responseQualityPack: responseQualityResult?.pack || null,
1110
+ toolCallingPack: toolCallingResult?.pack || null,
1111
+ toolOutcomePack: toolOutcomeResult?.pack || null,
1112
+ memoryPolicyPack: memoryPolicyResult?.pack || null,
1113
+ memoryRecallPack: memoryRecallResult?.pack || null,
1114
+ maxToolSteps,
1115
+ trimLevel
1116
+ });
1117
+ if (!toolExecutionRequirement.required) return base;
1118
+ const reason = toolExecutionRequirement.reason || 'required_tool_execution';
1119
+ return `${base}\n\n[Tool Execution Requirement]\nThis request requires real tool execution (${reason}). Do not claim file/system/web actions unless matching tool calls in this turn succeeded. If tools fail, state the failure clearly and provide the best next action.`;
1120
+ };
1043
1121
 
1044
1122
  const buildContext = () => {
1045
- // System prompt budget: 25% of context window
1046
- const maxSystemPromptTokens = Math.floor(config.maxContextTokens * 0.25);
1123
+ // System prompt budget: keep prompt lean for lower p95 latency.
1124
+ // Cap absolute size to avoid over-spending tokens on instructions.
1125
+ const systemPromptShare = input.isScheduledTask ? 0.1 : 0.12;
1126
+ const maxSystemPromptTokens = Math.max(
1127
+ 1200,
1128
+ Math.min(6000, Math.floor(config.maxContextTokens * systemPromptShare))
1129
+ );
1047
1130
  const MAX_TRIM_LEVEL = 4;
1048
1131
 
1049
1132
  let resolvedInstructions = '';
@@ -1062,7 +1145,10 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
1062
1145
 
1063
1146
  const outputReserve = resolvedMaxOutputTokens || Math.floor(config.maxContextTokens * 0.25);
1064
1147
  const resolvedMaxContext = Math.max(config.maxContextTokens - outputReserve - resolvedInstructionTokens, 2000);
1065
- const resolvedAdjusted = Math.max(1000, Math.floor(resolvedMaxContext * tokenRatio));
1148
+ const resolvedAdjusted = Math.max(
1149
+ 1000,
1150
+ Math.min(adjustedRecentTokens, Math.floor(resolvedMaxContext * tokenRatio))
1151
+ );
1066
1152
  let { recentMessages: contextMessages } = splitRecentHistory(recentMessages, resolvedAdjusted, 6);
1067
1153
  contextMessages = clampContextMessages(contextMessages, tokenEstimate.tokensPerChar, resolvedMaxContextMessageTokens);
1068
1154
  contextMessages = pruneContextMessages(contextMessages, agent.context.contextPruning);
@@ -1077,9 +1163,65 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
1077
1163
  let completionTokens = 0;
1078
1164
  let promptTokens = 0;
1079
1165
  let latencyMs: number | undefined;
1166
+ let toolRetryAttempts = 0;
1167
+ let toolOutcomeVerificationForced = false;
1168
+ let toolLoopBreakerTriggered = false;
1169
+ let toolLoopBreakerReason: string | undefined;
1080
1170
 
1081
1171
  const modelChain = [model, ...(input.modelFallbacks || [])].slice(0, 3);
1082
1172
  let currentModel = model;
1173
+ const toolTrimConfig = agent.context.contextPruning;
1174
+ const toolSoftTrimMaxChars = Math.max(500, Math.floor(toolTrimConfig.softTrimMaxChars || 4000));
1175
+ const toolSoftTrimHead = Math.max(100, Math.floor(toolTrimConfig.softTrimHeadChars || 1500));
1176
+ const toolSoftTrimTail = Math.max(100, Math.floor(toolTrimConfig.softTrimTailChars || 1500));
1177
+ const followupOutputMaxChars = Math.max(900, Math.floor(toolSoftTrimMaxChars * 0.75));
1178
+ const followupArgumentMaxChars = Math.max(300, Math.floor(toolSoftTrimMaxChars * 0.25));
1179
+ let streamSeq = 0;
1180
+
1181
+ if (input.streamDir) {
1182
+ try {
1183
+ fs.mkdirSync(input.streamDir, { recursive: true });
1184
+ } catch {
1185
+ // ignore stream dir creation failure; normal response still works
1186
+ }
1187
+ }
1188
+
1189
+ const writeStreamChunk = (text: string) => {
1190
+ if (!input.streamDir) return;
1191
+ streamSeq += 1;
1192
+ const chunkFile = path.join(input.streamDir, `chunk_${String(streamSeq).padStart(6, '0')}.txt`);
1193
+ const tmpFile = `${chunkFile}.tmp`;
1194
+ try {
1195
+ fs.writeFileSync(tmpFile, text);
1196
+ fs.renameSync(tmpFile, chunkFile);
1197
+ } catch (writeErr) {
1198
+ log(`Stream write error at seq ${streamSeq}: ${writeErr instanceof Error ? writeErr.message : String(writeErr)}`);
1199
+ }
1200
+ };
1201
+
1202
+ const finalizeStream = () => {
1203
+ if (!input.streamDir) return;
1204
+ try {
1205
+ const donePath = path.join(input.streamDir, 'done');
1206
+ if (!fs.existsSync(donePath)) {
1207
+ fs.writeFileSync(donePath, '');
1208
+ }
1209
+ } catch {
1210
+ // ignore
1211
+ }
1212
+ };
1213
+
1214
+ const markStreamError = (errorMessage: string) => {
1215
+ if (!input.streamDir) return;
1216
+ try {
1217
+ const donePath = path.join(input.streamDir, 'done');
1218
+ if (!fs.existsSync(donePath)) {
1219
+ fs.writeFileSync(path.join(input.streamDir, 'error'), errorMessage);
1220
+ }
1221
+ } catch {
1222
+ // ignore
1223
+ }
1224
+ };
1083
1225
 
1084
1226
  try {
1085
1227
  const { instructions: resolvedInstructions, instructionsTokens: resolvedInstructionTokens, contextMessages } = buildContext();
@@ -1102,21 +1244,12 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
1102
1244
  }
1103
1245
  }
1104
1246
 
1105
- const contextInput = messagesToOpenRouter(contextMessages);
1247
+ const contextInput = messagesToOpenRouterInput(contextMessages);
1106
1248
 
1107
- // Inject vision content into the last user message if images are present
1108
- const imageContent = loadImageAttachments(input.attachments);
1109
- if (imageContent.length > 0 && contextInput.length > 0) {
1110
- const lastMsg = contextInput[contextInput.length - 1];
1111
- if (lastMsg.role === 'user') {
1112
- // Convert string content to multi-modal content array
1113
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
1114
- (lastMsg as any).content = [
1115
- { type: 'text', text: typeof lastMsg.content === 'string' ? lastMsg.content : '' },
1116
- ...imageContent
1117
- ];
1118
- }
1119
- }
1249
+ // Inject vision content into the last user message if images are present.
1250
+ // Uses OpenRouter Responses API content part types (input_text/input_image).
1251
+ const imageContent = loadImageAttachmentsForInput(input.attachments, { log });
1252
+ injectImagesIntoContextInput(contextInput, imageContent);
1120
1253
 
1121
1254
  let lastError: unknown = null;
1122
1255
  for (let attempt = 0; attempt < modelChain.length; attempt++) {
@@ -1128,9 +1261,9 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
1128
1261
  }
1129
1262
  if (attempt > 0) log(`Fallback ${attempt}: trying ${currentModel}`);
1130
1263
 
1264
+ const startedAt = Date.now();
1131
1265
  try {
1132
1266
  log(`Starting OpenRouter call (${currentModel})...`);
1133
- const startedAt = Date.now();
1134
1267
  // ── Custom tool execution loop ──────────────────────────────────
1135
1268
  // The SDK's built-in tool loop (executeToolsIfNeeded) drops conversation
1136
1269
  // context in follow-up API calls — it only sends [function_calls, function_call_outputs]
@@ -1142,39 +1275,14 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
1142
1275
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
1143
1276
  let conversationInput: any[] = [...contextInput];
1144
1277
  let step = 0;
1145
- let streamSeq = 0;
1146
-
1147
- // Helper to write a stream chunk
1148
- const writeStreamChunk = (text: string) => {
1149
- if (!input.streamDir) return;
1150
- streamSeq++;
1151
- const chunkFile = path.join(input.streamDir, `chunk_${String(streamSeq).padStart(6, '0')}.txt`);
1152
- const tmpFile = chunkFile + '.tmp';
1153
- try {
1154
- fs.writeFileSync(tmpFile, text);
1155
- fs.renameSync(tmpFile, chunkFile);
1156
- } catch (writeErr) {
1157
- log(`Stream write error at seq ${streamSeq}: ${writeErr instanceof Error ? writeErr.message : String(writeErr)}`);
1158
- }
1159
- };
1160
-
1161
- // Helper to finalize streaming
1162
- const finalizeStream = () => {
1163
- if (!input.streamDir) return;
1164
- try {
1165
- if (!fs.existsSync(path.join(input.streamDir, 'done'))) {
1166
- fs.writeFileSync(path.join(input.streamDir, 'done'), '');
1167
- }
1168
- } catch { /* ignore */ }
1169
- };
1170
1278
 
1171
1279
  // Initial call — uses streaming for real-time delivery
1172
1280
  const initialResult = openrouter.callModel({
1173
1281
  model: currentModel,
1174
1282
  instructions: resolvedInstructions,
1175
1283
  input: conversationInput,
1176
- tools: schemaTools,
1177
- maxOutputTokens: resolvedMaxOutputTokens,
1284
+ tools: disableToolsForTurn ? undefined : schemaTools,
1285
+ maxOutputTokens: effectiveMaxOutputTokens,
1178
1286
  temperature: config.temperature,
1179
1287
  reasoning: resolvedReasoning
1180
1288
  });
@@ -1182,13 +1290,12 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
1182
1290
  // Stream text from initial response
1183
1291
  if (input.streamDir) {
1184
1292
  try {
1185
- fs.mkdirSync(input.streamDir, { recursive: true });
1186
1293
  for await (const delta of initialResult.getTextStream()) {
1187
1294
  writeStreamChunk(delta);
1188
1295
  }
1189
1296
  } catch (streamErr) {
1190
1297
  log(`Stream error: ${streamErr instanceof Error ? streamErr.message : String(streamErr)}`);
1191
- try { fs.writeFileSync(path.join(input.streamDir, 'error'), streamErr instanceof Error ? streamErr.message : String(streamErr)); } catch { /* ignore */ }
1298
+ markStreamError(streamErr instanceof Error ? streamErr.message : String(streamErr));
1192
1299
  }
1193
1300
  }
1194
1301
 
@@ -1200,15 +1307,188 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
1200
1307
  } catch (err) {
1201
1308
  const message = err instanceof Error ? err.message : String(err);
1202
1309
  log(`Initial getResponse failed: ${message}`);
1203
- finalizeStream();
1204
1310
  throw err;
1205
1311
  }
1206
1312
 
1207
1313
  responseText = extractTextFromApiResponse(lastResponse);
1208
1314
  let pendingCalls = extractFunctionCalls(lastResponse);
1315
+ const callSignatureCounts = new Map<string, number>();
1316
+ let previousRoundSignature = '';
1317
+ let repeatedRoundCount = 0;
1318
+ let runToolLoopBreakerTriggered = false;
1319
+ let runToolLoopBreakerReason: string | undefined;
1320
+ let runToolRetryAttempts = 0;
1321
+ let runNonRetryableFailures = 0;
1322
+ let runOutcomeVerificationForced = false;
1323
+
1324
+ const maxToolRequirementNudges = 2;
1325
+ let toolRequirementNudgeAttempt = 0;
1326
+ const nudgeReason = toolExecutionRequirement.reason || 'required_tool_execution';
1327
+ const runDeterministicToolRequirementFallback = async (phase: 'pre_loop' | 'post_loop'): Promise<boolean> => {
1328
+ const createReadInstruction = parseCreateReadFileInstruction(prompt);
1329
+ if (createReadInstruction) {
1330
+ const writeExecutor = toolExecutors.get('Write');
1331
+ const readExecutor = toolExecutors.get('Read');
1332
+ if (!writeExecutor || !readExecutor) return false;
1333
+ runOutcomeVerificationForced = true;
1334
+ log(`Tool requirement fallback (${phase}): deterministic create+read for ${createReadInstruction.path}`);
1335
+ try {
1336
+ await writeExecutor({
1337
+ path: createReadInstruction.path,
1338
+ content: createReadInstruction.lines.join('\n')
1339
+ });
1340
+ await readExecutor({ path: createReadInstruction.path });
1341
+ responseText = `Created file "${createReadInstruction.path}" with ${createReadInstruction.lines.length} lines and verified it by reading it back.`;
1342
+ writeStreamChunk(responseText);
1343
+ return true;
1344
+ } catch (fallbackErr) {
1345
+ log(`Deterministic create+read fallback failed: ${fallbackErr instanceof Error ? fallbackErr.message : String(fallbackErr)}`);
1346
+ return false;
1347
+ }
1348
+ }
1349
+
1350
+ const listReadInstruction = parseListReadNewestInstruction(prompt);
1351
+ if (!listReadInstruction) return false;
1352
+ const globExecutor = toolExecutors.get('Glob');
1353
+ const readExecutor = toolExecutors.get('Read');
1354
+ if (!globExecutor || !readExecutor) return false;
1355
+ runOutcomeVerificationForced = true;
1356
+ log(`Tool requirement fallback (${phase}): deterministic list+read for ${listReadInstruction.directory}`);
1357
+ try {
1358
+ const normalizedDir = listReadInstruction.directory.replace(/\/+$/, '');
1359
+ const globPattern = normalizedDir ? `${normalizedDir}/**/*` : '**/*';
1360
+ const maxResults = Math.max(50, listReadInstruction.count * 20);
1361
+ const globResult = await globExecutor({ pattern: globPattern, maxResults });
1362
+ const matches = (
1363
+ globResult &&
1364
+ typeof globResult === 'object' &&
1365
+ Array.isArray((globResult as { matches?: unknown }).matches)
1366
+ ? (globResult as { matches: unknown[] }).matches
1367
+ : []
1368
+ ).map((item) => String(item || '').trim()).filter(Boolean);
1369
+
1370
+ const rankedFiles = Array.from(new Set(matches))
1371
+ .map((candidatePath) => {
1372
+ try {
1373
+ const stat = fs.statSync(candidatePath);
1374
+ if (!stat.isFile()) return null;
1375
+ return { path: candidatePath, mtimeMs: stat.mtimeMs };
1376
+ } catch {
1377
+ return null;
1378
+ }
1379
+ })
1380
+ .filter((entry): entry is { path: string; mtimeMs: number } => !!entry)
1381
+ .sort((a, b) => b.mtimeMs - a.mtimeMs)
1382
+ .slice(0, listReadInstruction.count);
1383
+
1384
+ if (rankedFiles.length === 0) {
1385
+ responseText = [
1386
+ `- No files were found under \`${listReadInstruction.directory}\`.`,
1387
+ '- I could not read a newest file because the directory appears empty.'
1388
+ ].join('\n');
1389
+ writeStreamChunk(responseText);
1390
+ return true;
1391
+ }
1392
+
1393
+ const newest = rankedFiles[0];
1394
+ const readResult = await readExecutor({ path: newest.path });
1395
+ const readContent = (
1396
+ readResult &&
1397
+ typeof readResult === 'object' &&
1398
+ typeof (readResult as { content?: unknown }).content === 'string'
1399
+ ? (readResult as { content: string }).content
1400
+ : ''
1401
+ ).trim();
1402
+ const preview = readContent
1403
+ ? readContent.split(/\r?\n/).map(line => line.trim()).filter(Boolean).slice(0, 3).join(' | ')
1404
+ : '[empty file]';
1405
+ const relativePath = newest.path.startsWith(`${GROUP_DIR}/`)
1406
+ ? newest.path.slice(GROUP_DIR.length + 1)
1407
+ : newest.path;
1408
+ const newestBasenames = rankedFiles.map(entry => path.basename(entry.path)).join(', ');
1409
+ const bulletCount = listReadInstruction.bulletCount || 2;
1410
+ const bulletLines = [
1411
+ `- Newest file: \`${relativePath}\` (top ${rankedFiles.length} files from \`${listReadInstruction.directory}\`).`,
1412
+ `- Preview: ${preview}.`,
1413
+ `- Newest set: ${newestBasenames}.`
1414
+ ].slice(0, bulletCount);
1415
+
1416
+ responseText = bulletLines.join('\n');
1417
+ writeStreamChunk(responseText);
1418
+ return true;
1419
+ } catch (fallbackErr) {
1420
+ log(`Deterministic list+read fallback failed: ${fallbackErr instanceof Error ? fallbackErr.message : String(fallbackErr)}`);
1421
+ return false;
1422
+ }
1423
+ };
1424
+
1425
+ while (toolExecutionRequirement.required && pendingCalls.length === 0 && toolCalls.length === 0 && toolRequirementNudgeAttempt < maxToolRequirementNudges) {
1426
+ toolRequirementNudgeAttempt += 1;
1427
+ runOutcomeVerificationForced = true;
1428
+ log(`Tool requirement nudge triggered (${nudgeReason}, attempt ${toolRequirementNudgeAttempt}/${maxToolRequirementNudges})`);
1429
+ const nudgePrompt = buildToolExecutionNudgePrompt({
1430
+ reason: nudgeReason,
1431
+ attempt: toolRequirementNudgeAttempt
1432
+ });
1433
+ const responseItems = Array.isArray(lastResponse?.output) ? lastResponse.output : [];
1434
+ conversationInput = [...conversationInput, ...responseItems, { role: 'user', content: nudgePrompt }];
1435
+ try {
1436
+ const nudgeResult = openrouter.callModel({
1437
+ model: currentModel,
1438
+ instructions: resolvedInstructions,
1439
+ input: conversationInput,
1440
+ tools: schemaTools,
1441
+ maxOutputTokens: effectiveMaxOutputTokens,
1442
+ temperature: Math.min(config.temperature, 0.1),
1443
+ reasoning: { effort: 'low' as const }
1444
+ });
1445
+ lastResponse = await nudgeResult.getResponse();
1446
+ const nudgeText = extractTextFromApiResponse(lastResponse);
1447
+ if (nudgeText) {
1448
+ responseText = nudgeText;
1449
+ writeStreamChunk(nudgeText);
1450
+ }
1451
+ pendingCalls = extractFunctionCalls(lastResponse);
1452
+ } catch (nudgeErr) {
1453
+ log(`Tool requirement nudge failed: ${nudgeErr instanceof Error ? nudgeErr.message : String(nudgeErr)}`);
1454
+ break;
1455
+ }
1456
+ }
1457
+
1458
+ if (toolExecutionRequirement.required && pendingCalls.length === 0 && toolCalls.length === 0) {
1459
+ await runDeterministicToolRequirementFallback('pre_loop');
1460
+ }
1209
1461
 
1210
1462
  // Tool execution loop — execute tools ourselves, include full context in follow-ups
1211
1463
  while (pendingCalls.length > 0 && step < maxToolSteps) {
1464
+ const roundSignature = normalizeToolRoundSignature(pendingCalls);
1465
+ if (roundSignature && roundSignature === previousRoundSignature) {
1466
+ repeatedRoundCount += 1;
1467
+ } else {
1468
+ repeatedRoundCount = 1;
1469
+ previousRoundSignature = roundSignature;
1470
+ }
1471
+ if (roundSignature && repeatedRoundCount >= repeatedRoundThreshold) {
1472
+ runToolLoopBreakerTriggered = true;
1473
+ runToolLoopBreakerReason = `repeated_round_signature(${repeatedRoundCount})`;
1474
+ log(`Tool loop breaker triggered: ${runToolLoopBreakerReason}`);
1475
+ break;
1476
+ }
1477
+ for (const fc of pendingCalls) {
1478
+ const signature = normalizeToolCallSignature(fc);
1479
+ const nextCount = (callSignatureCounts.get(signature) || 0) + 1;
1480
+ callSignatureCounts.set(signature, nextCount);
1481
+ if (nextCount >= repeatedSignatureThreshold) {
1482
+ runToolLoopBreakerTriggered = true;
1483
+ runToolLoopBreakerReason = `repeated_call_signature(${nextCount}): ${fc.name}`;
1484
+ break;
1485
+ }
1486
+ }
1487
+ if (runToolLoopBreakerTriggered) {
1488
+ log(`Tool loop breaker triggered: ${runToolLoopBreakerReason || 'unknown_reason'}`);
1489
+ break;
1490
+ }
1491
+
1212
1492
  log(`Step ${step}: executing ${pendingCalls.length} tool call(s): ${pendingCalls.map(c => c.name).join(', ')}`);
1213
1493
 
1214
1494
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
@@ -1226,50 +1506,120 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
1226
1506
  continue;
1227
1507
  }
1228
1508
 
1229
- try {
1230
- // Calling the wrapped execute fires onToolCall/onToolResult callbacks
1231
- const result = await executor(fc.arguments);
1232
- toolResults.push({
1233
- type: 'function_call_output',
1234
- callId: fc.id,
1235
- output: JSON.stringify(result)
1509
+ const normalizedArgs = normalizeToolCallArguments({
1510
+ toolName: fc.name,
1511
+ rawArguments: fc.arguments
1512
+ });
1513
+ if (normalizedArgs.malformedReason) {
1514
+ const recoveryHint = buildMalformedArgumentsRecoveryHint({
1515
+ toolName: fc.name,
1516
+ malformedReason: normalizedArgs.malformedReason
1236
1517
  });
1237
- } catch (err) {
1238
- const error = err instanceof Error ? err.message : String(err);
1518
+ const error = recoveryHint
1519
+ ? `Malformed arguments for ${fc.name}: ${normalizedArgs.malformedReason}. ${recoveryHint}`
1520
+ : `Malformed arguments for ${fc.name}: ${normalizedArgs.malformedReason}`;
1239
1521
  toolResults.push({
1240
1522
  type: 'function_call_output',
1241
1523
  callId: fc.id,
1242
1524
  output: JSON.stringify({ error })
1243
1525
  });
1526
+ toolOutputs.push({ name: fc.name, ok: false, error });
1527
+ runNonRetryableFailures += 1;
1528
+ if (runNonRetryableFailures >= nonRetryableFailureThreshold) {
1529
+ runToolLoopBreakerTriggered = true;
1530
+ runToolLoopBreakerReason = `non_retryable_failures(${runNonRetryableFailures})`;
1531
+ }
1532
+ step++;
1533
+ if (runToolLoopBreakerTriggered) break;
1534
+ continue;
1535
+ }
1536
+
1537
+ let attemptNumber = 1;
1538
+ // Retry only read/idempotent tools on transient failures.
1539
+ for (;;) {
1540
+ try {
1541
+ // Calling the wrapped execute fires onToolCall/onToolResult callbacks.
1542
+ const result = await executor(normalizedArgs.arguments);
1543
+ toolResults.push({
1544
+ type: 'function_call_output',
1545
+ callId: fc.id,
1546
+ output: JSON.stringify(result)
1547
+ });
1548
+ break;
1549
+ } catch (err) {
1550
+ if (shouldRetryIdempotentToolCall({
1551
+ toolName: fc.name,
1552
+ error: err,
1553
+ attempt: attemptNumber,
1554
+ maxAttempts: idempotentRetryAttempts
1555
+ })) {
1556
+ runToolRetryAttempts += 1;
1557
+ const delayMs = Math.min(2_000, idempotentRetryBackoffMs * attemptNumber);
1558
+ log(`Retrying idempotent tool ${fc.name} after transient error (attempt ${attemptNumber + 1}/${idempotentRetryAttempts})`);
1559
+ if (delayMs > 0) {
1560
+ await sleep(delayMs);
1561
+ }
1562
+ attemptNumber += 1;
1563
+ continue;
1564
+ }
1565
+ const error = err instanceof Error ? err.message : String(err);
1566
+ toolResults.push({
1567
+ type: 'function_call_output',
1568
+ callId: fc.id,
1569
+ output: JSON.stringify({ error })
1570
+ });
1571
+ if (isNonRetryableToolError(error)) {
1572
+ runNonRetryableFailures += 1;
1573
+ if (runNonRetryableFailures >= nonRetryableFailureThreshold) {
1574
+ runToolLoopBreakerTriggered = true;
1575
+ runToolLoopBreakerReason = `non_retryable_failures(${runNonRetryableFailures})`;
1576
+ }
1577
+ }
1578
+ break;
1579
+ }
1244
1580
  }
1245
1581
  step++;
1582
+ if (runToolLoopBreakerTriggered) break;
1583
+ }
1584
+ if (runToolLoopBreakerTriggered) {
1585
+ log(`Tool loop breaker triggered: ${runToolLoopBreakerReason || 'unknown_reason'}`);
1586
+ break;
1246
1587
  }
1247
1588
 
1248
1589
  // Build follow-up input with FULL conversation context:
1249
1590
  // original messages + model output + tool results (accumulated each round)
1250
1591
  conversationInput = [...conversationInput, ...lastResponse.output, ...toolResults];
1251
1592
 
1593
+ // Compact oversized tool payloads before follow-up calls to reduce context bloat.
1594
+ const compactedConversation = compactToolConversationItems(conversationInput, {
1595
+ maxOutputChars: followupOutputMaxChars,
1596
+ outputHeadChars: Math.min(toolSoftTrimHead, Math.floor(followupOutputMaxChars * 0.6)),
1597
+ outputTailChars: Math.min(toolSoftTrimTail, Math.floor(followupOutputMaxChars * 0.3)),
1598
+ maxArgumentChars: followupArgumentMaxChars,
1599
+ });
1600
+ conversationInput = compactedConversation.items as typeof conversationInput;
1601
+ if (compactedConversation.compacted > 0) {
1602
+ log(`Tool loop: compacted ${compactedConversation.compacted} oversized payload(s)`);
1603
+ }
1604
+
1252
1605
  // Phase 1: Soft-trim oversized tool results (like OpenClaw's context-pruning extension).
1253
1606
  // Replace large tool result content with head+tail, preserving pair integrity.
1254
1607
  // PROTECT the most recent round's tool results — only trim older ones.
1255
- const SOFT_TRIM_MAX_CHARS = 4000;
1256
- const SOFT_TRIM_HEAD = 1500;
1257
- const SOFT_TRIM_TAIL = 1500;
1258
1608
  const protectedStart = conversationInput.length - toolResults.length;
1259
1609
  for (let idx = 0; idx < protectedStart; idx++) {
1260
1610
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
1261
1611
  const anyItem = conversationInput[idx] as any;
1262
- if (anyItem?.type === 'function_call_output' && typeof anyItem.output === 'string' && anyItem.output.length > SOFT_TRIM_MAX_CHARS) {
1612
+ if (anyItem?.type === 'function_call_output' && typeof anyItem.output === 'string' && anyItem.output.length > toolSoftTrimMaxChars) {
1263
1613
  const orig = anyItem.output;
1264
- anyItem.output = orig.slice(0, SOFT_TRIM_HEAD) + '\n...\n' + orig.slice(-SOFT_TRIM_TAIL)
1265
- + `\n[Tool result trimmed: kept first ${SOFT_TRIM_HEAD} and last ${SOFT_TRIM_TAIL} of ${orig.length} chars.]`;
1614
+ anyItem.output = orig.slice(0, toolSoftTrimHead) + '\n...\n' + orig.slice(-toolSoftTrimTail)
1615
+ + `\n[Tool result trimmed: kept first ${toolSoftTrimHead} and last ${toolSoftTrimTail} of ${orig.length} chars.]`;
1266
1616
  }
1267
1617
  }
1268
1618
 
1269
1619
  // Phase 2: If still over budget, remove only initial context messages (role/content items
1270
1620
  // without a 'type' field). NEVER remove function_call or function_call_output items —
1271
1621
  // orphaning either side of a pair causes API 400 errors.
1272
- const followupTokenLimit = Math.floor(config.maxContextTokens * 0.6);
1622
+ const followupTokenLimit = Math.floor(config.maxContextTokens * 0.45);
1273
1623
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
1274
1624
  const estimateInputTokens = (items: any[]) => items.reduce((sum: number, item: any) => {
1275
1625
  const content = typeof item === 'string' ? item : JSON.stringify(item);
@@ -1298,7 +1648,7 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
1298
1648
  instructions: resolvedInstructions,
1299
1649
  input: conversationInput,
1300
1650
  tools: schemaTools,
1301
- maxOutputTokens: resolvedMaxOutputTokens,
1651
+ maxOutputTokens: effectiveMaxOutputTokens,
1302
1652
  temperature: config.temperature,
1303
1653
  reasoning: resolvedReasoning
1304
1654
  });
@@ -1335,7 +1685,7 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
1335
1685
  instructions: resolvedInstructions,
1336
1686
  input: conversationInput,
1337
1687
  tools: schemaTools,
1338
- maxOutputTokens: resolvedMaxOutputTokens,
1688
+ maxOutputTokens: effectiveMaxOutputTokens,
1339
1689
  temperature: config.temperature,
1340
1690
  reasoning: resolvedReasoning
1341
1691
  });
@@ -1368,8 +1718,78 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
1368
1718
  pendingCalls = extractFunctionCalls(lastResponse);
1369
1719
  }
1370
1720
 
1721
+ if (toolExecutionRequirement.required && toolCalls.length === 0) {
1722
+ const fallbackApplied = await runDeterministicToolRequirementFallback('post_loop');
1723
+ if (fallbackApplied) {
1724
+ pendingCalls = [];
1725
+ }
1726
+ }
1727
+
1728
+ if (toolExecutionRequirement.required && toolCalls.length === 0) {
1729
+ runOutcomeVerificationForced = true;
1730
+ responseText = 'I could not execute the required tools for this request, so I cannot safely claim completion.';
1731
+ writeStreamChunk(responseText);
1732
+ }
1733
+
1734
+ const unresolvedCalls = pendingCalls.slice();
1735
+ if (forceSynthesisAfterTools && toolCalls.length > 0 && (runToolLoopBreakerTriggered || unresolvedCalls.length > 0 || !responseText.trim())) {
1736
+ runOutcomeVerificationForced = true;
1737
+ const synthesisReason = runToolLoopBreakerTriggered
1738
+ ? `stuck_loop:${runToolLoopBreakerReason || 'unknown'}`
1739
+ : (unresolvedCalls.length > 0 ? 'unresolved_tool_calls' : 'empty_after_tools');
1740
+ log(`Tool outcome verifier forcing synthesis (${synthesisReason})`);
1741
+ const continuationPrompt = buildForcedSynthesisPrompt({
1742
+ reason: synthesisReason,
1743
+ pendingCalls: unresolvedCalls,
1744
+ toolOutputs
1745
+ });
1746
+ conversationInput = [...conversationInput, { role: 'user', content: continuationPrompt }];
1747
+ try {
1748
+ const synthesisResult = openrouter.callModel({
1749
+ model: currentModel,
1750
+ instructions: resolvedInstructions,
1751
+ input: conversationInput,
1752
+ maxOutputTokens: effectiveMaxOutputTokens,
1753
+ temperature: config.temperature,
1754
+ reasoning: resolvedReasoning
1755
+ });
1756
+ const synthesisResponse = await synthesisResult.getResponse();
1757
+ const synthesisText = extractTextFromApiResponse(synthesisResponse);
1758
+ if (synthesisText && synthesisText.trim()) {
1759
+ responseText = synthesisText;
1760
+ writeStreamChunk(synthesisText);
1761
+ }
1762
+ } catch (synthesisErr) {
1763
+ log(`Forced synthesis failed: ${synthesisErr instanceof Error ? synthesisErr.message : String(synthesisErr)}`);
1764
+ }
1765
+
1766
+ if (!responseText || !responseText.trim()) {
1767
+ responseText = buildToolOutcomeFallback({
1768
+ reason: synthesisReason,
1769
+ toolOutputs,
1770
+ pendingCalls: unresolvedCalls
1771
+ });
1772
+ writeStreamChunk(responseText);
1773
+ }
1774
+ }
1775
+
1776
+ if (!responseText || !responseText.trim()) {
1777
+ responseText = toolCalls.length > 0
1778
+ ? 'I completed tool execution but received an empty model response. Please retry, and I will continue from this context.'
1779
+ : 'I could not produce a response for that request. Please retry, and I will continue from this context.';
1780
+ writeStreamChunk(responseText);
1781
+ }
1782
+
1371
1783
  finalizeStream();
1372
1784
  latencyMs = Date.now() - startedAt;
1785
+ toolRetryAttempts += runToolRetryAttempts;
1786
+ if (runOutcomeVerificationForced) {
1787
+ toolOutcomeVerificationForced = true;
1788
+ }
1789
+ if (runToolLoopBreakerTriggered) {
1790
+ toolLoopBreakerTriggered = true;
1791
+ toolLoopBreakerReason = runToolLoopBreakerReason;
1792
+ }
1373
1793
 
1374
1794
  if (responseText && responseText.trim()) {
1375
1795
  log(`Model returned text response (${responseText.length} chars, ${step} tool steps)`);
@@ -1391,10 +1811,13 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
1391
1811
  // rebuild system prompt at max trim level, then retry.
1392
1812
  if (errClass === 'context_overflow' && contextMessages.length > 4) {
1393
1813
  log(`Context overflow on ${currentModel}, emergency compaction + max trim`);
1394
- // Split: keep last 4 messages, compact the rest via summary
1395
- const keepCount = Math.min(4, contextMessages.length);
1396
- const toCompact = contextMessages.slice(0, contextMessages.length - keepCount);
1397
- const toKeep = contextMessages.slice(-keepCount);
1814
+ const recoveryPlan = buildContextOverflowRecoveryPlan({
1815
+ contextMessages: contextMessages.map(msg => ({ role: msg.role, content: msg.content })),
1816
+ emergencySummary: null,
1817
+ keepRecentCount: 4
1818
+ });
1819
+ const toCompact = recoveryPlan.toCompact;
1820
+ const toKeep = recoveryPlan.toKeep;
1398
1821
  let emergencySummary = '';
1399
1822
  if (toCompact.length > 0) {
1400
1823
  try {
@@ -1420,22 +1843,30 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
1420
1843
  }
1421
1844
  // Rebuild system prompt at max trim level (includes updated summary)
1422
1845
  const minInstructions = resolveInstructions(4);
1423
- // Build trimmed input: summary context + recent messages
1424
- const compactedInput = emergencySummary
1425
- ? [{ role: 'user' as const, content: `[Previous conversation summary: ${emergencySummary}]` }, ...toKeep.map(m => ({ role: m.role, content: m.content }))]
1426
- : toKeep.map(m => ({ role: m.role, content: m.content }));
1846
+ const compactedInput = buildContextOverflowRecoveryPlan({
1847
+ contextMessages: toKeep,
1848
+ emergencySummary,
1849
+ keepRecentCount: Math.max(1, toKeep.length)
1850
+ }).retryInput;
1427
1851
  try {
1428
1852
  const retryResult = openrouter.callModel({
1429
1853
  model: currentModel,
1430
1854
  instructions: minInstructions,
1431
1855
  input: compactedInput,
1432
1856
  tools: schemaTools,
1433
- maxOutputTokens: resolvedMaxOutputTokens,
1857
+ maxOutputTokens: effectiveMaxOutputTokens,
1434
1858
  temperature: config.temperature,
1435
1859
  reasoning: resolvedReasoning
1436
1860
  });
1437
1861
  const retryResponse = await retryResult.getResponse();
1438
1862
  responseText = extractTextFromApiResponse(retryResponse) || '';
1863
+ if (responseText) {
1864
+ writeStreamChunk(responseText);
1865
+ }
1866
+ finalizeStream();
1867
+ latencyMs = Date.now() - startedAt;
1868
+ completionTokens = estimateTokensForModel(responseText || '', tokenEstimate.tokensPerChar);
1869
+ promptTokens = resolvedPromptTokens;
1439
1870
  lastError = null;
1440
1871
  break;
1441
1872
  } catch (retryErr) {
@@ -1459,6 +1890,8 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
1459
1890
  const errorMessage = err instanceof Error ? err.message : String(err);
1460
1891
  const allFailed = modelChain.length > 1 ? `All models failed. Last error: ${errorMessage}` : errorMessage;
1461
1892
  log(`Agent error: ${allFailed}`);
1893
+ markStreamError(allFailed);
1894
+ await cleanupMcpConnections();
1462
1895
  return {
1463
1896
  status: 'error',
1464
1897
  result: null,
@@ -1470,12 +1903,16 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
1470
1903
  memory_facts: sessionCtx.state.facts,
1471
1904
  tokens_prompt: promptTokens,
1472
1905
  tokens_completion: completionTokens,
1473
- memory_recall_count: memoryRecallCount,
1906
+ memory_recall_count: memoryRecallCountForOutput,
1474
1907
  session_recall_count: sessionRecallCount,
1475
1908
  memory_items_upserted: memoryItemsUpserted,
1476
1909
  memory_items_extracted: memoryItemsExtracted,
1477
1910
  timings: Object.keys(timings).length > 0 ? timings : undefined,
1478
1911
  tool_calls: toolCalls.length > 0 ? toolCalls : undefined,
1912
+ tool_retry_attempts: toolRetryAttempts || undefined,
1913
+ tool_outcome_verification_forced: toolOutcomeVerificationForced || undefined,
1914
+ tool_loop_breaker_triggered: toolLoopBreakerTriggered || undefined,
1915
+ tool_loop_breaker_reason: toolLoopBreakerReason,
1479
1916
  latency_ms: latencyMs
1480
1917
  };
1481
1918
  }
@@ -1569,10 +2006,7 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
1569
2006
  }
1570
2007
  }
1571
2008
 
1572
- // Cleanup MCP connections
1573
- if (mcpCleanup) {
1574
- try { await mcpCleanup(); } catch { /* ignore cleanup errors */ }
1575
- }
2009
+ await cleanupMcpConnections();
1576
2010
 
1577
2011
  return {
1578
2012
  status: 'success',
@@ -1584,12 +2018,16 @@ export async function runAgentOnce(input: ContainerInput): Promise<ContainerOutp
1584
2018
  memory_facts: sessionCtx.state.facts,
1585
2019
  tokens_prompt: promptTokens,
1586
2020
  tokens_completion: completionTokens,
1587
- memory_recall_count: memoryRecallCount,
2021
+ memory_recall_count: memoryRecallCountForOutput,
1588
2022
  session_recall_count: sessionRecallCount,
1589
2023
  memory_items_upserted: memoryItemsUpserted,
1590
2024
  memory_items_extracted: memoryItemsExtracted,
1591
2025
  timings: Object.keys(timings).length > 0 ? timings : undefined,
1592
2026
  tool_calls: toolCalls.length > 0 ? toolCalls : undefined,
2027
+ tool_retry_attempts: toolRetryAttempts || undefined,
2028
+ tool_outcome_verification_forced: toolOutcomeVerificationForced || undefined,
2029
+ tool_loop_breaker_triggered: toolLoopBreakerTriggered || undefined,
2030
+ tool_loop_breaker_reason: toolLoopBreakerReason,
1593
2031
  latency_ms: latencyMs,
1594
2032
  replyToId
1595
2033
  };