lynkr 7.2.4 → 8.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/README.md +2 -2
  2. package/config/model-tiers.json +89 -0
  3. package/docs/docs.html +1 -0
  4. package/docs/index.md +7 -0
  5. package/docs/toon-integration-spec.md +130 -0
  6. package/documentation/README.md +3 -2
  7. package/documentation/claude-code-cli.md +23 -16
  8. package/documentation/cursor-integration.md +17 -14
  9. package/documentation/docker.md +11 -4
  10. package/documentation/embeddings.md +7 -5
  11. package/documentation/faq.md +66 -12
  12. package/documentation/features.md +22 -15
  13. package/documentation/installation.md +66 -14
  14. package/documentation/production.md +43 -8
  15. package/documentation/providers.md +145 -42
  16. package/documentation/routing.md +476 -0
  17. package/documentation/token-optimization.md +7 -5
  18. package/documentation/troubleshooting.md +81 -5
  19. package/install.sh +6 -1
  20. package/package.json +5 -3
  21. package/scripts/setup.js +0 -1
  22. package/src/agents/executor.js +14 -6
  23. package/src/api/middleware/session.js +15 -2
  24. package/src/api/openai-router.js +130 -37
  25. package/src/api/providers-handler.js +15 -1
  26. package/src/api/router.js +107 -2
  27. package/src/budget/index.js +4 -3
  28. package/src/clients/databricks.js +431 -234
  29. package/src/clients/gpt-utils.js +181 -0
  30. package/src/clients/ollama-utils.js +66 -140
  31. package/src/clients/routing.js +0 -1
  32. package/src/clients/standard-tools.js +82 -5
  33. package/src/config/index.js +119 -35
  34. package/src/context/toon.js +173 -0
  35. package/src/headroom/launcher.js +8 -3
  36. package/src/logger/index.js +23 -0
  37. package/src/orchestrator/index.js +765 -212
  38. package/src/routing/agentic-detector.js +320 -0
  39. package/src/routing/complexity-analyzer.js +202 -2
  40. package/src/routing/cost-optimizer.js +305 -0
  41. package/src/routing/index.js +168 -159
  42. package/src/routing/model-registry.js +437 -0
  43. package/src/routing/model-tiers.js +365 -0
  44. package/src/server.js +2 -2
  45. package/src/sessions/cleanup.js +3 -3
  46. package/src/sessions/record.js +10 -1
  47. package/src/sessions/store.js +7 -2
  48. package/src/tools/agent-task.js +48 -1
  49. package/src/tools/index.js +15 -2
  50. package/src/tools/workspace.js +35 -4
  51. package/src/workspace/index.js +30 -0
  52. package/te +11622 -0
  53. package/test/README.md +1 -1
  54. package/test/azure-openai-config.test.js +17 -8
  55. package/test/azure-openai-integration.test.js +7 -1
  56. package/test/azure-openai-routing.test.js +41 -43
  57. package/test/bedrock-integration.test.js +18 -32
  58. package/test/hybrid-routing-integration.test.js +35 -20
  59. package/test/hybrid-routing-performance.test.js +74 -64
  60. package/test/llamacpp-integration.test.js +28 -9
  61. package/test/lmstudio-integration.test.js +20 -8
  62. package/test/openai-integration.test.js +17 -20
  63. package/test/performance-tests.js +1 -1
  64. package/test/routing.test.js +65 -59
  65. package/test/toon-compression.test.js +131 -0
  66. package/CLAWROUTER_ROUTING_PLAN.md +0 -910
  67. package/ROUTER_COMPARISON.md +0 -173
  68. package/TIER_ROUTING_PLAN.md +0 -771
@@ -1,6 +1,7 @@
1
1
  const config = require("../config");
2
2
  const { invokeModel } = require("../clients/databricks");
3
3
  const { appendTurnToSession } = require("../sessions/record");
4
+ const { upsertSession } = require("../sessions/store");
4
5
  const { executeToolCall } = require("../tools");
5
6
  const policy = require("../policy");
6
7
  const logger = require("../logger");
@@ -10,6 +11,7 @@ const tokens = require("../utils/tokens");
10
11
  const systemPrompt = require("../prompts/system");
11
12
  const historyCompression = require("../context/compression");
12
13
  const tokenBudget = require("../context/budget");
14
+ const { applyToonCompression } = require("../context/toon");
13
15
  const { classifyRequestType, selectToolsSmartly } = require("../tools/smart-selection");
14
16
  const { compressMessages: headroomCompress, isEnabled: isHeadroomEnabled } = require("../headroom");
15
17
  const { createAuditLogger } = require("../logger/audit-logger");
@@ -19,6 +21,8 @@ const crypto = require("crypto");
19
21
  const { asyncClone, asyncTransform, getPoolStats } = require("../workers/helpers");
20
22
  const { getSemanticCache, isSemanticCacheEnabled } = require("../cache/semantic");
21
23
  const lazyLoader = require("../tools/lazy-loader");
24
+ const { areSimilarToolCalls } = require("../clients/gpt-utils");
25
+ const { getModelRegistrySync } = require("../routing/model-registry");
22
26
 
23
27
  /**
24
28
  * Get destination URL for audit logging based on provider type
@@ -49,6 +53,8 @@ function getDestinationUrl(providerType) {
49
53
  return config.zai?.endpoint ?? 'unknown';
50
54
  case 'vertex':
51
55
  return config.vertex?.endpoint ?? 'unknown';
56
+ case 'moonshot':
57
+ return config.moonshot?.endpoint ?? 'unknown';
52
58
  default:
53
59
  return 'unknown';
54
60
  }
@@ -455,6 +461,192 @@ function injectToolLoopStopInstruction(messages, threshold = 5) {
455
461
  return messages;
456
462
  }
457
463
 
464
+ // === CROSS-REQUEST TOOL CALL DEDUP TRACKING ===
465
+ // These helpers track tool call signatures across multiple HTTP requests within
466
+ // the same session (client/passthrough mode). The inner-loop detection in
467
+ // runAgentLoop() only sees one request at a time, so repeated calls across
468
+ // requests escape it.
469
+
470
+ const DEDUP_MAX_SIGNATURES = 50;
471
+ const DEDUP_WARN_THRESHOLD = 2;
472
+ const DEDUP_TERMINATE_THRESHOLD = 3;
473
+
474
+ /**
475
+ * Initialise session.metadata.toolCallDedup if missing.
476
+ * @param {Object} session
477
+ */
478
+ function ensureDedupStructure(session) {
479
+ if (!session || !session.metadata) return;
480
+ if (!session.metadata.toolCallDedup) {
481
+ session.metadata.toolCallDedup = {
482
+ signatures: {},
483
+ similarGroups: {},
484
+ lastResetAt: Date.now(),
485
+ warningInjected: false,
486
+ };
487
+ }
488
+ }
489
+
490
+ /**
491
+ * Record a tool call into the cross-request dedup tracker.
492
+ * Handles similarity merging and enforces the 50-entry cap.
493
+ * @param {Object} session
494
+ * @param {Object} toolCall - tool_use block (Anthropic format: { name, input, id })
495
+ */
496
+ function recordCrossRequestToolCall(session, toolCall) {
497
+ if (!session?.metadata) return;
498
+ ensureDedupStructure(session);
499
+
500
+ const dedup = session.metadata.toolCallDedup;
501
+ const signature = getToolCallSignature(toolCall);
502
+ const toolName = toolCall.function?.name ?? toolCall.name ?? 'unknown';
503
+ const args = toolCall.function?.arguments ?? toolCall.input;
504
+ const argsPreview = (typeof args === 'string' ? args : JSON.stringify(args ?? {})).substring(0, 200);
505
+ const now = Date.now();
506
+
507
+ // Check if this signature maps to a canonical via similarity groups
508
+ const canonicalSig = dedup.similarGroups[signature] || signature;
509
+
510
+ if (dedup.signatures[canonicalSig]) {
511
+ dedup.signatures[canonicalSig].count += 1;
512
+ dedup.signatures[canonicalSig].lastSeen = now;
513
+ } else {
514
+ // Check for similar existing entries before creating a new one
515
+ let mergedInto = null;
516
+ for (const [existingSig, existingData] of Object.entries(dedup.signatures)) {
517
+ // Build a fake call object from stored data to compare with areSimilarToolCalls
518
+ const existingCall = {
519
+ name: existingData.toolName,
520
+ input: existingData.argsPreview,
521
+ };
522
+ if (areSimilarToolCalls(toolCall, existingCall)) {
523
+ // Merge: map this signature to the existing canonical
524
+ dedup.similarGroups[signature] = existingSig;
525
+ dedup.signatures[existingSig].count += 1;
526
+ dedup.signatures[existingSig].lastSeen = now;
527
+ mergedInto = existingSig;
528
+ logger.debug({
529
+ newSignature: signature,
530
+ canonicalSignature: existingSig,
531
+ toolName,
532
+ count: dedup.signatures[existingSig].count,
533
+ }, "Cross-request tool dedup: merged similar call");
534
+ break;
535
+ }
536
+ }
537
+
538
+ if (!mergedInto) {
539
+ // New unique signature
540
+ dedup.signatures[signature] = {
541
+ count: 1,
542
+ toolName,
543
+ firstSeen: now,
544
+ lastSeen: now,
545
+ argsPreview,
546
+ };
547
+ }
548
+ }
549
+
550
+ // Enforce cap: evict oldest entries if over limit
551
+ const sigKeys = Object.keys(dedup.signatures);
552
+ if (sigKeys.length > DEDUP_MAX_SIGNATURES) {
553
+ const sorted = sigKeys.sort(
554
+ (a, b) => dedup.signatures[a].lastSeen - dedup.signatures[b].lastSeen
555
+ );
556
+ const toRemove = sorted.slice(0, sigKeys.length - DEDUP_MAX_SIGNATURES);
557
+ for (const key of toRemove) {
558
+ delete dedup.signatures[key];
559
+ // Also clean up any similarGroups pointing to this key
560
+ for (const [groupSig, canonical] of Object.entries(dedup.similarGroups)) {
561
+ if (canonical === key) delete dedup.similarGroups[groupSig];
562
+ }
563
+ }
564
+ }
565
+ }
566
+
567
+ /**
568
+ * Return the highest dedup count, the associated tool name, and signature.
569
+ * @param {Object} session
570
+ * @returns {{ maxCount: number, toolName: string|null, signature: string|null }}
571
+ */
572
+ function getMaxDedupCount(session) {
573
+ if (!session?.metadata?.toolCallDedup?.signatures) {
574
+ return { maxCount: 0, toolName: null, signature: null };
575
+ }
576
+ const sigs = session.metadata.toolCallDedup.signatures;
577
+ let maxCount = 0;
578
+ let toolName = null;
579
+ let signature = null;
580
+ for (const [sig, data] of Object.entries(sigs)) {
581
+ if (data.count > maxCount) {
582
+ maxCount = data.count;
583
+ toolName = data.toolName;
584
+ signature = sig;
585
+ }
586
+ }
587
+ return { maxCount, toolName, signature };
588
+ }
589
+
590
+ /**
591
+ * Extract tool_use blocks from messages that appear after the last user text message.
592
+ * These are the tool calls from the current assistant turn that the client is sending back.
593
+ * @param {Array} messages
594
+ * @returns {Array} - Array of tool_use-like objects
595
+ */
596
+ function extractToolUseFromCurrentTurn(messages) {
597
+ if (!Array.isArray(messages)) return [];
598
+
599
+ // Find last user text message
600
+ let lastUserTextIndex = -1;
601
+ for (let i = messages.length - 1; i >= 0; i--) {
602
+ const msg = messages[i];
603
+ if (msg?.role !== 'user') continue;
604
+ if (typeof msg.content === 'string' && msg.content.trim().length > 0) {
605
+ lastUserTextIndex = i;
606
+ break;
607
+ }
608
+ if (Array.isArray(msg.content)) {
609
+ const hasText = msg.content.some(block =>
610
+ (block?.type === 'text' && block?.text?.trim?.().length > 0) ||
611
+ (block?.type === 'input_text' && block?.input_text?.trim?.().length > 0)
612
+ );
613
+ if (hasText) {
614
+ lastUserTextIndex = i;
615
+ break;
616
+ }
617
+ }
618
+ }
619
+
620
+ const toolUseBlocks = [];
621
+ const startIndex = lastUserTextIndex >= 0 ? lastUserTextIndex : 0;
622
+ for (let i = startIndex; i < messages.length; i++) {
623
+ const msg = messages[i];
624
+ if (msg?.role !== 'assistant') continue;
625
+ if (!Array.isArray(msg.content)) continue;
626
+ for (const block of msg.content) {
627
+ if (block?.type === 'tool_use') {
628
+ toolUseBlocks.push(block);
629
+ }
630
+ }
631
+ }
632
+ return toolUseBlocks;
633
+ }
634
+
635
+ /**
636
+ * Reset dedup tracking. Called when a new user question is detected.
637
+ * @param {Object} session
638
+ */
639
+ function resetDedupTracking(session) {
640
+ if (!session?.metadata) return;
641
+ session.metadata.toolCallDedup = {
642
+ signatures: {},
643
+ similarGroups: {},
644
+ lastResetAt: Date.now(),
645
+ warningInjected: false,
646
+ };
647
+ logger.debug({ sessionId: session?.id ?? null }, "Cross-request tool dedup: reset tracking for new user question");
648
+ }
649
+
458
650
  function sanitiseAzureTools(tools) {
459
651
  if (!Array.isArray(tools) || tools.length === 0) return undefined;
460
652
  const allowed = new Set([
@@ -516,13 +708,51 @@ function parseExecutionContent(content) {
516
708
  const trimmed = content.trim();
517
709
  if (trimmed.startsWith("{") || trimmed.startsWith("[")) {
518
710
  try {
519
- return JSON.parse(trimmed);
711
+ const parsed = JSON.parse(trimmed);
712
+ // Handle Anthropic content blocks array - extract text
713
+ if (Array.isArray(parsed)) {
714
+ const textParts = parsed
715
+ .filter(block => block && typeof block === 'object')
716
+ .map(block => {
717
+ if (block.type === 'text' && typeof block.text === 'string') {
718
+ return block.text;
719
+ }
720
+ // Handle other block types gracefully
721
+ if (block.text) return block.text;
722
+ if (block.content) return typeof block.content === 'string' ? block.content : JSON.stringify(block.content);
723
+ return null;
724
+ })
725
+ .filter(text => text !== null);
726
+
727
+ if (textParts.length > 0) {
728
+ return textParts.join('\n');
729
+ }
730
+ }
731
+ return parsed;
520
732
  } catch {
521
733
  return content;
522
734
  }
523
735
  }
524
736
  return content;
525
737
  }
738
+ // Handle content that's already an array (content blocks)
739
+ if (Array.isArray(content)) {
740
+ const textParts = content
741
+ .filter(block => block && typeof block === 'object')
742
+ .map(block => {
743
+ if (block.type === 'text' && typeof block.text === 'string') {
744
+ return block.text;
745
+ }
746
+ if (block.text) return block.text;
747
+ if (block.content) return typeof block.content === 'string' ? block.content : JSON.stringify(block.content);
748
+ return null;
749
+ })
750
+ .filter(text => text !== null);
751
+
752
+ if (textParts.length > 0) {
753
+ return textParts.join('\n');
754
+ }
755
+ }
526
756
  return content;
527
757
  }
528
758
 
@@ -718,19 +948,17 @@ function stripThinkingBlocks(text) {
718
948
  return cleanedLines.join("\n").trim();
719
949
  }
720
950
 
951
+ /**
952
+ * Convert legacy Ollama /api/chat response to Anthropic Messages format.
953
+ * Used when Ollama < v0.14.0 (no native Anthropic endpoint).
954
+ */
721
955
  function ollamaToAnthropicResponse(ollamaResponse, requestedModel) {
722
- // Ollama response format:
723
- // { model, created_at, message: { role, content, tool_calls }, done, total_duration, ... }
724
- // { eval_count, prompt_eval_count, ... }
725
-
726
956
  const message = ollamaResponse?.message ?? {};
727
957
  const rawContent = message.content || "";
728
958
  const toolCalls = message.tool_calls || [];
729
959
 
730
- // Build content blocks
731
960
  const contentItems = [];
732
961
 
733
- // Add text content if present, after stripping thinking blocks
734
962
  if (typeof rawContent === "string" && rawContent.trim()) {
735
963
  const cleanedContent = stripThinkingBlocks(rawContent);
736
964
  if (cleanedContent) {
@@ -738,18 +966,31 @@ function ollamaToAnthropicResponse(ollamaResponse, requestedModel) {
738
966
  }
739
967
  }
740
968
 
741
- // Add tool calls if present
969
+ // Convert tool calls from OpenAI function-calling format to Anthropic tool_use
742
970
  if (Array.isArray(toolCalls) && toolCalls.length > 0) {
743
- const { buildAnthropicResponseFromOllama } = require("../clients/ollama-utils");
744
- // Use the utility function for tool call conversion
745
- return buildAnthropicResponseFromOllama(ollamaResponse, requestedModel);
971
+ for (const toolCall of toolCalls) {
972
+ const func = toolCall.function || {};
973
+ let input = {};
974
+ if (func.arguments) {
975
+ if (typeof func.arguments === "string") {
976
+ try { input = JSON.parse(func.arguments); } catch { input = {}; }
977
+ } else if (typeof func.arguments === "object") {
978
+ input = func.arguments;
979
+ }
980
+ }
981
+ contentItems.push({
982
+ type: "tool_use",
983
+ id: toolCall.id || `toolu_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`,
984
+ name: func.name || "unknown",
985
+ input,
986
+ });
987
+ }
746
988
  }
747
989
 
748
990
  if (contentItems.length === 0) {
749
991
  contentItems.push({ type: "text", text: "" });
750
992
  }
751
993
 
752
- // Ollama uses different token count fields
753
994
  const inputTokens = ollamaResponse.prompt_eval_count ?? 0;
754
995
  const outputTokens = ollamaResponse.eval_count ?? 0;
755
996
 
@@ -759,7 +1000,8 @@ function ollamaToAnthropicResponse(ollamaResponse, requestedModel) {
759
1000
  role: "assistant",
760
1001
  model: requestedModel,
761
1002
  content: contentItems,
762
- stop_reason: ollamaResponse.done ? "end_turn" : "max_tokens",
1003
+ stop_reason: toolCalls.length > 0 ? "tool_use" :
1004
+ ollamaResponse.done ? "end_turn" : "max_tokens",
763
1005
  stop_sequence: null,
764
1006
  usage: {
765
1007
  input_tokens: inputTokens,
@@ -851,6 +1093,9 @@ function sanitizePayload(payload) {
851
1093
  config.modelProvider?.defaultModel ??
852
1094
  "databricks-claude-sonnet-4-5";
853
1095
  clean.model = requestedModel;
1096
+ if (!clean.max_tokens) {
1097
+ clean.max_tokens = 16384;
1098
+ }
854
1099
  const providerType = config.modelProvider?.type ?? "databricks";
855
1100
  const flattenContent = providerType !== "azure-anthropic";
856
1101
  clean.messages = normaliseMessages(clean, { flattenContent }).filter((msg) => {
@@ -995,12 +1240,10 @@ function sanitizePayload(payload) {
995
1240
  // Check if this is a simple conversational message (no tools needed)
996
1241
  const isConversational = (() => {
997
1242
  if (!Array.isArray(clean.messages) || clean.messages.length === 0) {
998
- logger.debug({ reason: "No messages array" }, "Ollama conversational check");
999
1243
  return false;
1000
1244
  }
1001
1245
  const lastMessage = clean.messages[clean.messages.length - 1];
1002
1246
  if (lastMessage?.role !== "user") {
1003
- logger.debug({ role: lastMessage?.role }, "Ollama conversational check - not user");
1004
1247
  return false;
1005
1248
  }
1006
1249
 
@@ -1008,28 +1251,18 @@ function sanitizePayload(payload) {
1008
1251
  ? lastMessage.content
1009
1252
  : "";
1010
1253
 
1011
- logger.debug({
1012
- contentType: typeof lastMessage.content,
1013
- isString: typeof lastMessage.content === "string",
1014
- contentLength: typeof lastMessage.content === "string" ? lastMessage.content.length : "N/A",
1015
- actualContent: typeof lastMessage.content === "string" ? lastMessage.content.substring(0, 100) : JSON.stringify(lastMessage.content).substring(0, 100)
1016
- }, "Ollama conversational check - analyzing content");
1017
-
1018
1254
  const trimmed = content.trim().toLowerCase();
1019
1255
 
1020
1256
  // Simple greetings
1021
1257
  if (/^(hi|hello|hey|good morning|good afternoon|good evening|howdy|greetings)[\s\.\!\?]*$/.test(trimmed)) {
1022
- logger.debug({ matched: "greeting", trimmed }, "Ollama conversational check - matched");
1023
- return true;
1258
+ return "greeting";
1024
1259
  }
1025
1260
 
1026
- // Very short messages (< 20 chars) without code/technical keywords
1027
- if (trimmed.length < 20 && !/code|file|function|error|bug|fix|write|read|create/.test(trimmed)) {
1028
- logger.debug({ matched: "short", trimmed, length: trimmed.length }, "Ollama conversational check - matched");
1029
- return true;
1261
+ // Conversational phrases that don't need tools (thanks, farewells, acknowledgements)
1262
+ if (/^(thanks|thank you|thx|ty|bye|goodbye|see you|ok|okay|cool|nice|great|awesome|sure|got it|sounds good|no worries|np|cheers)[\s\.\!\?]*$/.test(trimmed)) {
1263
+ return "conversational";
1030
1264
  }
1031
1265
 
1032
- logger.debug({ trimmed: trimmed.substring(0, 50), length: trimmed.length }, "Ollama conversational check - not matched");
1033
1266
  return false;
1034
1267
  })();
1035
1268
 
@@ -1039,37 +1272,12 @@ function sanitizePayload(payload) {
1039
1272
  delete clean.tool_choice;
1040
1273
  logger.debug({
1041
1274
  model: config.ollama?.model,
1042
- message: "Removed tools for conversational message"
1043
- }, "Ollama conversational mode");
1275
+ reason: isConversational,
1276
+ }, "Ollama conversational mode - tools removed");
1044
1277
  } else if (modelSupportsTools && Array.isArray(clean.tools) && clean.tools.length > 0) {
1045
- // Ollama performance degrades with too many tools
1046
- // Limit to essential tools only
1047
- const OLLAMA_ESSENTIAL_TOOLS = new Set([
1048
- "Bash",
1049
- "Read",
1050
- "Write",
1051
- "Edit",
1052
- "Glob",
1053
- "Grep",
1054
- "WebSearch",
1055
- "WebFetch"
1056
- ]);
1057
-
1058
- const limitedTools = clean.tools.filter(tool =>
1059
- OLLAMA_ESSENTIAL_TOOLS.has(tool.name)
1060
- );
1061
-
1062
- logger.debug({
1063
- model: config.ollama?.model,
1064
- originalToolCount: clean.tools.length,
1065
- limitedToolCount: limitedTools.length,
1066
- keptTools: limitedTools.map(t => t.name)
1067
- }, "Ollama tools limited for performance");
1068
-
1069
- clean.tools = limitedTools.length > 0 ? limitedTools : undefined;
1070
- if (!clean.tools) {
1071
- delete clean.tools;
1072
- }
1278
+ // Keep all tools — Ollama receives them in Anthropic format (native API)
1279
+ // or they get converted to OpenAI format in invokeOllama (legacy API)
1280
+ clean.tools = ensureAnthropicToolFormat(clean.tools);
1073
1281
  } else {
1074
1282
  // Remove tools for models without tool support
1075
1283
  delete clean.tools;
@@ -1097,6 +1305,14 @@ function sanitizePayload(payload) {
1097
1305
  } else {
1098
1306
  clean.tools = ensureAnthropicToolFormat(clean.tools);
1099
1307
  }
1308
+ } else if (providerType === "moonshot") {
1309
+ // Moonshot supports tools - keep them in Anthropic format
1310
+ // They will be converted to OpenAI format in invokeMoonshot
1311
+ if (!Array.isArray(clean.tools) || clean.tools.length === 0) {
1312
+ delete clean.tools;
1313
+ } else {
1314
+ clean.tools = ensureAnthropicToolFormat(clean.tools);
1315
+ }
1100
1316
  } else if (Array.isArray(clean.tools)) {
1101
1317
  // Unknown provider - remove tools for safety
1102
1318
  delete clean.tools;
@@ -1172,6 +1388,10 @@ function sanitizePayload(payload) {
1172
1388
  }
1173
1389
  }
1174
1390
 
1391
+ // Optional TOON conversion for large JSON message payloads (prompt context only).
1392
+ // Run this BEFORE message coalescing to preserve parseable JSON boundaries.
1393
+ applyToonCompression(clean, config.toon, { logger });
1394
+
1175
1395
  // FIX: Handle consecutive messages with the same role (causes llama.cpp 400 error)
1176
1396
  // Strategy: Merge all consecutive messages, add instruction to focus on last request
1177
1397
  if (Array.isArray(clean.messages) && clean.messages.length > 0) {
@@ -1210,7 +1430,7 @@ function sanitizePayload(payload) {
1210
1430
  }
1211
1431
 
1212
1432
  if (merged.length !== clean.messages.length) {
1213
- logger.info({
1433
+ logger.debug({
1214
1434
  originalCount: clean.messages.length,
1215
1435
  mergedCount: merged.length,
1216
1436
  reduced: clean.messages.length - merged.length
@@ -1220,19 +1440,20 @@ function sanitizePayload(payload) {
1220
1440
  clean.messages = merged;
1221
1441
  }
1222
1442
 
1223
- // [CONTEXT_FLOW] Log payload after sanitization
1224
1443
  logger.debug({
1225
1444
  providerType: config.modelProvider?.type ?? "databricks",
1226
- phase: "after_sanitize",
1227
- systemField: typeof clean.system === 'string'
1228
- ? { type: 'string', length: clean.system.length }
1229
- : clean.system
1230
- ? { type: typeof clean.system, value: clean.system }
1231
- : undefined,
1232
1445
  messageCount: clean.messages?.length ?? 0,
1233
- firstMessageHasSystem: clean.messages?.[0]?.content?.includes?.('You are Claude Code') ?? false,
1234
1446
  toolCount: clean.tools?.length ?? 0
1235
- }, '[CONTEXT_FLOW] After sanitizePayload');
1447
+ }, 'After sanitizePayload');
1448
+
1449
+ // === Suggestion mode: tag request and override model if configured ===
1450
+ const { isSuggestionMode: isSuggestion } = detectSuggestionMode(clean.messages);
1451
+ clean._requestMode = isSuggestion ? "suggestion" : "main";
1452
+ const smConfig = config.modelProvider?.suggestionModeModel ?? "default";
1453
+ if (isSuggestion && smConfig.toLowerCase() !== "default" && smConfig.toLowerCase() !== "none") {
1454
+ clean.model = smConfig;
1455
+ clean._suggestionModeModel = smConfig;
1456
+ }
1236
1457
 
1237
1458
  return clean;
1238
1459
  }
@@ -1330,8 +1551,7 @@ async function runAgentLoop({
1330
1551
  providerType,
1331
1552
  headers,
1332
1553
  }) {
1333
- console.log('[DEBUG] runAgentLoop ENTERED - providerType:', providerType, 'messages:', cleanPayload.messages?.length);
1334
- logger.info({ providerType, messageCount: cleanPayload.messages?.length }, 'runAgentLoop ENTERED');
1554
+ logger.debug({ providerType, messageCount: cleanPayload.messages?.length }, 'runAgentLoop entered');
1335
1555
  const settings = resolveLoopOptions(options);
1336
1556
  // Initialize audit logger (no-op if disabled)
1337
1557
  const auditLogger = createAuditLogger(config.audit);
@@ -1378,7 +1598,6 @@ async function runAgentLoop({
1378
1598
  }
1379
1599
 
1380
1600
  steps += 1;
1381
- console.log('[LOOP DEBUG] Entered while loop - step:', steps);
1382
1601
  logger.debug(
1383
1602
  {
1384
1603
  sessionId: session?.id ?? null,
@@ -1388,6 +1607,19 @@ async function runAgentLoop({
1388
1607
  "Agent loop step",
1389
1608
  );
1390
1609
 
1610
+ // Trim messages when they grow too large to prevent OOM.
1611
+ // Keep the first message (system/user) and the last MAX_LOOP_MESSAGES.
1612
+ const MAX_LOOP_MESSAGES = 40;
1613
+ if (cleanPayload.messages && cleanPayload.messages.length > MAX_LOOP_MESSAGES) {
1614
+ const excess = cleanPayload.messages.length - MAX_LOOP_MESSAGES;
1615
+ // Keep first 2 messages (system context + initial user) and trim from the middle
1616
+ cleanPayload.messages.splice(2, excess);
1617
+ logger.debug(
1618
+ { trimmed: excess, remaining: cleanPayload.messages.length },
1619
+ "Trimmed intermediate messages to prevent memory growth",
1620
+ );
1621
+ }
1622
+
1391
1623
  // Debug: Log payload before sending to Azure
1392
1624
  if (providerType === "azure-anthropic") {
1393
1625
  logger.debug(
@@ -1472,14 +1704,11 @@ async function runAgentLoop({
1472
1704
  }
1473
1705
  }
1474
1706
 
1475
- // [CONTEXT_FLOW] Log after memory injection
1476
1707
  logger.debug({
1477
1708
  sessionId: session?.id ?? null,
1478
- phase: "after_memory",
1479
- systemPromptLength: cleanPayload.system?.length ?? 0,
1480
1709
  messageCount: cleanPayload.messages?.length ?? 0,
1481
1710
  toolCount: cleanPayload.tools?.length ?? 0
1482
- }, '[CONTEXT_FLOW] After memory injection');
1711
+ }, 'After memory injection');
1483
1712
 
1484
1713
  if (steps === 1 && (config.systemPrompt?.mode === 'dynamic' || config.systemPrompt?.toolDescriptions === 'minimal')) {
1485
1714
  try {
@@ -1568,9 +1797,26 @@ IMPORTANT TOOL USAGE RULES:
1568
1797
  logger.debug({ sessionId: session?.id ?? null }, 'Tool termination instructions injected for non-Claude model');
1569
1798
  }
1570
1799
 
1800
+ // Compute model-aware token budget thresholds
1801
+ const registry = getModelRegistrySync();
1802
+ const modelInfo = registry.getCost(requestedModel);
1803
+ const modelContextWindow = modelInfo?.context || config.tokenBudget?.max || 180000;
1804
+ const modelMax = Math.floor(modelContextWindow * 0.85);
1805
+ const effectiveMax = Math.min(modelMax, config.tokenBudget?.max || 180000);
1806
+ const effectiveWarning = Math.floor(effectiveMax * 0.65);
1807
+
1808
+ logger.debug({
1809
+ sessionId: session?.id ?? null,
1810
+ requestedModel,
1811
+ modelContextWindow,
1812
+ effectiveWarning,
1813
+ effectiveMax,
1814
+ source: modelInfo?.source || 'default',
1815
+ }, 'Model-aware token budget computed');
1816
+
1571
1817
  if (steps === 1 && config.tokenBudget?.enforcement !== false) {
1572
1818
  try {
1573
- const budgetCheck = tokenBudget.checkBudget(cleanPayload);
1819
+ const budgetCheck = tokenBudget.checkBudget(cleanPayload, effectiveWarning, effectiveMax);
1574
1820
 
1575
1821
  if (budgetCheck.atWarning) {
1576
1822
  logger.warn({
@@ -1584,8 +1830,8 @@ IMPORTANT TOOL USAGE RULES:
1584
1830
  if (budgetCheck.overMax) {
1585
1831
  // Apply adaptive compression to fit within budget
1586
1832
  const enforcement = tokenBudget.enforceBudget(cleanPayload, {
1587
- warningThreshold: config.tokenBudget?.warning,
1588
- maxThreshold: config.tokenBudget?.max,
1833
+ warningThreshold: effectiveWarning,
1834
+ maxThreshold: effectiveMax,
1589
1835
  enforcement: true
1590
1836
  });
1591
1837
 
@@ -1609,7 +1855,6 @@ IMPORTANT TOOL USAGE RULES:
1609
1855
  }
1610
1856
 
1611
1857
  // Track estimated token usage before model call
1612
- console.log('[TOKEN DEBUG] About to track token usage - step:', steps);
1613
1858
  const estimatedTokens = config.tokenTracking?.enabled !== false
1614
1859
  ? tokens.countPayloadTokens(cleanPayload)
1615
1860
  : null;
@@ -1623,15 +1868,6 @@ IMPORTANT TOOL USAGE RULES:
1623
1868
  }
1624
1869
 
1625
1870
  // Apply Headroom compression if enabled
1626
- const headroomEstTokens = Math.ceil(JSON.stringify(cleanPayload.messages || []).length / 4);
1627
- logger.info({
1628
- headroomEnabled: isHeadroomEnabled(),
1629
- messageCount: cleanPayload.messages?.length ?? 0,
1630
- estimatedTokens: headroomEstTokens,
1631
- threshold: config.headroom?.minTokens || 500,
1632
- willCompress: isHeadroomEnabled() && headroomEstTokens >= (config.headroom?.minTokens || 500),
1633
- }, 'Headroom compression check');
1634
-
1635
1871
  if (isHeadroomEnabled() && cleanPayload.messages && cleanPayload.messages.length > 0) {
1636
1872
  try {
1637
1873
  const compressionResult = await headroomCompress(
@@ -1640,36 +1876,27 @@ IMPORTANT TOOL USAGE RULES:
1640
1876
  {
1641
1877
  mode: config.headroom?.mode,
1642
1878
  queryContext: cleanPayload.messages[cleanPayload.messages.length - 1]?.content,
1879
+ model: requestedModel,
1880
+ modelLimit: modelContextWindow,
1881
+ tokenBudget: effectiveMax,
1643
1882
  }
1644
1883
  );
1645
1884
 
1646
- logger.info({
1647
- compressed: compressionResult.compressed,
1648
- tokensBefore: compressionResult.stats?.tokens_before,
1649
- tokensAfter: compressionResult.stats?.tokens_after,
1650
- savings: compressionResult.stats?.savings_percent ? `${compressionResult.stats.savings_percent}%` : 'N/A',
1651
- reason: compressionResult.stats?.reason || compressionResult.stats?.transforms_applied?.join(', ') || 'none',
1652
- }, 'Headroom compression result');
1653
-
1654
1885
  if (compressionResult.compressed) {
1655
1886
  cleanPayload.messages = compressionResult.messages;
1656
1887
  if (compressionResult.tools) {
1657
1888
  cleanPayload.tools = compressionResult.tools;
1658
1889
  }
1659
- logger.info({
1660
- sessionId: session?.id ?? null,
1661
- tokensBefore: compressionResult.stats?.tokens_before,
1662
- tokensAfter: compressionResult.stats?.tokens_after,
1663
- saved: compressionResult.stats?.tokens_saved,
1664
- savingsPercent: compressionResult.stats?.savings_percent,
1665
- transforms: compressionResult.stats?.transforms_applied,
1666
- }, 'Headroom compression applied to request');
1667
- } else {
1668
- logger.debug({
1669
- sessionId: session?.id ?? null,
1670
- reason: compressionResult.stats?.reason,
1671
- }, 'Headroom compression skipped');
1672
1890
  }
1891
+
1892
+ logger.debug({
1893
+ sessionId: session?.id ?? null,
1894
+ outcome: compressionResult.compressed ? 'applied' : 'skipped',
1895
+ tokensBefore: compressionResult.stats?.tokens_before,
1896
+ tokensAfter: compressionResult.stats?.tokens_after,
1897
+ savingsPercent: compressionResult.stats?.savings_percent,
1898
+ reason: compressionResult.stats?.reason || compressionResult.stats?.transforms_applied?.join(', ') || 'none',
1899
+ }, 'Headroom compression');
1673
1900
  } catch (headroomErr) {
1674
1901
  logger.warn({ err: headroomErr, sessionId: session?.id ?? null }, 'Headroom compression failed, using original messages');
1675
1902
  }
@@ -1902,11 +2129,26 @@ IMPORTANT TOOL USAGE RULES:
1902
2129
  toolCalls = Array.isArray(message.tool_calls) ? message.tool_calls : [];
1903
2130
  }
1904
2131
 
2132
+ // Guard: drop hallucinated tool calls when no tools were sent to the model.
2133
+ // Some models (e.g. Llama 3.1) hallucinate tool_call blocks from conversation
2134
+ // history even when the request contained zero tool definitions.
2135
+ const toolsWereSent = Array.isArray(cleanPayload.tools) && cleanPayload.tools.length > 0;
2136
+ if (toolCalls.length > 0 && !toolsWereSent) {
2137
+ logger.warn({
2138
+ sessionId: session?.id ?? null,
2139
+ step: steps,
2140
+ hallucinated: toolCalls.map(tc => tc.function?.name || tc.name),
2141
+ noToolInjection: !!cleanPayload._noToolInjection,
2142
+ }, "Dropped hallucinated tool calls (no tools were sent to model)");
2143
+ toolCalls = [];
2144
+ // If there's also no text content, treat as empty response (handled below)
2145
+ }
2146
+
1905
2147
  if (toolCalls.length > 0) {
1906
2148
  // Convert OpenAI/OpenRouter format to Anthropic format for session storage
1907
2149
  let sessionContent;
1908
2150
  if (providerType === "azure-anthropic") {
1909
- // Azure Anthropic already returns content in Anthropic format
2151
+ // Azure Anthropic already returns content in Anthropic
1910
2152
  sessionContent = databricksResponse.json?.content ?? [];
1911
2153
  } else {
1912
2154
  // Convert OpenAI/OpenRouter format to Anthropic content blocks
@@ -1966,9 +2208,10 @@ IMPORTANT TOOL USAGE RULES:
1966
2208
  });
1967
2209
 
1968
2210
  let assistantToolMessage;
1969
- if (providerType === "azure-anthropic") {
1970
- // For Azure Anthropic, use the content array directly from the response
1971
- // It already contains both text and tool_use blocks in the correct format
2211
+ if (providerType === "azure-anthropic" || isAnthropicFormat) {
2212
+ // For Anthropic-format responses (azure-anthropic, Ollama native API,
2213
+ // azure-openai Responses API), use the content array directly
2214
+ // it already contains both text and tool_use blocks in the correct format
1972
2215
  assistantToolMessage = {
1973
2216
  role: "assistant",
1974
2217
  content: databricksResponse.json?.content ?? [],
@@ -1981,9 +2224,9 @@ IMPORTANT TOOL USAGE RULES:
1981
2224
  };
1982
2225
  }
1983
2226
 
1984
- // Only add fallback content for Databricks format (Azure already has content)
2227
+ // Only add fallback content for OpenAI-format responses (Anthropic format already has content)
1985
2228
  if (
1986
- providerType !== "azure-anthropic" &&
2229
+ providerType !== "azure-anthropic" && !isAnthropicFormat &&
1987
2230
  (!assistantToolMessage.content ||
1988
2231
  (typeof assistantToolMessage.content === "string" &&
1989
2232
  assistantToolMessage.content.trim().length === 0)) &&
@@ -2019,7 +2262,7 @@ IMPORTANT TOOL USAGE RULES:
2019
2262
  // If in passthrough/client mode and there are client-side tools, return them to client
2020
2263
  // Server-side tools (Task, Web) will be executed below
2021
2264
  if ((executionMode === "passthrough" || executionMode === "client") && clientSideToolCalls.length > 0) {
2022
- logger.info(
2265
+ logger.debug(
2023
2266
  {
2024
2267
  sessionId: session?.id ?? null,
2025
2268
  totalToolCount: toolCalls.length,
@@ -2044,7 +2287,7 @@ IMPORTANT TOOL USAGE RULES:
2044
2287
  type: "message",
2045
2288
  role: "assistant",
2046
2289
  content: clientContent,
2047
- model: databricksResponse.json?.model || clean.model,
2290
+ model: databricksResponse.json?.model || cleanPayload.model,
2048
2291
  stop_reason: "tool_use",
2049
2292
  usage: databricksResponse.json?.usage || {
2050
2293
  input_tokens: 0,
@@ -2065,6 +2308,27 @@ IMPORTANT TOOL USAGE RULES:
2065
2308
  // then continue the conversation loop. For now, let's fall through to execute server-side tools.
2066
2309
  if (serverSideToolCalls.length === 0) {
2067
2310
  // No server-side tools - pure passthrough
2311
+ // Record outbound client-side tool calls into cross-request dedup tracker
2312
+ if (session && clientSideToolCalls.length > 0) {
2313
+ ensureDedupStructure(session);
2314
+ for (const call of clientSideToolCalls) {
2315
+ recordCrossRequestToolCall(session, call);
2316
+ }
2317
+ // Persist dedup state (non-ephemeral sessions only)
2318
+ if (session.id && !session._ephemeral) {
2319
+ try { upsertSession(session.id, { metadata: session.metadata }); } catch (e) {
2320
+ logger.debug({ err: e.message }, "Failed to persist outbound dedup state");
2321
+ }
2322
+ }
2323
+ const { maxCount, toolName: dedupTool } = getMaxDedupCount(session);
2324
+ logger.debug({
2325
+ sessionId: session?.id ?? null,
2326
+ clientToolCount: clientSideToolCalls.length,
2327
+ maxDedupCount: maxCount,
2328
+ maxDedupTool: dedupTool,
2329
+ }, "Cross-request tool dedup: recorded outbound tool calls");
2330
+ }
2331
+
2068
2332
  return {
2069
2333
  response: {
2070
2334
  status: 200,
@@ -2081,7 +2345,7 @@ IMPORTANT TOOL USAGE RULES:
2081
2345
  // Override toolCalls to only include Server-side tools for server execution
2082
2346
  toolCalls = serverSideToolCalls;
2083
2347
 
2084
- logger.info(
2348
+ logger.debug(
2085
2349
  {
2086
2350
  sessionId: session?.id ?? null,
2087
2351
  serverToolCount: serverSideToolCalls.length,
@@ -2090,7 +2354,7 @@ IMPORTANT TOOL USAGE RULES:
2090
2354
  );
2091
2355
  } else if (executionMode === "passthrough" || executionMode === "client") {
2092
2356
  // Only Server-side tools, no Client-side tools - execute all server-side
2093
- logger.info(
2357
+ logger.debug(
2094
2358
  {
2095
2359
  sessionId: session?.id ?? null,
2096
2360
  serverToolCount: serverSideToolCalls.length,
@@ -2155,6 +2419,7 @@ IMPORTANT TOOL USAGE RULES:
2155
2419
  session,
2156
2420
  cwd,
2157
2421
  requestMessages: cleanPayload.messages,
2422
+ provider: providerType, // Pass provider for GPT-specific formatting
2158
2423
  }))
2159
2424
  );
2160
2425
 
@@ -2388,10 +2653,14 @@ IMPORTANT TOOL USAGE RULES:
2388
2653
  session,
2389
2654
  cwd,
2390
2655
  requestMessages: cleanPayload.messages,
2656
+ provider: providerType, // Pass provider for GPT-specific formatting
2391
2657
  });
2392
2658
 
2393
2659
  let toolMessage;
2394
- if (providerType === "azure-anthropic") {
2660
+ if (providerType === "azure-anthropic" || isAnthropicFormat) {
2661
+ // Anthropic-format tool result for providers whose responses use
2662
+ // Anthropic tool_use blocks (azure-anthropic, Ollama native API,
2663
+ // azure-openai Responses API)
2395
2664
  const parsedContent = parseExecutionContent(execution.content);
2396
2665
  const serialisedContent =
2397
2666
  typeof parsedContent === "string" || parsedContent === null
@@ -2502,34 +2771,54 @@ IMPORTANT TOOL USAGE RULES:
2502
2771
 
2503
2772
  // === TOOL CALL LOOP DETECTION ===
2504
2773
  // Track tool calls to detect infinite loops where the model calls the same tool
2505
- // repeatedly with identical parameters
2774
+ // repeatedly with identical or similar parameters
2775
+ // All providers use threshold 2 and similarity-based detection
2776
+ const loopThreshold = 2;
2777
+
2506
2778
  for (const call of toolCalls) {
2507
2779
  const signature = getToolCallSignature(call);
2508
- const count = (toolCallHistory.get(signature) || 0) + 1;
2509
- toolCallHistory.set(signature, count);
2780
+ const existingEntry = toolCallHistory.get(signature);
2781
+ let count = (existingEntry?.count || 0) + 1;
2782
+ toolCallHistory.set(signature, { count, call });
2510
2783
 
2511
2784
  const toolName = call.function?.name ?? call.name ?? 'unknown';
2512
2785
 
2513
- if (count === 3 && !loopWarningInjected) {
2786
+ // Check for similar (not just identical) tool calls across all providers
2787
+ // This catches cases where the model slightly varies parameters but is essentially looping
2788
+ for (const [existingSig, existingData] of toolCallHistory.entries()) {
2789
+ if (existingSig !== signature && areSimilarToolCalls(call, existingData.call)) {
2790
+ // Found a similar call - increase count to trigger loop detection earlier
2791
+ count = Math.max(count, existingData.count + 1);
2792
+ logger.debug({
2793
+ tool: toolName,
2794
+ currentSignature: signature,
2795
+ similarSignature: existingSig,
2796
+ combinedCount: count,
2797
+ }, "Similar tool call detected - combining counts");
2798
+ }
2799
+ }
2800
+
2801
+ if (count === loopThreshold && !loopWarningInjected) {
2514
2802
  logger.warn(
2515
2803
  {
2516
2804
  sessionId: session?.id ?? null,
2517
2805
  correlationId: options?.correlationId,
2518
2806
  tool: toolName,
2519
2807
  loopCount: count,
2808
+ loopThreshold,
2520
2809
  signature: signature,
2521
2810
  action: 'warning_injected',
2522
2811
  totalSteps: steps,
2523
2812
  remainingSteps: settings.maxSteps - steps,
2524
2813
  },
2525
- "Tool call loop detected - same tool called 3 times with identical parameters",
2814
+ `Tool call loop detected - same tool called ${loopThreshold} times with identical/similar parameters`,
2526
2815
  );
2527
2816
 
2528
2817
  // Inject warning message to model
2529
2818
  loopWarningInjected = true;
2530
2819
  const warningMessage = {
2531
2820
  role: "user",
2532
- content: "⚠️ System Warning: You have called the same tool with identical parameters 3 times in this request. This may indicate an infinite loop. Please provide a final answer to the user instead of calling the same tool again, or explain why you need to continue retrying with the same parameters.",
2821
+ content: `⚠️ CRITICAL SYSTEM WARNING: You have called the "${toolName}" tool ${count} times with identical or similar parameters. This IS an infinite loop. STOP calling this tool immediately. You MUST now provide a direct text response to the user based on the results you have received. If the tool returned "no results" or empty output, that IS the final answer - do not retry. Summarize your findings and respond.`,
2533
2822
  };
2534
2823
 
2535
2824
  cleanPayload.messages.push(warningMessage);
@@ -2544,11 +2833,12 @@ IMPORTANT TOOL USAGE RULES:
2544
2833
  reason: "tool_call_loop_warning",
2545
2834
  toolName,
2546
2835
  loopCount: count,
2836
+ loopThreshold,
2547
2837
  },
2548
2838
  });
2549
2839
  }
2550
- } else if (count > 3) {
2551
- // Force termination after 3 identical calls
2840
+ } else if (count > loopThreshold) {
2841
+ // Force termination after threshold exceeded
2552
2842
  // Log FULL context for debugging why the loop occurred
2553
2843
  logger.error(
2554
2844
  {
@@ -2556,6 +2846,7 @@ IMPORTANT TOOL USAGE RULES:
2556
2846
  correlationId: options?.correlationId,
2557
2847
  tool: toolName,
2558
2848
  loopCount: count,
2849
+ loopThreshold,
2559
2850
  signature: signature,
2560
2851
  action: 'request_terminated',
2561
2852
  totalSteps: steps,
@@ -2576,7 +2867,7 @@ IMPORTANT TOOL USAGE RULES:
2576
2867
  body: {
2577
2868
  error: {
2578
2869
  type: "tool_call_loop_detected",
2579
- message: `Tool call loop detected: The model called the same tool ("${toolName}") with identical parameters ${count} times. This indicates an infinite loop and execution has been terminated. Please try rephrasing your request or provide different parameters.`,
2870
+ message: `Tool call loop detected: The model called the same tool ("${toolName}") with identical parameters ${count} times (threshold: ${loopThreshold}). This indicates an infinite loop and execution has been terminated. Please try rephrasing your request or provide different parameters.`,
2580
2871
  },
2581
2872
  },
2582
2873
  terminationReason: "tool_call_loop",
@@ -2608,11 +2899,19 @@ IMPORTANT TOOL USAGE RULES:
2608
2899
  anthropicPayload.content = policy.sanitiseContent(anthropicPayload.content);
2609
2900
  }
2610
2901
  } else if (actualProvider === "ollama") {
2611
- anthropicPayload = ollamaToAnthropicResponse(
2612
- databricksResponse.json,
2613
- requestedModel,
2614
- );
2615
- anthropicPayload.content = policy.sanitiseContent(anthropicPayload.content);
2902
+ const ollamaJson = databricksResponse.json;
2903
+ // Detect response format: Anthropic API (v0.14.0+) has type:"message",
2904
+ // legacy /api/chat has message.role + message.content
2905
+ if (ollamaJson?.type === "message" && Array.isArray(ollamaJson?.content)) {
2906
+ // Anthropic-native response — passthrough
2907
+ anthropicPayload = ollamaJson;
2908
+ } else {
2909
+ // Legacy Ollama response — convert to Anthropic format
2910
+ anthropicPayload = ollamaToAnthropicResponse(ollamaJson, requestedModel);
2911
+ }
2912
+ if (Array.isArray(anthropicPayload?.content)) {
2913
+ anthropicPayload.content = policy.sanitiseContent(anthropicPayload.content);
2914
+ }
2616
2915
  } else if (actualProvider === "openrouter") {
2617
2916
  const { convertOpenRouterResponseToAnthropic } = require("../clients/openrouter-utils");
2618
2917
 
@@ -2841,6 +3140,16 @@ IMPORTANT TOOL USAGE RULES:
2841
3140
  if (Array.isArray(anthropicPayload?.content)) {
2842
3141
  anthropicPayload.content = policy.sanitiseContent(anthropicPayload.content);
2843
3142
  }
3143
+ } else if (actualProvider === "moonshot") {
3144
+ // Moonshot responses are already converted to Anthropic format in invokeMoonshot
3145
+ logger.info({
3146
+ hasJson: !!databricksResponse.json,
3147
+ jsonContent: JSON.stringify(databricksResponse.json?.content)?.substring(0, 300),
3148
+ }, "=== MOONSHOT ORCHESTRATOR DEBUG ===");
3149
+ anthropicPayload = databricksResponse.json;
3150
+ if (Array.isArray(anthropicPayload?.content)) {
3151
+ anthropicPayload.content = policy.sanitiseContent(anthropicPayload.content);
3152
+ }
2844
3153
  } else {
2845
3154
  anthropicPayload = toAnthropicResponse(
2846
3155
  databricksResponse.json,
@@ -3035,6 +3344,7 @@ IMPORTANT TOOL USAGE RULES:
3035
3344
  session,
3036
3345
  cwd,
3037
3346
  requestMessages: cleanPayload.messages,
3347
+ provider: providerType, // Pass provider for GPT-specific formatting
3038
3348
  });
3039
3349
 
3040
3350
  const toolResultMessage = createFallbackToolResultMessage(providerType, {
@@ -3243,6 +3553,34 @@ IMPORTANT TOOL USAGE RULES:
3243
3553
  };
3244
3554
  }
3245
3555
 
3556
+ /**
3557
+ * Detect if the current request is a suggestion mode call.
3558
+ * Scans the last user message for the [SUGGESTION MODE: marker.
3559
+ * @param {Array} messages - The conversation messages
3560
+ * @returns {{ isSuggestionMode: boolean }}
3561
+ */
3562
+ function detectSuggestionMode(messages) {
3563
+ if (!Array.isArray(messages) || messages.length === 0) {
3564
+ return { isSuggestionMode: false };
3565
+ }
3566
+ // Scan from the end to find the last user message
3567
+ for (let i = messages.length - 1; i >= 0; i--) {
3568
+ const msg = messages[i];
3569
+ if (msg?.role !== 'user') continue;
3570
+ const content = typeof msg.content === 'string'
3571
+ ? msg.content
3572
+ : Array.isArray(msg.content)
3573
+ ? msg.content.map(b => b.text || '').join(' ')
3574
+ : '';
3575
+ if (content.includes('[SUGGESTION MODE:')) {
3576
+ return { isSuggestionMode: true };
3577
+ }
3578
+ // Only check the last user message
3579
+ break;
3580
+ }
3581
+ return { isSuggestionMode: false };
3582
+ }
3583
+
3246
3584
  async function processMessage({ payload, headers, session, cwd, options = {} }) {
3247
3585
  const requestedModel =
3248
3586
  payload?.model ??
@@ -3252,102 +3590,317 @@ async function processMessage({ payload, headers, session, cwd, options = {} })
3252
3590
  typeof headers?.["anthropic-beta"] === "string" &&
3253
3591
  headers["anthropic-beta"].includes("interleaved-thinking");
3254
3592
 
3593
+ // === SUGGESTION MODE: Early return when SUGGESTION_MODE_MODEL=none ===
3594
+ const { isSuggestionMode } = detectSuggestionMode(payload?.messages);
3595
+ const suggestionModelConfig = config.modelProvider?.suggestionModeModel ?? "default";
3596
+ if (isSuggestionMode && suggestionModelConfig.toLowerCase() === "none") {
3597
+ logger.info('Suggestion mode: skipping LLM call (SUGGESTION_MODE_MODEL=none)');
3598
+ return {
3599
+ response: {
3600
+ json: {
3601
+ id: `msg_suggestion_skip_${Date.now()}`,
3602
+ type: "message",
3603
+ role: "assistant",
3604
+ content: [{ type: "text", text: "" }],
3605
+ model: requestedModel,
3606
+ stop_reason: "end_turn",
3607
+ stop_sequence: null,
3608
+ usage: { input_tokens: 0, output_tokens: 0 },
3609
+ },
3610
+ ok: true,
3611
+ status: 200,
3612
+ },
3613
+ steps: 0,
3614
+ durationMs: 0,
3615
+ terminationReason: "suggestion_mode_skip",
3616
+ };
3617
+ }
3618
+
3255
3619
  // === TOOL LOOP GUARD (EARLY CHECK) ===
3256
3620
  // Check BEFORE sanitization since sanitizePayload removes conversation history
3257
- const toolLoopThreshold = config.policy?.toolLoopThreshold ?? 3;
3621
+ // All providers use threshold 2 to catch loops early
3622
+ const providerType = config.modelProvider?.type ?? "databricks";
3623
+ const toolLoopThreshold = 2;
3258
3624
  const { toolResultCount, toolUseCount } = countToolCallsInHistory(payload?.messages);
3259
3625
 
3260
- console.log('[ToolLoopGuard EARLY] Checking ORIGINAL messages:', {
3261
- messageCount: payload?.messages?.length,
3262
- toolResultCount,
3263
- toolUseCount,
3264
- threshold: toolLoopThreshold,
3265
- });
3626
+ const executionMode = config.toolExecutionMode || "server";
3627
+ const isClientMode = executionMode === "client" || executionMode === "passthrough";
3628
+
3629
+ if (isClientMode && session) {
3630
+ // === CROSS-REQUEST DEDUP (CLIENT/PASSTHROUGH MODE) ===
3631
+ // The inner-loop guard resets each HTTP request so repeated calls across
3632
+ // requests escape detection. Track signatures in session metadata instead.
3633
+ ensureDedupStructure(session);
3634
+
3635
+ // Detect new user question → reset dedup tracking
3636
+ const dedup = session.metadata.toolCallDedup;
3637
+ const incomingToolUse = extractToolUseFromCurrentTurn(payload?.messages);
3638
+ // A user text message with no preceding tool_use means a brand-new question
3639
+ const hasNewUserText = (() => {
3640
+ const msgs = payload?.messages || [];
3641
+ for (let i = msgs.length - 1; i >= 0; i--) {
3642
+ const msg = msgs[i];
3643
+ if (msg?.role === 'user') {
3644
+ if (typeof msg.content === 'string' && msg.content.trim().length > 0) return true;
3645
+ if (Array.isArray(msg.content)) {
3646
+ return msg.content.some(block =>
3647
+ (block?.type === 'text' && block?.text?.trim?.().length > 0) ||
3648
+ (block?.type === 'input_text' && block?.input_text?.trim?.().length > 0)
3649
+ );
3650
+ }
3651
+ }
3652
+ break; // Only check the very last message
3653
+ }
3654
+ return false;
3655
+ })();
3266
3656
 
3267
- if (toolResultCount >= toolLoopThreshold) {
3268
- logger.error({
3269
- toolResultCount,
3270
- toolUseCount,
3271
- threshold: toolLoopThreshold,
3272
- sessionId: session?.id ?? null,
3273
- }, "[ToolLoopGuard] FORCE TERMINATING - too many tool calls in conversation");
3657
+ if (hasNewUserText && incomingToolUse.length === 0) {
3658
+ // Pure user text with no tool results → new question
3659
+ resetDedupTracking(session);
3660
+ } else {
3661
+ // Record each tool_use from the incoming messages into the dedup tracker
3662
+ for (const toolUseBlock of incomingToolUse) {
3663
+ recordCrossRequestToolCall(session, toolUseBlock);
3664
+ }
3274
3665
 
3275
- // Extract tool results ONLY from CURRENT TURN (after last user text message)
3276
- // This prevents showing old results from previous questions
3277
- let toolResultsSummary = "";
3278
- const messages = payload?.messages || [];
3666
+ const { maxCount, toolName: dedupToolName, signature: dedupSig } = getMaxDedupCount(session);
3279
3667
 
3280
- // Find the last user text message index (same logic as countToolCallsInHistory)
3281
- let lastUserTextIndex = -1;
3282
- for (let i = messages.length - 1; i >= 0; i--) {
3283
- const msg = messages[i];
3284
- if (msg?.role !== 'user') continue;
3285
- if (typeof msg.content === 'string' && msg.content.trim().length > 0) {
3286
- lastUserTextIndex = i;
3287
- break;
3668
+ if (maxCount >= DEDUP_TERMINATE_THRESHOLD) {
3669
+ // Force-terminate: same pattern as existing tool_loop_guard
3670
+ logger.error({
3671
+ toolName: dedupToolName,
3672
+ count: maxCount,
3673
+ threshold: DEDUP_TERMINATE_THRESHOLD,
3674
+ signature: dedupSig,
3675
+ sessionId: session?.id ?? null,
3676
+ }, "[CrossRequestDedup] FORCE TERMINATING - repeated tool call across requests");
3677
+
3678
+ // Extract tool results summary from current turn
3679
+ let toolResultsSummary = "";
3680
+ const messages = payload?.messages || [];
3681
+ const { lastUserTextIndex: luIdx } = countToolCallsInHistory(messages);
3682
+ const startIdx = luIdx >= 0 ? luIdx : 0;
3683
+ for (let i = startIdx; i < messages.length; i++) {
3684
+ const msg = messages[i];
3685
+ if (!msg || !Array.isArray(msg.content)) continue;
3686
+ for (const block of msg.content) {
3687
+ if (block?.type === 'tool_result' && block?.content) {
3688
+ const content = typeof block.content === 'string'
3689
+ ? block.content
3690
+ : JSON.stringify(block.content);
3691
+ if (content && !content.includes('Found 0')) {
3692
+ toolResultsSummary += content + "\n";
3693
+ }
3694
+ }
3695
+ }
3696
+ }
3697
+
3698
+ let responseText = `Based on the tool results, here's what I found:\n\n`;
3699
+ if (toolResultsSummary.trim()) {
3700
+ responseText += toolResultsSummary.trim();
3701
+ } else {
3702
+ responseText += `The tools executed but didn't return clear results. Please check the tool output above or try a different command.`;
3703
+ }
3704
+
3705
+ const forcedResponse = {
3706
+ id: `msg_forced_${Date.now()}`,
3707
+ type: "message",
3708
+ role: "assistant",
3709
+ content: [{ type: "text", text: responseText }],
3710
+ model: requestedModel || "unknown",
3711
+ stop_reason: "end_turn",
3712
+ stop_sequence: null,
3713
+ usage: { input_tokens: 0, output_tokens: 100 },
3714
+ };
3715
+
3716
+ // Reset dedup after termination so next question starts fresh
3717
+ resetDedupTracking(session);
3718
+ // Persist to DB (non-ephemeral sessions only)
3719
+ if (session.id && !session._ephemeral) {
3720
+ try { upsertSession(session.id, { metadata: session.metadata }); } catch (e) {
3721
+ logger.debug({ err: e.message }, "Failed to persist dedup reset");
3722
+ }
3723
+ }
3724
+
3725
+ return {
3726
+ status: 200,
3727
+ body: forcedResponse,
3728
+ terminationReason: "tool_loop_guard",
3729
+ };
3288
3730
  }
3289
- if (Array.isArray(msg.content)) {
3290
- const hasText = msg.content.some(block =>
3291
- (block?.type === 'text' && block?.text?.trim?.().length > 0) ||
3292
- (block?.type === 'input_text' && block?.input_text?.trim?.().length > 0)
3293
- );
3294
- if (hasText) {
3295
- lastUserTextIndex = i;
3296
- break;
3731
+
3732
+ if (maxCount >= DEDUP_WARN_THRESHOLD && !dedup.warningInjected) {
3733
+ logger.warn({
3734
+ toolName: dedupToolName,
3735
+ count: maxCount,
3736
+ threshold: DEDUP_WARN_THRESHOLD,
3737
+ signature: dedupSig,
3738
+ sessionId: session?.id ?? null,
3739
+ }, "[CrossRequestDedup] Warning - repeated tool call detected across requests");
3740
+
3741
+ dedup.warningInjected = true;
3742
+
3743
+ // Inject a strict warning into the payload so the model sees it
3744
+ if (Array.isArray(payload?.messages)) {
3745
+ payload.messages.push({
3746
+ role: "user",
3747
+ content: `⚠️ CRITICAL SYSTEM WARNING: You have called the "${dedupToolName}" tool ${maxCount} times with identical or similar parameters across multiple requests. This IS an infinite loop. STOP calling this tool immediately. You MUST now provide a direct text response based on the results you have received. If the tool returned "no results" or empty output, that IS the final answer - do not retry. Summarize your findings and respond.`,
3748
+ });
3749
+ }
3750
+ }
3751
+
3752
+ // Persist dedup state (non-ephemeral sessions only)
3753
+ if (session.id && !session._ephemeral) {
3754
+ try { upsertSession(session.id, { metadata: session.metadata }); } catch (e) {
3755
+ logger.debug({ err: e.message }, "Failed to persist dedup state");
3297
3756
  }
3298
3757
  }
3299
3758
  }
3300
3759
 
3301
- // Only extract tool results AFTER the last user text message
3302
- const startIndex = lastUserTextIndex >= 0 ? lastUserTextIndex : 0;
3303
- for (let i = startIndex; i < messages.length; i++) {
3304
- const msg = messages[i];
3305
- if (!msg || !Array.isArray(msg.content)) continue;
3306
- for (const block of msg.content) {
3307
- if (block?.type === 'tool_result' && block?.content) {
3308
- const content = typeof block.content === 'string'
3309
- ? block.content
3310
- : JSON.stringify(block.content);
3311
- if (content && !content.includes('Found 0')) {
3312
- toolResultsSummary += content + "\n";
3760
+ // Client mode still uses the relaxed per-request threshold for the count-based guard
3761
+ const effectiveThreshold = 10;
3762
+ if (toolResultCount >= effectiveThreshold) {
3763
+ logger.error({
3764
+ toolResultCount,
3765
+ toolUseCount,
3766
+ threshold: effectiveThreshold,
3767
+ sessionId: session?.id ?? null,
3768
+ }, "[ToolLoopGuard] FORCE TERMINATING - too many tool calls in conversation");
3769
+
3770
+ let toolResultsSummary = "";
3771
+ const messages = payload?.messages || [];
3772
+ let lastUserTextIndex = -1;
3773
+ for (let i = messages.length - 1; i >= 0; i--) {
3774
+ const msg = messages[i];
3775
+ if (msg?.role !== 'user') continue;
3776
+ if (typeof msg.content === 'string' && msg.content.trim().length > 0) {
3777
+ lastUserTextIndex = i;
3778
+ break;
3779
+ }
3780
+ if (Array.isArray(msg.content)) {
3781
+ const hasText = msg.content.some(block =>
3782
+ (block?.type === 'text' && block?.text?.trim?.().length > 0) ||
3783
+ (block?.type === 'input_text' && block?.input_text?.trim?.().length > 0)
3784
+ );
3785
+ if (hasText) {
3786
+ lastUserTextIndex = i;
3787
+ break;
3788
+ }
3789
+ }
3790
+ }
3791
+ const startIndex = lastUserTextIndex >= 0 ? lastUserTextIndex : 0;
3792
+ for (let i = startIndex; i < messages.length; i++) {
3793
+ const msg = messages[i];
3794
+ if (!msg || !Array.isArray(msg.content)) continue;
3795
+ for (const block of msg.content) {
3796
+ if (block?.type === 'tool_result' && block?.content) {
3797
+ const content = typeof block.content === 'string'
3798
+ ? block.content
3799
+ : JSON.stringify(block.content);
3800
+ if (content && !content.includes('Found 0')) {
3801
+ toolResultsSummary += content + "\n";
3802
+ }
3313
3803
  }
3314
3804
  }
3315
3805
  }
3316
- }
3317
3806
 
3318
- // Build response text based on actual results from CURRENT turn only
3319
- let responseText = `Based on the tool results, here's what I found:\n\n`;
3320
- if (toolResultsSummary.trim()) {
3321
- responseText += toolResultsSummary.trim();
3322
- } else {
3323
- responseText += `The tools executed but didn't return clear results. Please check the tool output above or try a different command.`;
3807
+ let responseText = `Based on the tool results, here's what I found:\n\n`;
3808
+ if (toolResultsSummary.trim()) {
3809
+ responseText += toolResultsSummary.trim();
3810
+ } else {
3811
+ responseText += `The tools executed but didn't return clear results. Please check the tool output above or try a different command.`;
3812
+ }
3813
+
3814
+ const forcedResponse = {
3815
+ id: `msg_forced_${Date.now()}`,
3816
+ type: "message",
3817
+ role: "assistant",
3818
+ content: [{ type: "text", text: responseText }],
3819
+ model: requestedModel || "unknown",
3820
+ stop_reason: "end_turn",
3821
+ stop_sequence: null,
3822
+ usage: { input_tokens: 0, output_tokens: 100 },
3823
+ };
3824
+
3825
+ return {
3826
+ status: 200,
3827
+ body: forcedResponse,
3828
+ terminationReason: "tool_loop_guard",
3829
+ };
3324
3830
  }
3831
+ } else {
3832
+ // Server mode: use existing threshold 2 with countToolCallsInHistory
3833
+ const effectiveThreshold = toolLoopThreshold;
3834
+
3835
+ if (toolResultCount >= effectiveThreshold) {
3836
+ logger.error({
3837
+ toolResultCount,
3838
+ toolUseCount,
3839
+ threshold: effectiveThreshold,
3840
+ sessionId: session?.id ?? null,
3841
+ }, "[ToolLoopGuard] FORCE TERMINATING - too many tool calls in conversation");
3842
+
3843
+ let toolResultsSummary = "";
3844
+ const messages = payload?.messages || [];
3845
+ let lastUserTextIndex = -1;
3846
+ for (let i = messages.length - 1; i >= 0; i--) {
3847
+ const msg = messages[i];
3848
+ if (msg?.role !== 'user') continue;
3849
+ if (typeof msg.content === 'string' && msg.content.trim().length > 0) {
3850
+ lastUserTextIndex = i;
3851
+ break;
3852
+ }
3853
+ if (Array.isArray(msg.content)) {
3854
+ const hasText = msg.content.some(block =>
3855
+ (block?.type === 'text' && block?.text?.trim?.().length > 0) ||
3856
+ (block?.type === 'input_text' && block?.input_text?.trim?.().length > 0)
3857
+ );
3858
+ if (hasText) {
3859
+ lastUserTextIndex = i;
3860
+ break;
3861
+ }
3862
+ }
3863
+ }
3864
+ const startIndex = lastUserTextIndex >= 0 ? lastUserTextIndex : 0;
3865
+ for (let i = startIndex; i < messages.length; i++) {
3866
+ const msg = messages[i];
3867
+ if (!msg || !Array.isArray(msg.content)) continue;
3868
+ for (const block of msg.content) {
3869
+ if (block?.type === 'tool_result' && block?.content) {
3870
+ const content = typeof block.content === 'string'
3871
+ ? block.content
3872
+ : JSON.stringify(block.content);
3873
+ if (content && !content.includes('Found 0')) {
3874
+ toolResultsSummary += content + "\n";
3875
+ }
3876
+ }
3877
+ }
3878
+ }
3325
3879
 
3326
- // Force return a response instead of continuing the loop
3327
- const forcedResponse = {
3328
- id: `msg_forced_${Date.now()}`,
3329
- type: "message",
3330
- role: "assistant",
3331
- content: [
3332
- {
3333
- type: "text",
3334
- text: responseText,
3335
- },
3336
- ],
3337
- model: requestedModel || "unknown",
3338
- stop_reason: "end_turn",
3339
- stop_sequence: null,
3340
- usage: {
3341
- input_tokens: 0,
3342
- output_tokens: 100,
3343
- },
3344
- };
3880
+ let responseText = `Based on the tool results, here's what I found:\n\n`;
3881
+ if (toolResultsSummary.trim()) {
3882
+ responseText += toolResultsSummary.trim();
3883
+ } else {
3884
+ responseText += `The tools executed but didn't return clear results. Please check the tool output above or try a different command.`;
3885
+ }
3345
3886
 
3346
- return {
3347
- status: 200,
3348
- body: forcedResponse,
3349
- terminationReason: "tool_loop_guard",
3350
- };
3887
+ const forcedResponse = {
3888
+ id: `msg_forced_${Date.now()}`,
3889
+ type: "message",
3890
+ role: "assistant",
3891
+ content: [{ type: "text", text: responseText }],
3892
+ model: requestedModel || "unknown",
3893
+ stop_reason: "end_turn",
3894
+ stop_sequence: null,
3895
+ usage: { input_tokens: 0, output_tokens: 100 },
3896
+ };
3897
+
3898
+ return {
3899
+ status: 200,
3900
+ body: forcedResponse,
3901
+ terminationReason: "tool_loop_guard",
3902
+ };
3903
+ }
3351
3904
  }
3352
3905
 
3353
3906
  const cleanPayload = sanitizePayload(payload);