lynkr 7.2.4 → 8.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/README.md +2 -2
  2. package/config/model-tiers.json +89 -0
  3. package/docs/docs.html +1 -0
  4. package/docs/index.md +7 -0
  5. package/docs/toon-integration-spec.md +130 -0
  6. package/documentation/README.md +3 -2
  7. package/documentation/claude-code-cli.md +23 -16
  8. package/documentation/cursor-integration.md +17 -14
  9. package/documentation/docker.md +11 -4
  10. package/documentation/embeddings.md +7 -5
  11. package/documentation/faq.md +66 -12
  12. package/documentation/features.md +22 -15
  13. package/documentation/installation.md +66 -14
  14. package/documentation/production.md +43 -8
  15. package/documentation/providers.md +145 -42
  16. package/documentation/routing.md +476 -0
  17. package/documentation/token-optimization.md +7 -5
  18. package/documentation/troubleshooting.md +81 -5
  19. package/install.sh +6 -1
  20. package/package.json +5 -3
  21. package/scripts/setup.js +0 -1
  22. package/src/agents/executor.js +14 -6
  23. package/src/api/middleware/session.js +15 -2
  24. package/src/api/openai-router.js +130 -37
  25. package/src/api/providers-handler.js +15 -1
  26. package/src/api/router.js +107 -2
  27. package/src/budget/index.js +4 -3
  28. package/src/clients/databricks.js +431 -234
  29. package/src/clients/gpt-utils.js +181 -0
  30. package/src/clients/ollama-utils.js +66 -140
  31. package/src/clients/routing.js +0 -1
  32. package/src/clients/standard-tools.js +82 -5
  33. package/src/config/index.js +119 -35
  34. package/src/context/toon.js +173 -0
  35. package/src/headroom/launcher.js +8 -3
  36. package/src/logger/index.js +23 -0
  37. package/src/orchestrator/index.js +765 -212
  38. package/src/routing/agentic-detector.js +320 -0
  39. package/src/routing/complexity-analyzer.js +202 -2
  40. package/src/routing/cost-optimizer.js +305 -0
  41. package/src/routing/index.js +168 -159
  42. package/src/routing/model-registry.js +437 -0
  43. package/src/routing/model-tiers.js +365 -0
  44. package/src/server.js +2 -2
  45. package/src/sessions/cleanup.js +3 -3
  46. package/src/sessions/record.js +10 -1
  47. package/src/sessions/store.js +7 -2
  48. package/src/tools/agent-task.js +48 -1
  49. package/src/tools/index.js +15 -2
  50. package/src/tools/workspace.js +35 -4
  51. package/src/workspace/index.js +30 -0
  52. package/te +11622 -0
  53. package/test/README.md +1 -1
  54. package/test/azure-openai-config.test.js +17 -8
  55. package/test/azure-openai-integration.test.js +7 -1
  56. package/test/azure-openai-routing.test.js +41 -43
  57. package/test/bedrock-integration.test.js +18 -32
  58. package/test/hybrid-routing-integration.test.js +35 -20
  59. package/test/hybrid-routing-performance.test.js +74 -64
  60. package/test/llamacpp-integration.test.js +28 -9
  61. package/test/lmstudio-integration.test.js +20 -8
  62. package/test/openai-integration.test.js +17 -20
  63. package/test/performance-tests.js +1 -1
  64. package/test/routing.test.js +65 -59
  65. package/test/toon-compression.test.js +131 -0
  66. package/CLAWROUTER_ROUTING_PLAN.md +0 -910
  67. package/ROUTER_COMPARISON.md +0 -173
  68. package/TIER_ROUTING_PLAN.md +0 -771
@@ -6,11 +6,12 @@ const { getCircuitBreakerRegistry } = require("./circuit-breaker");
6
6
  const { getMetricsCollector } = require("../observability/metrics");
7
7
  const { getHealthTracker } = require("../observability/health-tracker");
8
8
  const logger = require("../logger");
9
- const { STANDARD_TOOLS } = require("./standard-tools");
9
+ const { STANDARD_TOOLS, STANDARD_TOOL_NAMES } = require("./standard-tools");
10
10
  const { convertAnthropicToolsToOpenRouter } = require("./openrouter-utils");
11
11
  const {
12
12
  detectModelFamily
13
13
  } = require("./bedrock-utils");
14
+ const { getGPTSystemPromptAddendum } = require("./gpt-utils");
14
15
 
15
16
 
16
17
 
@@ -183,9 +184,9 @@ async function invokeDatabricks(body) {
183
184
  // Inject standard tools if client didn't send any (passthrough mode)
184
185
  if (!Array.isArray(databricksBody.tools) || databricksBody.tools.length === 0) {
185
186
  databricksBody.tools = STANDARD_TOOLS;
186
- logger.info({
187
+ logger.debug({
187
188
  injectedToolCount: STANDARD_TOOLS.length,
188
- injectedToolNames: STANDARD_TOOLS.map(t => t.name),
189
+ injectedToolNames: STANDARD_TOOL_NAMES,
189
190
  reason: "Client did not send tools (passthrough mode)"
190
191
  }, "=== INJECTING STANDARD TOOLS (Databricks) ===");
191
192
  }
@@ -224,9 +225,9 @@ async function invokeAzureAnthropic(body) {
224
225
  // Inject standard tools if client didn't send any (passthrough mode)
225
226
  if (!Array.isArray(body.tools) || body.tools.length === 0) {
226
227
  body.tools = STANDARD_TOOLS;
227
- logger.info({
228
+ logger.debug({
228
229
  injectedToolCount: STANDARD_TOOLS.length,
229
- injectedToolNames: STANDARD_TOOLS.map(t => t.name),
230
+ injectedToolNames: STANDARD_TOOL_NAMES,
230
231
  reason: "Client did not send tools (passthrough mode)"
231
232
  }, "=== INJECTING STANDARD TOOLS (Azure Anthropic) ===");
232
233
  }
@@ -248,42 +249,115 @@ async function invokeOllama(body) {
248
249
  throw new Error("Ollama endpoint is not configured.");
249
250
  }
250
251
 
251
- const { convertAnthropicToolsToOllama, checkOllamaToolSupport } = require("./ollama-utils");
252
+ const { checkOllamaToolSupport, hasAnthropicEndpoint, convertAnthropicToolsToOllama } = require("./ollama-utils");
253
+
254
+ const modelName = body._suggestionModeModel || body._tierModel || config.ollama.model;
255
+
256
+ // Detect whether Ollama has the native Anthropic Messages API (v0.14.0+)
257
+ const useAnthropicApi = await hasAnthropicEndpoint(config.ollama.endpoint);
258
+
259
+ // Check if model supports tools FIRST (before wasteful injection)
260
+ const supportsTools = await checkOllamaToolSupport(config.ollama.model);
261
+ const injectToolsOllama = process.env.INJECT_TOOLS_OLLAMA !== "false";
262
+
263
+ // Determine tools to send
264
+ let toolsToSend = body.tools;
265
+ let toolsInjected = false;
266
+
267
+ if (!supportsTools) {
268
+ toolsToSend = null;
269
+ } else if (injectToolsOllama && (!Array.isArray(toolsToSend) || toolsToSend.length === 0)) {
270
+ toolsToSend = STANDARD_TOOLS;
271
+ toolsInjected = true;
272
+ }
273
+
274
+ // Consolidated tool injection log
275
+ const toolCount = (supportsTools && Array.isArray(toolsToSend)) ? toolsToSend.length : 0;
276
+ let logMessage;
277
+ if (!supportsTools) {
278
+ logMessage = `Tools not supported (0 tools)`;
279
+ } else if (toolsInjected) {
280
+ logMessage = `injected ${toolCount} tools`;
281
+ } else if (Array.isArray(toolsToSend) && toolsToSend.length > 0) {
282
+ logMessage = `Using client-provided tools (${toolCount} tools)`;
283
+ } else if (!injectToolsOllama) {
284
+ logMessage = `Tool injection disabled (0 tools)`;
285
+ } else {
286
+ logMessage = `No tools (0 tools)`;
287
+ }
252
288
 
289
+ logger.debug({
290
+ model: modelName,
291
+ apiMode: useAnthropicApi ? "anthropic" : "legacy",
292
+ toolCount,
293
+ toolsInjected,
294
+ supportsTools,
295
+ toolNames: (Array.isArray(toolsToSend) && toolsToSend.length > 0) ? toolsToSend.map(t => t.name) : []
296
+ }, `=== Ollama STANDARD TOOLS INJECTION for ${config.ollama.model} === ${logMessage}`);
297
+
298
+ // ---- Anthropic-native path (Ollama v0.14.0+) ----
299
+ if (useAnthropicApi) {
300
+ const endpoint = `${config.ollama.endpoint}/v1/messages`;
301
+ const headers = {
302
+ "Content-Type": "application/json",
303
+ "anthropic-version": "2023-06-01",
304
+ };
305
+
306
+ // Build body with only valid Anthropic Messages API fields
307
+ const ollamaBody = {
308
+ model: modelName,
309
+ messages: body.messages,
310
+ max_tokens: body.max_tokens || 4096,
311
+ stream: false,
312
+ };
313
+
314
+ if (body.system) ollamaBody.system = body.system;
315
+ if (body.temperature !== undefined) ollamaBody.temperature = body.temperature;
316
+ if (body.top_p !== undefined) ollamaBody.top_p = body.top_p;
317
+ if (body.top_k !== undefined) ollamaBody.top_k = body.top_k;
318
+ if (body.stop_sequences) ollamaBody.stop_sequences = body.stop_sequences;
319
+ if (body.tool_choice) ollamaBody.tool_choice = body.tool_choice;
320
+ if (body.metadata) ollamaBody.metadata = body.metadata;
321
+
322
+ // Tools (already Anthropic format — no conversion needed)
323
+ if (supportsTools && Array.isArray(toolsToSend) && toolsToSend.length > 0) {
324
+ ollamaBody.tools = toolsToSend;
325
+ }
326
+
327
+ if (config.ollama.keepAlive !== undefined) {
328
+ const keepAlive = config.ollama.keepAlive;
329
+ ollamaBody.keep_alive = /^-?\d+$/.test(keepAlive)
330
+ ? parseInt(keepAlive, 10)
331
+ : keepAlive;
332
+ logger.debug({ keepAlive: ollamaBody.keep_alive }, "Ollama keep_alive configured");
333
+ }
334
+
335
+ return performJsonRequest(endpoint, { headers, body: ollamaBody }, "Ollama");
336
+ }
337
+
338
+ // ---- Legacy path (Ollama < v0.14.0, /api/chat with OpenAI format) ----
253
339
  const endpoint = `${config.ollama.endpoint}/api/chat`;
254
340
  const headers = { "Content-Type": "application/json" };
255
341
 
256
- // Convert Anthropic messages format to Ollama format
257
- // Ollama expects content as string, not content blocks array
342
+ // Convert Anthropic messages to Ollama format (content blocks → strings)
258
343
  const convertedMessages = [];
259
344
 
260
- // Handle system prompt (same pattern as other providers)
261
345
  if (body.system && typeof body.system === "string" && body.system.trim().length > 0) {
262
- convertedMessages.push({
263
- role: "system",
264
- content: body.system.trim()
265
- });
346
+ convertedMessages.push({ role: "system", content: body.system.trim() });
266
347
  }
267
348
 
268
- // Add user/assistant messages
269
349
  (body.messages || []).forEach(msg => {
270
350
  let content = msg.content;
271
-
272
- // Convert content blocks array to simple string
273
351
  if (Array.isArray(content)) {
274
352
  content = content
275
353
  .filter(block => block.type === 'text')
276
354
  .map(block => block.text || '')
277
355
  .join('\n');
278
356
  }
279
-
280
- convertedMessages.push({
281
- role: msg.role,
282
- content: content || ''
283
- });
357
+ convertedMessages.push({ role: msg.role, content: content || '' });
284
358
  });
285
359
 
286
- // FIX: Deduplicate consecutive messages with same role (Ollama may reject this)
360
+ // Deduplicate consecutive messages with same role
287
361
  const deduplicated = [];
288
362
  let lastRole = null;
289
363
  for (const msg of convertedMessages) {
@@ -298,85 +372,30 @@ async function invokeOllama(body) {
298
372
  lastRole = msg.role;
299
373
  }
300
374
 
301
- if (deduplicated.length !== convertedMessages.length) {
302
- logger.info({
303
- originalCount: convertedMessages.length,
304
- deduplicatedCount: deduplicated.length,
305
- removed: convertedMessages.length - deduplicated.length,
306
- messageRoles: convertedMessages.map(m => m.role).join(' → '),
307
- deduplicatedRoles: deduplicated.map(m => m.role).join(' → ')
308
- }, 'Ollama: Removed consecutive duplicate roles from message sequence');
309
- }
310
-
311
375
  const ollamaBody = {
312
- model: config.ollama.model,
376
+ model: modelName,
313
377
  messages: deduplicated,
314
- stream: false, // Force non-streaming for Ollama - streaming format conversion not yet implemented
378
+ stream: false,
315
379
  options: {
316
380
  temperature: body.temperature ?? 0.7,
317
- num_predict: body.max_tokens ?? 4096,
381
+ num_predict: body.max_tokens ?? 16384,
318
382
  top_p: body.top_p ?? 1.0,
319
383
  },
320
384
  };
321
385
 
322
- // Add keep_alive if configured (controls how long model stays loaded)
323
- // Accepts: duration strings ("10m", "24h"), numbers (seconds), -1 (permanent), 0 (immediate unload)
324
386
  if (config.ollama.keepAlive !== undefined) {
325
387
  const keepAlive = config.ollama.keepAlive;
326
- // Parse as number if it looks like one, otherwise use string
327
388
  ollamaBody.keep_alive = /^-?\d+$/.test(keepAlive)
328
389
  ? parseInt(keepAlive, 10)
329
390
  : keepAlive;
330
391
  logger.debug({ keepAlive: ollamaBody.keep_alive }, "Ollama keep_alive configured");
331
392
  }
332
393
 
333
- // Check if model supports tools FIRST (before wasteful injection)
334
- const supportsTools = await checkOllamaToolSupport(config.ollama.model);
335
-
336
- // Inject standard tools if client didn't send any (passthrough mode)
337
- let toolsToSend = body.tools;
338
- let toolsInjected = false;
339
-
340
- const injectToolsOllama = process.env.INJECT_TOOLS_OLLAMA !== "false";
341
-
342
- if (!supportsTools) {
343
- // Model doesn't support tools - don't inject them
344
- toolsToSend = null;
345
- } else if (injectToolsOllama && (!Array.isArray(toolsToSend) || toolsToSend.length === 0)) {
346
- // Model supports tools and none provided - inject them
347
- toolsToSend = STANDARD_TOOLS;
348
- toolsInjected = true;
349
- }
350
-
351
- // Add tools if present AND model supports them
394
+ // Tools need conversion to OpenAI function-calling format for legacy endpoint
352
395
  if (supportsTools && Array.isArray(toolsToSend) && toolsToSend.length > 0) {
353
396
  ollamaBody.tools = convertAnthropicToolsToOllama(toolsToSend);
354
397
  }
355
398
 
356
- // Single consolidated log message for all cases (easy to grep and compare across models)
357
- const toolCount = (supportsTools && Array.isArray(toolsToSend)) ? toolsToSend.length : 0;
358
- let logMessage;
359
-
360
- if (!supportsTools) {
361
- logMessage = `Tools not supported (0 tools)`;
362
- } else if (toolsInjected) {
363
- logMessage = `injected ${toolCount} tools`;
364
- } else if (Array.isArray(toolsToSend) && toolsToSend.length > 0) {
365
- logMessage = `Using client-provided tools (${toolCount} tools)`;
366
- } else if (!injectToolsOllama) {
367
- logMessage = `Tool injection disabled (0 tools)`;
368
- } else {
369
- logMessage = `No tools (0 tools)`;
370
- }
371
-
372
- logger.info({
373
- model: config.ollama.model,
374
- toolCount,
375
- toolsInjected,
376
- supportsTools,
377
- toolNames: (Array.isArray(toolsToSend) && toolsToSend.length > 0) ? toolsToSend.map(t => t.name) : []
378
- }, `=== Ollama STANDARD TOOLS INJECTION for ${config.ollama.model} === ${logMessage}`);
379
-
380
399
  return performJsonRequest(endpoint, { headers, body: ollamaBody }, "Ollama");
381
400
  }
382
401
 
@@ -410,7 +429,7 @@ async function invokeOpenRouter(body) {
410
429
  }
411
430
 
412
431
  const openRouterBody = {
413
- model: config.openrouter.model,
432
+ model: body._suggestionModeModel || body._tierModel || config.openrouter.model,
414
433
  messages,
415
434
  temperature: body.temperature ?? 0.7,
416
435
  max_tokens: body.max_tokens ?? 4096,
@@ -426,16 +445,16 @@ async function invokeOpenRouter(body) {
426
445
  // Client didn't send tools (likely passthrough mode) - inject standard Claude Code tools
427
446
  toolsToSend = STANDARD_TOOLS;
428
447
  toolsInjected = true;
429
- logger.info({
448
+ logger.debug({
430
449
  injectedToolCount: STANDARD_TOOLS.length,
431
- injectedToolNames: STANDARD_TOOLS.map(t => t.name),
450
+ injectedToolNames: STANDARD_TOOL_NAMES,
432
451
  reason: "Client did not send tools (passthrough mode)"
433
452
  }, "=== INJECTING STANDARD TOOLS (OpenRouter) ===");
434
453
  }
435
454
 
436
455
  if (Array.isArray(toolsToSend) && toolsToSend.length > 0) {
437
456
  openRouterBody.tools = convertAnthropicToolsToOpenRouter(toolsToSend);
438
- logger.info({
457
+ logger.debug({
439
458
  toolCount: toolsToSend.length,
440
459
  toolNames: toolsToSend.map(t => t.name),
441
460
  toolsInjected
@@ -490,13 +509,16 @@ async function invokeAzureOpenAI(body) {
490
509
  });
491
510
  }
492
511
 
512
+ // System prompt injection disabled - breaks model response
513
+ // Tool guidance now provided via tool descriptions instead
514
+
493
515
  const azureBody = {
494
516
  messages,
495
517
  temperature: body.temperature ?? 0.3, // Lower temperature for more deterministic, action-oriented behavior
496
518
  max_tokens: Math.min(body.max_tokens ?? 4096, 16384), // Cap at Azure OpenAI's limit
497
519
  top_p: body.top_p ?? 1.0,
498
520
  stream: false, // Force non-streaming for Azure OpenAI - streaming format conversion not yet implemented
499
- model: config.azureOpenAI.deployment
521
+ model: body._suggestionModeModel || body._tierModel || config.azureOpenAI.deployment
500
522
  };
501
523
 
502
524
  // Add tools - inject standard tools if client didn't send any (passthrough mode)
@@ -507,18 +529,18 @@ async function invokeAzureOpenAI(body) {
507
529
  // Client didn't send tools (likely passthrough mode) - inject standard Claude Code tools
508
530
  toolsToSend = STANDARD_TOOLS;
509
531
  toolsInjected = true;
510
- logger.info({
532
+ logger.debug({
511
533
  injectedToolCount: STANDARD_TOOLS.length,
512
- injectedToolNames: STANDARD_TOOLS.map(t => t.name),
534
+ injectedToolNames: STANDARD_TOOL_NAMES,
513
535
  reason: "Client did not send tools (passthrough mode)"
514
536
  }, "=== INJECTING STANDARD TOOLS ===");
515
537
  }
516
538
 
517
539
  if (Array.isArray(toolsToSend) && toolsToSend.length > 0) {
518
540
  azureBody.tools = convertAnthropicToolsToOpenRouter(toolsToSend);
519
- azureBody.parallel_tool_calls = true; // Enable parallel tool calling for better performance
541
+ azureBody.parallel_tool_calls = true; // Enable parallel tool calls
520
542
  azureBody.tool_choice = "auto"; // Explicitly enable tool use (helps GPT models understand they should use tools)
521
- logger.info({
543
+ logger.debug({
522
544
  toolCount: toolsToSend.length,
523
545
  toolNames: toolsToSend.map(t => t.name),
524
546
  toolsInjected,
@@ -529,7 +551,7 @@ async function invokeAzureOpenAI(body) {
529
551
  }, "=== SENDING TOOLS TO AZURE OPENAI ===");
530
552
  }
531
553
 
532
- logger.info({
554
+ logger.debug({
533
555
  endpoint,
534
556
  hasTools: !!azureBody.tools,
535
557
  toolCount: azureBody.tools?.length || 0,
@@ -563,14 +585,83 @@ async function invokeAzureOpenAI(body) {
563
585
  // Track function call IDs for matching with outputs
564
586
  const pendingCallIds = [];
565
587
 
588
+ // Detect if this is a continuation request (has tool results)
589
+ // Azure content filter triggers on full system prompt in continuations
590
+ // Check for:
591
+ // 1. tool_result blocks in user messages (Anthropic format)
592
+ // 2. tool messages (OpenAI format)
593
+ // 3. assistant messages with tool_use or tool_calls (indicates prior tool invocation)
594
+ // 4. Flattened continuation pattern from orchestrator (contains "IMPORTANT: Focus on")
595
+ const hasToolResults = (body.messages || []).some(msg => {
596
+ // Check for Anthropic format tool_result in user messages
597
+ if (msg.role === "user" && Array.isArray(msg.content)) {
598
+ if (msg.content.some(block => block.type === "tool_result")) return true;
599
+ }
600
+ // Check for OpenAI format tool messages
601
+ if (msg.role === "tool") return true;
602
+ // Check for assistant messages with tool_use (Anthropic) or tool_calls (OpenAI)
603
+ // If there's a prior tool use, this is a continuation
604
+ if (msg.role === "assistant") {
605
+ if (Array.isArray(msg.content)) {
606
+ if (msg.content.some(block => block.type === "tool_use")) return true;
607
+ }
608
+ if (msg.tool_calls && msg.tool_calls.length > 0) return true;
609
+ }
610
+ return false;
611
+ }) || azureBody.messages.some(msg => {
612
+ // Also check converted messages for flattened continuation pattern
613
+ // The orchestrator flattens tool results into user message with this marker
614
+ if (msg.role === "user" && typeof msg.content === "string") {
615
+ if (msg.content.includes("IMPORTANT: Focus on and respond ONLY to my most recent request")) return true;
616
+ }
617
+ return false;
618
+ });
619
+
620
+ if (hasToolResults) {
621
+ logger.debug({
622
+ hasToolResults: true,
623
+ originalMessageCount: (body.messages || []).length,
624
+ convertedMessageCount: azureBody.messages.length,
625
+ messageRoles: (body.messages || []).map(m => m.role),
626
+ }, "=== CONTINUATION REQUEST DETECTED - using minimal system prompt to avoid Azure content filter ===");
627
+ } else {
628
+ logger.debug({
629
+ hasToolResults: false,
630
+ originalMessageCount: (body.messages || []).length,
631
+ messageRoles: (body.messages || []).map(m => m.role),
632
+ }, "Initial request - using full system prompt");
633
+ }
634
+
635
+ // Helper function to strip <system-reminder> tags and meta-instructions from content
636
+ // Azure's jailbreak filter triggers on these instructions in continuation requests
637
+ const stripSystemReminders = (content) => {
638
+ if (!content || typeof content !== 'string') return content;
639
+ // Remove <system-reminder>...</system-reminder> blocks
640
+ let cleaned = content.replace(/<system-reminder>[\s\S]*?<\/system-reminder>/gi, '');
641
+ // Remove the continuation marker that orchestrator adds
642
+ cleaned = cleaned.replace(/---\s*IMPORTANT:\s*Focus on and respond ONLY to my most recent request[^\n]*/gi, '');
643
+ // Trim whitespace
644
+ return cleaned.trim();
645
+ };
646
+
566
647
  for (const msg of azureBody.messages) {
567
648
  if (msg.role === "system") {
568
- // System messages become developer messages
569
- responsesInput.push({
570
- type: "message",
571
- role: "developer",
572
- content: typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content)
573
- });
649
+ // For continuation requests, use minimal system prompt to avoid content filter
650
+ // Azure's jailbreak detection triggers on security-related text in continuations
651
+ if (hasToolResults) {
652
+ responsesInput.push({
653
+ type: "message",
654
+ role: "developer",
655
+ content: "You are a helpful coding assistant. Continue helping the user based on the tool results."
656
+ });
657
+ } else {
658
+ // Initial request - use full system prompt
659
+ responsesInput.push({
660
+ type: "message",
661
+ role: "developer",
662
+ content: typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content)
663
+ });
664
+ }
574
665
  } else if (msg.role === "user") {
575
666
  // Check if content contains tool_result blocks (Anthropic format)
576
667
  if (Array.isArray(msg.content)) {
@@ -585,19 +676,30 @@ async function invokeAzureOpenAI(body) {
585
676
  output: typeof block.content === 'string' ? block.content : JSON.stringify(block.content || "")
586
677
  });
587
678
  } else if (block.type === "text") {
588
- responsesInput.push({
589
- type: "message",
590
- role: "user",
591
- content: block.text || ""
592
- });
679
+ // For continuation requests, strip system-reminder tags to avoid jailbreak filter
680
+ const textContent = hasToolResults ? stripSystemReminders(block.text || "") : (block.text || "");
681
+ if (textContent) { // Only add if there's content after stripping
682
+ responsesInput.push({
683
+ type: "message",
684
+ role: "user",
685
+ content: textContent
686
+ });
687
+ }
593
688
  }
594
689
  }
595
690
  } else {
596
- responsesInput.push({
597
- type: "message",
598
- role: "user",
599
- content: typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content)
600
- });
691
+ // For continuation requests, strip system-reminder tags to avoid jailbreak filter
692
+ let userContent = typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content);
693
+ if (hasToolResults) {
694
+ userContent = stripSystemReminders(userContent);
695
+ }
696
+ if (userContent) { // Only add if there's content after stripping
697
+ responsesInput.push({
698
+ type: "message",
699
+ role: "user",
700
+ content: userContent
701
+ });
702
+ }
601
703
  }
602
704
  } else if (msg.role === "assistant") {
603
705
  // Assistant messages - handle tool_calls (OpenAI format) and tool_use blocks (Anthropic format)
@@ -663,7 +765,7 @@ async function invokeAzureOpenAI(body) {
663
765
  tool_choice: azureBody.tool_choice,
664
766
  stream: false
665
767
  };
666
- logger.info({
768
+ logger.debug({
667
769
  format: "responses",
668
770
  inputCount: responsesBody.input?.length,
669
771
  model: responsesBody.model,
@@ -681,7 +783,7 @@ async function invokeAzureOpenAI(body) {
681
783
  const textContent = messageOutput?.content?.find(c => c.type === "output_text")?.text || "";
682
784
 
683
785
  // Find function_call outputs (tool calls are separate items in output array)
684
- const toolCalls = outputArray
786
+ const rawToolCalls = outputArray
685
787
  .filter(o => o.type === "function_call")
686
788
  .map(tc => ({
687
789
  id: tc.call_id || tc.id || `call_${Date.now()}`,
@@ -692,7 +794,30 @@ async function invokeAzureOpenAI(body) {
692
794
  }
693
795
  }));
694
796
 
695
- logger.info({
797
+ // Deduplicate identical tool calls (GPT sometimes returns multiple identical calls)
798
+ const seenSignatures = new Set();
799
+ const toolCalls = rawToolCalls.filter(tc => {
800
+ const signature = `${tc.function.name}:${tc.function.arguments}`;
801
+ if (seenSignatures.has(signature)) {
802
+ logger.warn({
803
+ toolName: tc.function.name,
804
+ signature: signature.substring(0, 100),
805
+ }, "Filtered duplicate tool call from GPT response");
806
+ return false;
807
+ }
808
+ seenSignatures.add(signature);
809
+ return true;
810
+ });
811
+
812
+ if (rawToolCalls.length !== toolCalls.length) {
813
+ logger.debug({
814
+ originalCount: rawToolCalls.length,
815
+ dedupedCount: toolCalls.length,
816
+ removed: rawToolCalls.length - toolCalls.length,
817
+ }, "Deduplicated identical tool calls from single response");
818
+ }
819
+
820
+ logger.debug({
696
821
  outputTypes: outputArray.map(o => o.type),
697
822
  hasMessage: !!messageOutput,
698
823
  toolCallCount: toolCalls.length,
@@ -717,7 +842,7 @@ async function invokeAzureOpenAI(body) {
717
842
  usage: result.json.usage
718
843
  };
719
844
 
720
- logger.info({
845
+ logger.debug({
721
846
  convertedContent: textContent?.substring(0, 100),
722
847
  hasToolCalls: toolCalls.length > 0,
723
848
  toolCallCount: toolCalls.length
@@ -725,7 +850,7 @@ async function invokeAzureOpenAI(body) {
725
850
 
726
851
  // Now convert from Chat Completions format to Anthropic format
727
852
  const anthropicJson = convertOpenAIToAnthropic(result.json);
728
- logger.info({
853
+ logger.debug({
729
854
  anthropicContentTypes: anthropicJson.content?.map(c => c.type),
730
855
  stopReason: anthropicJson.stop_reason
731
856
  }, "Converted to Anthropic format");
@@ -747,67 +872,6 @@ async function invokeAzureOpenAI(body) {
747
872
  }
748
873
  }
749
874
 
750
- /**
751
- * Convert Azure Responses API response to Anthropic format
752
- */
753
- function convertResponsesAPIToAnthropic(response, model) {
754
- const content = [];
755
- const outputArray = response.output || [];
756
-
757
- // Extract text content from message output
758
- const messageOutput = outputArray.find(o => o.type === "message");
759
- if (messageOutput?.content) {
760
- for (const item of messageOutput.content) {
761
- if (item.type === "output_text" && item.text) {
762
- content.push({ type: "text", text: item.text });
763
- }
764
- }
765
- }
766
-
767
- // Extract tool calls from function_call outputs
768
- const toolCalls = outputArray
769
- .filter(o => o.type === "function_call")
770
- .map(tc => ({
771
- type: "tool_use",
772
- id: tc.call_id || tc.id || `call_${Date.now()}`,
773
- name: tc.name,
774
- input: typeof tc.arguments === 'string' ? JSON.parse(tc.arguments || "{}") : (tc.arguments || {})
775
- }));
776
-
777
- content.push(...toolCalls);
778
-
779
- // Handle reasoning_content for thinking models
780
- if (content.length === 0 && response.reasoning_content) {
781
- content.push({ type: "text", text: response.reasoning_content });
782
- }
783
-
784
- // Ensure at least empty text if no content
785
- if (content.length === 0) {
786
- content.push({ type: "text", text: "" });
787
- }
788
-
789
- // Determine stop reason
790
- let stopReason = "end_turn";
791
- if (toolCalls.length > 0) {
792
- stopReason = "tool_use";
793
- } else if (response.status === "incomplete" && response.incomplete_details?.reason === "max_output_tokens") {
794
- stopReason = "max_tokens";
795
- }
796
-
797
- return {
798
- id: response.id || `msg_${Date.now()}`,
799
- type: "message",
800
- role: "assistant",
801
- content,
802
- model: model || response.model,
803
- stop_reason: stopReason,
804
- stop_sequence: null,
805
- usage: {
806
- input_tokens: response.usage?.input_tokens || 0,
807
- output_tokens: response.usage?.output_tokens || 0,
808
- }
809
- };
810
- }
811
875
 
812
876
  async function invokeOpenAI(body) {
813
877
  if (!config.openai?.apiKey) {
@@ -841,8 +905,10 @@ async function invokeOpenAI(body) {
841
905
  });
842
906
  }
843
907
 
908
+ // System prompt injection disabled - breaks model response
909
+
844
910
  const openAIBody = {
845
- model: config.openai.model || "gpt-4o",
911
+ model: body._suggestionModeModel || body._tierModel || config.openai.model || "gpt-4o",
846
912
  messages,
847
913
  temperature: body.temperature ?? 0.7,
848
914
  max_tokens: body.max_tokens ?? 4096,
@@ -858,25 +924,25 @@ async function invokeOpenAI(body) {
858
924
  // Client didn't send tools (likely passthrough mode) - inject standard Claude Code tools
859
925
  toolsToSend = STANDARD_TOOLS;
860
926
  toolsInjected = true;
861
- logger.info({
927
+ logger.debug({
862
928
  injectedToolCount: STANDARD_TOOLS.length,
863
- injectedToolNames: STANDARD_TOOLS.map(t => t.name),
929
+ injectedToolNames: STANDARD_TOOL_NAMES,
864
930
  reason: "Client did not send tools (passthrough mode)"
865
931
  }, "=== INJECTING STANDARD TOOLS (OpenAI) ===");
866
932
  }
867
933
 
868
934
  if (Array.isArray(toolsToSend) && toolsToSend.length > 0) {
869
935
  openAIBody.tools = convertAnthropicToolsToOpenRouter(toolsToSend);
870
- openAIBody.parallel_tool_calls = true; // Enable parallel tool calling
936
+ openAIBody.parallel_tool_calls = false; // Disable parallel tool calls - GPT often makes duplicate calls
871
937
  openAIBody.tool_choice = "auto"; // Let the model decide when to use tools
872
- logger.info({
938
+ logger.debug({
873
939
  toolCount: toolsToSend.length,
874
940
  toolNames: toolsToSend.map(t => t.name),
875
941
  toolsInjected
876
942
  }, "=== SENDING TOOLS TO OPENAI ===");
877
943
  }
878
944
 
879
- logger.info({
945
+ logger.debug({
880
946
  endpoint,
881
947
  model: openAIBody.model,
882
948
  hasTools: !!openAIBody.tools,
@@ -934,7 +1000,7 @@ async function invokeLlamaCpp(body) {
934
1000
  }
935
1001
 
936
1002
  if (deduplicated.length !== messages.length) {
937
- logger.info({
1003
+ logger.debug({
938
1004
  originalCount: messages.length,
939
1005
  deduplicatedCount: deduplicated.length,
940
1006
  removed: messages.length - deduplicated.length,
@@ -959,26 +1025,26 @@ async function invokeLlamaCpp(body) {
959
1025
  if (injectToolsLlamacpp && (!Array.isArray(toolsToSend) || toolsToSend.length === 0)) {
960
1026
  toolsToSend = STANDARD_TOOLS;
961
1027
  toolsInjected = true;
962
- logger.info({
1028
+ logger.debug({
963
1029
  injectedToolCount: STANDARD_TOOLS.length,
964
- injectedToolNames: STANDARD_TOOLS.map(t => t.name),
1030
+ injectedToolNames: STANDARD_TOOL_NAMES,
965
1031
  reason: "Client did not send tools (passthrough mode)"
966
1032
  }, "=== INJECTING STANDARD TOOLS (llama.cpp) ===");
967
1033
  } else if (!injectToolsLlamacpp) {
968
- logger.info({}, "Tool injection disabled for llama.cpp (INJECT_TOOLS_LLAMACPP=false)");
1034
+ logger.debug({}, "Tool injection disabled for llama.cpp (INJECT_TOOLS_LLAMACPP=false)");
969
1035
  }
970
1036
 
971
1037
  if (Array.isArray(toolsToSend) && toolsToSend.length > 0) {
972
1038
  llamacppBody.tools = convertAnthropicToolsToOpenRouter(toolsToSend);
973
1039
  llamacppBody.tool_choice = "auto";
974
- logger.info({
1040
+ logger.debug({
975
1041
  toolCount: toolsToSend.length,
976
1042
  toolNames: toolsToSend.map(t => t.name),
977
1043
  toolsInjected
978
1044
  }, "=== SENDING TOOLS TO LLAMA.CPP ===");
979
1045
  }
980
1046
 
981
- logger.info({
1047
+ logger.debug({
982
1048
  endpoint,
983
1049
  hasTools: !!llamacppBody.tools,
984
1050
  toolCount: llamacppBody.tools?.length || 0,
@@ -1042,9 +1108,9 @@ async function invokeLMStudio(body) {
1042
1108
  if (!Array.isArray(toolsToSend) || toolsToSend.length === 0) {
1043
1109
  toolsToSend = STANDARD_TOOLS;
1044
1110
  toolsInjected = true;
1045
- logger.info({
1111
+ logger.debug({
1046
1112
  injectedToolCount: STANDARD_TOOLS.length,
1047
- injectedToolNames: STANDARD_TOOLS.map(t => t.name),
1113
+ injectedToolNames: STANDARD_TOOL_NAMES,
1048
1114
  reason: "Client did not send tools (passthrough mode)"
1049
1115
  }, "=== INJECTING STANDARD TOOLS (LM Studio) ===");
1050
1116
  }
@@ -1052,14 +1118,14 @@ async function invokeLMStudio(body) {
1052
1118
  if (Array.isArray(toolsToSend) && toolsToSend.length > 0) {
1053
1119
  lmstudioBody.tools = convertAnthropicToolsToOpenRouter(toolsToSend);
1054
1120
  lmstudioBody.tool_choice = "auto";
1055
- logger.info({
1121
+ logger.debug({
1056
1122
  toolCount: toolsToSend.length,
1057
1123
  toolNames: toolsToSend.map(t => t.name),
1058
1124
  toolsInjected
1059
1125
  }, "=== SENDING TOOLS TO LM STUDIO ===");
1060
1126
  }
1061
1127
 
1062
- logger.info({
1128
+ logger.debug({
1063
1129
  endpoint,
1064
1130
  hasTools: !!lmstudioBody.tools,
1065
1131
  toolCount: lmstudioBody.tools?.length || 0,
@@ -1080,7 +1146,7 @@ async function invokeBedrock(body) {
1080
1146
  }
1081
1147
 
1082
1148
  const bearerToken = config.bedrock.apiKey;
1083
- logger.info({ authMethod: "Bearer Token" }, "=== BEDROCK AUTH ===");
1149
+ logger.debug({ authMethod: "Bearer Token" }, "=== BEDROCK AUTH ===");
1084
1150
 
1085
1151
  // 2. Inject standard tools if needed
1086
1152
  let toolsToSend = body.tools;
@@ -1089,9 +1155,9 @@ async function invokeBedrock(body) {
1089
1155
  if (!Array.isArray(toolsToSend) || toolsToSend.length === 0) {
1090
1156
  toolsToSend = STANDARD_TOOLS;
1091
1157
  toolsInjected = true;
1092
- logger.info({
1158
+ logger.debug({
1093
1159
  injectedToolCount: STANDARD_TOOLS.length,
1094
- injectedToolNames: STANDARD_TOOLS.map(t => t.name),
1160
+ injectedToolNames: STANDARD_TOOL_NAMES,
1095
1161
  reason: "Client did not send tools (passthrough mode)"
1096
1162
  }, "=== INJECTING STANDARD TOOLS (Bedrock) ===");
1097
1163
  }
@@ -1099,10 +1165,10 @@ async function invokeBedrock(body) {
1099
1165
  const bedrockBody = { ...body, tools: toolsToSend };
1100
1166
 
1101
1167
  // 4. Detect model family and convert format
1102
- const modelId = config.bedrock.modelId;
1168
+ const modelId = body._tierModel || config.bedrock.modelId;
1103
1169
  const modelFamily = detectModelFamily(modelId);
1104
1170
 
1105
- logger.info({
1171
+ logger.debug({
1106
1172
  modelId,
1107
1173
  modelFamily,
1108
1174
  hasTools: !!bedrockBody.tools,
@@ -1167,7 +1233,7 @@ async function invokeBedrock(body) {
1167
1233
  const host = `bedrock-runtime.${config.bedrock.region}.amazonaws.com`;
1168
1234
  const endpoint = `https://${host}${path}`;
1169
1235
 
1170
- logger.info({
1236
+ logger.debug({
1171
1237
  endpoint,
1172
1238
  authMethod: "Bearer Token",
1173
1239
  hasSystem: !!converseBody.system,
@@ -1200,7 +1266,7 @@ async function invokeBedrock(body) {
1200
1266
  // Parse Converse API response (already parsed by performJsonRequest)
1201
1267
  const converseResponse = response.json; // Use property, not method
1202
1268
 
1203
- logger.info({
1269
+ logger.debug({
1204
1270
  stopReason: converseResponse.stopReason,
1205
1271
  inputTokens: converseResponse.usage?.inputTokens || 0,
1206
1272
  outputTokens: converseResponse.usage?.outputTokens || 0,
@@ -1280,7 +1346,7 @@ async function invokeZai(body) {
1280
1346
  "claude-3-haiku": "glm-4.5-air",
1281
1347
  };
1282
1348
 
1283
- const requestedModel = body.model || config.zai.model;
1349
+ const requestedModel = body._tierModel || body.model || config.zai.model;
1284
1350
  let mappedModel = modelMap[requestedModel] || config.zai.model || "glm-4.7";
1285
1351
  mappedModel = mappedModel.toLowerCase();
1286
1352
 
@@ -1357,7 +1423,7 @@ async function invokeZai(body) {
1357
1423
  // "required" was forcing tools even for simple greetings
1358
1424
  zaiBody.tool_choice = "auto";
1359
1425
  // Also enable parallel tool calls
1360
- zaiBody.parallel_tool_calls = true;
1426
+ zaiBody.parallel_tool_calls = false; // Disable parallel tool calls - GPT often makes duplicate calls
1361
1427
  }
1362
1428
 
1363
1429
  headers = {
@@ -1372,9 +1438,9 @@ async function invokeZai(body) {
1372
1438
  // Inject standard tools if client didn't send any (passthrough mode)
1373
1439
  if (!Array.isArray(zaiBody.tools) || zaiBody.tools.length === 0) {
1374
1440
  zaiBody.tools = STANDARD_TOOLS;
1375
- logger.info({
1441
+ logger.debug({
1376
1442
  injectedToolCount: STANDARD_TOOLS.length,
1377
- injectedToolNames: STANDARD_TOOLS.map(t => t.name),
1443
+ injectedToolNames: STANDARD_TOOL_NAMES,
1378
1444
  reason: "Client did not send tools (passthrough mode)"
1379
1445
  }, "=== INJECTING STANDARD TOOLS (Z.AI Anthropic) ===");
1380
1446
  }
@@ -1386,7 +1452,7 @@ async function invokeZai(body) {
1386
1452
  };
1387
1453
  }
1388
1454
 
1389
- logger.info({
1455
+ logger.debug({
1390
1456
  endpoint,
1391
1457
  format: isOpenAIFormat ? "openai" : "anthropic",
1392
1458
  model: zaiBody.model,
@@ -1416,7 +1482,7 @@ async function invokeZai(body) {
1416
1482
 
1417
1483
  const response = await performJsonRequest(endpoint, { headers, body: zaiBody }, "Z.AI");
1418
1484
 
1419
- logger.info({
1485
+ logger.debug({
1420
1486
  responseOk: response?.ok,
1421
1487
  responseStatus: response?.status,
1422
1488
  hasJson: !!response?.json,
@@ -1428,7 +1494,7 @@ async function invokeZai(body) {
1428
1494
  // Convert OpenAI response back to Anthropic format if needed
1429
1495
  if (isOpenAIFormat && response?.ok && response?.json) {
1430
1496
  const anthropicJson = convertOpenAIToAnthropic(response.json);
1431
- logger.info({
1497
+ logger.debug({
1432
1498
  convertedContent: JSON.stringify(anthropicJson.content).substring(0, 200),
1433
1499
  }, "=== Z.AI CONVERTED RESPONSE ===");
1434
1500
  // Return in the same format as other providers (with ok, status, json)
@@ -1448,6 +1514,118 @@ async function invokeZai(body) {
1448
1514
 
1449
1515
 
1450
1516
 
1517
+ /**
1518
+ * Moonshot AI (Kimi) Provider
1519
+ *
1520
+ * Moonshot offers Kimi models through an OpenAI-compatible chat completions API.
1521
+ * Uses native system role support (unlike Z.AI which merges into user message).
1522
+ */
1523
+ async function invokeMoonshot(body) {
1524
+ if (!config.moonshot?.apiKey) {
1525
+ throw new Error("Moonshot API key is not configured. Set MOONSHOT_API_KEY in your .env file.");
1526
+ }
1527
+
1528
+ const {
1529
+ convertAnthropicToolsToOpenRouter,
1530
+ convertAnthropicMessagesToOpenRouter
1531
+ } = require("./openrouter-utils");
1532
+
1533
+ const endpoint = config.moonshot.endpoint || "https://api.moonshot.ai/v1/chat/completions";
1534
+
1535
+ // Model mapping: Anthropic names → Moonshot/Kimi names
1536
+ const modelMap = {
1537
+ "claude-sonnet-4-5-20250929": "kimi-k2-turbo-preview",
1538
+ "claude-sonnet-4-5": "kimi-k2-turbo-preview",
1539
+ "claude-sonnet-4.5": "kimi-k2-turbo-preview",
1540
+ "claude-3-5-sonnet": "kimi-k2-turbo-preview",
1541
+ "claude-haiku-4-5-20251001": "kimi-k2-turbo-preview",
1542
+ "claude-haiku-4-5": "kimi-k2-turbo-preview",
1543
+ "claude-3-haiku": "kimi-k2-turbo-preview",
1544
+ };
1545
+
1546
+ const requestedModel = body._tierModel || body.model || config.moonshot.model;
1547
+ const mappedModel = modelMap[requestedModel] || config.moonshot.model || "kimi-k2-turbo-preview";
1548
+
1549
+ // Convert messages using existing utility
1550
+ const messages = convertAnthropicMessagesToOpenRouter(body.messages || []);
1551
+
1552
+ // Moonshot natively supports system role — add as system message
1553
+ if (body.system) {
1554
+ const systemContent = Array.isArray(body.system)
1555
+ ? body.system.map(s => s.text || s).join("\n")
1556
+ : body.system;
1557
+ messages.unshift({ role: "system", content: systemContent });
1558
+ }
1559
+
1560
+ const moonshotBody = {
1561
+ model: mappedModel,
1562
+ messages,
1563
+ max_tokens: body.max_tokens || 4096,
1564
+ temperature: body.temperature ?? 0.7,
1565
+ top_p: body.top_p ?? 1.0,
1566
+ stream: false, // Force non-streaming - OpenAI SSE to Anthropic SSE conversion not implemented
1567
+ };
1568
+
1569
+ // Convert and add tools if present
1570
+ if (Array.isArray(body.tools) && body.tools.length > 0) {
1571
+ moonshotBody.tools = convertAnthropicToolsToOpenRouter(body.tools);
1572
+ moonshotBody.tool_choice = "auto";
1573
+ moonshotBody.parallel_tool_calls = false;
1574
+ }
1575
+
1576
+ const headers = {
1577
+ "Content-Type": "application/json",
1578
+ "Authorization": `Bearer ${config.moonshot.apiKey}`,
1579
+ };
1580
+
1581
+ logger.debug({
1582
+ endpoint,
1583
+ model: moonshotBody.model,
1584
+ originalModel: requestedModel,
1585
+ messageCount: moonshotBody.messages?.length || 0,
1586
+ hasTools: !!moonshotBody.tools,
1587
+ toolCount: moonshotBody.tools?.length || 0,
1588
+ }, "=== Moonshot REQUEST ===");
1589
+
1590
+ const response = await performJsonRequest(endpoint, { headers, body: moonshotBody }, "Moonshot");
1591
+
1592
+ const rawMsg = response?.json?.choices?.[0]?.message;
1593
+ logger.debug({
1594
+ responseOk: response?.ok,
1595
+ responseStatus: response?.status,
1596
+ hasJson: !!response?.json,
1597
+ contentType: typeof rawMsg?.content,
1598
+ contentValue: typeof rawMsg?.content === 'string' ? rawMsg.content.substring(0, 300) : String(JSON.stringify(rawMsg?.content) || '').substring(0, 300),
1599
+ hasReasoning: !!rawMsg?.reasoning_content,
1600
+ reasoningType: typeof rawMsg?.reasoning_content,
1601
+ reasoningValue: typeof rawMsg?.reasoning_content === 'string' ? rawMsg.reasoning_content.substring(0, 300) : String(JSON.stringify(rawMsg?.reasoning_content) || '').substring(0, 300),
1602
+ finishReason: response?.json?.choices?.[0]?.finish_reason,
1603
+ messageKeys: rawMsg ? Object.keys(rawMsg) : [],
1604
+ fullRawResponse: String(JSON.stringify(response?.json) || '').substring(0, 800),
1605
+ }, "=== Moonshot RAW RESPONSE ===");
1606
+
1607
+ // Convert OpenAI response back to Anthropic format
1608
+ if (response?.ok && response?.json) {
1609
+ const anthropicJson = convertOpenAIToAnthropic(response.json);
1610
+ logger.debug({
1611
+ convertedContent: JSON.stringify(anthropicJson.content).substring(0, 500),
1612
+ contentLength: anthropicJson.content?.length,
1613
+ firstContentType: anthropicJson.content?.[0]?.type,
1614
+ firstContentText: anthropicJson.content?.[0]?.text?.substring(0, 300),
1615
+ }, "=== Moonshot CONVERTED RESPONSE ===");
1616
+ return {
1617
+ ok: response.ok,
1618
+ status: response.status,
1619
+ json: anthropicJson,
1620
+ text: JSON.stringify(anthropicJson),
1621
+ contentType: "application/json",
1622
+ headers: response.headers,
1623
+ };
1624
+ }
1625
+
1626
+ return response;
1627
+ }
1628
+
1451
1629
  /**
1452
1630
  * Convert OpenAI response to Anthropic format
1453
1631
  */
@@ -1463,11 +1641,17 @@ function convertOpenAIToAnthropic(response) {
1463
1641
  // Add text content from message.content
1464
1642
  // Don't add placeholder text if there are tool_calls - tools are the actual response
1465
1643
  const hasToolCalls = Array.isArray(message.tool_calls) && message.tool_calls.length > 0;
1466
- if (message.content) {
1467
- content.push({ type: "text", text: message.content });
1468
- } else if (message.reasoning_content && !message.content) {
1469
- // Thinking models (Kimi-K2, o1, etc.) return response in reasoning_content
1470
- content.push({ type: "text", text: message.reasoning_content });
1644
+
1645
+ // Extract text content - handle thinking models that split content/reasoning
1646
+ const textContent = typeof message.content === 'string' ? message.content : '';
1647
+ const reasoningContent = typeof message.reasoning_content === 'string' ? message.reasoning_content : '';
1648
+
1649
+ if (textContent) {
1650
+ // Has regular content - use it directly (ignore reasoning_content chain-of-thought)
1651
+ content.push({ type: "text", text: textContent });
1652
+ } else if (reasoningContent) {
1653
+ // Fallback: thinking models where content is empty but reasoning has the output
1654
+ content.push({ type: "text", text: reasoningContent });
1471
1655
  }
1472
1656
 
1473
1657
  // Convert tool calls
@@ -1488,13 +1672,14 @@ function convertOpenAIToAnthropic(response) {
1488
1672
  }
1489
1673
 
1490
1674
  // Determine stop reason
1675
+ // IMPORTANT: Check for actual tool_calls presence, not just finish_reason string.
1676
+ // Some providers (Moonshot, etc.) return finish_reason: "stop" even when tool_calls exist.
1677
+ // If we don't set stop_reason to "tool_use", the CLI won't execute the tool calls.
1491
1678
  let stopReason = "end_turn";
1492
- if (choice.finish_reason === "tool_calls") {
1679
+ if (hasToolCalls) {
1493
1680
  stopReason = "tool_use";
1494
1681
  } else if (choice.finish_reason === "length") {
1495
1682
  stopReason = "max_tokens";
1496
- } else if (choice.finish_reason === "stop") {
1497
- stopReason = "end_turn";
1498
1683
  }
1499
1684
 
1500
1685
  return {
@@ -1580,7 +1765,7 @@ async function invokeVertex(body) {
1580
1765
  };
1581
1766
 
1582
1767
  // Map model name
1583
- const requestedModel = body.model || config.vertex.model;
1768
+ const requestedModel = body._tierModel || body.model || config.vertex.model;
1584
1769
  const geminiModel = modelMap[requestedModel] || config.vertex.model || "gemini-2.0-flash";
1585
1770
 
1586
1771
  // Construct Gemini API endpoint
@@ -1626,7 +1811,7 @@ async function invokeVertex(body) {
1626
1811
  "Content-Type": "application/json",
1627
1812
  };
1628
1813
 
1629
- logger.info({
1814
+ logger.debug({
1630
1815
  endpoint: endpoint.replace(apiKey, "***"),
1631
1816
  model: geminiModel,
1632
1817
  originalModel: requestedModel,
@@ -1655,7 +1840,7 @@ async function invokeVertex(body) {
1655
1840
  // Convert Gemini response to Anthropic format
1656
1841
  if (response?.json) {
1657
1842
  const anthropicJson = convertGeminiToAnthropic(response.json, requestedModel);
1658
- logger.info({
1843
+ logger.debug({
1659
1844
  convertedContent: JSON.stringify(anthropicJson.content).substring(0, 200),
1660
1845
  }, "=== VERTEX AI (GEMINI) CONVERTED RESPONSE ===");
1661
1846
  return {
@@ -1816,35 +2001,44 @@ function convertGeminiToAnthropic(response, requestedModel) {
1816
2001
  }
1817
2002
 
1818
2003
  async function invokeModel(body, options = {}) {
1819
- const { determineProvider, isFallbackEnabled, getFallbackProvider, analyzeComplexity } = require("./routing");
2004
+ const { determineProviderSmart, isFallbackEnabled, getFallbackProvider } = require("./routing");
1820
2005
  const metricsCollector = getMetricsCollector();
1821
2006
  const registry = getCircuitBreakerRegistry();
1822
2007
  const healthTracker = getHealthTracker();
1823
2008
 
1824
- // Analyze complexity and determine provider
1825
- const complexityAnalysis = analyzeComplexity(body);
1826
- const initialProvider = options.forceProvider ?? determineProvider(body);
1827
- const preferOllama = config.modelProvider?.preferOllama ?? false;
2009
+ // Determine provider via async tier routing
2010
+ const routingResult = options.forceProvider
2011
+ ? { provider: options.forceProvider, model: null, method: 'forced' }
2012
+ : await determineProviderSmart(body);
2013
+ const initialProvider = routingResult.provider;
2014
+ const tierSelectedModel = routingResult.model;
2015
+
2016
+ // Inject tier-selected model into body so provider functions can use it
2017
+ if (tierSelectedModel) {
2018
+ body._tierModel = tierSelectedModel;
2019
+ }
1828
2020
 
1829
2021
  // Build routing decision object for response headers
1830
2022
  const routingDecision = {
1831
2023
  provider: initialProvider,
1832
- score: complexityAnalysis.score,
1833
- threshold: complexityAnalysis.threshold,
1834
- mode: complexityAnalysis.mode,
1835
- recommendation: complexityAnalysis.recommendation,
1836
- method: complexityAnalysis.score !== undefined ? 'complexity' : 'static',
1837
- taskType: complexityAnalysis.breakdown?.taskType?.reason,
2024
+ tier: routingResult.tier || null,
2025
+ model: tierSelectedModel || null,
2026
+ score: routingResult.score,
2027
+ threshold: routingResult.threshold,
2028
+ mode: routingResult.mode,
2029
+ reason: routingResult.reason,
2030
+ method: routingResult.method || 'static',
1838
2031
  };
1839
2032
 
1840
2033
  logger.debug({
1841
2034
  initialProvider,
1842
- preferOllama,
2035
+ tierSelectedModel,
2036
+ tier: routingResult.tier,
1843
2037
  fallbackEnabled: isFallbackEnabled(),
1844
2038
  toolCount: Array.isArray(body?.tools) ? body.tools.length : 0,
1845
- complexityScore: complexityAnalysis.score,
1846
- complexityThreshold: complexityAnalysis.threshold,
1847
- recommendation: complexityAnalysis.recommendation,
2039
+ score: routingResult.score,
2040
+ reason: routingResult.reason,
2041
+ method: routingResult.method,
1848
2042
  }, "Provider routing decision");
1849
2043
 
1850
2044
  metricsCollector.recordProviderRouting(initialProvider);
@@ -1885,6 +2079,8 @@ async function invokeModel(body, options = {}) {
1885
2079
  return await invokeZai(body);
1886
2080
  } else if (initialProvider === "vertex") {
1887
2081
  return await invokeVertex(body);
2082
+ } else if (initialProvider === "moonshot") {
2083
+ return await invokeMoonshot(body);
1888
2084
  }
1889
2085
  return await invokeDatabricks(body);
1890
2086
  });
@@ -1920,11 +2116,10 @@ async function invokeModel(body, options = {}) {
1920
2116
  metricsCollector.recordProviderFailure(initialProvider);
1921
2117
  healthTracker.recordFailure(initialProvider, err, err.status);
1922
2118
 
1923
- // Check if we should fallback
2119
+ // Check if we should fallback (any provider can fall back, not just ollama)
1924
2120
  const shouldFallback =
1925
- preferOllama &&
1926
- initialProvider === "ollama" &&
1927
2121
  isFallbackEnabled() &&
2122
+ initialProvider !== getFallbackProvider() &&
1928
2123
  !options.disableFallback;
1929
2124
 
1930
2125
  if (!shouldFallback) {
@@ -1941,7 +2136,7 @@ async function invokeModel(body, options = {}) {
1941
2136
  fallbackProvider,
1942
2137
  reason,
1943
2138
  error: err.message,
1944
- }, "Ollama failed, attempting transparent fallback to cloud");
2139
+ }, "Primary provider failed, attempting transparent fallback");
1945
2140
 
1946
2141
  metricsCollector.recordFallbackAttempt(initialProvider, fallbackProvider, reason);
1947
2142
 
@@ -1974,6 +2169,8 @@ async function invokeModel(body, options = {}) {
1974
2169
  return await invokeZai(body);
1975
2170
  } else if (fallbackProvider === "vertex") {
1976
2171
  return await invokeVertex(body);
2172
+ } else if (fallbackProvider === "moonshot") {
2173
+ return await invokeMoonshot(body);
1977
2174
  }
1978
2175
  return await invokeDatabricks(body);
1979
2176
  });
@@ -2023,7 +2220,7 @@ async function invokeModel(body, options = {}) {
2023
2220
  fallbackProvider,
2024
2221
  originalError: err.message,
2025
2222
  fallbackError: fallbackErr.message,
2026
- }, "Both Ollama and fallback provider failed");
2223
+ }, "Both primary and fallback provider failed");
2027
2224
 
2028
2225
  // Return fallback error (more actionable than Ollama error)
2029
2226
  throw fallbackErr;