@vybestack/llxprt-code-core 0.7.0-nightly.251208.a6190e71e → 0.7.0-nightly.251211.134f1920b

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/dist/prompt-config/defaults/default-prompts.json +1 -1
  2. package/dist/src/config/profileManager.js +6 -0
  3. package/dist/src/config/profileManager.js.map +1 -1
  4. package/dist/src/core/geminiChat.d.ts +15 -1
  5. package/dist/src/core/geminiChat.js +35 -10
  6. package/dist/src/core/geminiChat.js.map +1 -1
  7. package/dist/src/core/prompts.js +29 -9
  8. package/dist/src/core/prompts.js.map +1 -1
  9. package/dist/src/index.d.ts +1 -0
  10. package/dist/src/index.js.map +1 -1
  11. package/dist/src/mcp/sa-impersonation-provider.js.map +1 -1
  12. package/dist/src/mcp/token-storage/file-token-storage.js +2 -1
  13. package/dist/src/mcp/token-storage/file-token-storage.js.map +1 -1
  14. package/dist/src/mcp/token-storage/hybrid-token-storage.js.map +1 -1
  15. package/dist/src/prompt-config/defaults/core.md +0 -3
  16. package/dist/src/prompt-config/prompt-installer.d.ts +33 -2
  17. package/dist/src/prompt-config/prompt-installer.js +163 -31
  18. package/dist/src/prompt-config/prompt-installer.js.map +1 -1
  19. package/dist/src/prompt-config/prompt-resolver.js +49 -41
  20. package/dist/src/prompt-config/prompt-resolver.js.map +1 -1
  21. package/dist/src/prompt-config/types.d.ts +1 -0
  22. package/dist/src/providers/LoggingProviderWrapper.d.ts +2 -1
  23. package/dist/src/providers/LoggingProviderWrapper.js +16 -4
  24. package/dist/src/providers/LoggingProviderWrapper.js.map +1 -1
  25. package/dist/src/providers/ProviderManager.d.ts +6 -3
  26. package/dist/src/providers/ProviderManager.js +16 -4
  27. package/dist/src/providers/ProviderManager.js.map +1 -1
  28. package/dist/src/providers/anthropic/AnthropicProvider.js +164 -145
  29. package/dist/src/providers/anthropic/AnthropicProvider.js.map +1 -1
  30. package/dist/src/providers/gemini/GeminiProvider.js +91 -30
  31. package/dist/src/providers/gemini/GeminiProvider.js.map +1 -1
  32. package/dist/src/providers/openai/OpenAIProvider.d.ts +10 -2
  33. package/dist/src/providers/openai/OpenAIProvider.js +354 -132
  34. package/dist/src/providers/openai/OpenAIProvider.js.map +1 -1
  35. package/dist/src/providers/openai-vercel/OpenAIVercelProvider.d.ts +3 -0
  36. package/dist/src/providers/openai-vercel/OpenAIVercelProvider.js +255 -22
  37. package/dist/src/providers/openai-vercel/OpenAIVercelProvider.js.map +1 -1
  38. package/dist/src/providers/openai-vercel/messageConversion.d.ts +4 -1
  39. package/dist/src/providers/openai-vercel/messageConversion.js +41 -6
  40. package/dist/src/providers/openai-vercel/messageConversion.js.map +1 -1
  41. package/dist/src/providers/reasoning/reasoningUtils.d.ts +26 -1
  42. package/dist/src/providers/reasoning/reasoningUtils.js +157 -0
  43. package/dist/src/providers/reasoning/reasoningUtils.js.map +1 -1
  44. package/dist/src/providers/utils/cacheMetricsExtractor.d.ts +6 -0
  45. package/dist/src/providers/utils/cacheMetricsExtractor.js +36 -0
  46. package/dist/src/providers/utils/cacheMetricsExtractor.js.map +1 -0
  47. package/dist/src/providers/utils/dumpContext.d.ts +36 -0
  48. package/dist/src/providers/utils/dumpContext.js +93 -0
  49. package/dist/src/providers/utils/dumpContext.js.map +1 -0
  50. package/dist/src/providers/utils/dumpSDKContext.d.ts +13 -0
  51. package/dist/src/providers/utils/dumpSDKContext.js +39 -0
  52. package/dist/src/providers/utils/dumpSDKContext.js.map +1 -0
  53. package/dist/src/services/history/IContent.d.ts +3 -7
  54. package/dist/src/services/history/IContent.js.map +1 -1
  55. package/dist/src/settings/types.d.ts +4 -2
  56. package/dist/src/tools/IToolFormatter.d.ts +1 -1
  57. package/dist/src/tools/ToolIdStrategy.d.ts +25 -0
  58. package/dist/src/tools/ToolIdStrategy.js +108 -0
  59. package/dist/src/tools/ToolIdStrategy.js.map +1 -1
  60. package/dist/src/tools/modifiable-tool.js.map +1 -1
  61. package/dist/src/tools/task.js +14 -2
  62. package/dist/src/tools/task.js.map +1 -1
  63. package/dist/src/tools/tools.js.map +1 -1
  64. package/dist/src/types/modelParams.d.ts +6 -0
  65. package/dist/src/utils/generateContentResponseUtilities.js +6 -0
  66. package/dist/src/utils/generateContentResponseUtilities.js.map +1 -1
  67. package/dist/src/utils/retry.js +1 -0
  68. package/dist/src/utils/retry.js.map +1 -1
  69. package/package.json +1 -1
@@ -22,7 +22,7 @@ import crypto from 'node:crypto';
22
22
  import * as http from 'http';
23
23
  import * as https from 'https';
24
24
  import * as net from 'net';
25
- import { isKimiModel, getToolIdStrategy, } from '../../tools/ToolIdStrategy.js';
25
+ import { isKimiModel, isMistralModel, getToolIdStrategy, } from '../../tools/ToolIdStrategy.js';
26
26
  import { BaseProvider, } from '../BaseProvider.js';
27
27
  import { DebugLogger } from '../../debug/index.js';
28
28
  import { ToolFormatter } from '../../tools/ToolFormatter.js';
@@ -39,6 +39,8 @@ import { ToolCallPipeline } from './ToolCallPipeline.js';
39
39
  import { buildToolResponsePayload, EMPTY_TOOL_RESULT_PLACEHOLDER, } from '../utils/toolResponsePayload.js';
40
40
  import { isLocalEndpoint } from '../utils/localEndpoint.js';
41
41
  import { filterThinkingForContext, thinkingToReasoningField, extractThinkingBlocks, } from '../reasoning/reasoningUtils.js';
42
+ import { shouldDumpSDKContext, dumpSDKContext, } from '../utils/dumpSDKContext.js';
43
+ import { extractCacheMetrics } from '../utils/cacheMetricsExtractor.js';
42
44
  const MAX_TOOL_RESPONSE_CHARS = 1024;
43
45
  const MAX_TOOL_RESPONSE_RETRY_CHARS = 512;
44
46
  const TOOL_ARGS_PREVIEW_LENGTH = 500;
@@ -276,13 +278,12 @@ export class OpenAIProvider extends BaseProvider {
276
278
  // This preserves meaningful whitespace in regular text chunks during streaming
277
279
  // (e.g., " 5 Biggest" should remain " 5 Biggest", not become "5 Biggest")
278
280
  if (hadReasoningTags) {
279
- // Clean up multiple consecutive spaces/whitespace that may result from stripping
281
+ // Collapse multiple spaces/tabs but preserve newlines for proper paragraph/line breaks
280
282
  str = str.replace(/[ \t]+/g, ' ');
281
283
  str = str.replace(/\n{3,}/g, '\n\n');
282
- // Only trim leading whitespace when think tags were at the beginning
283
- // This prevents leading spaces from "<think>...</think>text" -> " text"
284
- // but preserves trailing whitespace for streaming chunk concatenation
285
- str = str.trimStart();
284
+ // Only trim leading horizontal whitespace (spaces/tabs), NOT newlines
285
+ // This preserves line breaks between think tags and content (fixes #721)
286
+ str = str.replace(/^[ \t]+/, '');
286
287
  }
287
288
  const afterLen = str.length;
288
289
  if (hadReasoningTags && afterLen !== beforeLen) {
@@ -437,62 +438,78 @@ export class OpenAIProvider extends BaseProvider {
437
438
  * and all tool info is only encoded in the text template.
438
439
  */
439
440
  extractKimiToolCallsFromText(raw) {
440
- if (!raw || !raw.includes('<|tool_calls_section_begin|>')) {
441
+ // Return early only if input is null/undefined/empty
442
+ if (!raw) {
441
443
  return { cleanedText: raw, toolCalls: [] };
442
444
  }
443
445
  const logger = this.getLogger();
444
446
  const toolCalls = [];
445
447
  let text = raw;
446
- const sectionRegex = /<\|tool_calls_section_begin\|>([\s\S]*?)<\|tool_calls_section_end\|>/g;
447
- text = text.replace(sectionRegex, (_sectionMatch, sectionBody) => {
448
- try {
449
- const callRegex = /<\|tool_call_begin\|>\s*([^<]+?)\s*<\|tool_call_argument_begin\|>\s*([\s\S]*?)\s*<\|tool_call_end\|>/g;
450
- let m;
451
- while ((m = callRegex.exec(sectionBody)) !== null) {
452
- const rawId = m[1].trim();
453
- const rawArgs = m[2].trim();
454
- // Infer tool name from ID.
455
- let toolName = '';
456
- const match = /^functions\.([A-Za-z0-9_]+):\d+/i.exec(rawId) ||
457
- /^[A-Za-z0-9_]+\.([A-Za-z0-9_]+):\d+/.exec(rawId);
458
- if (match) {
459
- toolName = match[1];
460
- }
461
- else {
462
- const colonParts = rawId.split(':');
463
- const head = colonParts[0] || rawId;
464
- const dotParts = head.split('.');
465
- toolName = dotParts[dotParts.length - 1] || head;
448
+ // Extract tool calls from complete sections if present
449
+ if (raw.includes('<|tool_calls_section_begin|>')) {
450
+ const sectionRegex = /<\|tool_calls_section_begin\|>([\s\S]*?)<\|tool_calls_section_end\|>/g;
451
+ text = text.replace(sectionRegex, (_sectionMatch, sectionBody) => {
452
+ try {
453
+ const callRegex = /<\|tool_call_begin\|>\s*([^<]+?)\s*<\|tool_call_argument_begin\|>\s*([\s\S]*?)\s*<\|tool_call_end\|>/g;
454
+ let m;
455
+ while ((m = callRegex.exec(sectionBody)) !== null) {
456
+ const rawId = m[1].trim();
457
+ const rawArgs = m[2].trim();
458
+ // Infer tool name from ID.
459
+ let toolName = '';
460
+ const match = /^functions\.([A-Za-z0-9_]+):\d+/i.exec(rawId) ||
461
+ /^[A-Za-z0-9_]+\.([A-Za-z0-9_]+):\d+/.exec(rawId);
462
+ if (match) {
463
+ toolName = match[1];
464
+ }
465
+ else {
466
+ const colonParts = rawId.split(':');
467
+ const head = colonParts[0] || rawId;
468
+ const dotParts = head.split('.');
469
+ toolName = dotParts[dotParts.length - 1] || head;
470
+ }
471
+ // Normalize tool name (handles Kimi-K2 style prefixes like call_functionsglob7)
472
+ toolName = this.normalizeToolName(toolName);
473
+ const sanitizedArgs = this.sanitizeToolArgumentsString(rawArgs);
474
+ const processedParameters = processToolParameters(sanitizedArgs, toolName);
475
+ toolCalls.push({
476
+ type: 'tool_call',
477
+ id: this.normalizeToHistoryToolId(rawId),
478
+ name: toolName,
479
+ parameters: processedParameters,
480
+ });
466
481
  }
467
- // Normalize tool name (handles Kimi-K2 style prefixes like call_functionsglob7)
468
- toolName = this.normalizeToolName(toolName);
469
- const sanitizedArgs = this.sanitizeToolArgumentsString(rawArgs);
470
- const processedParameters = processToolParameters(sanitizedArgs, toolName);
471
- toolCalls.push({
472
- type: 'tool_call',
473
- id: this.normalizeToHistoryToolId(rawId),
474
- name: toolName,
475
- parameters: processedParameters,
476
- });
477
482
  }
478
- }
479
- catch (err) {
480
- logger.debug(() => `[OpenAIProvider] Failed to parse Kimi tool_calls_section: ${err}`);
481
- }
482
- // Strip the entire tool section from user-visible text
483
- return '';
484
- });
485
- if (toolCalls.length > 0) {
486
- logger.debug(() => `[OpenAIProvider] Parsed Kimi tool_calls_section`, {
487
- toolCallCount: toolCalls.length,
488
- originalLength: raw.length,
489
- cleanedLength: text.length,
483
+ catch (err) {
484
+ logger.debug(() => `[OpenAIProvider] Failed to parse Kimi tool_calls_section: ${err}`);
485
+ }
486
+ // Strip the entire tool section from user-visible text
487
+ return '';
490
488
  });
489
+ if (toolCalls.length > 0) {
490
+ logger.debug(() => `[OpenAIProvider] Parsed Kimi tool_calls_section`, {
491
+ toolCallCount: toolCalls.length,
492
+ originalLength: raw.length,
493
+ cleanedLength: text.length,
494
+ });
495
+ }
491
496
  }
497
+ // ALWAYS run stray token cleanup, even if no complete sections were found
498
+ // This handles partial sections, malformed tokens, orphaned markers, etc.
499
+ text = text.replace(/<\|tool_call(?:_(?:begin|end|argument_begin))?\|>/g, '');
500
+ text = text.replace(/<\|tool_calls_section_(?:begin|end)\|>/g, '');
492
501
  // Don't trim - preserve leading/trailing newlines that are important for formatting
493
502
  // (e.g., numbered lists from Kimi K2 that have newlines between items)
494
503
  return { cleanedText: text, toolCalls };
495
504
  }
505
+ /**
506
+ * Clean Kimi K2 tool call tokens from thinking content.
507
+ * Used when extracting thinking from <think> tags that may contain embedded tool calls.
508
+ * @issue #749
509
+ */
510
+ cleanThinkingContent(thought) {
511
+ return this.extractKimiToolCallsFromText(thought).cleanedText;
512
+ }
496
513
  /**
497
514
  * @plan:PLAN-20251023-STATELESS-HARDENING.P09
498
515
  * @requirement:REQ-SP4-002
@@ -909,9 +926,12 @@ export class OpenAIProvider extends BaseProvider {
909
926
  }
910
927
  else {
911
928
  // Assistant message with tool calls
929
+ // CRITICAL for Mistral API compatibility (#760):
930
+ // When tool_calls are present, we must NOT include a content property at all
931
+ // (not even null). Mistral's OpenAI-compatible API requires this.
932
+ // See: https://docs.mistral.ai/capabilities/function_calling
912
933
  messages.push({
913
934
  role: 'assistant',
914
- content: text || null,
915
935
  tool_calls: toolCalls.map((tc) => ({
916
936
  id: this.normalizeToOpenAIToolId(tc.id),
917
937
  type: 'function',
@@ -947,10 +967,16 @@ export class OpenAIProvider extends BaseProvider {
947
967
  }
948
968
  else {
949
969
  for (const tr of toolResponses) {
970
+ // CRITICAL for Mistral API compatibility (#760):
971
+ // Tool messages must include a name field matching the function name.
972
+ // See: https://docs.mistral.ai/capabilities/function_calling
973
+ // Note: The OpenAI SDK types don't include name, but Mistral requires it.
974
+ // We use a type assertion to add this required field.
950
975
  messages.push({
951
976
  role: 'tool',
952
977
  content: this.buildToolResponseContent(tr, config),
953
978
  tool_call_id: this.normalizeToOpenAIToolId(tr.callId),
979
+ name: tr.toolName,
954
980
  });
955
981
  }
956
982
  }
@@ -976,8 +1002,9 @@ export class OpenAIProvider extends BaseProvider {
976
1002
  const messages = [];
977
1003
  // Create a ToolIdMapper based on the tool format
978
1004
  // For Kimi K2, this generates sequential IDs in the format functions.{name}:{index}
979
- const toolIdMapper = toolFormat === 'kimi'
980
- ? getToolIdStrategy('kimi').createMapper(filteredContents)
1005
+ // For Mistral, this generates 9-char alphanumeric IDs
1006
+ const toolIdMapper = toolFormat === 'kimi' || toolFormat === 'mistral'
1007
+ ? getToolIdStrategy(toolFormat).createMapper(filteredContents)
981
1008
  : null;
982
1009
  // Helper to resolve tool call IDs based on format
983
1010
  const resolveToolCallId = (tc) => {
@@ -1013,9 +1040,12 @@ export class OpenAIProvider extends BaseProvider {
1013
1040
  const toolCalls = content.blocks.filter((b) => b.type === 'tool_call');
1014
1041
  if (toolCalls.length > 0) {
1015
1042
  // Assistant message with tool calls
1043
+ // CRITICAL for Mistral API compatibility (#760):
1044
+ // When tool_calls are present, we must NOT include a content property at all
1045
+ // (not even null). Mistral's OpenAI-compatible API requires this.
1046
+ // See: https://docs.mistral.ai/capabilities/function_calling
1016
1047
  const baseMessage = {
1017
1048
  role: 'assistant',
1018
- content: text || null,
1019
1049
  tool_calls: toolCalls.map((tc) => ({
1020
1050
  id: resolveToolCallId(tc),
1021
1051
  type: 'function',
@@ -1056,10 +1086,16 @@ export class OpenAIProvider extends BaseProvider {
1056
1086
  // Convert tool responses
1057
1087
  const toolResponses = content.blocks.filter((b) => b.type === 'tool_response');
1058
1088
  for (const tr of toolResponses) {
1089
+ // CRITICAL for Mistral API compatibility (#760):
1090
+ // Tool messages must include a name field matching the function name.
1091
+ // See: https://docs.mistral.ai/capabilities/function_calling
1092
+ // Note: The OpenAI SDK types don't include name, but Mistral requires it.
1093
+ // We use a type assertion to add this required field.
1059
1094
  messages.push({
1060
1095
  role: 'tool',
1061
1096
  content: this.buildToolResponseContent(tr, options.config),
1062
1097
  tool_call_id: resolveToolResponseId(tr),
1098
+ name: tr.toolName,
1063
1099
  });
1064
1100
  }
1065
1101
  }
@@ -1505,9 +1541,9 @@ export class OpenAIProvider extends BaseProvider {
1505
1541
  // Buffer for accumulating text chunks for providers that need it
1506
1542
  let textBuffer = '';
1507
1543
  // Use the same detected format from earlier for consistency
1508
- const isKimiModel = model.toLowerCase().includes('kimi-k2');
1544
+ const isKimiK2Model = model.toLowerCase().includes('kimi-k2');
1509
1545
  // Buffer text for Qwen format providers and Kimi-K2 to avoid stanza formatting
1510
- const shouldBufferText = detectedFormat === 'qwen' || isKimiModel;
1546
+ const shouldBufferText = detectedFormat === 'qwen' || isKimiK2Model;
1511
1547
  // Accumulate thinking content across the entire stream to emit as ONE block
1512
1548
  // This handles fragmented <think>word</think> streaming from Synthetic API
1513
1549
  // @plan PLAN-20251202-THINKING.P16
@@ -1574,12 +1610,29 @@ export class OpenAIProvider extends BaseProvider {
1574
1610
  continue;
1575
1611
  // Parse reasoning_content from streaming delta (Phase 16 integration)
1576
1612
  // ACCUMULATE instead of yielding immediately to handle token-by-token streaming
1613
+ // Extract embedded Kimi K2 tool calls from reasoning_content (fixes #749)
1577
1614
  // @plan PLAN-20251202-THINKING.P16
1578
- const reasoningBlock = this.parseStreamingReasoningDelta(choice.delta);
1615
+ // @requirement REQ-KIMI-REASONING-001.1
1616
+ const { thinking: reasoningBlock, toolCalls: reasoningToolCalls } = this.parseStreamingReasoningDelta(choice.delta);
1579
1617
  if (reasoningBlock) {
1580
1618
  // Accumulate reasoning content - will emit ONE block later
1581
1619
  accumulatedReasoningContent += reasoningBlock.thought;
1582
1620
  }
1621
+ // Accumulate tool calls extracted from reasoning_content
1622
+ if (reasoningToolCalls.length > 0) {
1623
+ for (const toolCall of reasoningToolCalls) {
1624
+ // Convert ToolCallBlock to accumulated format
1625
+ const index = accumulatedToolCalls.length;
1626
+ accumulatedToolCalls[index] = {
1627
+ id: toolCall.id,
1628
+ type: 'function',
1629
+ function: {
1630
+ name: toolCall.name,
1631
+ arguments: JSON.stringify(toolCall.parameters),
1632
+ },
1633
+ };
1634
+ }
1635
+ }
1583
1636
  // Check for finish_reason to detect proper stream ending
1584
1637
  if (choice.finish_reason) {
1585
1638
  logger.debug(() => `[Streaming] Stream finished with reason: ${choice.finish_reason}`, {
@@ -1600,13 +1653,25 @@ export class OpenAIProvider extends BaseProvider {
1600
1653
  }
1601
1654
  // Handle text content - buffer for Qwen format, emit immediately for others
1602
1655
  // Note: Synthetic API sends content that may duplicate reasoning_content.
1603
- // This is the model's behavior - we don't filter it here.
1656
+ // We now filter duplicates by tracking when content starts matching reasoning_content.
1657
+ // fixes #721
1604
1658
  // @plan PLAN-20251202-THINKING.P16
1605
1659
  const rawDeltaContent = this.coerceMessageContentToString(choice.delta?.content);
1606
1660
  if (rawDeltaContent) {
1607
- const deltaContent = isKimiModel
1608
- ? rawDeltaContent
1609
- : this.sanitizeProviderText(rawDeltaContent);
1661
+ // For Kimi models, we need to buffer the RAW content without processing
1662
+ // because Kimi tokens stream incrementally and partial tokens would leak
1663
+ // through if we try to process them immediately. The buffer will be
1664
+ // processed when flushed (at sentence boundaries or end of stream).
1665
+ let deltaContent;
1666
+ if (isKimiK2Model) {
1667
+ // For Kimi: Don't process yet - just pass through and let buffering handle it
1668
+ // We'll extract tool calls and sanitize when we flush the buffer
1669
+ deltaContent = rawDeltaContent;
1670
+ }
1671
+ else {
1672
+ // For non-Kimi models: sanitize immediately as before
1673
+ deltaContent = this.sanitizeProviderText(rawDeltaContent);
1674
+ }
1610
1675
  if (!deltaContent) {
1611
1676
  continue;
1612
1677
  }
@@ -1622,9 +1687,9 @@ export class OpenAIProvider extends BaseProvider {
1622
1687
  });
1623
1688
  // Buffer text to avoid stanza formatting
1624
1689
  textBuffer += deltaContent;
1625
- const hasKimiBegin = textBuffer.includes('<|tool_calls_section_begin|>');
1626
- const hasKimiEnd = textBuffer.includes('<|tool_calls_section_end|>');
1627
- const hasOpenKimiSection = hasKimiBegin && !hasKimiEnd;
1690
+ const kimiBeginCount = (textBuffer.match(/<\|tool_calls_section_begin\|>/g) || []).length;
1691
+ const kimiEndCount = (textBuffer.match(/<\|tool_calls_section_end\|>/g) || []).length;
1692
+ const hasOpenKimiSection = kimiBeginCount > kimiEndCount;
1628
1693
  // Emit buffered text when we have a complete sentence or paragraph
1629
1694
  // Look for natural break points, but avoid flushing mid Kimi section
1630
1695
  if (!hasOpenKimiSection &&
@@ -1641,12 +1706,14 @@ export class OpenAIProvider extends BaseProvider {
1641
1706
  // @requirement REQ-THINK-003
1642
1707
  const tagBasedThinking = this.extractThinkTagsAsBlock(workingText);
1643
1708
  if (tagBasedThinking) {
1709
+ // Clean Kimi tokens from thinking content before accumulating
1710
+ const cleanedThought = this.cleanThinkingContent(tagBasedThinking.thought);
1644
1711
  // Accumulate thinking content - don't emit yet
1645
1712
  // Use newline to preserve formatting between chunks (not space)
1646
1713
  if (accumulatedThinkingContent.length > 0) {
1647
1714
  accumulatedThinkingContent += '\n';
1648
1715
  }
1649
- accumulatedThinkingContent += tagBasedThinking.thought;
1716
+ accumulatedThinkingContent += cleanedThought;
1650
1717
  logger.debug(() => `[Streaming legacy] Accumulated thinking: ${accumulatedThinkingContent.length} chars total`);
1651
1718
  }
1652
1719
  const kimiParsed = this.extractKimiToolCallsFromText(workingText);
@@ -1708,7 +1775,10 @@ export class OpenAIProvider extends BaseProvider {
1708
1775
  // Always use sanitized text to strip <think> tags (legacy streaming)
1709
1776
  // Bug fix: Previously Kimi used unsanitized workingText
1710
1777
  // @plan PLAN-20251202-THINKING.P16
1711
- if (cleanedText.trim().length > 0) {
1778
+ // Bug fix #721: Emit whitespace-only chunks (e.g., " " between words)
1779
+ // Previously we used cleanedText.trim().length > 0 which dropped spaces,
1780
+ // causing "list 5" to become "list5". Now we emit any non-empty cleanedText.
1781
+ if (cleanedText.length > 0) {
1712
1782
  yield {
1713
1783
  speaker: 'ai',
1714
1784
  blocks: [
@@ -1827,11 +1897,13 @@ export class OpenAIProvider extends BaseProvider {
1827
1897
  // @plan PLAN-20251202-THINKING.P16
1828
1898
  const tagBasedThinking = this.extractThinkTagsAsBlock(workingText);
1829
1899
  if (tagBasedThinking) {
1900
+ // Clean Kimi tokens from thinking content before accumulating
1901
+ const cleanedThought = this.cleanThinkingContent(tagBasedThinking.thought);
1830
1902
  // Use newline to preserve formatting between chunks (not space)
1831
1903
  if (accumulatedThinkingContent.length > 0) {
1832
1904
  accumulatedThinkingContent += '\n';
1833
1905
  }
1834
- accumulatedThinkingContent += tagBasedThinking.thought;
1906
+ accumulatedThinkingContent += cleanedThought;
1835
1907
  }
1836
1908
  const kimiParsed = this.extractKimiToolCallsFromText(workingText);
1837
1909
  if (kimiParsed.toolCalls.length > 0) {
@@ -1890,7 +1962,10 @@ export class OpenAIProvider extends BaseProvider {
1890
1962
  // Always use sanitized text to strip <think> tags (legacy final buffer)
1891
1963
  // Bug fix: Previously Kimi used unsanitized workingText
1892
1964
  // @plan PLAN-20251202-THINKING.P16
1893
- if (cleanedText.trim().length > 0) {
1965
+ // Bug fix #721: Emit whitespace-only chunks (e.g., " " between words)
1966
+ // Previously we used cleanedText.trim().length > 0 which dropped spaces,
1967
+ // causing "list 5" to become "list5". Now we emit any non-empty cleanedText.
1968
+ if (cleanedText.length > 0) {
1894
1969
  yield {
1895
1970
  speaker: 'ai',
1896
1971
  blocks: [
@@ -1922,19 +1997,32 @@ export class OpenAIProvider extends BaseProvider {
1922
1997
  }
1923
1998
  // Emit accumulated reasoning_content as ONE ThinkingBlock (legacy path)
1924
1999
  // This consolidates token-by-token reasoning from Synthetic API into a single block
2000
+ // Clean Kimi tokens from the accumulated content (not per-chunk) to handle split tokens
1925
2001
  // @plan PLAN-20251202-THINKING.P16
1926
2002
  if (accumulatedReasoningContent.length > 0) {
1927
- yield {
1928
- speaker: 'ai',
1929
- blocks: [
1930
- {
1931
- type: 'thinking',
1932
- thought: accumulatedReasoningContent,
1933
- sourceField: 'reasoning_content',
1934
- isHidden: false,
1935
- },
1936
- ],
1937
- };
2003
+ // Extract Kimi tool calls from the complete accumulated reasoning content
2004
+ const { cleanedText: cleanedReasoning, toolCalls: reasoningToolCalls } = this.extractKimiToolCallsFromText(accumulatedReasoningContent);
2005
+ // Emit the cleaned thinking block
2006
+ if (cleanedReasoning.length > 0) {
2007
+ yield {
2008
+ speaker: 'ai',
2009
+ blocks: [
2010
+ {
2011
+ type: 'thinking',
2012
+ thought: cleanedReasoning,
2013
+ sourceField: 'reasoning_content',
2014
+ isHidden: false,
2015
+ },
2016
+ ],
2017
+ };
2018
+ }
2019
+ // Emit any tool calls extracted from reasoning content
2020
+ if (reasoningToolCalls.length > 0) {
2021
+ yield {
2022
+ speaker: 'ai',
2023
+ blocks: reasoningToolCalls,
2024
+ };
2025
+ }
1938
2026
  }
1939
2027
  // Process and emit tool calls using legacy accumulated approach
1940
2028
  if (accumulatedToolCalls.length > 0) {
@@ -1961,6 +2049,7 @@ export class OpenAIProvider extends BaseProvider {
1961
2049
  };
1962
2050
  // Add usage metadata if we captured it from streaming
1963
2051
  if (streamingUsage) {
2052
+ const cacheMetrics = extractCacheMetrics(streamingUsage);
1964
2053
  toolCallsContent.metadata = {
1965
2054
  usage: {
1966
2055
  promptTokens: streamingUsage.prompt_tokens || 0,
@@ -1968,6 +2057,9 @@ export class OpenAIProvider extends BaseProvider {
1968
2057
  totalTokens: streamingUsage.total_tokens ||
1969
2058
  (streamingUsage.prompt_tokens || 0) +
1970
2059
  (streamingUsage.completion_tokens || 0),
2060
+ cachedTokens: cacheMetrics.cachedTokens,
2061
+ cacheCreationTokens: cacheMetrics.cacheCreationTokens,
2062
+ cacheMissTokens: cacheMetrics.cacheMissTokens,
1971
2063
  },
1972
2064
  };
1973
2065
  }
@@ -1976,6 +2068,7 @@ export class OpenAIProvider extends BaseProvider {
1976
2068
  }
1977
2069
  // If we have usage information but no tool calls, emit a metadata-only response
1978
2070
  if (streamingUsage && accumulatedToolCalls.length === 0) {
2071
+ const cacheMetrics = extractCacheMetrics(streamingUsage);
1979
2072
  yield {
1980
2073
  speaker: 'ai',
1981
2074
  blocks: [],
@@ -1986,6 +2079,9 @@ export class OpenAIProvider extends BaseProvider {
1986
2079
  totalTokens: streamingUsage.total_tokens ||
1987
2080
  (streamingUsage.prompt_tokens || 0) +
1988
2081
  (streamingUsage.completion_tokens || 0),
2082
+ cachedTokens: cacheMetrics.cachedTokens,
2083
+ cacheCreationTokens: cacheMetrics.cacheCreationTokens,
2084
+ cacheMissTokens: cacheMetrics.cacheMissTokens,
1989
2085
  },
1990
2086
  },
1991
2087
  };
@@ -2050,8 +2146,10 @@ export class OpenAIProvider extends BaseProvider {
2050
2146
  }
2051
2147
  const blocks = [];
2052
2148
  // Parse reasoning_content from response (Phase 16 integration)
2053
- const reasoningBlock = this.parseNonStreamingReasoning(choice.message);
2054
- logger.debug(() => `[Non-streaming] parseNonStreamingReasoning result: ${reasoningBlock ? `found (${reasoningBlock.thought?.length} chars)` : 'not found'}`, {
2149
+ // Extract embedded Kimi K2 tool calls from reasoning_content (fixes #749)
2150
+ // @requirement REQ-KIMI-REASONING-001.2
2151
+ const { thinking: reasoningBlock, toolCalls: reasoningToolCalls } = this.parseNonStreamingReasoning(choice.message);
2152
+ logger.debug(() => `[Non-streaming] parseNonStreamingReasoning result: ${reasoningBlock ? `found (${reasoningBlock.thought?.length} chars)` : 'not found'}, tool calls: ${reasoningToolCalls.length}`, {
2055
2153
  hasReasoningContent: 'reasoning_content' in
2056
2154
  (choice.message ?? {}),
2057
2155
  messageKeys: Object.keys(choice.message ?? {}),
@@ -2059,6 +2157,11 @@ export class OpenAIProvider extends BaseProvider {
2059
2157
  if (reasoningBlock) {
2060
2158
  blocks.push(reasoningBlock);
2061
2159
  }
2160
+ // Add tool calls extracted from reasoning_content
2161
+ if (reasoningToolCalls.length > 0) {
2162
+ blocks.push(...reasoningToolCalls);
2163
+ logger.debug(() => `[Non-streaming] Added ${reasoningToolCalls.length} tool calls from reasoning_content`);
2164
+ }
2062
2165
  // Handle text content (strip thinking / reasoning blocks) and Kimi tool sections
2063
2166
  const rawMessageContent = this.coerceMessageContentToString(choice.message?.content);
2064
2167
  let kimiCleanContent;
@@ -2162,6 +2265,7 @@ export class OpenAIProvider extends BaseProvider {
2162
2265
  };
2163
2266
  // Add usage metadata from non-streaming response
2164
2267
  if (completion.usage) {
2268
+ const cacheMetrics = extractCacheMetrics(completion.usage);
2165
2269
  responseContent.metadata = {
2166
2270
  usage: {
2167
2271
  promptTokens: completion.usage.prompt_tokens || 0,
@@ -2169,6 +2273,9 @@ export class OpenAIProvider extends BaseProvider {
2169
2273
  totalTokens: completion.usage.total_tokens ||
2170
2274
  (completion.usage.prompt_tokens || 0) +
2171
2275
  (completion.usage.completion_tokens || 0),
2276
+ cachedTokens: cacheMetrics.cachedTokens,
2277
+ cacheCreationTokens: cacheMetrics.cacheCreationTokens,
2278
+ cacheMissTokens: cacheMetrics.cacheMissTokens,
2172
2279
  },
2173
2280
  };
2174
2281
  }
@@ -2176,6 +2283,7 @@ export class OpenAIProvider extends BaseProvider {
2176
2283
  }
2177
2284
  else if (completion.usage) {
2178
2285
  // Emit metadata-only response if no content blocks but have usage info
2286
+ const cacheMetrics = extractCacheMetrics(completion.usage);
2179
2287
  yield {
2180
2288
  speaker: 'ai',
2181
2289
  blocks: [],
@@ -2186,6 +2294,9 @@ export class OpenAIProvider extends BaseProvider {
2186
2294
  totalTokens: completion.usage.total_tokens ||
2187
2295
  (completion.usage.prompt_tokens || 0) +
2188
2296
  (completion.usage.completion_tokens || 0),
2297
+ cachedTokens: cacheMetrics.cachedTokens,
2298
+ cacheCreationTokens: cacheMetrics.cacheCreationTokens,
2299
+ cacheMissTokens: cacheMetrics.cacheMissTokens,
2189
2300
  },
2190
2301
  },
2191
2302
  };
@@ -2438,6 +2549,10 @@ export class OpenAIProvider extends BaseProvider {
2438
2549
  overrideKeys: requestOverrides ? Object.keys(requestOverrides) : [],
2439
2550
  });
2440
2551
  }
2552
+ // Get dump mode from ephemeral settings
2553
+ const dumpMode = ephemeralSettings.dumpcontext;
2554
+ const shouldDumpSuccess = shouldDumpSDKContext(dumpMode, false);
2555
+ const shouldDumpError = shouldDumpSDKContext(dumpMode, true);
2441
2556
  if (streamingEnabled) {
2442
2557
  // Streaming mode - use retry loop with compression support
2443
2558
  let compressedOnce = false;
@@ -2452,6 +2567,10 @@ export class OpenAIProvider extends BaseProvider {
2452
2567
  shouldRetryOnError: this.shouldRetryResponse.bind(this),
2453
2568
  trackThrottleWaitTime: this.throttleTracker,
2454
2569
  });
2570
+ // Dump successful streaming request if enabled
2571
+ if (shouldDumpSuccess) {
2572
+ await dumpSDKContext('openai', '/chat/completions', requestBody, { streaming: true }, false, baseURL || 'https://api.openai.com/v1');
2573
+ }
2455
2574
  break;
2456
2575
  }
2457
2576
  catch (error) {
@@ -2480,6 +2599,11 @@ export class OpenAIProvider extends BaseProvider {
2480
2599
  logger.warn(() => `[OpenAIProvider] Retrying streaming request after compressing tool responses due to provider 400`);
2481
2600
  continue;
2482
2601
  }
2602
+ // Dump error if enabled
2603
+ if (shouldDumpError) {
2604
+ const dumpErrorMessage = error instanceof Error ? error.message : String(error);
2605
+ await dumpSDKContext('openai', '/chat/completions', requestBody, { error: dumpErrorMessage }, true, baseURL || 'https://api.openai.com/v1');
2606
+ }
2483
2607
  // Re-throw other errors as-is
2484
2608
  const capturedErrorMessage = error instanceof Error ? error.message : String(error);
2485
2609
  const status = typeof error === 'object' &&
@@ -2514,6 +2638,10 @@ export class OpenAIProvider extends BaseProvider {
2514
2638
  shouldRetryOnError: this.shouldRetryResponse.bind(this),
2515
2639
  trackThrottleWaitTime: this.throttleTracker,
2516
2640
  }));
2641
+ // Dump successful non-streaming request if enabled
2642
+ if (shouldDumpSuccess) {
2643
+ await dumpSDKContext('openai', '/chat/completions', requestBody, response, false, baseURL || 'https://api.openai.com/v1');
2644
+ }
2517
2645
  break;
2518
2646
  }
2519
2647
  catch (error) {
@@ -2548,6 +2676,11 @@ export class OpenAIProvider extends BaseProvider {
2548
2676
  logger.warn(() => `[OpenAIProvider] Retrying request after compressing tool responses due to provider 400`);
2549
2677
  continue;
2550
2678
  }
2679
+ // Dump error if enabled
2680
+ if (shouldDumpError) {
2681
+ const dumpErrorMessage = error instanceof Error ? error.message : String(error);
2682
+ await dumpSDKContext('openai', '/chat/completions', requestBody, { error: dumpErrorMessage }, true, baseURL || 'https://api.openai.com/v1');
2683
+ }
2551
2684
  const capturedErrorMessage = error instanceof Error ? error.message : String(error);
2552
2685
  const status = typeof error === 'object' &&
2553
2686
  error !== null &&
@@ -2576,9 +2709,9 @@ export class OpenAIProvider extends BaseProvider {
2576
2709
  // Buffer for accumulating text chunks for providers that need it
2577
2710
  let textBuffer = '';
2578
2711
  // Use the same detected format from earlier for consistency
2579
- const isKimiModel = model.toLowerCase().includes('kimi-k2');
2712
+ const isKimiK2Model = model.toLowerCase().includes('kimi-k2');
2580
2713
  // Buffer text for Qwen format providers and Kimi-K2 to avoid stanza formatting
2581
- const shouldBufferText = detectedFormat === 'qwen' || isKimiModel;
2714
+ const shouldBufferText = detectedFormat === 'qwen' || isKimiK2Model;
2582
2715
  // Accumulate thinking content across the entire stream to emit as ONE block
2583
2716
  // This handles fragmented <think>word</think> streaming from Synthetic API
2584
2717
  // @plan PLAN-20251202-THINKING.P16
@@ -2648,13 +2781,28 @@ export class OpenAIProvider extends BaseProvider {
2648
2781
  continue;
2649
2782
  // Parse reasoning_content from streaming delta (Pipeline path)
2650
2783
  // ACCUMULATE instead of yielding immediately to handle token-by-token streaming
2784
+ // Extract embedded Kimi K2 tool calls from reasoning_content (fixes #749)
2651
2785
  // @plan PLAN-20251202-THINKING.P16
2652
- // @requirement REQ-THINK-003.1
2653
- const reasoningBlock = this.parseStreamingReasoningDelta(choice.delta);
2786
+ // @requirement REQ-THINK-003.1, REQ-KIMI-REASONING-001.1
2787
+ const { thinking: reasoningBlock, toolCalls: reasoningToolCalls } = this.parseStreamingReasoningDelta(choice.delta);
2654
2788
  if (reasoningBlock) {
2655
2789
  // Accumulate reasoning content - will emit ONE block later
2656
2790
  accumulatedReasoningContent += reasoningBlock.thought;
2657
2791
  }
2792
+ // Add tool calls extracted from reasoning_content to pipeline
2793
+ if (reasoningToolCalls.length > 0) {
2794
+ // Get current pipeline stats to determine next index
2795
+ const stats = this.toolCallPipeline.getStats();
2796
+ let baseIndex = stats.collector.totalCalls;
2797
+ for (const toolCall of reasoningToolCalls) {
2798
+ // Add complete tool call as fragments to pipeline
2799
+ this.toolCallPipeline.addFragment(baseIndex, {
2800
+ name: toolCall.name,
2801
+ args: JSON.stringify(toolCall.parameters),
2802
+ });
2803
+ baseIndex++;
2804
+ }
2805
+ }
2658
2806
  // Check for finish_reason to detect proper stream ending
2659
2807
  if (choice.finish_reason) {
2660
2808
  logger.debug(() => `[Streaming] Stream finished with reason: ${choice.finish_reason}`, {
@@ -2675,13 +2823,24 @@ export class OpenAIProvider extends BaseProvider {
2675
2823
  }
2676
2824
  // Handle text content - buffer for Qwen format, emit immediately for others
2677
2825
  // Note: Synthetic API sends content that may duplicate reasoning_content.
2678
- // This is the model's behavior - we don't filter it here.
2826
+ // This is the model's behavior - we don't filter it here as detection is unreliable.
2679
2827
  // @plan PLAN-20251202-THINKING.P16
2680
2828
  const rawDeltaContent = this.coerceMessageContentToString(choice.delta?.content);
2681
2829
  if (rawDeltaContent) {
2682
- const deltaContent = isKimiModel
2683
- ? rawDeltaContent
2684
- : this.sanitizeProviderText(rawDeltaContent);
2830
+ // For Kimi models, we need to buffer the RAW content without processing
2831
+ // because Kimi tokens stream incrementally and partial tokens would leak
2832
+ // through if we try to process them immediately. The buffer will be
2833
+ // processed when flushed (at sentence boundaries or end of stream).
2834
+ let deltaContent;
2835
+ if (isKimiK2Model) {
2836
+ // For Kimi: Don't process yet - just pass through and let buffering handle it
2837
+ // We'll extract tool calls and sanitize when we flush the buffer
2838
+ deltaContent = rawDeltaContent;
2839
+ }
2840
+ else {
2841
+ // For non-Kimi models: sanitize immediately as before
2842
+ deltaContent = this.sanitizeProviderText(rawDeltaContent);
2843
+ }
2685
2844
  if (!deltaContent) {
2686
2845
  continue;
2687
2846
  }
@@ -2697,9 +2856,9 @@ export class OpenAIProvider extends BaseProvider {
2697
2856
  });
2698
2857
  // Buffer text to avoid stanza formatting
2699
2858
  textBuffer += deltaContent;
2700
- const hasKimiBegin = textBuffer.includes('<|tool_calls_section_begin|>');
2701
- const hasKimiEnd = textBuffer.includes('<|tool_calls_section_end|>');
2702
- const hasOpenKimiSection = hasKimiBegin && !hasKimiEnd;
2859
+ const kimiBeginCount = (textBuffer.match(/<\|tool_calls_section_begin\|>/g) || []).length;
2860
+ const kimiEndCount = (textBuffer.match(/<\|tool_calls_section_end\|>/g) || []).length;
2861
+ const hasOpenKimiSection = kimiBeginCount > kimiEndCount;
2703
2862
  // Emit buffered text when we have a complete sentence or paragraph
2704
2863
  // Look for natural break points, avoiding flush mid Kimi section
2705
2864
  if (!hasOpenKimiSection &&
@@ -2716,12 +2875,14 @@ export class OpenAIProvider extends BaseProvider {
2716
2875
  // @requirement REQ-THINK-003
2717
2876
  const tagBasedThinking = this.extractThinkTagsAsBlock(workingText);
2718
2877
  if (tagBasedThinking) {
2878
+ // Clean Kimi tokens from thinking content before accumulating
2879
+ const cleanedThought = this.cleanThinkingContent(tagBasedThinking.thought);
2719
2880
  // Accumulate thinking content - don't emit yet
2720
2881
  // Use newline to preserve formatting between chunks (not space)
2721
2882
  if (accumulatedThinkingContent.length > 0) {
2722
2883
  accumulatedThinkingContent += '\n';
2723
2884
  }
2724
- accumulatedThinkingContent += tagBasedThinking.thought;
2885
+ accumulatedThinkingContent += cleanedThought;
2725
2886
  logger.debug(() => `[Streaming] Accumulated thinking: ${accumulatedThinkingContent.length} chars total`);
2726
2887
  }
2727
2888
  const kimiParsed = this.extractKimiToolCallsFromText(workingText);
@@ -2783,7 +2944,10 @@ export class OpenAIProvider extends BaseProvider {
2783
2944
  // Always use sanitized text to strip <think> tags (pipeline streaming)
2784
2945
  // Bug fix: Previously Kimi used unsanitized workingText
2785
2946
  // @plan PLAN-20251202-THINKING.P16
2786
- if (cleanedText.trim().length > 0) {
2947
+ // Bug fix #721: Emit whitespace-only chunks (e.g., " " between words)
2948
+ // Previously we used cleanedText.trim().length > 0 which dropped spaces,
2949
+ // causing "list 5" to become "list5". Now we emit any non-empty cleanedText.
2950
+ if (cleanedText.length > 0) {
2787
2951
  yield {
2788
2952
  speaker: 'ai',
2789
2953
  blocks: [
@@ -2883,11 +3047,13 @@ export class OpenAIProvider extends BaseProvider {
2883
3047
  // @plan PLAN-20251202-THINKING.P16
2884
3048
  const tagBasedThinking = this.extractThinkTagsAsBlock(workingText);
2885
3049
  if (tagBasedThinking) {
3050
+ // Clean Kimi tokens from thinking content before accumulating
3051
+ const cleanedThought = this.cleanThinkingContent(tagBasedThinking.thought);
2886
3052
  // Use newline to preserve formatting between chunks (not space)
2887
3053
  if (accumulatedThinkingContent.length > 0) {
2888
3054
  accumulatedThinkingContent += '\n';
2889
3055
  }
2890
- accumulatedThinkingContent += tagBasedThinking.thought;
3056
+ accumulatedThinkingContent += cleanedThought;
2891
3057
  }
2892
3058
  const kimiParsed = this.extractKimiToolCallsFromText(workingText);
2893
3059
  if (kimiParsed.toolCalls.length > 0) {
@@ -2946,7 +3112,10 @@ export class OpenAIProvider extends BaseProvider {
2946
3112
  // Always use sanitized text to strip <think> tags (pipeline final buffer)
2947
3113
  // Bug fix: Previously Kimi used unsanitized workingText
2948
3114
  // @plan PLAN-20251202-THINKING.P16
2949
- if (cleanedText.trim().length > 0) {
3115
+ // Bug fix #721: Emit whitespace-only chunks (e.g., " " between words)
3116
+ // Previously we used cleanedText.trim().length > 0 which dropped spaces,
3117
+ // causing "list 5" to become "list5". Now we emit any non-empty cleanedText.
3118
+ if (cleanedText.length > 0) {
2950
3119
  yield {
2951
3120
  speaker: 'ai',
2952
3121
  blocks: [
@@ -2978,19 +3147,32 @@ export class OpenAIProvider extends BaseProvider {
2978
3147
  }
2979
3148
  // Emit accumulated reasoning_content as ONE ThinkingBlock (pipeline path)
2980
3149
  // This consolidates token-by-token reasoning from Synthetic API into a single block
3150
+ // Clean Kimi tokens from the accumulated content (not per-chunk) to handle split tokens
2981
3151
  // @plan PLAN-20251202-THINKING.P16
2982
3152
  if (accumulatedReasoningContent.length > 0) {
2983
- yield {
2984
- speaker: 'ai',
2985
- blocks: [
2986
- {
2987
- type: 'thinking',
2988
- thought: accumulatedReasoningContent,
2989
- sourceField: 'reasoning_content',
2990
- isHidden: false,
2991
- },
2992
- ],
2993
- };
3153
+ // Extract Kimi tool calls from the complete accumulated reasoning content
3154
+ const { cleanedText: cleanedReasoning, toolCalls: reasoningToolCalls } = this.extractKimiToolCallsFromText(accumulatedReasoningContent);
3155
+ // Emit the cleaned thinking block
3156
+ if (cleanedReasoning.length > 0) {
3157
+ yield {
3158
+ speaker: 'ai',
3159
+ blocks: [
3160
+ {
3161
+ type: 'thinking',
3162
+ thought: cleanedReasoning,
3163
+ sourceField: 'reasoning_content',
3164
+ isHidden: false,
3165
+ },
3166
+ ],
3167
+ };
3168
+ }
3169
+ // Emit any tool calls extracted from reasoning content
3170
+ if (reasoningToolCalls.length > 0) {
3171
+ yield {
3172
+ speaker: 'ai',
3173
+ blocks: reasoningToolCalls,
3174
+ };
3175
+ }
2994
3176
  }
2995
3177
  // Process and emit tool calls using the pipeline
2996
3178
  const pipelineResult = await this.toolCallPipeline.process(abortSignal);
@@ -3020,6 +3202,7 @@ export class OpenAIProvider extends BaseProvider {
3020
3202
  };
3021
3203
  // Add usage metadata if we captured it from streaming
3022
3204
  if (streamingUsage) {
3205
+ const cacheMetrics = extractCacheMetrics(streamingUsage);
3023
3206
  toolCallsContent.metadata = {
3024
3207
  usage: {
3025
3208
  promptTokens: streamingUsage.prompt_tokens || 0,
@@ -3027,6 +3210,9 @@ export class OpenAIProvider extends BaseProvider {
3027
3210
  totalTokens: streamingUsage.total_tokens ||
3028
3211
  (streamingUsage.prompt_tokens || 0) +
3029
3212
  (streamingUsage.completion_tokens || 0),
3213
+ cachedTokens: cacheMetrics.cachedTokens,
3214
+ cacheCreationTokens: cacheMetrics.cacheCreationTokens,
3215
+ cacheMissTokens: cacheMetrics.cacheMissTokens,
3030
3216
  },
3031
3217
  };
3032
3218
  }
@@ -3036,6 +3222,7 @@ export class OpenAIProvider extends BaseProvider {
3036
3222
  // If we have usage information but no tool calls, emit a metadata-only response
3037
3223
  if (streamingUsage &&
3038
3224
  this.toolCallPipeline.getStats().collector.totalCalls === 0) {
3225
+ const cacheMetrics = extractCacheMetrics(streamingUsage);
3039
3226
  yield {
3040
3227
  speaker: 'ai',
3041
3228
  blocks: [],
@@ -3046,6 +3233,9 @@ export class OpenAIProvider extends BaseProvider {
3046
3233
  totalTokens: streamingUsage.total_tokens ||
3047
3234
  (streamingUsage.prompt_tokens || 0) +
3048
3235
  (streamingUsage.completion_tokens || 0),
3236
+ cachedTokens: cacheMetrics.cachedTokens,
3237
+ cacheCreationTokens: cacheMetrics.cacheCreationTokens,
3238
+ cacheMissTokens: cacheMetrics.cacheMissTokens,
3049
3239
  },
3050
3240
  },
3051
3241
  };
@@ -3198,6 +3388,7 @@ export class OpenAIProvider extends BaseProvider {
3198
3388
  };
3199
3389
  // Add usage metadata from non-streaming response
3200
3390
  if (completion.usage) {
3391
+ const cacheMetrics = extractCacheMetrics(completion.usage);
3201
3392
  responseContent.metadata = {
3202
3393
  usage: {
3203
3394
  promptTokens: completion.usage.prompt_tokens || 0,
@@ -3205,6 +3396,9 @@ export class OpenAIProvider extends BaseProvider {
3205
3396
  totalTokens: completion.usage.total_tokens ||
3206
3397
  (completion.usage.prompt_tokens || 0) +
3207
3398
  (completion.usage.completion_tokens || 0),
3399
+ cachedTokens: cacheMetrics.cachedTokens,
3400
+ cacheCreationTokens: cacheMetrics.cacheCreationTokens,
3401
+ cacheMissTokens: cacheMetrics.cacheMissTokens,
3208
3402
  },
3209
3403
  };
3210
3404
  }
@@ -3212,6 +3406,7 @@ export class OpenAIProvider extends BaseProvider {
3212
3406
  }
3213
3407
  else if (completion.usage) {
3214
3408
  // Emit metadata-only response if no content blocks but have usage info
3409
+ const cacheMetrics = extractCacheMetrics(completion.usage);
3215
3410
  yield {
3216
3411
  speaker: 'ai',
3217
3412
  blocks: [],
@@ -3222,6 +3417,9 @@ export class OpenAIProvider extends BaseProvider {
3222
3417
  totalTokens: completion.usage.total_tokens ||
3223
3418
  (completion.usage.prompt_tokens || 0) +
3224
3419
  (completion.usage.completion_tokens || 0),
3420
+ cachedTokens: cacheMetrics.cachedTokens,
3421
+ cacheCreationTokens: cacheMetrics.cacheCreationTokens,
3422
+ cacheMissTokens: cacheMetrics.cacheMissTokens,
3225
3423
  },
3226
3424
  },
3227
3425
  };
@@ -3256,6 +3454,12 @@ export class OpenAIProvider extends BaseProvider {
3256
3454
  logger.debug(() => `Auto-detected 'kimi' format for K2 model: ${modelName}`);
3257
3455
  return 'kimi';
3258
3456
  }
3457
+ // Check for Mistral models (requires 9-char alphanumeric IDs)
3458
+ // This applies to both hosted API and self-hosted Mistral models
3459
+ if (isMistralModel(modelName)) {
3460
+ logger.debug(() => `Auto-detected 'mistral' format for Mistral model: ${modelName}`);
3461
+ return 'mistral';
3462
+ }
3259
3463
  const lowerModelName = modelName.toLowerCase();
3260
3464
  // Check for GLM-4 models (glm-4, glm-4.5, glm-4.6, glm-4-5, etc.)
3261
3465
  if (lowerModelName.includes('glm-4')) {
@@ -3338,57 +3542,75 @@ export class OpenAIProvider extends BaseProvider {
3338
3542
  * Parse reasoning_content from streaming delta.
3339
3543
  *
3340
3544
  * @plan PLAN-20251202-THINKING.P11, PLAN-20251202-THINKING.P16
3341
- * @requirement REQ-THINK-003.1, REQ-THINK-003.3, REQ-THINK-003.4
3545
+ * @requirement REQ-THINK-003.1, REQ-THINK-003.3, REQ-THINK-003.4, REQ-KIMI-REASONING-001.1
3546
+ * @issue #749
3342
3547
  */
3343
3548
  parseStreamingReasoningDelta(delta) {
3344
3549
  if (!delta) {
3345
- return null;
3550
+ return { thinking: null, toolCalls: [] };
3346
3551
  }
3347
3552
  // Access reasoning_content via type assertion since OpenAI SDK doesn't declare it
3348
3553
  const reasoningContent = delta
3349
3554
  .reasoning_content;
3350
3555
  // Handle absent, null, or non-string
3351
3556
  if (!reasoningContent || typeof reasoningContent !== 'string') {
3352
- return null;
3353
- }
3354
- // Handle empty string or whitespace-only
3355
- if (reasoningContent.trim().length === 0) {
3356
- return null;
3357
- }
3358
- return {
3359
- type: 'thinking',
3360
- thought: reasoningContent,
3361
- sourceField: 'reasoning_content',
3362
- isHidden: false,
3363
- };
3557
+ return { thinking: null, toolCalls: [] };
3558
+ }
3559
+ // Handle empty string only - preserve whitespace-only content (spaces, tabs)
3560
+ // to maintain proper formatting in accumulated reasoning (fixes issue #721)
3561
+ if (reasoningContent.length === 0) {
3562
+ return { thinking: null, toolCalls: [] };
3563
+ }
3564
+ // Extract Kimi K2 tool calls embedded in reasoning_content (fixes issue #749)
3565
+ const { cleanedText, toolCalls } = this.extractKimiToolCallsFromText(reasoningContent);
3566
+ // For streaming, preserve whitespace-only content for proper formatting (issue #721)
3567
+ // Only return null if the cleaned text is empty (length 0)
3568
+ const thinkingBlock = cleanedText.length === 0
3569
+ ? null
3570
+ : {
3571
+ type: 'thinking',
3572
+ thought: cleanedText,
3573
+ sourceField: 'reasoning_content',
3574
+ isHidden: false,
3575
+ };
3576
+ return { thinking: thinkingBlock, toolCalls };
3364
3577
  }
3365
3578
  /**
3366
3579
  * Parse reasoning_content from non-streaming message.
3367
3580
  *
3368
3581
  * @plan PLAN-20251202-THINKING.P11, PLAN-20251202-THINKING.P16
3369
- * @requirement REQ-THINK-003.2, REQ-THINK-003.3, REQ-THINK-003.4
3582
+ * @requirement REQ-THINK-003.2, REQ-THINK-003.3, REQ-THINK-003.4, REQ-KIMI-REASONING-001.2
3583
+ * @issue #749
3370
3584
  */
3371
3585
  parseNonStreamingReasoning(message) {
3372
3586
  if (!message) {
3373
- return null;
3587
+ return { thinking: null, toolCalls: [] };
3374
3588
  }
3375
3589
  // Access reasoning_content via type assertion since OpenAI SDK doesn't declare it
3376
3590
  const reasoningContent = message
3377
3591
  .reasoning_content;
3378
3592
  // Handle absent, null, or non-string
3379
3593
  if (!reasoningContent || typeof reasoningContent !== 'string') {
3380
- return null;
3594
+ return { thinking: null, toolCalls: [] };
3381
3595
  }
3382
- // Handle empty string or whitespace-only
3596
+ // Handle empty string or whitespace-only - for non-streaming complete responses,
3597
+ // whitespace-only reasoning is unusual and should be treated as no reasoning
3383
3598
  if (reasoningContent.trim().length === 0) {
3384
- return null;
3385
- }
3386
- return {
3387
- type: 'thinking',
3388
- thought: reasoningContent,
3389
- sourceField: 'reasoning_content',
3390
- isHidden: false,
3391
- };
3599
+ return { thinking: null, toolCalls: [] };
3600
+ }
3601
+ // Extract Kimi K2 tool calls embedded in reasoning_content (fixes issue #749)
3602
+ const { cleanedText, toolCalls } = this.extractKimiToolCallsFromText(reasoningContent);
3603
+ // For non-streaming, trim whitespace after extraction
3604
+ const trimmedText = cleanedText.trim();
3605
+ const thinkingBlock = trimmedText.length === 0
3606
+ ? null
3607
+ : {
3608
+ type: 'thinking',
3609
+ thought: trimmedText,
3610
+ sourceField: 'reasoning_content',
3611
+ isHidden: false,
3612
+ };
3613
+ return { thinking: thinkingBlock, toolCalls };
3392
3614
  }
3393
3615
  }
3394
3616
  //# sourceMappingURL=OpenAIProvider.js.map