@vybestack/llxprt-code-core 0.7.0-nightly.251209.0061bd6bf → 0.7.0-nightly.251211.134f1920b

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/dist/src/config/profileManager.js +2 -0
  2. package/dist/src/config/profileManager.js.map +1 -1
  3. package/dist/src/prompt-config/prompt-resolver.js +4 -0
  4. package/dist/src/prompt-config/prompt-resolver.js.map +1 -1
  5. package/dist/src/providers/LoggingProviderWrapper.d.ts +2 -1
  6. package/dist/src/providers/LoggingProviderWrapper.js +16 -4
  7. package/dist/src/providers/LoggingProviderWrapper.js.map +1 -1
  8. package/dist/src/providers/ProviderManager.d.ts +6 -3
  9. package/dist/src/providers/ProviderManager.js +16 -4
  10. package/dist/src/providers/ProviderManager.js.map +1 -1
  11. package/dist/src/providers/openai/OpenAIProvider.d.ts +10 -2
  12. package/dist/src/providers/openai/OpenAIProvider.js +335 -136
  13. package/dist/src/providers/openai/OpenAIProvider.js.map +1 -1
  14. package/dist/src/providers/openai-vercel/OpenAIVercelProvider.d.ts +3 -0
  15. package/dist/src/providers/openai-vercel/OpenAIVercelProvider.js +255 -22
  16. package/dist/src/providers/openai-vercel/OpenAIVercelProvider.js.map +1 -1
  17. package/dist/src/providers/openai-vercel/messageConversion.d.ts +4 -1
  18. package/dist/src/providers/openai-vercel/messageConversion.js +41 -6
  19. package/dist/src/providers/openai-vercel/messageConversion.js.map +1 -1
  20. package/dist/src/providers/reasoning/reasoningUtils.d.ts +26 -1
  21. package/dist/src/providers/reasoning/reasoningUtils.js +157 -0
  22. package/dist/src/providers/reasoning/reasoningUtils.js.map +1 -1
  23. package/dist/src/providers/utils/cacheMetricsExtractor.d.ts +6 -0
  24. package/dist/src/providers/utils/cacheMetricsExtractor.js +36 -0
  25. package/dist/src/providers/utils/cacheMetricsExtractor.js.map +1 -0
  26. package/dist/src/services/history/IContent.d.ts +3 -7
  27. package/dist/src/services/history/IContent.js.map +1 -1
  28. package/dist/src/tools/IToolFormatter.d.ts +1 -1
  29. package/dist/src/tools/ToolIdStrategy.d.ts +25 -0
  30. package/dist/src/tools/ToolIdStrategy.js +108 -0
  31. package/dist/src/tools/ToolIdStrategy.js.map +1 -1
  32. package/dist/src/tools/task.js +14 -2
  33. package/dist/src/tools/task.js.map +1 -1
  34. package/dist/src/utils/generateContentResponseUtilities.js +6 -0
  35. package/dist/src/utils/generateContentResponseUtilities.js.map +1 -1
  36. package/package.json +1 -1
@@ -22,7 +22,7 @@ import crypto from 'node:crypto';
22
22
  import * as http from 'http';
23
23
  import * as https from 'https';
24
24
  import * as net from 'net';
25
- import { isKimiModel, getToolIdStrategy, } from '../../tools/ToolIdStrategy.js';
25
+ import { isKimiModel, isMistralModel, getToolIdStrategy, } from '../../tools/ToolIdStrategy.js';
26
26
  import { BaseProvider, } from '../BaseProvider.js';
27
27
  import { DebugLogger } from '../../debug/index.js';
28
28
  import { ToolFormatter } from '../../tools/ToolFormatter.js';
@@ -40,6 +40,7 @@ import { buildToolResponsePayload, EMPTY_TOOL_RESULT_PLACEHOLDER, } from '../uti
40
40
  import { isLocalEndpoint } from '../utils/localEndpoint.js';
41
41
  import { filterThinkingForContext, thinkingToReasoningField, extractThinkingBlocks, } from '../reasoning/reasoningUtils.js';
42
42
  import { shouldDumpSDKContext, dumpSDKContext, } from '../utils/dumpSDKContext.js';
43
+ import { extractCacheMetrics } from '../utils/cacheMetricsExtractor.js';
43
44
  const MAX_TOOL_RESPONSE_CHARS = 1024;
44
45
  const MAX_TOOL_RESPONSE_RETRY_CHARS = 512;
45
46
  const TOOL_ARGS_PREVIEW_LENGTH = 500;
@@ -277,13 +278,12 @@ export class OpenAIProvider extends BaseProvider {
277
278
  // This preserves meaningful whitespace in regular text chunks during streaming
278
279
  // (e.g., " 5 Biggest" should remain " 5 Biggest", not become "5 Biggest")
279
280
  if (hadReasoningTags) {
280
- // Clean up multiple consecutive spaces/whitespace that may result from stripping
281
+ // Collapse multiple spaces/tabs but preserve newlines for proper paragraph/line breaks
281
282
  str = str.replace(/[ \t]+/g, ' ');
282
283
  str = str.replace(/\n{3,}/g, '\n\n');
283
- // Only trim leading whitespace when think tags were at the beginning
284
- // This prevents leading spaces from "<think>...</think>text" -> " text"
285
- // but preserves trailing whitespace for streaming chunk concatenation
286
- str = str.trimStart();
284
+ // Only trim leading horizontal whitespace (spaces/tabs), NOT newlines
285
+ // This preserves line breaks between think tags and content (fixes #721)
286
+ str = str.replace(/^[ \t]+/, '');
287
287
  }
288
288
  const afterLen = str.length;
289
289
  if (hadReasoningTags && afterLen !== beforeLen) {
@@ -438,62 +438,78 @@ export class OpenAIProvider extends BaseProvider {
438
438
  * and all tool info is only encoded in the text template.
439
439
  */
440
440
  extractKimiToolCallsFromText(raw) {
441
- if (!raw || !raw.includes('<|tool_calls_section_begin|>')) {
441
+ // Return early only if input is null/undefined/empty
442
+ if (!raw) {
442
443
  return { cleanedText: raw, toolCalls: [] };
443
444
  }
444
445
  const logger = this.getLogger();
445
446
  const toolCalls = [];
446
447
  let text = raw;
447
- const sectionRegex = /<\|tool_calls_section_begin\|>([\s\S]*?)<\|tool_calls_section_end\|>/g;
448
- text = text.replace(sectionRegex, (_sectionMatch, sectionBody) => {
449
- try {
450
- const callRegex = /<\|tool_call_begin\|>\s*([^<]+?)\s*<\|tool_call_argument_begin\|>\s*([\s\S]*?)\s*<\|tool_call_end\|>/g;
451
- let m;
452
- while ((m = callRegex.exec(sectionBody)) !== null) {
453
- const rawId = m[1].trim();
454
- const rawArgs = m[2].trim();
455
- // Infer tool name from ID.
456
- let toolName = '';
457
- const match = /^functions\.([A-Za-z0-9_]+):\d+/i.exec(rawId) ||
458
- /^[A-Za-z0-9_]+\.([A-Za-z0-9_]+):\d+/.exec(rawId);
459
- if (match) {
460
- toolName = match[1];
461
- }
462
- else {
463
- const colonParts = rawId.split(':');
464
- const head = colonParts[0] || rawId;
465
- const dotParts = head.split('.');
466
- toolName = dotParts[dotParts.length - 1] || head;
448
+ // Extract tool calls from complete sections if present
449
+ if (raw.includes('<|tool_calls_section_begin|>')) {
450
+ const sectionRegex = /<\|tool_calls_section_begin\|>([\s\S]*?)<\|tool_calls_section_end\|>/g;
451
+ text = text.replace(sectionRegex, (_sectionMatch, sectionBody) => {
452
+ try {
453
+ const callRegex = /<\|tool_call_begin\|>\s*([^<]+?)\s*<\|tool_call_argument_begin\|>\s*([\s\S]*?)\s*<\|tool_call_end\|>/g;
454
+ let m;
455
+ while ((m = callRegex.exec(sectionBody)) !== null) {
456
+ const rawId = m[1].trim();
457
+ const rawArgs = m[2].trim();
458
+ // Infer tool name from ID.
459
+ let toolName = '';
460
+ const match = /^functions\.([A-Za-z0-9_]+):\d+/i.exec(rawId) ||
461
+ /^[A-Za-z0-9_]+\.([A-Za-z0-9_]+):\d+/.exec(rawId);
462
+ if (match) {
463
+ toolName = match[1];
464
+ }
465
+ else {
466
+ const colonParts = rawId.split(':');
467
+ const head = colonParts[0] || rawId;
468
+ const dotParts = head.split('.');
469
+ toolName = dotParts[dotParts.length - 1] || head;
470
+ }
471
+ // Normalize tool name (handles Kimi-K2 style prefixes like call_functionsglob7)
472
+ toolName = this.normalizeToolName(toolName);
473
+ const sanitizedArgs = this.sanitizeToolArgumentsString(rawArgs);
474
+ const processedParameters = processToolParameters(sanitizedArgs, toolName);
475
+ toolCalls.push({
476
+ type: 'tool_call',
477
+ id: this.normalizeToHistoryToolId(rawId),
478
+ name: toolName,
479
+ parameters: processedParameters,
480
+ });
467
481
  }
468
- // Normalize tool name (handles Kimi-K2 style prefixes like call_functionsglob7)
469
- toolName = this.normalizeToolName(toolName);
470
- const sanitizedArgs = this.sanitizeToolArgumentsString(rawArgs);
471
- const processedParameters = processToolParameters(sanitizedArgs, toolName);
472
- toolCalls.push({
473
- type: 'tool_call',
474
- id: this.normalizeToHistoryToolId(rawId),
475
- name: toolName,
476
- parameters: processedParameters,
477
- });
478
482
  }
479
- }
480
- catch (err) {
481
- logger.debug(() => `[OpenAIProvider] Failed to parse Kimi tool_calls_section: ${err}`);
482
- }
483
- // Strip the entire tool section from user-visible text
484
- return '';
485
- });
486
- if (toolCalls.length > 0) {
487
- logger.debug(() => `[OpenAIProvider] Parsed Kimi tool_calls_section`, {
488
- toolCallCount: toolCalls.length,
489
- originalLength: raw.length,
490
- cleanedLength: text.length,
483
+ catch (err) {
484
+ logger.debug(() => `[OpenAIProvider] Failed to parse Kimi tool_calls_section: ${err}`);
485
+ }
486
+ // Strip the entire tool section from user-visible text
487
+ return '';
491
488
  });
489
+ if (toolCalls.length > 0) {
490
+ logger.debug(() => `[OpenAIProvider] Parsed Kimi tool_calls_section`, {
491
+ toolCallCount: toolCalls.length,
492
+ originalLength: raw.length,
493
+ cleanedLength: text.length,
494
+ });
495
+ }
492
496
  }
497
+ // ALWAYS run stray token cleanup, even if no complete sections were found
498
+ // This handles partial sections, malformed tokens, orphaned markers, etc.
499
+ text = text.replace(/<\|tool_call(?:_(?:begin|end|argument_begin))?\|>/g, '');
500
+ text = text.replace(/<\|tool_calls_section_(?:begin|end)\|>/g, '');
493
501
  // Don't trim - preserve leading/trailing newlines that are important for formatting
494
502
  // (e.g., numbered lists from Kimi K2 that have newlines between items)
495
503
  return { cleanedText: text, toolCalls };
496
504
  }
505
+ /**
506
+ * Clean Kimi K2 tool call tokens from thinking content.
507
+ * Used when extracting thinking from <think> tags that may contain embedded tool calls.
508
+ * @issue #749
509
+ */
510
+ cleanThinkingContent(thought) {
511
+ return this.extractKimiToolCallsFromText(thought).cleanedText;
512
+ }
497
513
  /**
498
514
  * @plan:PLAN-20251023-STATELESS-HARDENING.P09
499
515
  * @requirement:REQ-SP4-002
@@ -910,9 +926,12 @@ export class OpenAIProvider extends BaseProvider {
910
926
  }
911
927
  else {
912
928
  // Assistant message with tool calls
929
+ // CRITICAL for Mistral API compatibility (#760):
930
+ // When tool_calls are present, we must NOT include a content property at all
931
+ // (not even null). Mistral's OpenAI-compatible API requires this.
932
+ // See: https://docs.mistral.ai/capabilities/function_calling
913
933
  messages.push({
914
934
  role: 'assistant',
915
- content: text || null,
916
935
  tool_calls: toolCalls.map((tc) => ({
917
936
  id: this.normalizeToOpenAIToolId(tc.id),
918
937
  type: 'function',
@@ -948,10 +967,16 @@ export class OpenAIProvider extends BaseProvider {
948
967
  }
949
968
  else {
950
969
  for (const tr of toolResponses) {
970
+ // CRITICAL for Mistral API compatibility (#760):
971
+ // Tool messages must include a name field matching the function name.
972
+ // See: https://docs.mistral.ai/capabilities/function_calling
973
+ // Note: The OpenAI SDK types don't include name, but Mistral requires it.
974
+ // We use a type assertion to add this required field.
951
975
  messages.push({
952
976
  role: 'tool',
953
977
  content: this.buildToolResponseContent(tr, config),
954
978
  tool_call_id: this.normalizeToOpenAIToolId(tr.callId),
979
+ name: tr.toolName,
955
980
  });
956
981
  }
957
982
  }
@@ -977,8 +1002,9 @@ export class OpenAIProvider extends BaseProvider {
977
1002
  const messages = [];
978
1003
  // Create a ToolIdMapper based on the tool format
979
1004
  // For Kimi K2, this generates sequential IDs in the format functions.{name}:{index}
980
- const toolIdMapper = toolFormat === 'kimi'
981
- ? getToolIdStrategy('kimi').createMapper(filteredContents)
1005
+ // For Mistral, this generates 9-char alphanumeric IDs
1006
+ const toolIdMapper = toolFormat === 'kimi' || toolFormat === 'mistral'
1007
+ ? getToolIdStrategy(toolFormat).createMapper(filteredContents)
982
1008
  : null;
983
1009
  // Helper to resolve tool call IDs based on format
984
1010
  const resolveToolCallId = (tc) => {
@@ -1014,9 +1040,12 @@ export class OpenAIProvider extends BaseProvider {
1014
1040
  const toolCalls = content.blocks.filter((b) => b.type === 'tool_call');
1015
1041
  if (toolCalls.length > 0) {
1016
1042
  // Assistant message with tool calls
1043
+ // CRITICAL for Mistral API compatibility (#760):
1044
+ // When tool_calls are present, we must NOT include a content property at all
1045
+ // (not even null). Mistral's OpenAI-compatible API requires this.
1046
+ // See: https://docs.mistral.ai/capabilities/function_calling
1017
1047
  const baseMessage = {
1018
1048
  role: 'assistant',
1019
- content: text || null,
1020
1049
  tool_calls: toolCalls.map((tc) => ({
1021
1050
  id: resolveToolCallId(tc),
1022
1051
  type: 'function',
@@ -1057,10 +1086,16 @@ export class OpenAIProvider extends BaseProvider {
1057
1086
  // Convert tool responses
1058
1087
  const toolResponses = content.blocks.filter((b) => b.type === 'tool_response');
1059
1088
  for (const tr of toolResponses) {
1089
+ // CRITICAL for Mistral API compatibility (#760):
1090
+ // Tool messages must include a name field matching the function name.
1091
+ // See: https://docs.mistral.ai/capabilities/function_calling
1092
+ // Note: The OpenAI SDK types don't include name, but Mistral requires it.
1093
+ // We use a type assertion to add this required field.
1060
1094
  messages.push({
1061
1095
  role: 'tool',
1062
1096
  content: this.buildToolResponseContent(tr, options.config),
1063
1097
  tool_call_id: resolveToolResponseId(tr),
1098
+ name: tr.toolName,
1064
1099
  });
1065
1100
  }
1066
1101
  }
@@ -1506,9 +1541,9 @@ export class OpenAIProvider extends BaseProvider {
1506
1541
  // Buffer for accumulating text chunks for providers that need it
1507
1542
  let textBuffer = '';
1508
1543
  // Use the same detected format from earlier for consistency
1509
- const isKimiModel = model.toLowerCase().includes('kimi-k2');
1544
+ const isKimiK2Model = model.toLowerCase().includes('kimi-k2');
1510
1545
  // Buffer text for Qwen format providers and Kimi-K2 to avoid stanza formatting
1511
- const shouldBufferText = detectedFormat === 'qwen' || isKimiModel;
1546
+ const shouldBufferText = detectedFormat === 'qwen' || isKimiK2Model;
1512
1547
  // Accumulate thinking content across the entire stream to emit as ONE block
1513
1548
  // This handles fragmented <think>word</think> streaming from Synthetic API
1514
1549
  // @plan PLAN-20251202-THINKING.P16
@@ -1575,12 +1610,29 @@ export class OpenAIProvider extends BaseProvider {
1575
1610
  continue;
1576
1611
  // Parse reasoning_content from streaming delta (Phase 16 integration)
1577
1612
  // ACCUMULATE instead of yielding immediately to handle token-by-token streaming
1613
+ // Extract embedded Kimi K2 tool calls from reasoning_content (fixes #749)
1578
1614
  // @plan PLAN-20251202-THINKING.P16
1579
- const reasoningBlock = this.parseStreamingReasoningDelta(choice.delta);
1615
+ // @requirement REQ-KIMI-REASONING-001.1
1616
+ const { thinking: reasoningBlock, toolCalls: reasoningToolCalls } = this.parseStreamingReasoningDelta(choice.delta);
1580
1617
  if (reasoningBlock) {
1581
1618
  // Accumulate reasoning content - will emit ONE block later
1582
1619
  accumulatedReasoningContent += reasoningBlock.thought;
1583
1620
  }
1621
+ // Accumulate tool calls extracted from reasoning_content
1622
+ if (reasoningToolCalls.length > 0) {
1623
+ for (const toolCall of reasoningToolCalls) {
1624
+ // Convert ToolCallBlock to accumulated format
1625
+ const index = accumulatedToolCalls.length;
1626
+ accumulatedToolCalls[index] = {
1627
+ id: toolCall.id,
1628
+ type: 'function',
1629
+ function: {
1630
+ name: toolCall.name,
1631
+ arguments: JSON.stringify(toolCall.parameters),
1632
+ },
1633
+ };
1634
+ }
1635
+ }
1584
1636
  // Check for finish_reason to detect proper stream ending
1585
1637
  if (choice.finish_reason) {
1586
1638
  logger.debug(() => `[Streaming] Stream finished with reason: ${choice.finish_reason}`, {
@@ -1601,13 +1653,25 @@ export class OpenAIProvider extends BaseProvider {
1601
1653
  }
1602
1654
  // Handle text content - buffer for Qwen format, emit immediately for others
1603
1655
  // Note: Synthetic API sends content that may duplicate reasoning_content.
1604
- // This is the model's behavior - we don't filter it here.
1656
+ // We now filter duplicates by tracking when content starts matching reasoning_content.
1657
+ // fixes #721
1605
1658
  // @plan PLAN-20251202-THINKING.P16
1606
1659
  const rawDeltaContent = this.coerceMessageContentToString(choice.delta?.content);
1607
1660
  if (rawDeltaContent) {
1608
- const deltaContent = isKimiModel
1609
- ? rawDeltaContent
1610
- : this.sanitizeProviderText(rawDeltaContent);
1661
+ // For Kimi models, we need to buffer the RAW content without processing
1662
+ // because Kimi tokens stream incrementally and partial tokens would leak
1663
+ // through if we try to process them immediately. The buffer will be
1664
+ // processed when flushed (at sentence boundaries or end of stream).
1665
+ let deltaContent;
1666
+ if (isKimiK2Model) {
1667
+ // For Kimi: Don't process yet - just pass through and let buffering handle it
1668
+ // We'll extract tool calls and sanitize when we flush the buffer
1669
+ deltaContent = rawDeltaContent;
1670
+ }
1671
+ else {
1672
+ // For non-Kimi models: sanitize immediately as before
1673
+ deltaContent = this.sanitizeProviderText(rawDeltaContent);
1674
+ }
1611
1675
  if (!deltaContent) {
1612
1676
  continue;
1613
1677
  }
@@ -1623,9 +1687,9 @@ export class OpenAIProvider extends BaseProvider {
1623
1687
  });
1624
1688
  // Buffer text to avoid stanza formatting
1625
1689
  textBuffer += deltaContent;
1626
- const hasKimiBegin = textBuffer.includes('<|tool_calls_section_begin|>');
1627
- const hasKimiEnd = textBuffer.includes('<|tool_calls_section_end|>');
1628
- const hasOpenKimiSection = hasKimiBegin && !hasKimiEnd;
1690
+ const kimiBeginCount = (textBuffer.match(/<\|tool_calls_section_begin\|>/g) || []).length;
1691
+ const kimiEndCount = (textBuffer.match(/<\|tool_calls_section_end\|>/g) || []).length;
1692
+ const hasOpenKimiSection = kimiBeginCount > kimiEndCount;
1629
1693
  // Emit buffered text when we have a complete sentence or paragraph
1630
1694
  // Look for natural break points, but avoid flushing mid Kimi section
1631
1695
  if (!hasOpenKimiSection &&
@@ -1642,12 +1706,14 @@ export class OpenAIProvider extends BaseProvider {
1642
1706
  // @requirement REQ-THINK-003
1643
1707
  const tagBasedThinking = this.extractThinkTagsAsBlock(workingText);
1644
1708
  if (tagBasedThinking) {
1709
+ // Clean Kimi tokens from thinking content before accumulating
1710
+ const cleanedThought = this.cleanThinkingContent(tagBasedThinking.thought);
1645
1711
  // Accumulate thinking content - don't emit yet
1646
1712
  // Use newline to preserve formatting between chunks (not space)
1647
1713
  if (accumulatedThinkingContent.length > 0) {
1648
1714
  accumulatedThinkingContent += '\n';
1649
1715
  }
1650
- accumulatedThinkingContent += tagBasedThinking.thought;
1716
+ accumulatedThinkingContent += cleanedThought;
1651
1717
  logger.debug(() => `[Streaming legacy] Accumulated thinking: ${accumulatedThinkingContent.length} chars total`);
1652
1718
  }
1653
1719
  const kimiParsed = this.extractKimiToolCallsFromText(workingText);
@@ -1709,7 +1775,10 @@ export class OpenAIProvider extends BaseProvider {
1709
1775
  // Always use sanitized text to strip <think> tags (legacy streaming)
1710
1776
  // Bug fix: Previously Kimi used unsanitized workingText
1711
1777
  // @plan PLAN-20251202-THINKING.P16
1712
- if (cleanedText.trim().length > 0) {
1778
+ // Bug fix #721: Emit whitespace-only chunks (e.g., " " between words)
1779
+ // Previously we used cleanedText.trim().length > 0 which dropped spaces,
1780
+ // causing "list 5" to become "list5". Now we emit any non-empty cleanedText.
1781
+ if (cleanedText.length > 0) {
1713
1782
  yield {
1714
1783
  speaker: 'ai',
1715
1784
  blocks: [
@@ -1828,11 +1897,13 @@ export class OpenAIProvider extends BaseProvider {
1828
1897
  // @plan PLAN-20251202-THINKING.P16
1829
1898
  const tagBasedThinking = this.extractThinkTagsAsBlock(workingText);
1830
1899
  if (tagBasedThinking) {
1900
+ // Clean Kimi tokens from thinking content before accumulating
1901
+ const cleanedThought = this.cleanThinkingContent(tagBasedThinking.thought);
1831
1902
  // Use newline to preserve formatting between chunks (not space)
1832
1903
  if (accumulatedThinkingContent.length > 0) {
1833
1904
  accumulatedThinkingContent += '\n';
1834
1905
  }
1835
- accumulatedThinkingContent += tagBasedThinking.thought;
1906
+ accumulatedThinkingContent += cleanedThought;
1836
1907
  }
1837
1908
  const kimiParsed = this.extractKimiToolCallsFromText(workingText);
1838
1909
  if (kimiParsed.toolCalls.length > 0) {
@@ -1891,7 +1962,10 @@ export class OpenAIProvider extends BaseProvider {
1891
1962
  // Always use sanitized text to strip <think> tags (legacy final buffer)
1892
1963
  // Bug fix: Previously Kimi used unsanitized workingText
1893
1964
  // @plan PLAN-20251202-THINKING.P16
1894
- if (cleanedText.trim().length > 0) {
1965
+ // Bug fix #721: Emit whitespace-only chunks (e.g., " " between words)
1966
+ // Previously we used cleanedText.trim().length > 0 which dropped spaces,
1967
+ // causing "list 5" to become "list5". Now we emit any non-empty cleanedText.
1968
+ if (cleanedText.length > 0) {
1895
1969
  yield {
1896
1970
  speaker: 'ai',
1897
1971
  blocks: [
@@ -1923,19 +1997,32 @@ export class OpenAIProvider extends BaseProvider {
1923
1997
  }
1924
1998
  // Emit accumulated reasoning_content as ONE ThinkingBlock (legacy path)
1925
1999
  // This consolidates token-by-token reasoning from Synthetic API into a single block
2000
+ // Clean Kimi tokens from the accumulated content (not per-chunk) to handle split tokens
1926
2001
  // @plan PLAN-20251202-THINKING.P16
1927
2002
  if (accumulatedReasoningContent.length > 0) {
1928
- yield {
1929
- speaker: 'ai',
1930
- blocks: [
1931
- {
1932
- type: 'thinking',
1933
- thought: accumulatedReasoningContent,
1934
- sourceField: 'reasoning_content',
1935
- isHidden: false,
1936
- },
1937
- ],
1938
- };
2003
+ // Extract Kimi tool calls from the complete accumulated reasoning content
2004
+ const { cleanedText: cleanedReasoning, toolCalls: reasoningToolCalls } = this.extractKimiToolCallsFromText(accumulatedReasoningContent);
2005
+ // Emit the cleaned thinking block
2006
+ if (cleanedReasoning.length > 0) {
2007
+ yield {
2008
+ speaker: 'ai',
2009
+ blocks: [
2010
+ {
2011
+ type: 'thinking',
2012
+ thought: cleanedReasoning,
2013
+ sourceField: 'reasoning_content',
2014
+ isHidden: false,
2015
+ },
2016
+ ],
2017
+ };
2018
+ }
2019
+ // Emit any tool calls extracted from reasoning content
2020
+ if (reasoningToolCalls.length > 0) {
2021
+ yield {
2022
+ speaker: 'ai',
2023
+ blocks: reasoningToolCalls,
2024
+ };
2025
+ }
1939
2026
  }
1940
2027
  // Process and emit tool calls using legacy accumulated approach
1941
2028
  if (accumulatedToolCalls.length > 0) {
@@ -1962,6 +2049,7 @@ export class OpenAIProvider extends BaseProvider {
1962
2049
  };
1963
2050
  // Add usage metadata if we captured it from streaming
1964
2051
  if (streamingUsage) {
2052
+ const cacheMetrics = extractCacheMetrics(streamingUsage);
1965
2053
  toolCallsContent.metadata = {
1966
2054
  usage: {
1967
2055
  promptTokens: streamingUsage.prompt_tokens || 0,
@@ -1969,6 +2057,9 @@ export class OpenAIProvider extends BaseProvider {
1969
2057
  totalTokens: streamingUsage.total_tokens ||
1970
2058
  (streamingUsage.prompt_tokens || 0) +
1971
2059
  (streamingUsage.completion_tokens || 0),
2060
+ cachedTokens: cacheMetrics.cachedTokens,
2061
+ cacheCreationTokens: cacheMetrics.cacheCreationTokens,
2062
+ cacheMissTokens: cacheMetrics.cacheMissTokens,
1972
2063
  },
1973
2064
  };
1974
2065
  }
@@ -1977,6 +2068,7 @@ export class OpenAIProvider extends BaseProvider {
1977
2068
  }
1978
2069
  // If we have usage information but no tool calls, emit a metadata-only response
1979
2070
  if (streamingUsage && accumulatedToolCalls.length === 0) {
2071
+ const cacheMetrics = extractCacheMetrics(streamingUsage);
1980
2072
  yield {
1981
2073
  speaker: 'ai',
1982
2074
  blocks: [],
@@ -1987,6 +2079,9 @@ export class OpenAIProvider extends BaseProvider {
1987
2079
  totalTokens: streamingUsage.total_tokens ||
1988
2080
  (streamingUsage.prompt_tokens || 0) +
1989
2081
  (streamingUsage.completion_tokens || 0),
2082
+ cachedTokens: cacheMetrics.cachedTokens,
2083
+ cacheCreationTokens: cacheMetrics.cacheCreationTokens,
2084
+ cacheMissTokens: cacheMetrics.cacheMissTokens,
1990
2085
  },
1991
2086
  },
1992
2087
  };
@@ -2051,8 +2146,10 @@ export class OpenAIProvider extends BaseProvider {
2051
2146
  }
2052
2147
  const blocks = [];
2053
2148
  // Parse reasoning_content from response (Phase 16 integration)
2054
- const reasoningBlock = this.parseNonStreamingReasoning(choice.message);
2055
- logger.debug(() => `[Non-streaming] parseNonStreamingReasoning result: ${reasoningBlock ? `found (${reasoningBlock.thought?.length} chars)` : 'not found'}`, {
2149
+ // Extract embedded Kimi K2 tool calls from reasoning_content (fixes #749)
2150
+ // @requirement REQ-KIMI-REASONING-001.2
2151
+ const { thinking: reasoningBlock, toolCalls: reasoningToolCalls } = this.parseNonStreamingReasoning(choice.message);
2152
+ logger.debug(() => `[Non-streaming] parseNonStreamingReasoning result: ${reasoningBlock ? `found (${reasoningBlock.thought?.length} chars)` : 'not found'}, tool calls: ${reasoningToolCalls.length}`, {
2056
2153
  hasReasoningContent: 'reasoning_content' in
2057
2154
  (choice.message ?? {}),
2058
2155
  messageKeys: Object.keys(choice.message ?? {}),
@@ -2060,6 +2157,11 @@ export class OpenAIProvider extends BaseProvider {
2060
2157
  if (reasoningBlock) {
2061
2158
  blocks.push(reasoningBlock);
2062
2159
  }
2160
+ // Add tool calls extracted from reasoning_content
2161
+ if (reasoningToolCalls.length > 0) {
2162
+ blocks.push(...reasoningToolCalls);
2163
+ logger.debug(() => `[Non-streaming] Added ${reasoningToolCalls.length} tool calls from reasoning_content`);
2164
+ }
2063
2165
  // Handle text content (strip thinking / reasoning blocks) and Kimi tool sections
2064
2166
  const rawMessageContent = this.coerceMessageContentToString(choice.message?.content);
2065
2167
  let kimiCleanContent;
@@ -2163,6 +2265,7 @@ export class OpenAIProvider extends BaseProvider {
2163
2265
  };
2164
2266
  // Add usage metadata from non-streaming response
2165
2267
  if (completion.usage) {
2268
+ const cacheMetrics = extractCacheMetrics(completion.usage);
2166
2269
  responseContent.metadata = {
2167
2270
  usage: {
2168
2271
  promptTokens: completion.usage.prompt_tokens || 0,
@@ -2170,6 +2273,9 @@ export class OpenAIProvider extends BaseProvider {
2170
2273
  totalTokens: completion.usage.total_tokens ||
2171
2274
  (completion.usage.prompt_tokens || 0) +
2172
2275
  (completion.usage.completion_tokens || 0),
2276
+ cachedTokens: cacheMetrics.cachedTokens,
2277
+ cacheCreationTokens: cacheMetrics.cacheCreationTokens,
2278
+ cacheMissTokens: cacheMetrics.cacheMissTokens,
2173
2279
  },
2174
2280
  };
2175
2281
  }
@@ -2177,6 +2283,7 @@ export class OpenAIProvider extends BaseProvider {
2177
2283
  }
2178
2284
  else if (completion.usage) {
2179
2285
  // Emit metadata-only response if no content blocks but have usage info
2286
+ const cacheMetrics = extractCacheMetrics(completion.usage);
2180
2287
  yield {
2181
2288
  speaker: 'ai',
2182
2289
  blocks: [],
@@ -2187,6 +2294,9 @@ export class OpenAIProvider extends BaseProvider {
2187
2294
  totalTokens: completion.usage.total_tokens ||
2188
2295
  (completion.usage.prompt_tokens || 0) +
2189
2296
  (completion.usage.completion_tokens || 0),
2297
+ cachedTokens: cacheMetrics.cachedTokens,
2298
+ cacheCreationTokens: cacheMetrics.cacheCreationTokens,
2299
+ cacheMissTokens: cacheMetrics.cacheMissTokens,
2190
2300
  },
2191
2301
  },
2192
2302
  };
@@ -2459,7 +2569,7 @@ export class OpenAIProvider extends BaseProvider {
2459
2569
  });
2460
2570
  // Dump successful streaming request if enabled
2461
2571
  if (shouldDumpSuccess) {
2462
- await dumpSDKContext('openai', '/v1/chat/completions', requestBody, { streaming: true }, false, baseURL || 'https://api.openai.com');
2572
+ await dumpSDKContext('openai', '/chat/completions', requestBody, { streaming: true }, false, baseURL || 'https://api.openai.com/v1');
2463
2573
  }
2464
2574
  break;
2465
2575
  }
@@ -2492,7 +2602,7 @@ export class OpenAIProvider extends BaseProvider {
2492
2602
  // Dump error if enabled
2493
2603
  if (shouldDumpError) {
2494
2604
  const dumpErrorMessage = error instanceof Error ? error.message : String(error);
2495
- await dumpSDKContext('openai', '/v1/chat/completions', requestBody, { error: dumpErrorMessage }, true, baseURL || 'https://api.openai.com');
2605
+ await dumpSDKContext('openai', '/chat/completions', requestBody, { error: dumpErrorMessage }, true, baseURL || 'https://api.openai.com/v1');
2496
2606
  }
2497
2607
  // Re-throw other errors as-is
2498
2608
  const capturedErrorMessage = error instanceof Error ? error.message : String(error);
@@ -2530,7 +2640,7 @@ export class OpenAIProvider extends BaseProvider {
2530
2640
  }));
2531
2641
  // Dump successful non-streaming request if enabled
2532
2642
  if (shouldDumpSuccess) {
2533
- await dumpSDKContext('openai', '/v1/chat/completions', requestBody, response, false, baseURL || 'https://api.openai.com');
2643
+ await dumpSDKContext('openai', '/chat/completions', requestBody, response, false, baseURL || 'https://api.openai.com/v1');
2534
2644
  }
2535
2645
  break;
2536
2646
  }
@@ -2569,7 +2679,7 @@ export class OpenAIProvider extends BaseProvider {
2569
2679
  // Dump error if enabled
2570
2680
  if (shouldDumpError) {
2571
2681
  const dumpErrorMessage = error instanceof Error ? error.message : String(error);
2572
- await dumpSDKContext('openai', '/v1/chat/completions', requestBody, { error: dumpErrorMessage }, true, baseURL || 'https://api.openai.com');
2682
+ await dumpSDKContext('openai', '/chat/completions', requestBody, { error: dumpErrorMessage }, true, baseURL || 'https://api.openai.com/v1');
2573
2683
  }
2574
2684
  const capturedErrorMessage = error instanceof Error ? error.message : String(error);
2575
2685
  const status = typeof error === 'object' &&
@@ -2599,9 +2709,9 @@ export class OpenAIProvider extends BaseProvider {
2599
2709
  // Buffer for accumulating text chunks for providers that need it
2600
2710
  let textBuffer = '';
2601
2711
  // Use the same detected format from earlier for consistency
2602
- const isKimiModel = model.toLowerCase().includes('kimi-k2');
2712
+ const isKimiK2Model = model.toLowerCase().includes('kimi-k2');
2603
2713
  // Buffer text for Qwen format providers and Kimi-K2 to avoid stanza formatting
2604
- const shouldBufferText = detectedFormat === 'qwen' || isKimiModel;
2714
+ const shouldBufferText = detectedFormat === 'qwen' || isKimiK2Model;
2605
2715
  // Accumulate thinking content across the entire stream to emit as ONE block
2606
2716
  // This handles fragmented <think>word</think> streaming from Synthetic API
2607
2717
  // @plan PLAN-20251202-THINKING.P16
@@ -2671,13 +2781,28 @@ export class OpenAIProvider extends BaseProvider {
2671
2781
  continue;
2672
2782
  // Parse reasoning_content from streaming delta (Pipeline path)
2673
2783
  // ACCUMULATE instead of yielding immediately to handle token-by-token streaming
2784
+ // Extract embedded Kimi K2 tool calls from reasoning_content (fixes #749)
2674
2785
  // @plan PLAN-20251202-THINKING.P16
2675
- // @requirement REQ-THINK-003.1
2676
- const reasoningBlock = this.parseStreamingReasoningDelta(choice.delta);
2786
+ // @requirement REQ-THINK-003.1, REQ-KIMI-REASONING-001.1
2787
+ const { thinking: reasoningBlock, toolCalls: reasoningToolCalls } = this.parseStreamingReasoningDelta(choice.delta);
2677
2788
  if (reasoningBlock) {
2678
2789
  // Accumulate reasoning content - will emit ONE block later
2679
2790
  accumulatedReasoningContent += reasoningBlock.thought;
2680
2791
  }
2792
+ // Add tool calls extracted from reasoning_content to pipeline
2793
+ if (reasoningToolCalls.length > 0) {
2794
+ // Get current pipeline stats to determine next index
2795
+ const stats = this.toolCallPipeline.getStats();
2796
+ let baseIndex = stats.collector.totalCalls;
2797
+ for (const toolCall of reasoningToolCalls) {
2798
+ // Add complete tool call as fragments to pipeline
2799
+ this.toolCallPipeline.addFragment(baseIndex, {
2800
+ name: toolCall.name,
2801
+ args: JSON.stringify(toolCall.parameters),
2802
+ });
2803
+ baseIndex++;
2804
+ }
2805
+ }
2681
2806
  // Check for finish_reason to detect proper stream ending
2682
2807
  if (choice.finish_reason) {
2683
2808
  logger.debug(() => `[Streaming] Stream finished with reason: ${choice.finish_reason}`, {
@@ -2698,13 +2823,24 @@ export class OpenAIProvider extends BaseProvider {
2698
2823
  }
2699
2824
  // Handle text content - buffer for Qwen format, emit immediately for others
2700
2825
  // Note: Synthetic API sends content that may duplicate reasoning_content.
2701
- // This is the model's behavior - we don't filter it here.
2826
+ // This is the model's behavior - we don't filter it here as detection is unreliable.
2702
2827
  // @plan PLAN-20251202-THINKING.P16
2703
2828
  const rawDeltaContent = this.coerceMessageContentToString(choice.delta?.content);
2704
2829
  if (rawDeltaContent) {
2705
- const deltaContent = isKimiModel
2706
- ? rawDeltaContent
2707
- : this.sanitizeProviderText(rawDeltaContent);
2830
+ // For Kimi models, we need to buffer the RAW content without processing
2831
+ // because Kimi tokens stream incrementally and partial tokens would leak
2832
+ // through if we try to process them immediately. The buffer will be
2833
+ // processed when flushed (at sentence boundaries or end of stream).
2834
+ let deltaContent;
2835
+ if (isKimiK2Model) {
2836
+ // For Kimi: Don't process yet - just pass through and let buffering handle it
2837
+ // We'll extract tool calls and sanitize when we flush the buffer
2838
+ deltaContent = rawDeltaContent;
2839
+ }
2840
+ else {
2841
+ // For non-Kimi models: sanitize immediately as before
2842
+ deltaContent = this.sanitizeProviderText(rawDeltaContent);
2843
+ }
2708
2844
  if (!deltaContent) {
2709
2845
  continue;
2710
2846
  }
@@ -2720,9 +2856,9 @@ export class OpenAIProvider extends BaseProvider {
2720
2856
  });
2721
2857
  // Buffer text to avoid stanza formatting
2722
2858
  textBuffer += deltaContent;
2723
- const hasKimiBegin = textBuffer.includes('<|tool_calls_section_begin|>');
2724
- const hasKimiEnd = textBuffer.includes('<|tool_calls_section_end|>');
2725
- const hasOpenKimiSection = hasKimiBegin && !hasKimiEnd;
2859
+ const kimiBeginCount = (textBuffer.match(/<\|tool_calls_section_begin\|>/g) || []).length;
2860
+ const kimiEndCount = (textBuffer.match(/<\|tool_calls_section_end\|>/g) || []).length;
2861
+ const hasOpenKimiSection = kimiBeginCount > kimiEndCount;
2726
2862
  // Emit buffered text when we have a complete sentence or paragraph
2727
2863
  // Look for natural break points, avoiding flush mid Kimi section
2728
2864
  if (!hasOpenKimiSection &&
@@ -2739,12 +2875,14 @@ export class OpenAIProvider extends BaseProvider {
2739
2875
  // @requirement REQ-THINK-003
2740
2876
  const tagBasedThinking = this.extractThinkTagsAsBlock(workingText);
2741
2877
  if (tagBasedThinking) {
2878
+ // Clean Kimi tokens from thinking content before accumulating
2879
+ const cleanedThought = this.cleanThinkingContent(tagBasedThinking.thought);
2742
2880
  // Accumulate thinking content - don't emit yet
2743
2881
  // Use newline to preserve formatting between chunks (not space)
2744
2882
  if (accumulatedThinkingContent.length > 0) {
2745
2883
  accumulatedThinkingContent += '\n';
2746
2884
  }
2747
- accumulatedThinkingContent += tagBasedThinking.thought;
2885
+ accumulatedThinkingContent += cleanedThought;
2748
2886
  logger.debug(() => `[Streaming] Accumulated thinking: ${accumulatedThinkingContent.length} chars total`);
2749
2887
  }
2750
2888
  const kimiParsed = this.extractKimiToolCallsFromText(workingText);
@@ -2806,7 +2944,10 @@ export class OpenAIProvider extends BaseProvider {
2806
2944
  // Always use sanitized text to strip <think> tags (pipeline streaming)
2807
2945
  // Bug fix: Previously Kimi used unsanitized workingText
2808
2946
  // @plan PLAN-20251202-THINKING.P16
2809
- if (cleanedText.trim().length > 0) {
2947
+ // Bug fix #721: Emit whitespace-only chunks (e.g., " " between words)
2948
+ // Previously we used cleanedText.trim().length > 0 which dropped spaces,
2949
+ // causing "list 5" to become "list5". Now we emit any non-empty cleanedText.
2950
+ if (cleanedText.length > 0) {
2810
2951
  yield {
2811
2952
  speaker: 'ai',
2812
2953
  blocks: [
@@ -2906,11 +3047,13 @@ export class OpenAIProvider extends BaseProvider {
2906
3047
  // @plan PLAN-20251202-THINKING.P16
2907
3048
  const tagBasedThinking = this.extractThinkTagsAsBlock(workingText);
2908
3049
  if (tagBasedThinking) {
3050
+ // Clean Kimi tokens from thinking content before accumulating
3051
+ const cleanedThought = this.cleanThinkingContent(tagBasedThinking.thought);
2909
3052
  // Use newline to preserve formatting between chunks (not space)
2910
3053
  if (accumulatedThinkingContent.length > 0) {
2911
3054
  accumulatedThinkingContent += '\n';
2912
3055
  }
2913
- accumulatedThinkingContent += tagBasedThinking.thought;
3056
+ accumulatedThinkingContent += cleanedThought;
2914
3057
  }
2915
3058
  const kimiParsed = this.extractKimiToolCallsFromText(workingText);
2916
3059
  if (kimiParsed.toolCalls.length > 0) {
@@ -2969,7 +3112,10 @@ export class OpenAIProvider extends BaseProvider {
2969
3112
  // Always use sanitized text to strip <think> tags (pipeline final buffer)
2970
3113
  // Bug fix: Previously Kimi used unsanitized workingText
2971
3114
  // @plan PLAN-20251202-THINKING.P16
2972
- if (cleanedText.trim().length > 0) {
3115
+ // Bug fix #721: Emit whitespace-only chunks (e.g., " " between words)
3116
+ // Previously we used cleanedText.trim().length > 0 which dropped spaces,
3117
+ // causing "list 5" to become "list5". Now we emit any non-empty cleanedText.
3118
+ if (cleanedText.length > 0) {
2973
3119
  yield {
2974
3120
  speaker: 'ai',
2975
3121
  blocks: [
@@ -3001,19 +3147,32 @@ export class OpenAIProvider extends BaseProvider {
3001
3147
  }
3002
3148
  // Emit accumulated reasoning_content as ONE ThinkingBlock (pipeline path)
3003
3149
  // This consolidates token-by-token reasoning from Synthetic API into a single block
3150
+ // Clean Kimi tokens from the accumulated content (not per-chunk) to handle split tokens
3004
3151
  // @plan PLAN-20251202-THINKING.P16
3005
3152
  if (accumulatedReasoningContent.length > 0) {
3006
- yield {
3007
- speaker: 'ai',
3008
- blocks: [
3009
- {
3010
- type: 'thinking',
3011
- thought: accumulatedReasoningContent,
3012
- sourceField: 'reasoning_content',
3013
- isHidden: false,
3014
- },
3015
- ],
3016
- };
3153
+ // Extract Kimi tool calls from the complete accumulated reasoning content
3154
+ const { cleanedText: cleanedReasoning, toolCalls: reasoningToolCalls } = this.extractKimiToolCallsFromText(accumulatedReasoningContent);
3155
+ // Emit the cleaned thinking block
3156
+ if (cleanedReasoning.length > 0) {
3157
+ yield {
3158
+ speaker: 'ai',
3159
+ blocks: [
3160
+ {
3161
+ type: 'thinking',
3162
+ thought: cleanedReasoning,
3163
+ sourceField: 'reasoning_content',
3164
+ isHidden: false,
3165
+ },
3166
+ ],
3167
+ };
3168
+ }
3169
+ // Emit any tool calls extracted from reasoning content
3170
+ if (reasoningToolCalls.length > 0) {
3171
+ yield {
3172
+ speaker: 'ai',
3173
+ blocks: reasoningToolCalls,
3174
+ };
3175
+ }
3017
3176
  }
3018
3177
  // Process and emit tool calls using the pipeline
3019
3178
  const pipelineResult = await this.toolCallPipeline.process(abortSignal);
@@ -3043,6 +3202,7 @@ export class OpenAIProvider extends BaseProvider {
3043
3202
  };
3044
3203
  // Add usage metadata if we captured it from streaming
3045
3204
  if (streamingUsage) {
3205
+ const cacheMetrics = extractCacheMetrics(streamingUsage);
3046
3206
  toolCallsContent.metadata = {
3047
3207
  usage: {
3048
3208
  promptTokens: streamingUsage.prompt_tokens || 0,
@@ -3050,6 +3210,9 @@ export class OpenAIProvider extends BaseProvider {
3050
3210
  totalTokens: streamingUsage.total_tokens ||
3051
3211
  (streamingUsage.prompt_tokens || 0) +
3052
3212
  (streamingUsage.completion_tokens || 0),
3213
+ cachedTokens: cacheMetrics.cachedTokens,
3214
+ cacheCreationTokens: cacheMetrics.cacheCreationTokens,
3215
+ cacheMissTokens: cacheMetrics.cacheMissTokens,
3053
3216
  },
3054
3217
  };
3055
3218
  }
@@ -3059,6 +3222,7 @@ export class OpenAIProvider extends BaseProvider {
3059
3222
  // If we have usage information but no tool calls, emit a metadata-only response
3060
3223
  if (streamingUsage &&
3061
3224
  this.toolCallPipeline.getStats().collector.totalCalls === 0) {
3225
+ const cacheMetrics = extractCacheMetrics(streamingUsage);
3062
3226
  yield {
3063
3227
  speaker: 'ai',
3064
3228
  blocks: [],
@@ -3069,6 +3233,9 @@ export class OpenAIProvider extends BaseProvider {
3069
3233
  totalTokens: streamingUsage.total_tokens ||
3070
3234
  (streamingUsage.prompt_tokens || 0) +
3071
3235
  (streamingUsage.completion_tokens || 0),
3236
+ cachedTokens: cacheMetrics.cachedTokens,
3237
+ cacheCreationTokens: cacheMetrics.cacheCreationTokens,
3238
+ cacheMissTokens: cacheMetrics.cacheMissTokens,
3072
3239
  },
3073
3240
  },
3074
3241
  };
@@ -3221,6 +3388,7 @@ export class OpenAIProvider extends BaseProvider {
3221
3388
  };
3222
3389
  // Add usage metadata from non-streaming response
3223
3390
  if (completion.usage) {
3391
+ const cacheMetrics = extractCacheMetrics(completion.usage);
3224
3392
  responseContent.metadata = {
3225
3393
  usage: {
3226
3394
  promptTokens: completion.usage.prompt_tokens || 0,
@@ -3228,6 +3396,9 @@ export class OpenAIProvider extends BaseProvider {
3228
3396
  totalTokens: completion.usage.total_tokens ||
3229
3397
  (completion.usage.prompt_tokens || 0) +
3230
3398
  (completion.usage.completion_tokens || 0),
3399
+ cachedTokens: cacheMetrics.cachedTokens,
3400
+ cacheCreationTokens: cacheMetrics.cacheCreationTokens,
3401
+ cacheMissTokens: cacheMetrics.cacheMissTokens,
3231
3402
  },
3232
3403
  };
3233
3404
  }
@@ -3235,6 +3406,7 @@ export class OpenAIProvider extends BaseProvider {
3235
3406
  }
3236
3407
  else if (completion.usage) {
3237
3408
  // Emit metadata-only response if no content blocks but have usage info
3409
+ const cacheMetrics = extractCacheMetrics(completion.usage);
3238
3410
  yield {
3239
3411
  speaker: 'ai',
3240
3412
  blocks: [],
@@ -3245,6 +3417,9 @@ export class OpenAIProvider extends BaseProvider {
3245
3417
  totalTokens: completion.usage.total_tokens ||
3246
3418
  (completion.usage.prompt_tokens || 0) +
3247
3419
  (completion.usage.completion_tokens || 0),
3420
+ cachedTokens: cacheMetrics.cachedTokens,
3421
+ cacheCreationTokens: cacheMetrics.cacheCreationTokens,
3422
+ cacheMissTokens: cacheMetrics.cacheMissTokens,
3248
3423
  },
3249
3424
  },
3250
3425
  };
@@ -3279,6 +3454,12 @@ export class OpenAIProvider extends BaseProvider {
3279
3454
  logger.debug(() => `Auto-detected 'kimi' format for K2 model: ${modelName}`);
3280
3455
  return 'kimi';
3281
3456
  }
3457
+ // Check for Mistral models (requires 9-char alphanumeric IDs)
3458
+ // This applies to both hosted API and self-hosted Mistral models
3459
+ if (isMistralModel(modelName)) {
3460
+ logger.debug(() => `Auto-detected 'mistral' format for Mistral model: ${modelName}`);
3461
+ return 'mistral';
3462
+ }
3282
3463
  const lowerModelName = modelName.toLowerCase();
3283
3464
  // Check for GLM-4 models (glm-4, glm-4.5, glm-4.6, glm-4-5, etc.)
3284
3465
  if (lowerModelName.includes('glm-4')) {
@@ -3361,57 +3542,75 @@ export class OpenAIProvider extends BaseProvider {
3361
3542
  * Parse reasoning_content from streaming delta.
3362
3543
  *
3363
3544
  * @plan PLAN-20251202-THINKING.P11, PLAN-20251202-THINKING.P16
3364
- * @requirement REQ-THINK-003.1, REQ-THINK-003.3, REQ-THINK-003.4
3545
+ * @requirement REQ-THINK-003.1, REQ-THINK-003.3, REQ-THINK-003.4, REQ-KIMI-REASONING-001.1
3546
+ * @issue #749
3365
3547
  */
3366
3548
  parseStreamingReasoningDelta(delta) {
3367
3549
  if (!delta) {
3368
- return null;
3550
+ return { thinking: null, toolCalls: [] };
3369
3551
  }
3370
3552
  // Access reasoning_content via type assertion since OpenAI SDK doesn't declare it
3371
3553
  const reasoningContent = delta
3372
3554
  .reasoning_content;
3373
3555
  // Handle absent, null, or non-string
3374
3556
  if (!reasoningContent || typeof reasoningContent !== 'string') {
3375
- return null;
3376
- }
3377
- // Handle empty string or whitespace-only
3378
- if (reasoningContent.trim().length === 0) {
3379
- return null;
3380
- }
3381
- return {
3382
- type: 'thinking',
3383
- thought: reasoningContent,
3384
- sourceField: 'reasoning_content',
3385
- isHidden: false,
3386
- };
3557
+ return { thinking: null, toolCalls: [] };
3558
+ }
3559
+ // Handle empty string only - preserve whitespace-only content (spaces, tabs)
3560
+ // to maintain proper formatting in accumulated reasoning (fixes issue #721)
3561
+ if (reasoningContent.length === 0) {
3562
+ return { thinking: null, toolCalls: [] };
3563
+ }
3564
+ // Extract Kimi K2 tool calls embedded in reasoning_content (fixes issue #749)
3565
+ const { cleanedText, toolCalls } = this.extractKimiToolCallsFromText(reasoningContent);
3566
+ // For streaming, preserve whitespace-only content for proper formatting (issue #721)
3567
+ // Only return null if the cleaned text is empty (length 0)
3568
+ const thinkingBlock = cleanedText.length === 0
3569
+ ? null
3570
+ : {
3571
+ type: 'thinking',
3572
+ thought: cleanedText,
3573
+ sourceField: 'reasoning_content',
3574
+ isHidden: false,
3575
+ };
3576
+ return { thinking: thinkingBlock, toolCalls };
3387
3577
  }
3388
3578
  /**
3389
3579
  * Parse reasoning_content from non-streaming message.
3390
3580
  *
3391
3581
  * @plan PLAN-20251202-THINKING.P11, PLAN-20251202-THINKING.P16
3392
- * @requirement REQ-THINK-003.2, REQ-THINK-003.3, REQ-THINK-003.4
3582
+ * @requirement REQ-THINK-003.2, REQ-THINK-003.3, REQ-THINK-003.4, REQ-KIMI-REASONING-001.2
3583
+ * @issue #749
3393
3584
  */
3394
3585
  parseNonStreamingReasoning(message) {
3395
3586
  if (!message) {
3396
- return null;
3587
+ return { thinking: null, toolCalls: [] };
3397
3588
  }
3398
3589
  // Access reasoning_content via type assertion since OpenAI SDK doesn't declare it
3399
3590
  const reasoningContent = message
3400
3591
  .reasoning_content;
3401
3592
  // Handle absent, null, or non-string
3402
3593
  if (!reasoningContent || typeof reasoningContent !== 'string') {
3403
- return null;
3594
+ return { thinking: null, toolCalls: [] };
3404
3595
  }
3405
- // Handle empty string or whitespace-only
3596
+ // Handle empty string or whitespace-only - for non-streaming complete responses,
3597
+ // whitespace-only reasoning is unusual and should be treated as no reasoning
3406
3598
  if (reasoningContent.trim().length === 0) {
3407
- return null;
3408
- }
3409
- return {
3410
- type: 'thinking',
3411
- thought: reasoningContent,
3412
- sourceField: 'reasoning_content',
3413
- isHidden: false,
3414
- };
3599
+ return { thinking: null, toolCalls: [] };
3600
+ }
3601
+ // Extract Kimi K2 tool calls embedded in reasoning_content (fixes issue #749)
3602
+ const { cleanedText, toolCalls } = this.extractKimiToolCallsFromText(reasoningContent);
3603
+ // For non-streaming, trim whitespace after extraction
3604
+ const trimmedText = cleanedText.trim();
3605
+ const thinkingBlock = trimmedText.length === 0
3606
+ ? null
3607
+ : {
3608
+ type: 'thinking',
3609
+ thought: trimmedText,
3610
+ sourceField: 'reasoning_content',
3611
+ isHidden: false,
3612
+ };
3613
+ return { thinking: thinkingBlock, toolCalls };
3415
3614
  }
3416
3615
  }
3417
3616
  //# sourceMappingURL=OpenAIProvider.js.map