npm - @vybestack/llxprt-code-core - Versions diffs - 0.7.0-nightly.251209.0061bd6bf → 0.7.0-nightly.251211.134f1920b - Mend

@vybestack/llxprt-code-core 0.7.0-nightly.251209.0061bd6bf → 0.7.0-nightly.251211.134f1920b

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/dist/src/providers/openai/OpenAIProvider.js CHANGED Viewed

@@ -22,7 +22,7 @@ import crypto from 'node:crypto';
 import * as http from 'http';
 import * as https from 'https';
 import * as net from 'net';
-import { isKimiModel, getToolIdStrategy, } from '../../tools/ToolIdStrategy.js';
+import { isKimiModel, isMistralModel, getToolIdStrategy, } from '../../tools/ToolIdStrategy.js';
 import { BaseProvider, } from '../BaseProvider.js';
 import { DebugLogger } from '../../debug/index.js';
 import { ToolFormatter } from '../../tools/ToolFormatter.js';
@@ -40,6 +40,7 @@ import { buildToolResponsePayload, EMPTY_TOOL_RESULT_PLACEHOLDER, } from '../uti
 import { isLocalEndpoint } from '../utils/localEndpoint.js';
 import { filterThinkingForContext, thinkingToReasoningField, extractThinkingBlocks, } from '../reasoning/reasoningUtils.js';
 import { shouldDumpSDKContext, dumpSDKContext, } from '../utils/dumpSDKContext.js';
+import { extractCacheMetrics } from '../utils/cacheMetricsExtractor.js';
 const MAX_TOOL_RESPONSE_CHARS = 1024;
 const MAX_TOOL_RESPONSE_RETRY_CHARS = 512;
 const TOOL_ARGS_PREVIEW_LENGTH = 500;
@@ -277,13 +278,12 @@ export class OpenAIProvider extends BaseProvider {
         // This preserves meaningful whitespace in regular text chunks during streaming
         // (e.g., " 5 Biggest" should remain " 5 Biggest", not become "5 Biggest")
         if (hadReasoningTags) {
-            // Clean up multiple consecutive spaces/whitespace that may result from stripping
+            // Collapse multiple spaces/tabs but preserve newlines for proper paragraph/line breaks
             str = str.replace(/[ \t]+/g, ' ');
             str = str.replace(/\n{3,}/g, '\n\n');
-            // Only trim leading whitespace when think tags were at the beginning
-            // This prevents leading spaces from "<think>...</think>text" -> " text"
-            // but preserves trailing whitespace for streaming chunk concatenation
-            str = str.trimStart();
+            // Only trim leading horizontal whitespace (spaces/tabs), NOT newlines
+            // This preserves line breaks between think tags and content (fixes #721)
+            str = str.replace(/^[ \t]+/, '');
         }
         const afterLen = str.length;
         if (hadReasoningTags && afterLen !== beforeLen) {
@@ -438,62 +438,78 @@ export class OpenAIProvider extends BaseProvider {
      * and all tool info is only encoded in the text template.
      */
     extractKimiToolCallsFromText(raw) {
-        if (!raw || !raw.includes('<|tool_calls_section_begin|>')) {
+        // Return early only if input is null/undefined/empty
+        if (!raw) {
             return { cleanedText: raw, toolCalls: [] };
         }
         const logger = this.getLogger();
         const toolCalls = [];
         let text = raw;
-        const sectionRegex = /<\|tool_calls_section_begin\|>([\s\S]*?)<\|tool_calls_section_end\|>/g;
-        text = text.replace(sectionRegex, (_sectionMatch, sectionBody) => {
-            try {
-                const callRegex = /<\|tool_call_begin\|>\s*([^<]+?)\s*<\|tool_call_argument_begin\|>\s*([\s\S]*?)\s*<\|tool_call_end\|>/g;
-                let m;
-                while ((m = callRegex.exec(sectionBody)) !== null) {
-                    const rawId = m[1].trim();
-                    const rawArgs = m[2].trim();
-                    // Infer tool name from ID.
-                    let toolName = '';
-                    const match = /^functions\.([A-Za-z0-9_]+):\d+/i.exec(rawId) ||
-                        /^[A-Za-z0-9_]+\.([A-Za-z0-9_]+):\d+/.exec(rawId);
-                    if (match) {
-                        toolName = match[1];
-                    }
-                    else {
-                        const colonParts = rawId.split(':');
-                        const head = colonParts[0] || rawId;
-                        const dotParts = head.split('.');
-                        toolName = dotParts[dotParts.length - 1] || head;
+        // Extract tool calls from complete sections if present
+        if (raw.includes('<|tool_calls_section_begin|>')) {
+            const sectionRegex = /<\|tool_calls_section_begin\|>([\s\S]*?)<\|tool_calls_section_end\|>/g;
+            text = text.replace(sectionRegex, (_sectionMatch, sectionBody) => {
+                try {
+                    const callRegex = /<\|tool_call_begin\|>\s*([^<]+?)\s*<\|tool_call_argument_begin\|>\s*([\s\S]*?)\s*<\|tool_call_end\|>/g;
+                    let m;
+                    while ((m = callRegex.exec(sectionBody)) !== null) {
+                        const rawId = m[1].trim();
+                        const rawArgs = m[2].trim();
+                        // Infer tool name from ID.
+                        let toolName = '';
+                        const match = /^functions\.([A-Za-z0-9_]+):\d+/i.exec(rawId) ||
+                            /^[A-Za-z0-9_]+\.([A-Za-z0-9_]+):\d+/.exec(rawId);
+                        if (match) {
+                            toolName = match[1];
+                        }
+                        else {
+                            const colonParts = rawId.split(':');
+                            const head = colonParts[0] || rawId;
+                            const dotParts = head.split('.');
+                            toolName = dotParts[dotParts.length - 1] || head;
+                        }
+                        // Normalize tool name (handles Kimi-K2 style prefixes like call_functionsglob7)
+                        toolName = this.normalizeToolName(toolName);
+                        const sanitizedArgs = this.sanitizeToolArgumentsString(rawArgs);
+                        const processedParameters = processToolParameters(sanitizedArgs, toolName);
+                        toolCalls.push({
+                            type: 'tool_call',
+                            id: this.normalizeToHistoryToolId(rawId),
+                            name: toolName,
+                            parameters: processedParameters,
+                        });
                     }
-                    // Normalize tool name (handles Kimi-K2 style prefixes like call_functionsglob7)
-                    toolName = this.normalizeToolName(toolName);
-                    const sanitizedArgs = this.sanitizeToolArgumentsString(rawArgs);
-                    const processedParameters = processToolParameters(sanitizedArgs, toolName);
-                    toolCalls.push({
-                        type: 'tool_call',
-                        id: this.normalizeToHistoryToolId(rawId),
-                        name: toolName,
-                        parameters: processedParameters,
-                    });
                 }
-            }
-            catch (err) {
-                logger.debug(() => `[OpenAIProvider] Failed to parse Kimi tool_calls_section: ${err}`);
-            }
-            // Strip the entire tool section from user-visible text
-            return '';
-        });
-        if (toolCalls.length > 0) {
-            logger.debug(() => `[OpenAIProvider] Parsed Kimi tool_calls_section`, {
-                toolCallCount: toolCalls.length,
-                originalLength: raw.length,
-                cleanedLength: text.length,
+                catch (err) {
+                    logger.debug(() => `[OpenAIProvider] Failed to parse Kimi tool_calls_section: ${err}`);
+                }
+                // Strip the entire tool section from user-visible text
+                return '';
             });
+            if (toolCalls.length > 0) {
+                logger.debug(() => `[OpenAIProvider] Parsed Kimi tool_calls_section`, {
+                    toolCallCount: toolCalls.length,
+                    originalLength: raw.length,
+                    cleanedLength: text.length,
+                });
+            }
         }
+        // ALWAYS run stray token cleanup, even if no complete sections were found
+        // This handles partial sections, malformed tokens, orphaned markers, etc.
+        text = text.replace(/<\|tool_call(?:_(?:begin|end|argument_begin))?\|>/g, '');
+        text = text.replace(/<\|tool_calls_section_(?:begin|end)\|>/g, '');
         // Don't trim - preserve leading/trailing newlines that are important for formatting
         // (e.g., numbered lists from Kimi K2 that have newlines between items)
         return { cleanedText: text, toolCalls };
     }
+    /**
+     * Clean Kimi K2 tool call tokens from thinking content.
+     * Used when extracting thinking from <think> tags that may contain embedded tool calls.
+     * @issue #749
+     */
+    cleanThinkingContent(thought) {
+        return this.extractKimiToolCallsFromText(thought).cleanedText;
+    }
     /**
      * @plan:PLAN-20251023-STATELESS-HARDENING.P09
      * @requirement:REQ-SP4-002
@@ -910,9 +926,12 @@ export class OpenAIProvider extends BaseProvider {
                     }
                     else {
                         // Assistant message with tool calls
+                        // CRITICAL for Mistral API compatibility (#760):
+                        // When tool_calls are present, we must NOT include a content property at all
+                        // (not even null). Mistral's OpenAI-compatible API requires this.
+                        // See: https://docs.mistral.ai/capabilities/function_calling
                         messages.push({
                             role: 'assistant',
-                            content: text || null,
                             tool_calls: toolCalls.map((tc) => ({
                                 id: this.normalizeToOpenAIToolId(tc.id),
                                 type: 'function',
@@ -948,10 +967,16 @@ export class OpenAIProvider extends BaseProvider {
                 }
                 else {
                     for (const tr of toolResponses) {
+                        // CRITICAL for Mistral API compatibility (#760):
+                        // Tool messages must include a name field matching the function name.
+                        // See: https://docs.mistral.ai/capabilities/function_calling
+                        // Note: The OpenAI SDK types don't include name, but Mistral requires it.
+                        // We use a type assertion to add this required field.
                         messages.push({
                             role: 'tool',
                             content: this.buildToolResponseContent(tr, config),
                             tool_call_id: this.normalizeToOpenAIToolId(tr.callId),
+                            name: tr.toolName,
                         });
                     }
                 }
@@ -977,8 +1002,9 @@ export class OpenAIProvider extends BaseProvider {
         const messages = [];
         // Create a ToolIdMapper based on the tool format
         // For Kimi K2, this generates sequential IDs in the format functions.{name}:{index}
-        const toolIdMapper = toolFormat === 'kimi'
-            ? getToolIdStrategy('kimi').createMapper(filteredContents)
+        // For Mistral, this generates 9-char alphanumeric IDs
+        const toolIdMapper = toolFormat === 'kimi' || toolFormat === 'mistral'
+            ? getToolIdStrategy(toolFormat).createMapper(filteredContents)
             : null;
         // Helper to resolve tool call IDs based on format
         const resolveToolCallId = (tc) => {
@@ -1014,9 +1040,12 @@ export class OpenAIProvider extends BaseProvider {
                 const toolCalls = content.blocks.filter((b) => b.type === 'tool_call');
                 if (toolCalls.length > 0) {
                     // Assistant message with tool calls
+                    // CRITICAL for Mistral API compatibility (#760):
+                    // When tool_calls are present, we must NOT include a content property at all
+                    // (not even null). Mistral's OpenAI-compatible API requires this.
+                    // See: https://docs.mistral.ai/capabilities/function_calling
                     const baseMessage = {
                         role: 'assistant',
-                        content: text || null,
                         tool_calls: toolCalls.map((tc) => ({
                             id: resolveToolCallId(tc),
                             type: 'function',
@@ -1057,10 +1086,16 @@ export class OpenAIProvider extends BaseProvider {
                 // Convert tool responses
                 const toolResponses = content.blocks.filter((b) => b.type === 'tool_response');
                 for (const tr of toolResponses) {
+                    // CRITICAL for Mistral API compatibility (#760):
+                    // Tool messages must include a name field matching the function name.
+                    // See: https://docs.mistral.ai/capabilities/function_calling
+                    // Note: The OpenAI SDK types don't include name, but Mistral requires it.
+                    // We use a type assertion to add this required field.
                     messages.push({
                         role: 'tool',
                         content: this.buildToolResponseContent(tr, options.config),
                         tool_call_id: resolveToolResponseId(tr),
+                        name: tr.toolName,
                     });
                 }
             }
@@ -1506,9 +1541,9 @@ export class OpenAIProvider extends BaseProvider {
             // Buffer for accumulating text chunks for providers that need it
             let textBuffer = '';
             // Use the same detected format from earlier for consistency
-            const isKimiModel = model.toLowerCase().includes('kimi-k2');
+            const isKimiK2Model = model.toLowerCase().includes('kimi-k2');
             // Buffer text for Qwen format providers and Kimi-K2 to avoid stanza formatting
-            const shouldBufferText = detectedFormat === 'qwen' || isKimiModel;
+            const shouldBufferText = detectedFormat === 'qwen' || isKimiK2Model;
             // Accumulate thinking content across the entire stream to emit as ONE block
             // This handles fragmented <think>word</think> streaming from Synthetic API
             // @plan PLAN-20251202-THINKING.P16
@@ -1575,12 +1610,29 @@ export class OpenAIProvider extends BaseProvider {
                         continue;
                     // Parse reasoning_content from streaming delta (Phase 16 integration)
                     // ACCUMULATE instead of yielding immediately to handle token-by-token streaming
+                    // Extract embedded Kimi K2 tool calls from reasoning_content (fixes #749)
                     // @plan PLAN-20251202-THINKING.P16
-                    const reasoningBlock = this.parseStreamingReasoningDelta(choice.delta);
+                    // @requirement REQ-KIMI-REASONING-001.1
+                    const { thinking: reasoningBlock, toolCalls: reasoningToolCalls } = this.parseStreamingReasoningDelta(choice.delta);
                     if (reasoningBlock) {
                         // Accumulate reasoning content - will emit ONE block later
                         accumulatedReasoningContent += reasoningBlock.thought;
                     }
+                    // Accumulate tool calls extracted from reasoning_content
+                    if (reasoningToolCalls.length > 0) {
+                        for (const toolCall of reasoningToolCalls) {
+                            // Convert ToolCallBlock to accumulated format
+                            const index = accumulatedToolCalls.length;
+                            accumulatedToolCalls[index] = {
+                                id: toolCall.id,
+                                type: 'function',
+                                function: {
+                                    name: toolCall.name,
+                                    arguments: JSON.stringify(toolCall.parameters),
+                                },
+                            };
+                        }
+                    }
                     // Check for finish_reason to detect proper stream ending
                     if (choice.finish_reason) {
                         logger.debug(() => `[Streaming] Stream finished with reason: ${choice.finish_reason}`, {
@@ -1601,13 +1653,25 @@ export class OpenAIProvider extends BaseProvider {
                     }
                     // Handle text content - buffer for Qwen format, emit immediately for others
                     // Note: Synthetic API sends content that may duplicate reasoning_content.
-                    // This is the model's behavior - we don't filter it here.
+                    // We now filter duplicates by tracking when content starts matching reasoning_content.
+                    // fixes #721
                     // @plan PLAN-20251202-THINKING.P16
                     const rawDeltaContent = this.coerceMessageContentToString(choice.delta?.content);
                     if (rawDeltaContent) {
-                        const deltaContent = isKimiModel
-                            ? rawDeltaContent
-                            : this.sanitizeProviderText(rawDeltaContent);
+                        // For Kimi models, we need to buffer the RAW content without processing
+                        // because Kimi tokens stream incrementally and partial tokens would leak
+                        // through if we try to process them immediately. The buffer will be
+                        // processed when flushed (at sentence boundaries or end of stream).
+                        let deltaContent;
+                        if (isKimiK2Model) {
+                            // For Kimi: Don't process yet - just pass through and let buffering handle it
+                            // We'll extract tool calls and sanitize when we flush the buffer
+                            deltaContent = rawDeltaContent;
+                        }
+                        else {
+                            // For non-Kimi models: sanitize immediately as before
+                            deltaContent = this.sanitizeProviderText(rawDeltaContent);
+                        }
                         if (!deltaContent) {
                             continue;
                         }
@@ -1623,9 +1687,9 @@ export class OpenAIProvider extends BaseProvider {
                             });
                             // Buffer text to avoid stanza formatting
                             textBuffer += deltaContent;
-                            const hasKimiBegin = textBuffer.includes('<|tool_calls_section_begin|>');
-                            const hasKimiEnd = textBuffer.includes('<|tool_calls_section_end|>');
-                            const hasOpenKimiSection = hasKimiBegin && !hasKimiEnd;
+                            const kimiBeginCount = (textBuffer.match(/<\|tool_calls_section_begin\|>/g) || []).length;
+                            const kimiEndCount = (textBuffer.match(/<\|tool_calls_section_end\|>/g) || []).length;
+                            const hasOpenKimiSection = kimiBeginCount > kimiEndCount;
                             // Emit buffered text when we have a complete sentence or paragraph
                             // Look for natural break points, but avoid flushing mid Kimi section
                             if (!hasOpenKimiSection &&
@@ -1642,12 +1706,14 @@ export class OpenAIProvider extends BaseProvider {
                                 // @requirement REQ-THINK-003
                                 const tagBasedThinking = this.extractThinkTagsAsBlock(workingText);
                                 if (tagBasedThinking) {
+                                    // Clean Kimi tokens from thinking content before accumulating
+                                    const cleanedThought = this.cleanThinkingContent(tagBasedThinking.thought);
                                     // Accumulate thinking content - don't emit yet
                                     // Use newline to preserve formatting between chunks (not space)
                                     if (accumulatedThinkingContent.length > 0) {
                                         accumulatedThinkingContent += '\n';
                                     }
-                                    accumulatedThinkingContent += tagBasedThinking.thought;
+                                    accumulatedThinkingContent += cleanedThought;
                                     logger.debug(() => `[Streaming legacy] Accumulated thinking: ${accumulatedThinkingContent.length} chars total`);
                                 }
                                 const kimiParsed = this.extractKimiToolCallsFromText(workingText);
@@ -1709,7 +1775,10 @@ export class OpenAIProvider extends BaseProvider {
                                 // Always use sanitized text to strip <think> tags (legacy streaming)
                                 // Bug fix: Previously Kimi used unsanitized workingText
                                 // @plan PLAN-20251202-THINKING.P16
-                                if (cleanedText.trim().length > 0) {
+                                // Bug fix #721: Emit whitespace-only chunks (e.g., " " between words)
+                                // Previously we used cleanedText.trim().length > 0 which dropped spaces,
+                                // causing "list 5" to become "list5". Now we emit any non-empty cleanedText.
+                                if (cleanedText.length > 0) {
                                     yield {
                                         speaker: 'ai',
                                         blocks: [
@@ -1828,11 +1897,13 @@ export class OpenAIProvider extends BaseProvider {
                 // @plan PLAN-20251202-THINKING.P16
                 const tagBasedThinking = this.extractThinkTagsAsBlock(workingText);
                 if (tagBasedThinking) {
+                    // Clean Kimi tokens from thinking content before accumulating
+                    const cleanedThought = this.cleanThinkingContent(tagBasedThinking.thought);
                     // Use newline to preserve formatting between chunks (not space)
                     if (accumulatedThinkingContent.length > 0) {
                         accumulatedThinkingContent += '\n';
                     }
-                    accumulatedThinkingContent += tagBasedThinking.thought;
+                    accumulatedThinkingContent += cleanedThought;
                 }
                 const kimiParsed = this.extractKimiToolCallsFromText(workingText);
                 if (kimiParsed.toolCalls.length > 0) {
@@ -1891,7 +1962,10 @@ export class OpenAIProvider extends BaseProvider {
                 // Always use sanitized text to strip <think> tags (legacy final buffer)
                 // Bug fix: Previously Kimi used unsanitized workingText
                 // @plan PLAN-20251202-THINKING.P16
-                if (cleanedText.trim().length > 0) {
+                // Bug fix #721: Emit whitespace-only chunks (e.g., " " between words)
+                // Previously we used cleanedText.trim().length > 0 which dropped spaces,
+                // causing "list 5" to become "list5". Now we emit any non-empty cleanedText.
+                if (cleanedText.length > 0) {
                     yield {
                         speaker: 'ai',
                         blocks: [
@@ -1923,19 +1997,32 @@ export class OpenAIProvider extends BaseProvider {
             }
             // Emit accumulated reasoning_content as ONE ThinkingBlock (legacy path)
             // This consolidates token-by-token reasoning from Synthetic API into a single block
+            // Clean Kimi tokens from the accumulated content (not per-chunk) to handle split tokens
             // @plan PLAN-20251202-THINKING.P16
             if (accumulatedReasoningContent.length > 0) {
-                yield {
-                    speaker: 'ai',
-                    blocks: [
-                        {
-                            type: 'thinking',
-                            thought: accumulatedReasoningContent,
-                            sourceField: 'reasoning_content',
-                            isHidden: false,
-                        },
-                    ],
-                };
+                // Extract Kimi tool calls from the complete accumulated reasoning content
+                const { cleanedText: cleanedReasoning, toolCalls: reasoningToolCalls } = this.extractKimiToolCallsFromText(accumulatedReasoningContent);
+                // Emit the cleaned thinking block
+                if (cleanedReasoning.length > 0) {
+                    yield {
+                        speaker: 'ai',
+                        blocks: [
+                            {
+                                type: 'thinking',
+                                thought: cleanedReasoning,
+                                sourceField: 'reasoning_content',
+                                isHidden: false,
+                            },
+                        ],
+                    };
+                }
+                // Emit any tool calls extracted from reasoning content
+                if (reasoningToolCalls.length > 0) {
+                    yield {
+                        speaker: 'ai',
+                        blocks: reasoningToolCalls,
+                    };
+                }
             }
             // Process and emit tool calls using legacy accumulated approach
             if (accumulatedToolCalls.length > 0) {
@@ -1962,6 +2049,7 @@ export class OpenAIProvider extends BaseProvider {
                     };
                     // Add usage metadata if we captured it from streaming
                     if (streamingUsage) {
+                        const cacheMetrics = extractCacheMetrics(streamingUsage);
                         toolCallsContent.metadata = {
                             usage: {
                                 promptTokens: streamingUsage.prompt_tokens || 0,
@@ -1969,6 +2057,9 @@ export class OpenAIProvider extends BaseProvider {
                                 totalTokens: streamingUsage.total_tokens ||
                                     (streamingUsage.prompt_tokens || 0) +
                                         (streamingUsage.completion_tokens || 0),
+                                cachedTokens: cacheMetrics.cachedTokens,
+                                cacheCreationTokens: cacheMetrics.cacheCreationTokens,
+                                cacheMissTokens: cacheMetrics.cacheMissTokens,
                             },
                         };
                     }
@@ -1977,6 +2068,7 @@ export class OpenAIProvider extends BaseProvider {
             }
             // If we have usage information but no tool calls, emit a metadata-only response
             if (streamingUsage && accumulatedToolCalls.length === 0) {
+                const cacheMetrics = extractCacheMetrics(streamingUsage);
                 yield {
                     speaker: 'ai',
                     blocks: [],
@@ -1987,6 +2079,9 @@ export class OpenAIProvider extends BaseProvider {
                             totalTokens: streamingUsage.total_tokens ||
                                 (streamingUsage.prompt_tokens || 0) +
                                     (streamingUsage.completion_tokens || 0),
+                            cachedTokens: cacheMetrics.cachedTokens,
+                            cacheCreationTokens: cacheMetrics.cacheCreationTokens,
+                            cacheMissTokens: cacheMetrics.cacheMissTokens,
                         },
                     },
                 };
@@ -2051,8 +2146,10 @@ export class OpenAIProvider extends BaseProvider {
             }
             const blocks = [];
             // Parse reasoning_content from response (Phase 16 integration)
-            const reasoningBlock = this.parseNonStreamingReasoning(choice.message);
-            logger.debug(() => `[Non-streaming] parseNonStreamingReasoning result: ${reasoningBlock ? `found (${reasoningBlock.thought?.length} chars)` : 'not found'}`, {
+            // Extract embedded Kimi K2 tool calls from reasoning_content (fixes #749)
+            // @requirement REQ-KIMI-REASONING-001.2
+            const { thinking: reasoningBlock, toolCalls: reasoningToolCalls } = this.parseNonStreamingReasoning(choice.message);
+            logger.debug(() => `[Non-streaming] parseNonStreamingReasoning result: ${reasoningBlock ? `found (${reasoningBlock.thought?.length} chars)` : 'not found'}, tool calls: ${reasoningToolCalls.length}`, {
                 hasReasoningContent: 'reasoning_content' in
                     (choice.message ?? {}),
                 messageKeys: Object.keys(choice.message ?? {}),
@@ -2060,6 +2157,11 @@ export class OpenAIProvider extends BaseProvider {
             if (reasoningBlock) {
                 blocks.push(reasoningBlock);
             }
+            // Add tool calls extracted from reasoning_content
+            if (reasoningToolCalls.length > 0) {
+                blocks.push(...reasoningToolCalls);
+                logger.debug(() => `[Non-streaming] Added ${reasoningToolCalls.length} tool calls from reasoning_content`);
+            }
             // Handle text content (strip thinking / reasoning blocks) and Kimi tool sections
             const rawMessageContent = this.coerceMessageContentToString(choice.message?.content);
             let kimiCleanContent;
@@ -2163,6 +2265,7 @@ export class OpenAIProvider extends BaseProvider {
                 };
                 // Add usage metadata from non-streaming response
                 if (completion.usage) {
+                    const cacheMetrics = extractCacheMetrics(completion.usage);
                     responseContent.metadata = {
                         usage: {
                             promptTokens: completion.usage.prompt_tokens || 0,
@@ -2170,6 +2273,9 @@ export class OpenAIProvider extends BaseProvider {
                             totalTokens: completion.usage.total_tokens ||
                                 (completion.usage.prompt_tokens || 0) +
                                     (completion.usage.completion_tokens || 0),
+                            cachedTokens: cacheMetrics.cachedTokens,
+                            cacheCreationTokens: cacheMetrics.cacheCreationTokens,
+                            cacheMissTokens: cacheMetrics.cacheMissTokens,
                         },
                     };
                 }
@@ -2177,6 +2283,7 @@ export class OpenAIProvider extends BaseProvider {
             }
             else if (completion.usage) {
                 // Emit metadata-only response if no content blocks but have usage info
+                const cacheMetrics = extractCacheMetrics(completion.usage);
                 yield {
                     speaker: 'ai',
                     blocks: [],
@@ -2187,6 +2294,9 @@ export class OpenAIProvider extends BaseProvider {
                             totalTokens: completion.usage.total_tokens ||
                                 (completion.usage.prompt_tokens || 0) +
                                     (completion.usage.completion_tokens || 0),
+                            cachedTokens: cacheMetrics.cachedTokens,
+                            cacheCreationTokens: cacheMetrics.cacheCreationTokens,
+                            cacheMissTokens: cacheMetrics.cacheMissTokens,
                         },
                     },
                 };
@@ -2459,7 +2569,7 @@ export class OpenAIProvider extends BaseProvider {
                     });
                     // Dump successful streaming request if enabled
                     if (shouldDumpSuccess) {
-                        await dumpSDKContext('openai', '/v1/chat/completions', requestBody, { streaming: true }, false, baseURL || 'https://api.openai.com');
+                        await dumpSDKContext('openai', '/chat/completions', requestBody, { streaming: true }, false, baseURL || 'https://api.openai.com/v1');
                     }
                     break;
                 }
@@ -2492,7 +2602,7 @@ export class OpenAIProvider extends BaseProvider {
                     // Dump error if enabled
                     if (shouldDumpError) {
                         const dumpErrorMessage = error instanceof Error ? error.message : String(error);
-                        await dumpSDKContext('openai', '/v1/chat/completions', requestBody, { error: dumpErrorMessage }, true, baseURL || 'https://api.openai.com');
+                        await dumpSDKContext('openai', '/chat/completions', requestBody, { error: dumpErrorMessage }, true, baseURL || 'https://api.openai.com/v1');
                     }
                     // Re-throw other errors as-is
                     const capturedErrorMessage = error instanceof Error ? error.message : String(error);
@@ -2530,7 +2640,7 @@ export class OpenAIProvider extends BaseProvider {
                     }));
                     // Dump successful non-streaming request if enabled
                     if (shouldDumpSuccess) {
-                        await dumpSDKContext('openai', '/v1/chat/completions', requestBody, response, false, baseURL || 'https://api.openai.com');
+                        await dumpSDKContext('openai', '/chat/completions', requestBody, response, false, baseURL || 'https://api.openai.com/v1');
                     }
                     break;
                 }
@@ -2569,7 +2679,7 @@ export class OpenAIProvider extends BaseProvider {
                     // Dump error if enabled
                     if (shouldDumpError) {
                         const dumpErrorMessage = error instanceof Error ? error.message : String(error);
-                        await dumpSDKContext('openai', '/v1/chat/completions', requestBody, { error: dumpErrorMessage }, true, baseURL || 'https://api.openai.com');
+                        await dumpSDKContext('openai', '/chat/completions', requestBody, { error: dumpErrorMessage }, true, baseURL || 'https://api.openai.com/v1');
                     }
                     const capturedErrorMessage = error instanceof Error ? error.message : String(error);
                     const status = typeof error === 'object' &&
@@ -2599,9 +2709,9 @@ export class OpenAIProvider extends BaseProvider {
             // Buffer for accumulating text chunks for providers that need it
             let textBuffer = '';
             // Use the same detected format from earlier for consistency
-            const isKimiModel = model.toLowerCase().includes('kimi-k2');
+            const isKimiK2Model = model.toLowerCase().includes('kimi-k2');
             // Buffer text for Qwen format providers and Kimi-K2 to avoid stanza formatting
-            const shouldBufferText = detectedFormat === 'qwen' || isKimiModel;
+            const shouldBufferText = detectedFormat === 'qwen' || isKimiK2Model;
             // Accumulate thinking content across the entire stream to emit as ONE block
             // This handles fragmented <think>word</think> streaming from Synthetic API
             // @plan PLAN-20251202-THINKING.P16
@@ -2671,13 +2781,28 @@ export class OpenAIProvider extends BaseProvider {
                         continue;
                     // Parse reasoning_content from streaming delta (Pipeline path)
                     // ACCUMULATE instead of yielding immediately to handle token-by-token streaming
+                    // Extract embedded Kimi K2 tool calls from reasoning_content (fixes #749)
                     // @plan PLAN-20251202-THINKING.P16
-                    // @requirement REQ-THINK-003.1
-                    const reasoningBlock = this.parseStreamingReasoningDelta(choice.delta);
+                    // @requirement REQ-THINK-003.1, REQ-KIMI-REASONING-001.1
+                    const { thinking: reasoningBlock, toolCalls: reasoningToolCalls } = this.parseStreamingReasoningDelta(choice.delta);
                     if (reasoningBlock) {
                         // Accumulate reasoning content - will emit ONE block later
                         accumulatedReasoningContent += reasoningBlock.thought;
                     }
+                    // Add tool calls extracted from reasoning_content to pipeline
+                    if (reasoningToolCalls.length > 0) {
+                        // Get current pipeline stats to determine next index
+                        const stats = this.toolCallPipeline.getStats();
+                        let baseIndex = stats.collector.totalCalls;
+                        for (const toolCall of reasoningToolCalls) {
+                            // Add complete tool call as fragments to pipeline
+                            this.toolCallPipeline.addFragment(baseIndex, {
+                                name: toolCall.name,
+                                args: JSON.stringify(toolCall.parameters),
+                            });
+                            baseIndex++;
+                        }
+                    }
                     // Check for finish_reason to detect proper stream ending
                     if (choice.finish_reason) {
                         logger.debug(() => `[Streaming] Stream finished with reason: ${choice.finish_reason}`, {
@@ -2698,13 +2823,24 @@ export class OpenAIProvider extends BaseProvider {
                     }
                     // Handle text content - buffer for Qwen format, emit immediately for others
                     // Note: Synthetic API sends content that may duplicate reasoning_content.
-                    // This is the model's behavior - we don't filter it here.
+                    // This is the model's behavior - we don't filter it here as detection is unreliable.
                     // @plan PLAN-20251202-THINKING.P16
                     const rawDeltaContent = this.coerceMessageContentToString(choice.delta?.content);
                     if (rawDeltaContent) {
-                        const deltaContent = isKimiModel
-                            ? rawDeltaContent
-                            : this.sanitizeProviderText(rawDeltaContent);
+                        // For Kimi models, we need to buffer the RAW content without processing
+                        // because Kimi tokens stream incrementally and partial tokens would leak
+                        // through if we try to process them immediately. The buffer will be
+                        // processed when flushed (at sentence boundaries or end of stream).
+                        let deltaContent;
+                        if (isKimiK2Model) {
+                            // For Kimi: Don't process yet - just pass through and let buffering handle it
+                            // We'll extract tool calls and sanitize when we flush the buffer
+                            deltaContent = rawDeltaContent;
+                        }
+                        else {
+                            // For non-Kimi models: sanitize immediately as before
+                            deltaContent = this.sanitizeProviderText(rawDeltaContent);
+                        }
                         if (!deltaContent) {
                             continue;
                         }
@@ -2720,9 +2856,9 @@ export class OpenAIProvider extends BaseProvider {
                             });
                             // Buffer text to avoid stanza formatting
                             textBuffer += deltaContent;
-                            const hasKimiBegin = textBuffer.includes('<|tool_calls_section_begin|>');
-                            const hasKimiEnd = textBuffer.includes('<|tool_calls_section_end|>');
-                            const hasOpenKimiSection = hasKimiBegin && !hasKimiEnd;
+                            const kimiBeginCount = (textBuffer.match(/<\|tool_calls_section_begin\|>/g) || []).length;
+                            const kimiEndCount = (textBuffer.match(/<\|tool_calls_section_end\|>/g) || []).length;
+                            const hasOpenKimiSection = kimiBeginCount > kimiEndCount;
                             // Emit buffered text when we have a complete sentence or paragraph
                             // Look for natural break points, avoiding flush mid Kimi section
                             if (!hasOpenKimiSection &&
@@ -2739,12 +2875,14 @@ export class OpenAIProvider extends BaseProvider {
                                 // @requirement REQ-THINK-003
                                 const tagBasedThinking = this.extractThinkTagsAsBlock(workingText);
                                 if (tagBasedThinking) {
+                                    // Clean Kimi tokens from thinking content before accumulating
+                                    const cleanedThought = this.cleanThinkingContent(tagBasedThinking.thought);
                                     // Accumulate thinking content - don't emit yet
                                     // Use newline to preserve formatting between chunks (not space)
                                     if (accumulatedThinkingContent.length > 0) {
                                         accumulatedThinkingContent += '\n';
                                     }
-                                    accumulatedThinkingContent += tagBasedThinking.thought;
+                                    accumulatedThinkingContent += cleanedThought;
                                     logger.debug(() => `[Streaming] Accumulated thinking: ${accumulatedThinkingContent.length} chars total`);
                                 }
                                 const kimiParsed = this.extractKimiToolCallsFromText(workingText);
@@ -2806,7 +2944,10 @@ export class OpenAIProvider extends BaseProvider {
                                 // Always use sanitized text to strip <think> tags (pipeline streaming)
                                 // Bug fix: Previously Kimi used unsanitized workingText
                                 // @plan PLAN-20251202-THINKING.P16
-                                if (cleanedText.trim().length > 0) {
+                                // Bug fix #721: Emit whitespace-only chunks (e.g., " " between words)
+                                // Previously we used cleanedText.trim().length > 0 which dropped spaces,
+                                // causing "list 5" to become "list5". Now we emit any non-empty cleanedText.
+                                if (cleanedText.length > 0) {
                                     yield {
                                         speaker: 'ai',
                                         blocks: [
@@ -2906,11 +3047,13 @@ export class OpenAIProvider extends BaseProvider {
                 // @plan PLAN-20251202-THINKING.P16
                 const tagBasedThinking = this.extractThinkTagsAsBlock(workingText);
                 if (tagBasedThinking) {
+                    // Clean Kimi tokens from thinking content before accumulating
+                    const cleanedThought = this.cleanThinkingContent(tagBasedThinking.thought);
                     // Use newline to preserve formatting between chunks (not space)
                     if (accumulatedThinkingContent.length > 0) {
                         accumulatedThinkingContent += '\n';
                     }
-                    accumulatedThinkingContent += tagBasedThinking.thought;
+                    accumulatedThinkingContent += cleanedThought;
                 }
                 const kimiParsed = this.extractKimiToolCallsFromText(workingText);
                 if (kimiParsed.toolCalls.length > 0) {
@@ -2969,7 +3112,10 @@ export class OpenAIProvider extends BaseProvider {
                 // Always use sanitized text to strip <think> tags (pipeline final buffer)
                 // Bug fix: Previously Kimi used unsanitized workingText
                 // @plan PLAN-20251202-THINKING.P16
-                if (cleanedText.trim().length > 0) {
+                // Bug fix #721: Emit whitespace-only chunks (e.g., " " between words)
+                // Previously we used cleanedText.trim().length > 0 which dropped spaces,
+                // causing "list 5" to become "list5". Now we emit any non-empty cleanedText.
+                if (cleanedText.length > 0) {
                     yield {
                         speaker: 'ai',
                         blocks: [
@@ -3001,19 +3147,32 @@ export class OpenAIProvider extends BaseProvider {
             }
             // Emit accumulated reasoning_content as ONE ThinkingBlock (pipeline path)
             // This consolidates token-by-token reasoning from Synthetic API into a single block
+            // Clean Kimi tokens from the accumulated content (not per-chunk) to handle split tokens
             // @plan PLAN-20251202-THINKING.P16
             if (accumulatedReasoningContent.length > 0) {
-                yield {
-                    speaker: 'ai',
-                    blocks: [
-                        {
-                            type: 'thinking',
-                            thought: accumulatedReasoningContent,
-                            sourceField: 'reasoning_content',
-                            isHidden: false,
-                        },
-                    ],
-                };
+                // Extract Kimi tool calls from the complete accumulated reasoning content
+                const { cleanedText: cleanedReasoning, toolCalls: reasoningToolCalls } = this.extractKimiToolCallsFromText(accumulatedReasoningContent);
+                // Emit the cleaned thinking block
+                if (cleanedReasoning.length > 0) {
+                    yield {
+                        speaker: 'ai',
+                        blocks: [
+                            {
+                                type: 'thinking',
+                                thought: cleanedReasoning,
+                                sourceField: 'reasoning_content',
+                                isHidden: false,
+                            },
+                        ],
+                    };
+                }
+                // Emit any tool calls extracted from reasoning content
+                if (reasoningToolCalls.length > 0) {
+                    yield {
+                        speaker: 'ai',
+                        blocks: reasoningToolCalls,
+                    };
+                }
             }
             // Process and emit tool calls using the pipeline
             const pipelineResult = await this.toolCallPipeline.process(abortSignal);
@@ -3043,6 +3202,7 @@ export class OpenAIProvider extends BaseProvider {
                     };
                     // Add usage metadata if we captured it from streaming
                     if (streamingUsage) {
+                        const cacheMetrics = extractCacheMetrics(streamingUsage);
                         toolCallsContent.metadata = {
                             usage: {
                                 promptTokens: streamingUsage.prompt_tokens || 0,
@@ -3050,6 +3210,9 @@ export class OpenAIProvider extends BaseProvider {
                                 totalTokens: streamingUsage.total_tokens ||
                                     (streamingUsage.prompt_tokens || 0) +
                                         (streamingUsage.completion_tokens || 0),
+                                cachedTokens: cacheMetrics.cachedTokens,
+                                cacheCreationTokens: cacheMetrics.cacheCreationTokens,
+                                cacheMissTokens: cacheMetrics.cacheMissTokens,
                             },
                         };
                     }
@@ -3059,6 +3222,7 @@ export class OpenAIProvider extends BaseProvider {
             // If we have usage information but no tool calls, emit a metadata-only response
             if (streamingUsage &&
                 this.toolCallPipeline.getStats().collector.totalCalls === 0) {
+                const cacheMetrics = extractCacheMetrics(streamingUsage);
                 yield {
                     speaker: 'ai',
                     blocks: [],
@@ -3069,6 +3233,9 @@ export class OpenAIProvider extends BaseProvider {
                             totalTokens: streamingUsage.total_tokens ||
                                 (streamingUsage.prompt_tokens || 0) +
                                     (streamingUsage.completion_tokens || 0),
+                            cachedTokens: cacheMetrics.cachedTokens,
+                            cacheCreationTokens: cacheMetrics.cacheCreationTokens,
+                            cacheMissTokens: cacheMetrics.cacheMissTokens,
                         },
                     },
                 };
@@ -3221,6 +3388,7 @@ export class OpenAIProvider extends BaseProvider {
                 };
                 // Add usage metadata from non-streaming response
                 if (completion.usage) {
+                    const cacheMetrics = extractCacheMetrics(completion.usage);
                     responseContent.metadata = {
                         usage: {
                             promptTokens: completion.usage.prompt_tokens || 0,
@@ -3228,6 +3396,9 @@ export class OpenAIProvider extends BaseProvider {
                             totalTokens: completion.usage.total_tokens ||
                                 (completion.usage.prompt_tokens || 0) +
                                     (completion.usage.completion_tokens || 0),
+                            cachedTokens: cacheMetrics.cachedTokens,
+                            cacheCreationTokens: cacheMetrics.cacheCreationTokens,
+                            cacheMissTokens: cacheMetrics.cacheMissTokens,
                         },
                     };
                 }
@@ -3235,6 +3406,7 @@ export class OpenAIProvider extends BaseProvider {
             }
             else if (completion.usage) {
                 // Emit metadata-only response if no content blocks but have usage info
+                const cacheMetrics = extractCacheMetrics(completion.usage);
                 yield {
                     speaker: 'ai',
                     blocks: [],
@@ -3245,6 +3417,9 @@ export class OpenAIProvider extends BaseProvider {
                             totalTokens: completion.usage.total_tokens ||
                                 (completion.usage.prompt_tokens || 0) +
                                     (completion.usage.completion_tokens || 0),
+                            cachedTokens: cacheMetrics.cachedTokens,
+                            cacheCreationTokens: cacheMetrics.cacheCreationTokens,
+                            cacheMissTokens: cacheMetrics.cacheMissTokens,
                         },
                     },
                 };
@@ -3279,6 +3454,12 @@ export class OpenAIProvider extends BaseProvider {
             logger.debug(() => `Auto-detected 'kimi' format for K2 model: ${modelName}`);
             return 'kimi';
         }
+        // Check for Mistral models (requires 9-char alphanumeric IDs)
+        // This applies to both hosted API and self-hosted Mistral models
+        if (isMistralModel(modelName)) {
+            logger.debug(() => `Auto-detected 'mistral' format for Mistral model: ${modelName}`);
+            return 'mistral';
+        }
         const lowerModelName = modelName.toLowerCase();
         // Check for GLM-4 models (glm-4, glm-4.5, glm-4.6, glm-4-5, etc.)
         if (lowerModelName.includes('glm-4')) {
@@ -3361,57 +3542,75 @@ export class OpenAIProvider extends BaseProvider {
      * Parse reasoning_content from streaming delta.
      *
      * @plan PLAN-20251202-THINKING.P11, PLAN-20251202-THINKING.P16
-     * @requirement REQ-THINK-003.1, REQ-THINK-003.3, REQ-THINK-003.4
+     * @requirement REQ-THINK-003.1, REQ-THINK-003.3, REQ-THINK-003.4, REQ-KIMI-REASONING-001.1
+     * @issue #749
      */
     parseStreamingReasoningDelta(delta) {
         if (!delta) {
-            return null;
+            return { thinking: null, toolCalls: [] };
         }
         // Access reasoning_content via type assertion since OpenAI SDK doesn't declare it
         const reasoningContent = delta
             .reasoning_content;
         // Handle absent, null, or non-string
         if (!reasoningContent || typeof reasoningContent !== 'string') {
-            return null;
-        }
-        // Handle empty string or whitespace-only
-        if (reasoningContent.trim().length === 0) {
-            return null;
-        }
-        return {
-            type: 'thinking',
-            thought: reasoningContent,
-            sourceField: 'reasoning_content',
-            isHidden: false,
-        };
+            return { thinking: null, toolCalls: [] };
+        }
+        // Handle empty string only - preserve whitespace-only content (spaces, tabs)
+        // to maintain proper formatting in accumulated reasoning (fixes issue #721)
+        if (reasoningContent.length === 0) {
+            return { thinking: null, toolCalls: [] };
+        }
+        // Extract Kimi K2 tool calls embedded in reasoning_content (fixes issue #749)
+        const { cleanedText, toolCalls } = this.extractKimiToolCallsFromText(reasoningContent);
+        // For streaming, preserve whitespace-only content for proper formatting (issue #721)
+        // Only return null if the cleaned text is empty (length 0)
+        const thinkingBlock = cleanedText.length === 0
+            ? null
+            : {
+                type: 'thinking',
+                thought: cleanedText,
+                sourceField: 'reasoning_content',
+                isHidden: false,
+            };
+        return { thinking: thinkingBlock, toolCalls };
     }
     /**
      * Parse reasoning_content from non-streaming message.
      *
      * @plan PLAN-20251202-THINKING.P11, PLAN-20251202-THINKING.P16
-     * @requirement REQ-THINK-003.2, REQ-THINK-003.3, REQ-THINK-003.4
+     * @requirement REQ-THINK-003.2, REQ-THINK-003.3, REQ-THINK-003.4, REQ-KIMI-REASONING-001.2
+     * @issue #749
      */
     parseNonStreamingReasoning(message) {
         if (!message) {
-            return null;
+            return { thinking: null, toolCalls: [] };
         }
         // Access reasoning_content via type assertion since OpenAI SDK doesn't declare it
         const reasoningContent = message
             .reasoning_content;
         // Handle absent, null, or non-string
         if (!reasoningContent || typeof reasoningContent !== 'string') {
-            return null;
+            return { thinking: null, toolCalls: [] };
         }
-        // Handle empty string or whitespace-only
+        // Handle empty string or whitespace-only - for non-streaming complete responses,
+        // whitespace-only reasoning is unusual and should be treated as no reasoning
         if (reasoningContent.trim().length === 0) {
-            return null;
-        }
-        return {
-            type: 'thinking',
-            thought: reasoningContent,
-            sourceField: 'reasoning_content',
-            isHidden: false,
-        };
+            return { thinking: null, toolCalls: [] };
+        }
+        // Extract Kimi K2 tool calls embedded in reasoning_content (fixes issue #749)
+        const { cleanedText, toolCalls } = this.extractKimiToolCallsFromText(reasoningContent);
+        // For non-streaming, trim whitespace after extraction
+        const trimmedText = cleanedText.trim();
+        const thinkingBlock = trimmedText.length === 0
+            ? null
+            : {
+                type: 'thinking',
+                thought: trimmedText,
+                sourceField: 'reasoning_content',
+                isHidden: false,
+            };
+        return { thinking: thinkingBlock, toolCalls };
     }
 }
 //# sourceMappingURL=OpenAIProvider.js.map