npm - @lobehub/lobehub - Versions diffs - 2.0.0-next.114 → 2.0.0-next.116 - Mend

@lobehub/lobehub 2.0.0-next.114 → 2.0.0-next.116

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/packages/model-runtime/src/core/streams/google/google-ai.test.ts CHANGED Viewed

@@ -251,16 +251,16 @@ describe('GoogleGenerativeAIStream', () => {
       expect(chunks).toEqual(
         [
           'id: chat_1',
-          'event: text',
-          'data: "234"\n',
+          'event: content_part',
+          'data: {"content":"234","partType":"text"}\n',
           'id: chat_1',
           'event: text',
           'data: ""\n',
           'id: chat_1',
-          'event: text',
-          `data: "567890\\n"\n`,
+          'event: content_part',
+          `data: {"content":"567890\\n","partType":"text"}\n`,
           // stop
           'id: chat_1',
           'event: stop',
@@ -376,20 +376,20 @@ describe('GoogleGenerativeAIStream', () => {
       expect(chunks).toEqual(
         [
           'id: chat_1',
-          'event: reasoning',
-          'data: "**Understanding the Conditional Logic**\\n\\n"\n',
+          'event: reasoning_part',
+          'data: {"content":"**Understanding the Conditional Logic**\\n\\n","inReasoning":true,"partType":"text"}\n',
           'id: chat_1',
-          'event: reasoning',
-          `data: "**Finalizing Interpretation**\\n\\n"\n`,
+          'event: reasoning_part',
+          `data: {"content":"**Finalizing Interpretation**\\n\\n","inReasoning":true,"partType":"text"}\n`,
           'id: chat_1',
-          'event: text',
-          `data: "简单来说，"\n`,
+          'event: content_part',
+          `data: {"content":"简单来说，","partType":"text"}\n`,
           'id: chat_1',
-          'event: text',
-          `data: "文本内容。"\n`,
+          'event: content_part',
+          `data: {"content":"文本内容。","partType":"text"}\n`,
           // stop
           'id: chat_1',
           'event: stop',
@@ -471,12 +471,12 @@ describe('GoogleGenerativeAIStream', () => {
       expect(chunks).toEqual(
         [
           'id: chat_1',
-          'event: text',
-          'data: "234"\n',
+          'event: content_part',
+          'data: {"content":"234","partType":"text"}\n',
           'id: chat_1',
-          'event: text',
-          `data: "567890\\n"\n`,
+          'event: content_part',
+          `data: {"content":"567890\\n","partType":"text"}\n`,
           // stop
           'id: chat_1',
           'event: stop',
@@ -1166,8 +1166,8 @@ describe('GoogleGenerativeAIStream', () => {
       expect(chunks).toEqual(
         [
           'id: chat_1',
-          'event: text',
-          'data: "你好！很高兴为你服务。请问有什么我可以帮你的吗？\\n\\n无论是回答问题、协助写作、翻译，还是随便聊聊，我都随时待命！"\n',
+          'event: content_part',
+          'data: {"content":"你好！很高兴为你服务。请问有什么我可以帮你的吗？\\n\\n无论是回答问题、协助写作、翻译，还是随便聊聊，我都随时待命！","partType":"text"}\n',
           'id: chat_1',
           'event: stop',
@@ -1286,8 +1286,8 @@ describe('GoogleGenerativeAIStream', () => {
       expect(chunks).toEqual(
         [
           'id: chat_1',
-          'event: text',
-          'data: "Here is my answer"\n',
+          'event: content_part',
+          'data: {"content":"Here is my answer","partType":"text","thoughtSignature":"sig123"}\n',
           'id: chat_1',
           'event: stop',
@@ -1300,4 +1300,435 @@ describe('GoogleGenerativeAIStream', () => {
       );
     });
   });
+  describe('Multimodal parts (reasoning_part and content_part)', () => {
+    it('should handle mixed reasoning text and reasoning image parts', async () => {
+      vi.spyOn(uuidModule, 'nanoid').mockReturnValueOnce('1');
+      const data = [
+        {
+          candidates: [
+            {
+              content: {
+                parts: [
+                  {
+                    text: "**Clarifying the Core Concept**\n\nI'm now focusing on the visual metaphor. I plan to depict Agent Runtime as a software environment that manages and executes agents' tasks, similar to how an operating system functions. I aim to create an informative and intuitive infographic.\n\n\n",
+                    thought: true,
+                  },
+                ],
+                role: 'model',
+              },
+              index: 0,
+            },
+          ],
+          usageMetadata: {
+            promptTokenCount: 9,
+            totalTokenCount: 9,
+            promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
+          },
+          modelVersion: 'gemini-3-pro-image-preview',
+          responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
+        },
+        {
+          candidates: [
+            {
+              content: {
+                parts: [
+                  {
+                    text: '**Developing Visual Representation**\n\nI\'m now iterating on the visual representation. The "command center" metaphor is proving useful. I\'m focusing on the interplay of the core components: the central engine coordinates perception, memory, planning, and action, with tools and plugins as extensions. The goal is to clearly show the flow of information through the system, from input to output, using visual cues. The aesthetic aims for a futuristic, tech-inspired look with glowing lines and circuit board elements, using a palette of blues, purples, and oranges.\n\n\n',
+                    thought: true,
+                  },
+                ],
+                role: 'model',
+              },
+              index: 0,
+            },
+          ],
+          usageMetadata: {
+            promptTokenCount: 9,
+            totalTokenCount: 9,
+            promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
+          },
+          modelVersion: 'gemini-3-pro-image-preview',
+          responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
+        },
+        {
+          candidates: [
+            {
+              content: {
+                parts: [
+                  {
+                    text: "**Constructing the Architecture**\n\nI'm presently building out the architecture of the infographic. I've broken down \"Agent Runtime\" into its core components and I'm designing the visual relationships between them.  The central engine will be the focal point, with modules for perception, memory, planning, action, and tools radiating outwards. My aim is to illustrate the workflow from input to output clearly. I'll utilize arrows to represent the flow of data and instructions between each module.\n\n\n",
+                    thought: true,
+                  },
+                ],
+                role: 'model',
+              },
+              index: 0,
+            },
+          ],
+          usageMetadata: {
+            promptTokenCount: 9,
+            totalTokenCount: 9,
+            promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
+          },
+          modelVersion: 'gemini-3-pro-image-preview',
+          responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
+        },
+        {
+          candidates: [
+            {
+              content: {
+                parts: [
+                  {
+                    inlineData: {
+                      mimeType: 'image/jpeg',
+                      data: '/9j/4AAQSkZJRgABAQEBLAEsAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQBiUgnjeAMHrkrk/mMk8k57YpwnCsBtzkcgk+/ONuO3QE4JJ3ccuJ373EbEg5KgDkAg4xtySNrHv14HYm9ne7au7JpLRfm+vn63Emtk9FZWv1t+Ntt9L9Wj//2Q==',
+                    },
+                    thought: true,
+                  },
+                ],
+                role: 'model',
+              },
+              index: 0,
+            },
+          ],
+          usageMetadata: {
+            promptTokenCount: 9,
+            totalTokenCount: 9,
+            promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
+          },
+          modelVersion: 'gemini-3-pro-image-preview',
+          responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
+        },
+        {
+          candidates: [
+            {
+              content: {
+                parts: [
+                  {
+                    text: '**Constructing an Infographic**\n\nI\'ve successfully created an infographic depicting an "Agent Runtime." The design employs a tech-inspired circuit board aesthetic, placing the core engine at the center. I\'ve clearly represented six essential modules: perception, memory, planning, action, tools, and learning. Arrows and text annotations vividly illustrate the data flow and processing.\n\n\n',
+                    thought: true,
+                  },
+                ],
+                role: 'model',
+              },
+              index: 0,
+            },
+          ],
+          usageMetadata: {
+            promptTokenCount: 9,
+            totalTokenCount: 9,
+            promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
+          },
+          modelVersion: 'gemini-3-pro-image-preview',
+          responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
+        },
+        {
+          candidates: [
+            {
+              content: {
+                parts: [
+                  {
+                    text: "**Defining Agent Runtime Modules**\n\nI'm making progress clarifying the architecture of an \"Agent Runtime\" system. I've designed an infographic with a circuit board aesthetic, centered on the core engine. Six key modules are now visualized: perception, memory, planning, action, tools, and learning. I've incorporated arrows and annotations to show data flow effectively.\n\n\n",
+                    thought: true,
+                  },
+                ],
+                role: 'model',
+              },
+              index: 0,
+            },
+          ],
+          usageMetadata: {
+            promptTokenCount: 9,
+            totalTokenCount: 9,
+            promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
+          },
+          modelVersion: 'gemini-3-pro-image-preview',
+          responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
+        },
+        {
+          candidates: [
+            {
+              content: {
+                parts: [
+                  {
+                    inlineData: {
+                      mimeType: 'image/jpeg',
+                      data: '/9j/4AAQSkZJRgABAQEBLAEsAAD/2wBDAAEBAQEBAQEBA2Q==',
+                    },
+                    thoughtSignature:
+                      'EueybArjsmwB0e2Kby+QPRkacnmPuV+CqMr6tiey3M5BHLHgIiggQOMeFmnKzsoux6PI6dQMgmdbXE1OTLLcWUmUD1CgFn+C2VdI09FpHrVhxVAtSk/zFVSlsjfCuANxtkP8tCDppVZqIya0QYjzg5K1fEO0m42CZX2/MHyqL8NjzR0lT8ENdoV3RSaK2tXqPH45uIb6nGeBSuX1n2EUMzO',
+                  },
+                ],
+                role: 'model',
+              },
+              finishReason: 'STOP',
+              index: 0,
+            },
+          ],
+          usageMetadata: {
+            promptTokenCount: 9,
+            candidatesTokenCount: 1358,
+            totalTokenCount: 1728,
+            promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
+            candidatesTokensDetails: [{ modality: 'IMAGE', tokenCount: 1120 }],
+            thoughtsTokenCount: 361,
+          },
+          modelVersion: 'gemini-3-pro-image-preview',
+          responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
+        },
+      ];
+      const mockGoogleStream = new ReadableStream({
+        start(controller) {
+          data.forEach((item) => {
+            controller.enqueue(item);
+          });
+          controller.close();
+        },
+      });
+      const protocolStream = GoogleGenerativeAIStream(mockGoogleStream);
+      const chunks = await decodeStreamChunks(protocolStream);
+      expect(chunks).toEqual(
+        [
+          // First reasoning text
+          'id: chat_1',
+          'event: reasoning_part',
+          'data: {"content":"**Clarifying the Core Concept**\\n\\nI\'m now focusing on the visual metaphor. I plan to depict Agent Runtime as a software environment that manages and executes agents\' tasks, similar to how an operating system functions. I aim to create an informative and intuitive infographic.\\n\\n\\n","inReasoning":true,"partType":"text"}\n',
+          // Second reasoning text
+          'id: chat_1',
+          'event: reasoning_part',
+          'data: {"content":"**Developing Visual Representation**\\n\\nI\'m now iterating on the visual representation. The \\"command center\\" metaphor is proving useful. I\'m focusing on the interplay of the core components: the central engine coordinates perception, memory, planning, and action, with tools and plugins as extensions. The goal is to clearly show the flow of information through the system, from input to output, using visual cues. The aesthetic aims for a futuristic, tech-inspired look with glowing lines and circuit board elements, using a palette of blues, purples, and oranges.\\n\\n\\n","inReasoning":true,"partType":"text"}\n',
+          // Third reasoning text
+          'id: chat_1',
+          'event: reasoning_part',
+          'data: {"content":"**Constructing the Architecture**\\n\\nI\'m presently building out the architecture of the infographic. I\'ve broken down \\"Agent Runtime\\" into its core components and I\'m designing the visual relationships between them.  The central engine will be the focal point, with modules for perception, memory, planning, action, and tools radiating outwards. My aim is to illustrate the workflow from input to output clearly. I\'ll utilize arrows to represent the flow of data and instructions between each module.\\n\\n\\n","inReasoning":true,"partType":"text"}\n',
+          // First reasoning image
+          'id: chat_1',
+          'event: reasoning_part',
+          'data: {"content":"/9j/4AAQSkZJRgABAQEBLAEsAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQBiUgnjeAMHrkrk/mMk8k57YpwnCsBtzkcgk+/ONuO3QE4JJ3ccuJ373EbEg5KgDkAg4xtySNrHv14HYm9ne7au7JpLRfm+vn63Emtk9FZWv1t+Ntt9L9Wj//2Q==","inReasoning":true,"mimeType":"image/jpeg","partType":"image"}\n',
+          // Fourth reasoning text
+          'id: chat_1',
+          'event: reasoning_part',
+          'data: {"content":"**Constructing an Infographic**\\n\\nI\'ve successfully created an infographic depicting an \\"Agent Runtime.\\" The design employs a tech-inspired circuit board aesthetic, placing the core engine at the center. I\'ve clearly represented six essential modules: perception, memory, planning, action, tools, and learning. Arrows and text annotations vividly illustrate the data flow and processing.\\n\\n\\n","inReasoning":true,"partType":"text"}\n',
+          // Fifth reasoning text
+          'id: chat_1',
+          'event: reasoning_part',
+          'data: {"content":"**Defining Agent Runtime Modules**\\n\\nI\'m making progress clarifying the architecture of an \\"Agent Runtime\\" system. I\'ve designed an infographic with a circuit board aesthetic, centered on the core engine. Six key modules are now visualized: perception, memory, planning, action, tools, and learning. I\'ve incorporated arrows and annotations to show data flow effectively.\\n\\n\\n","inReasoning":true,"partType":"text"}\n',
+          // Content image (with thoughtSignature but not thought:true)
+          'id: chat_1',
+          'event: content_part',
+          'data: {"content":"/9j/4AAQSkZJRgABAQEBLAEsAAD/2wBDAAEBAQEBAQEBA2Q==","mimeType":"image/jpeg","partType":"image","thoughtSignature":"EueybArjsmwB0e2Kby+QPRkacnmPuV+CqMr6tiey3M5BHLHgIiggQOMeFmnKzsoux6PI6dQMgmdbXE1OTLLcWUmUD1CgFn+C2VdI09FpHrVhxVAtSk/zFVSlsjfCuANxtkP8tCDppVZqIya0QYjzg5K1fEO0m42CZX2/MHyqL8NjzR0lT8ENdoV3RSaK2tXqPH45uIb6nGeBSuX1n2EUMzO"}\n',
+          // stop
+          'id: chat_1',
+          'event: stop',
+          'data: "STOP"\n',
+          // usage
+          'id: chat_1',
+          'event: usage',
+          'data: {"inputTextTokens":9,"outputImageTokens":1120,"outputReasoningTokens":361,"outputTextTokens":238,"totalInputTokens":9,"totalOutputTokens":1719,"totalTokens":1728}\n',
+        ].map((i) => i + '\n'),
+      );
+    });
+    it('should handle content text and image parts without reasoning', async () => {
+      vi.spyOn(uuidModule, 'nanoid').mockReturnValueOnce('1');
+      const data = [
+        {
+          candidates: [
+            {
+              content: {
+                parts: [
+                  {
+                    text: 'This is the description: ',
+                  },
+                ],
+                role: 'model',
+              },
+              index: 0,
+            },
+          ],
+          usageMetadata: {
+            promptTokenCount: 5,
+            totalTokenCount: 5,
+            promptTokensDetails: [{ modality: 'TEXT', tokenCount: 5 }],
+          },
+          modelVersion: 'gemini-3-pro-image-preview',
+        },
+        {
+          candidates: [
+            {
+              content: {
+                parts: [
+                  {
+                    inlineData: {
+                      mimeType: 'image/png',
+                      data: 'iVBORw0KGgoAAAANSUhEUgAAAAUA',
+                    },
+                  },
+                ],
+                role: 'model',
+              },
+              index: 0,
+            },
+          ],
+          usageMetadata: {
+            promptTokenCount: 5,
+            totalTokenCount: 5,
+            promptTokensDetails: [{ modality: 'TEXT', tokenCount: 5 }],
+          },
+          modelVersion: 'gemini-3-pro-image-preview',
+        },
+        {
+          candidates: [
+            {
+              content: {
+                parts: [
+                  {
+                    text: ' an example.',
+                  },
+                ],
+                role: 'model',
+              },
+              finishReason: 'STOP',
+              index: 0,
+            },
+          ],
+          usageMetadata: {
+            promptTokenCount: 5,
+            candidatesTokenCount: 10,
+            totalTokenCount: 15,
+            promptTokensDetails: [{ modality: 'TEXT', tokenCount: 5 }],
+            candidatesTokensDetails: [{ modality: 'TEXT', tokenCount: 10 }],
+          },
+          modelVersion: 'gemini-3-pro-image-preview',
+        },
+      ];
+      const mockGoogleStream = new ReadableStream({
+        start(controller) {
+          data.forEach((item) => {
+            controller.enqueue(item);
+          });
+          controller.close();
+        },
+      });
+      const protocolStream = GoogleGenerativeAIStream(mockGoogleStream);
+      const chunks = await decodeStreamChunks(protocolStream);
+      expect(chunks).toEqual(
+        [
+          'id: chat_1',
+          'event: content_part',
+          'data: {"content":"This is the description: ","partType":"text"}\n',
+          'id: chat_1',
+          'event: content_part',
+          'data: {"content":"iVBORw0KGgoAAAANSUhEUgAAAAUA","mimeType":"image/png","partType":"image"}\n',
+          'id: chat_1',
+          'event: content_part',
+          'data: {"content":" an example.","partType":"text"}\n',
+          'id: chat_1',
+          'event: stop',
+          'data: "STOP"\n',
+          'id: chat_1',
+          'event: usage',
+          'data: {"inputTextTokens":5,"outputImageTokens":0,"outputTextTokens":10,"totalInputTokens":5,"totalOutputTokens":10,"totalTokens":15}\n',
+        ].map((i) => i + '\n'),
+      );
+    });
+    it('should handle mixed reasoning and content parts in single chunk', async () => {
+      vi.spyOn(uuidModule, 'nanoid').mockReturnValueOnce('1');
+      const data = [
+        {
+          candidates: [
+            {
+              content: {
+                parts: [
+                  {
+                    text: 'Analyzing the request...',
+                    thought: true,
+                  },
+                  {
+                    text: 'Here is the answer: ',
+                  },
+                  {
+                    inlineData: {
+                      mimeType: 'image/png',
+                      data: 'base64data',
+                    },
+                  },
+                ],
+                role: 'model',
+              },
+              finishReason: 'STOP',
+              index: 0,
+            },
+          ],
+          usageMetadata: {
+            promptTokenCount: 10,
+            candidatesTokenCount: 20,
+            totalTokenCount: 30,
+            promptTokensDetails: [{ modality: 'TEXT', tokenCount: 10 }],
+            thoughtsTokenCount: 5,
+          },
+          modelVersion: 'gemini-3-pro-image-preview',
+        },
+      ];
+      const mockGoogleStream = new ReadableStream({
+        start(controller) {
+          data.forEach((item) => {
+            controller.enqueue(item);
+          });
+          controller.close();
+        },
+      });
+      const protocolStream = GoogleGenerativeAIStream(mockGoogleStream);
+      const chunks = await decodeStreamChunks(protocolStream);
+      expect(chunks).toEqual(
+        [
+          'id: chat_1',
+          'event: reasoning_part',
+          'data: {"content":"Analyzing the request...","inReasoning":true,"partType":"text"}\n',
+          'id: chat_1',
+          'event: content_part',
+          'data: {"content":"Here is the answer: ","partType":"text"}\n',
+          'id: chat_1',
+          'event: content_part',
+          'data: {"content":"base64data","mimeType":"image/png","partType":"image"}\n',
+          'id: chat_1',
+          'event: stop',
+          'data: "STOP"\n',
+          'id: chat_1',
+          'event: usage',
+          'data: {"inputTextTokens":10,"outputImageTokens":0,"outputReasoningTokens":5,"outputTextTokens":20,"totalInputTokens":10,"totalOutputTokens":25,"totalTokens":30}\n',
+        ].map((i) => i + '\n'),
+      );
+    });
+  });
 });

package/packages/model-runtime/src/core/streams/google/index.ts CHANGED Viewed

@@ -7,6 +7,7 @@ import { convertGoogleAIUsage } from '../../usageConverters/google-ai';
 import {
   ChatPayloadForTransformStream,
   StreamContext,
+  StreamPartChunkData,
   StreamProtocolChunk,
   StreamToolCallChunkData,
   createCallbacksTransformer,
@@ -114,12 +115,121 @@ const transformGoogleGenerativeAIStream = (
       .join('') || '';
   if (candidate) {
-    // 首先检查是否为 reasoning 内容 (thought: true)
-    if (Array.isArray(candidate.content?.parts) && candidate.content.parts.length > 0) {
+    // Check if this response contains reasoning or multimodal content
+    const parts = candidate.content?.parts || [];
+    const hasReasoningParts = parts.some((p: any) => p.thought === true);
+    const hasImageParts = parts.some((p: any) => p.inlineData);
+    const hasThoughtSignature = parts.some((p: any) => p.thoughtSignature);
+    const hasThoughtsInMetadata = (usageMetadata as any)?.thoughtsTokenCount > 0;
+    // Check model version to determine if new format should be used
+    const modelVersion = (chunk as any).modelVersion || '';
+    const isGemini25Plus = modelVersion.includes('gemini-2.5') || modelVersion.includes('gemini-3');
+    const isGemini3Model =
+      modelVersion.includes('gemini-3') || modelVersion.includes('image-preview');
+    // Check if this is the old single-image scenario (single image part with finishReason)
+    // This should use the legacy base64_image event format (only for gemini-2.0 and earlier)
+    const isSingleImageWithFinish =
+      parts.length === 1 &&
+      hasImageParts &&
+      !hasReasoningParts &&
+      candidate.finishReason &&
+      !isGemini25Plus;
+    // Check if this has grounding metadata (should use legacy text + grounding events)
+    const hasGroundingMetadata = !!candidate.groundingMetadata?.groundingChunks;
+    // Use content_part/reasoning_part events when:
+    // 1. There are reasoning parts in current chunk (thought: true)
+    // 2. There are multiple parts with images (multimodal content)
+    // 3. There are thoughtSignature in parts (reasoning metadata attached to content)
+    // 4. There is thoughtsTokenCount in metadata (indicates response contains reasoning)
+    // 5. This is Gemini 3 model with image generation (always use new format for consistency)
+    // BUT NOT for:
+    // - The legacy single-image scenario
+    // - Grounding metadata scenario (uses legacy text + grounding events)
+    const shouldUseMultimodalProcessing =
+      (hasReasoningParts ||
+        (hasImageParts && parts.length > 1) ||
+        hasThoughtSignature ||
+        hasThoughtsInMetadata ||
+        isGemini3Model) &&
+      !isSingleImageWithFinish &&
+      !hasGroundingMetadata;
+    // Process multimodal parts (text and images in reasoning or content)
+    if (
+      shouldUseMultimodalProcessing &&
+      Array.isArray(candidate.content?.parts) &&
+      candidate.content.parts.length > 0
+    ) {
+      const results: StreamProtocolChunk[] = [];
       for (const part of candidate.content.parts) {
+        // 1. Reasoning text part
         if (part && part.text && part.thought === true) {
-          return { data: part.text, id: context.id, type: 'reasoning' };
+          results.push({
+            data: {
+              content: part.text,
+              inReasoning: true,
+              partType: 'text',
+              thoughtSignature: part.thoughtSignature,
+            } as StreamPartChunkData,
+            id: context.id,
+            type: 'reasoning_part',
+          });
+        }
+        // 2. Reasoning image part
+        else if (part && part.inlineData && part.thought === true) {
+          results.push({
+            data: {
+              content: part.inlineData.data,
+              inReasoning: true,
+              mimeType: part.inlineData.mimeType,
+              partType: 'image',
+              thoughtSignature: part.thoughtSignature,
+            } as StreamPartChunkData,
+            id: context.id,
+            type: 'reasoning_part',
+          });
+        }
+        // 3. Content text part
+        else if (part && part.text && !part.thought) {
+          results.push({
+            data: {
+              content: part.text,
+              partType: 'text',
+              thoughtSignature: part.thoughtSignature,
+            } as StreamPartChunkData,
+            id: context.id,
+            type: 'content_part',
+          });
+        }
+        // 4. Content image part
+        else if (part && part.inlineData && !part.thought) {
+          results.push({
+            data: {
+              content: part.inlineData.data,
+              mimeType: part.inlineData.mimeType,
+              partType: 'image',
+              thoughtSignature: part.thoughtSignature,
+            } as StreamPartChunkData,
+            id: context.id,
+            type: 'content_part',
+          });
+        }
+      }
+      // If we found multimodal parts, return them with usage chunks
+      if (results.length > 0) {
+        if (candidate.finishReason && usageMetadata) {
+          results.push(...usageChunks);
         }
+        return results;
       }
     }