@lobehub/lobehub 2.0.0-next.114 → 2.0.0-next.116

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/CHANGELOG.md +50 -0
  2. package/changelog/v1.json +18 -0
  3. package/package.json +1 -1
  4. package/packages/const/src/models.ts +6 -0
  5. package/packages/context-engine/src/processors/MessageContent.ts +100 -6
  6. package/packages/context-engine/src/processors/__tests__/MessageContent.test.ts +239 -0
  7. package/packages/fetch-sse/src/fetchSSE.ts +30 -0
  8. package/packages/model-bank/src/aiModels/aihubmix.ts +35 -1
  9. package/packages/model-bank/src/aiModels/anthropic.ts +37 -2
  10. package/packages/model-bank/src/aiModels/bedrock.ts +26 -11
  11. package/packages/model-bank/src/aiModels/openrouter.ts +28 -1
  12. package/packages/model-bank/src/aiModels/zenmux.ts +30 -1
  13. package/packages/model-runtime/src/core/contextBuilders/google.test.ts +78 -24
  14. package/packages/model-runtime/src/core/contextBuilders/google.ts +10 -2
  15. package/packages/model-runtime/src/core/parameterResolver.ts +3 -0
  16. package/packages/model-runtime/src/core/streams/google/google-ai.test.ts +451 -20
  17. package/packages/model-runtime/src/core/streams/google/index.ts +113 -3
  18. package/packages/model-runtime/src/core/streams/protocol.ts +19 -0
  19. package/packages/types/src/message/common/base.ts +26 -0
  20. package/packages/types/src/message/common/metadata.ts +7 -0
  21. package/packages/utils/src/index.ts +1 -0
  22. package/packages/utils/src/multimodalContent.ts +25 -0
  23. package/src/components/Thinking/index.tsx +3 -3
  24. package/src/features/ChatList/Messages/Assistant/DisplayContent.tsx +44 -0
  25. package/src/features/ChatList/Messages/Assistant/MessageBody.tsx +96 -0
  26. package/src/features/ChatList/Messages/Assistant/Reasoning/index.tsx +26 -13
  27. package/src/features/ChatList/Messages/Assistant/index.tsx +8 -6
  28. package/src/features/ChatList/Messages/Default.tsx +4 -7
  29. package/src/features/ChatList/components/RichContentRenderer.tsx +35 -0
  30. package/src/store/chat/slices/aiChat/actions/streamingExecutor.ts +244 -17
  31. package/src/features/ChatList/Messages/Assistant/MessageContent.tsx +0 -78
@@ -251,16 +251,16 @@ describe('GoogleGenerativeAIStream', () => {
251
251
  expect(chunks).toEqual(
252
252
  [
253
253
  'id: chat_1',
254
- 'event: text',
255
- 'data: "234"\n',
254
+ 'event: content_part',
255
+ 'data: {"content":"234","partType":"text"}\n',
256
256
 
257
257
  'id: chat_1',
258
258
  'event: text',
259
259
  'data: ""\n',
260
260
 
261
261
  'id: chat_1',
262
- 'event: text',
263
- `data: "567890\\n"\n`,
262
+ 'event: content_part',
263
+ `data: {"content":"567890\\n","partType":"text"}\n`,
264
264
  // stop
265
265
  'id: chat_1',
266
266
  'event: stop',
@@ -376,20 +376,20 @@ describe('GoogleGenerativeAIStream', () => {
376
376
  expect(chunks).toEqual(
377
377
  [
378
378
  'id: chat_1',
379
- 'event: reasoning',
380
- 'data: "**Understanding the Conditional Logic**\\n\\n"\n',
379
+ 'event: reasoning_part',
380
+ 'data: {"content":"**Understanding the Conditional Logic**\\n\\n","inReasoning":true,"partType":"text"}\n',
381
381
 
382
382
  'id: chat_1',
383
- 'event: reasoning',
384
- `data: "**Finalizing Interpretation**\\n\\n"\n`,
383
+ 'event: reasoning_part',
384
+ `data: {"content":"**Finalizing Interpretation**\\n\\n","inReasoning":true,"partType":"text"}\n`,
385
385
 
386
386
  'id: chat_1',
387
- 'event: text',
388
- `data: "简单来说,"\n`,
387
+ 'event: content_part',
388
+ `data: {"content":"简单来说,","partType":"text"}\n`,
389
389
 
390
390
  'id: chat_1',
391
- 'event: text',
392
- `data: "文本内容。"\n`,
391
+ 'event: content_part',
392
+ `data: {"content":"文本内容。","partType":"text"}\n`,
393
393
  // stop
394
394
  'id: chat_1',
395
395
  'event: stop',
@@ -471,12 +471,12 @@ describe('GoogleGenerativeAIStream', () => {
471
471
  expect(chunks).toEqual(
472
472
  [
473
473
  'id: chat_1',
474
- 'event: text',
475
- 'data: "234"\n',
474
+ 'event: content_part',
475
+ 'data: {"content":"234","partType":"text"}\n',
476
476
 
477
477
  'id: chat_1',
478
- 'event: text',
479
- `data: "567890\\n"\n`,
478
+ 'event: content_part',
479
+ `data: {"content":"567890\\n","partType":"text"}\n`,
480
480
  // stop
481
481
  'id: chat_1',
482
482
  'event: stop',
@@ -1166,8 +1166,8 @@ describe('GoogleGenerativeAIStream', () => {
1166
1166
  expect(chunks).toEqual(
1167
1167
  [
1168
1168
  'id: chat_1',
1169
- 'event: text',
1170
- 'data: "你好!很高兴为你服务。请问有什么我可以帮你的吗?\\n\\n无论是回答问题、协助写作、翻译,还是随便聊聊,我都随时待命!"\n',
1169
+ 'event: content_part',
1170
+ 'data: {"content":"你好!很高兴为你服务。请问有什么我可以帮你的吗?\\n\\n无论是回答问题、协助写作、翻译,还是随便聊聊,我都随时待命!","partType":"text"}\n',
1171
1171
 
1172
1172
  'id: chat_1',
1173
1173
  'event: stop',
@@ -1286,8 +1286,8 @@ describe('GoogleGenerativeAIStream', () => {
1286
1286
  expect(chunks).toEqual(
1287
1287
  [
1288
1288
  'id: chat_1',
1289
- 'event: text',
1290
- 'data: "Here is my answer"\n',
1289
+ 'event: content_part',
1290
+ 'data: {"content":"Here is my answer","partType":"text","thoughtSignature":"sig123"}\n',
1291
1291
 
1292
1292
  'id: chat_1',
1293
1293
  'event: stop',
@@ -1300,4 +1300,435 @@ describe('GoogleGenerativeAIStream', () => {
1300
1300
  );
1301
1301
  });
1302
1302
  });
1303
+
1304
+ describe('Multimodal parts (reasoning_part and content_part)', () => {
1305
+ it('should handle mixed reasoning text and reasoning image parts', async () => {
1306
+ vi.spyOn(uuidModule, 'nanoid').mockReturnValueOnce('1');
1307
+
1308
+ const data = [
1309
+ {
1310
+ candidates: [
1311
+ {
1312
+ content: {
1313
+ parts: [
1314
+ {
1315
+ text: "**Clarifying the Core Concept**\n\nI'm now focusing on the visual metaphor. I plan to depict Agent Runtime as a software environment that manages and executes agents' tasks, similar to how an operating system functions. I aim to create an informative and intuitive infographic.\n\n\n",
1316
+ thought: true,
1317
+ },
1318
+ ],
1319
+ role: 'model',
1320
+ },
1321
+ index: 0,
1322
+ },
1323
+ ],
1324
+ usageMetadata: {
1325
+ promptTokenCount: 9,
1326
+ totalTokenCount: 9,
1327
+ promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
1328
+ },
1329
+ modelVersion: 'gemini-3-pro-image-preview',
1330
+ responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
1331
+ },
1332
+ {
1333
+ candidates: [
1334
+ {
1335
+ content: {
1336
+ parts: [
1337
+ {
1338
+ text: '**Developing Visual Representation**\n\nI\'m now iterating on the visual representation. The "command center" metaphor is proving useful. I\'m focusing on the interplay of the core components: the central engine coordinates perception, memory, planning, and action, with tools and plugins as extensions. The goal is to clearly show the flow of information through the system, from input to output, using visual cues. The aesthetic aims for a futuristic, tech-inspired look with glowing lines and circuit board elements, using a palette of blues, purples, and oranges.\n\n\n',
1339
+ thought: true,
1340
+ },
1341
+ ],
1342
+ role: 'model',
1343
+ },
1344
+ index: 0,
1345
+ },
1346
+ ],
1347
+ usageMetadata: {
1348
+ promptTokenCount: 9,
1349
+ totalTokenCount: 9,
1350
+ promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
1351
+ },
1352
+ modelVersion: 'gemini-3-pro-image-preview',
1353
+ responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
1354
+ },
1355
+ {
1356
+ candidates: [
1357
+ {
1358
+ content: {
1359
+ parts: [
1360
+ {
1361
+ text: "**Constructing the Architecture**\n\nI'm presently building out the architecture of the infographic. I've broken down \"Agent Runtime\" into its core components and I'm designing the visual relationships between them. The central engine will be the focal point, with modules for perception, memory, planning, action, and tools radiating outwards. My aim is to illustrate the workflow from input to output clearly. I'll utilize arrows to represent the flow of data and instructions between each module.\n\n\n",
1362
+ thought: true,
1363
+ },
1364
+ ],
1365
+ role: 'model',
1366
+ },
1367
+ index: 0,
1368
+ },
1369
+ ],
1370
+ usageMetadata: {
1371
+ promptTokenCount: 9,
1372
+ totalTokenCount: 9,
1373
+ promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
1374
+ },
1375
+ modelVersion: 'gemini-3-pro-image-preview',
1376
+ responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
1377
+ },
1378
+ {
1379
+ candidates: [
1380
+ {
1381
+ content: {
1382
+ parts: [
1383
+ {
1384
+ inlineData: {
1385
+ mimeType: 'image/jpeg',
1386
+ data: '/9j/4AAQSkZJRgABAQEBLAEsAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQBiUgnjeAMHrkrk/mMk8k57YpwnCsBtzkcgk+/ONuO3QE4JJ3ccuJ373EbEg5KgDkAg4xtySNrHv14HYm9ne7au7JpLRfm+vn63Emtk9FZWv1t+Ntt9L9Wj//2Q==',
1387
+ },
1388
+ thought: true,
1389
+ },
1390
+ ],
1391
+ role: 'model',
1392
+ },
1393
+ index: 0,
1394
+ },
1395
+ ],
1396
+ usageMetadata: {
1397
+ promptTokenCount: 9,
1398
+ totalTokenCount: 9,
1399
+ promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
1400
+ },
1401
+ modelVersion: 'gemini-3-pro-image-preview',
1402
+ responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
1403
+ },
1404
+ {
1405
+ candidates: [
1406
+ {
1407
+ content: {
1408
+ parts: [
1409
+ {
1410
+ text: '**Constructing an Infographic**\n\nI\'ve successfully created an infographic depicting an "Agent Runtime." The design employs a tech-inspired circuit board aesthetic, placing the core engine at the center. I\'ve clearly represented six essential modules: perception, memory, planning, action, tools, and learning. Arrows and text annotations vividly illustrate the data flow and processing.\n\n\n',
1411
+ thought: true,
1412
+ },
1413
+ ],
1414
+ role: 'model',
1415
+ },
1416
+ index: 0,
1417
+ },
1418
+ ],
1419
+ usageMetadata: {
1420
+ promptTokenCount: 9,
1421
+ totalTokenCount: 9,
1422
+ promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
1423
+ },
1424
+ modelVersion: 'gemini-3-pro-image-preview',
1425
+ responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
1426
+ },
1427
+ {
1428
+ candidates: [
1429
+ {
1430
+ content: {
1431
+ parts: [
1432
+ {
1433
+ text: "**Defining Agent Runtime Modules**\n\nI'm making progress clarifying the architecture of an \"Agent Runtime\" system. I've designed an infographic with a circuit board aesthetic, centered on the core engine. Six key modules are now visualized: perception, memory, planning, action, tools, and learning. I've incorporated arrows and annotations to show data flow effectively.\n\n\n",
1434
+ thought: true,
1435
+ },
1436
+ ],
1437
+ role: 'model',
1438
+ },
1439
+ index: 0,
1440
+ },
1441
+ ],
1442
+ usageMetadata: {
1443
+ promptTokenCount: 9,
1444
+ totalTokenCount: 9,
1445
+ promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
1446
+ },
1447
+ modelVersion: 'gemini-3-pro-image-preview',
1448
+ responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
1449
+ },
1450
+ {
1451
+ candidates: [
1452
+ {
1453
+ content: {
1454
+ parts: [
1455
+ {
1456
+ inlineData: {
1457
+ mimeType: 'image/jpeg',
1458
+ data: '/9j/4AAQSkZJRgABAQEBLAEsAAD/2wBDAAEBAQEBAQEBA2Q==',
1459
+ },
1460
+ thoughtSignature:
1461
+ 'EueybArjsmwB0e2Kby+QPRkacnmPuV+CqMr6tiey3M5BHLHgIiggQOMeFmnKzsoux6PI6dQMgmdbXE1OTLLcWUmUD1CgFn+C2VdI09FpHrVhxVAtSk/zFVSlsjfCuANxtkP8tCDppVZqIya0QYjzg5K1fEO0m42CZX2/MHyqL8NjzR0lT8ENdoV3RSaK2tXqPH45uIb6nGeBSuX1n2EUMzO',
1462
+ },
1463
+ ],
1464
+ role: 'model',
1465
+ },
1466
+ finishReason: 'STOP',
1467
+ index: 0,
1468
+ },
1469
+ ],
1470
+ usageMetadata: {
1471
+ promptTokenCount: 9,
1472
+ candidatesTokenCount: 1358,
1473
+ totalTokenCount: 1728,
1474
+ promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
1475
+ candidatesTokensDetails: [{ modality: 'IMAGE', tokenCount: 1120 }],
1476
+ thoughtsTokenCount: 361,
1477
+ },
1478
+ modelVersion: 'gemini-3-pro-image-preview',
1479
+ responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
1480
+ },
1481
+ ];
1482
+
1483
+ const mockGoogleStream = new ReadableStream({
1484
+ start(controller) {
1485
+ data.forEach((item) => {
1486
+ controller.enqueue(item);
1487
+ });
1488
+ controller.close();
1489
+ },
1490
+ });
1491
+
1492
+ const protocolStream = GoogleGenerativeAIStream(mockGoogleStream);
1493
+ const chunks = await decodeStreamChunks(protocolStream);
1494
+
1495
+ expect(chunks).toEqual(
1496
+ [
1497
+ // First reasoning text
1498
+ 'id: chat_1',
1499
+ 'event: reasoning_part',
1500
+ 'data: {"content":"**Clarifying the Core Concept**\\n\\nI\'m now focusing on the visual metaphor. I plan to depict Agent Runtime as a software environment that manages and executes agents\' tasks, similar to how an operating system functions. I aim to create an informative and intuitive infographic.\\n\\n\\n","inReasoning":true,"partType":"text"}\n',
1501
+
1502
+ // Second reasoning text
1503
+ 'id: chat_1',
1504
+ 'event: reasoning_part',
1505
+ 'data: {"content":"**Developing Visual Representation**\\n\\nI\'m now iterating on the visual representation. The \\"command center\\" metaphor is proving useful. I\'m focusing on the interplay of the core components: the central engine coordinates perception, memory, planning, and action, with tools and plugins as extensions. The goal is to clearly show the flow of information through the system, from input to output, using visual cues. The aesthetic aims for a futuristic, tech-inspired look with glowing lines and circuit board elements, using a palette of blues, purples, and oranges.\\n\\n\\n","inReasoning":true,"partType":"text"}\n',
1506
+
1507
+ // Third reasoning text
1508
+ 'id: chat_1',
1509
+ 'event: reasoning_part',
1510
+ 'data: {"content":"**Constructing the Architecture**\\n\\nI\'m presently building out the architecture of the infographic. I\'ve broken down \\"Agent Runtime\\" into its core components and I\'m designing the visual relationships between them. The central engine will be the focal point, with modules for perception, memory, planning, action, and tools radiating outwards. My aim is to illustrate the workflow from input to output clearly. I\'ll utilize arrows to represent the flow of data and instructions between each module.\\n\\n\\n","inReasoning":true,"partType":"text"}\n',
1511
+
1512
+ // First reasoning image
1513
+ 'id: chat_1',
1514
+ 'event: reasoning_part',
1515
+ 'data: {"content":"/9j/4AAQSkZJRgABAQEBLAEsAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQBiUgnjeAMHrkrk/mMk8k57YpwnCsBtzkcgk+/ONuO3QE4JJ3ccuJ373EbEg5KgDkAg4xtySNrHv14HYm9ne7au7JpLRfm+vn63Emtk9FZWv1t+Ntt9L9Wj//2Q==","inReasoning":true,"mimeType":"image/jpeg","partType":"image"}\n',
1516
+
1517
+ // Fourth reasoning text
1518
+ 'id: chat_1',
1519
+ 'event: reasoning_part',
1520
+ 'data: {"content":"**Constructing an Infographic**\\n\\nI\'ve successfully created an infographic depicting an \\"Agent Runtime.\\" The design employs a tech-inspired circuit board aesthetic, placing the core engine at the center. I\'ve clearly represented six essential modules: perception, memory, planning, action, tools, and learning. Arrows and text annotations vividly illustrate the data flow and processing.\\n\\n\\n","inReasoning":true,"partType":"text"}\n',
1521
+
1522
+ // Fifth reasoning text
1523
+ 'id: chat_1',
1524
+ 'event: reasoning_part',
1525
+ 'data: {"content":"**Defining Agent Runtime Modules**\\n\\nI\'m making progress clarifying the architecture of an \\"Agent Runtime\\" system. I\'ve designed an infographic with a circuit board aesthetic, centered on the core engine. Six key modules are now visualized: perception, memory, planning, action, tools, and learning. I\'ve incorporated arrows and annotations to show data flow effectively.\\n\\n\\n","inReasoning":true,"partType":"text"}\n',
1526
+
1527
+ // Content image (with thoughtSignature but not thought:true)
1528
+ 'id: chat_1',
1529
+ 'event: content_part',
1530
+ 'data: {"content":"/9j/4AAQSkZJRgABAQEBLAEsAAD/2wBDAAEBAQEBAQEBA2Q==","mimeType":"image/jpeg","partType":"image","thoughtSignature":"EueybArjsmwB0e2Kby+QPRkacnmPuV+CqMr6tiey3M5BHLHgIiggQOMeFmnKzsoux6PI6dQMgmdbXE1OTLLcWUmUD1CgFn+C2VdI09FpHrVhxVAtSk/zFVSlsjfCuANxtkP8tCDppVZqIya0QYjzg5K1fEO0m42CZX2/MHyqL8NjzR0lT8ENdoV3RSaK2tXqPH45uIb6nGeBSuX1n2EUMzO"}\n',
1531
+
1532
+ // stop
1533
+ 'id: chat_1',
1534
+ 'event: stop',
1535
+ 'data: "STOP"\n',
1536
+
1537
+ // usage
1538
+ 'id: chat_1',
1539
+ 'event: usage',
1540
+ 'data: {"inputTextTokens":9,"outputImageTokens":1120,"outputReasoningTokens":361,"outputTextTokens":238,"totalInputTokens":9,"totalOutputTokens":1719,"totalTokens":1728}\n',
1541
+ ].map((i) => i + '\n'),
1542
+ );
1543
+ });
1544
+
1545
+ it('should handle content text and image parts without reasoning', async () => {
1546
+ vi.spyOn(uuidModule, 'nanoid').mockReturnValueOnce('1');
1547
+
1548
+ const data = [
1549
+ {
1550
+ candidates: [
1551
+ {
1552
+ content: {
1553
+ parts: [
1554
+ {
1555
+ text: 'This is the description: ',
1556
+ },
1557
+ ],
1558
+ role: 'model',
1559
+ },
1560
+ index: 0,
1561
+ },
1562
+ ],
1563
+ usageMetadata: {
1564
+ promptTokenCount: 5,
1565
+ totalTokenCount: 5,
1566
+ promptTokensDetails: [{ modality: 'TEXT', tokenCount: 5 }],
1567
+ },
1568
+ modelVersion: 'gemini-3-pro-image-preview',
1569
+ },
1570
+ {
1571
+ candidates: [
1572
+ {
1573
+ content: {
1574
+ parts: [
1575
+ {
1576
+ inlineData: {
1577
+ mimeType: 'image/png',
1578
+ data: 'iVBORw0KGgoAAAANSUhEUgAAAAUA',
1579
+ },
1580
+ },
1581
+ ],
1582
+ role: 'model',
1583
+ },
1584
+ index: 0,
1585
+ },
1586
+ ],
1587
+ usageMetadata: {
1588
+ promptTokenCount: 5,
1589
+ totalTokenCount: 5,
1590
+ promptTokensDetails: [{ modality: 'TEXT', tokenCount: 5 }],
1591
+ },
1592
+ modelVersion: 'gemini-3-pro-image-preview',
1593
+ },
1594
+ {
1595
+ candidates: [
1596
+ {
1597
+ content: {
1598
+ parts: [
1599
+ {
1600
+ text: ' an example.',
1601
+ },
1602
+ ],
1603
+ role: 'model',
1604
+ },
1605
+ finishReason: 'STOP',
1606
+ index: 0,
1607
+ },
1608
+ ],
1609
+ usageMetadata: {
1610
+ promptTokenCount: 5,
1611
+ candidatesTokenCount: 10,
1612
+ totalTokenCount: 15,
1613
+ promptTokensDetails: [{ modality: 'TEXT', tokenCount: 5 }],
1614
+ candidatesTokensDetails: [{ modality: 'TEXT', tokenCount: 10 }],
1615
+ },
1616
+ modelVersion: 'gemini-3-pro-image-preview',
1617
+ },
1618
+ ];
1619
+
1620
+ const mockGoogleStream = new ReadableStream({
1621
+ start(controller) {
1622
+ data.forEach((item) => {
1623
+ controller.enqueue(item);
1624
+ });
1625
+ controller.close();
1626
+ },
1627
+ });
1628
+
1629
+ const protocolStream = GoogleGenerativeAIStream(mockGoogleStream);
1630
+ const chunks = await decodeStreamChunks(protocolStream);
1631
+
1632
+ expect(chunks).toEqual(
1633
+ [
1634
+ 'id: chat_1',
1635
+ 'event: content_part',
1636
+ 'data: {"content":"This is the description: ","partType":"text"}\n',
1637
+
1638
+ 'id: chat_1',
1639
+ 'event: content_part',
1640
+ 'data: {"content":"iVBORw0KGgoAAAANSUhEUgAAAAUA","mimeType":"image/png","partType":"image"}\n',
1641
+
1642
+ 'id: chat_1',
1643
+ 'event: content_part',
1644
+ 'data: {"content":" an example.","partType":"text"}\n',
1645
+
1646
+ 'id: chat_1',
1647
+ 'event: stop',
1648
+ 'data: "STOP"\n',
1649
+
1650
+ 'id: chat_1',
1651
+ 'event: usage',
1652
+ 'data: {"inputTextTokens":5,"outputImageTokens":0,"outputTextTokens":10,"totalInputTokens":5,"totalOutputTokens":10,"totalTokens":15}\n',
1653
+ ].map((i) => i + '\n'),
1654
+ );
1655
+ });
1656
+
1657
+ it('should handle mixed reasoning and content parts in single chunk', async () => {
1658
+ vi.spyOn(uuidModule, 'nanoid').mockReturnValueOnce('1');
1659
+
1660
+ const data = [
1661
+ {
1662
+ candidates: [
1663
+ {
1664
+ content: {
1665
+ parts: [
1666
+ {
1667
+ text: 'Analyzing the request...',
1668
+ thought: true,
1669
+ },
1670
+ {
1671
+ text: 'Here is the answer: ',
1672
+ },
1673
+ {
1674
+ inlineData: {
1675
+ mimeType: 'image/png',
1676
+ data: 'base64data',
1677
+ },
1678
+ },
1679
+ ],
1680
+ role: 'model',
1681
+ },
1682
+ finishReason: 'STOP',
1683
+ index: 0,
1684
+ },
1685
+ ],
1686
+ usageMetadata: {
1687
+ promptTokenCount: 10,
1688
+ candidatesTokenCount: 20,
1689
+ totalTokenCount: 30,
1690
+ promptTokensDetails: [{ modality: 'TEXT', tokenCount: 10 }],
1691
+ thoughtsTokenCount: 5,
1692
+ },
1693
+ modelVersion: 'gemini-3-pro-image-preview',
1694
+ },
1695
+ ];
1696
+
1697
+ const mockGoogleStream = new ReadableStream({
1698
+ start(controller) {
1699
+ data.forEach((item) => {
1700
+ controller.enqueue(item);
1701
+ });
1702
+ controller.close();
1703
+ },
1704
+ });
1705
+
1706
+ const protocolStream = GoogleGenerativeAIStream(mockGoogleStream);
1707
+ const chunks = await decodeStreamChunks(protocolStream);
1708
+
1709
+ expect(chunks).toEqual(
1710
+ [
1711
+ 'id: chat_1',
1712
+ 'event: reasoning_part',
1713
+ 'data: {"content":"Analyzing the request...","inReasoning":true,"partType":"text"}\n',
1714
+
1715
+ 'id: chat_1',
1716
+ 'event: content_part',
1717
+ 'data: {"content":"Here is the answer: ","partType":"text"}\n',
1718
+
1719
+ 'id: chat_1',
1720
+ 'event: content_part',
1721
+ 'data: {"content":"base64data","mimeType":"image/png","partType":"image"}\n',
1722
+
1723
+ 'id: chat_1',
1724
+ 'event: stop',
1725
+ 'data: "STOP"\n',
1726
+
1727
+ 'id: chat_1',
1728
+ 'event: usage',
1729
+ 'data: {"inputTextTokens":10,"outputImageTokens":0,"outputReasoningTokens":5,"outputTextTokens":20,"totalInputTokens":10,"totalOutputTokens":25,"totalTokens":30}\n',
1730
+ ].map((i) => i + '\n'),
1731
+ );
1732
+ });
1733
+ });
1303
1734
  });
@@ -7,6 +7,7 @@ import { convertGoogleAIUsage } from '../../usageConverters/google-ai';
7
7
  import {
8
8
  ChatPayloadForTransformStream,
9
9
  StreamContext,
10
+ StreamPartChunkData,
10
11
  StreamProtocolChunk,
11
12
  StreamToolCallChunkData,
12
13
  createCallbacksTransformer,
@@ -114,12 +115,121 @@ const transformGoogleGenerativeAIStream = (
114
115
  .join('') || '';
115
116
 
116
117
  if (candidate) {
117
- // 首先检查是否为 reasoning 内容 (thought: true)
118
- if (Array.isArray(candidate.content?.parts) && candidate.content.parts.length > 0) {
118
+ // Check if this response contains reasoning or multimodal content
119
+ const parts = candidate.content?.parts || [];
120
+ const hasReasoningParts = parts.some((p: any) => p.thought === true);
121
+ const hasImageParts = parts.some((p: any) => p.inlineData);
122
+ const hasThoughtSignature = parts.some((p: any) => p.thoughtSignature);
123
+ const hasThoughtsInMetadata = (usageMetadata as any)?.thoughtsTokenCount > 0;
124
+
125
+ // Check model version to determine if new format should be used
126
+ const modelVersion = (chunk as any).modelVersion || '';
127
+ const isGemini25Plus = modelVersion.includes('gemini-2.5') || modelVersion.includes('gemini-3');
128
+ const isGemini3Model =
129
+ modelVersion.includes('gemini-3') || modelVersion.includes('image-preview');
130
+
131
+ // Check if this is the old single-image scenario (single image part with finishReason)
132
+ // This should use the legacy base64_image event format (only for gemini-2.0 and earlier)
133
+ const isSingleImageWithFinish =
134
+ parts.length === 1 &&
135
+ hasImageParts &&
136
+ !hasReasoningParts &&
137
+ candidate.finishReason &&
138
+ !isGemini25Plus;
139
+
140
+ // Check if this has grounding metadata (should use legacy text + grounding events)
141
+ const hasGroundingMetadata = !!candidate.groundingMetadata?.groundingChunks;
142
+
143
+ // Use content_part/reasoning_part events when:
144
+ // 1. There are reasoning parts in current chunk (thought: true)
145
+ // 2. There are multiple parts with images (multimodal content)
146
+ // 3. There are thoughtSignature in parts (reasoning metadata attached to content)
147
+ // 4. There is thoughtsTokenCount in metadata (indicates response contains reasoning)
148
+ // 5. This is Gemini 3 model with image generation (always use new format for consistency)
149
+ // BUT NOT for:
150
+ // - The legacy single-image scenario
151
+ // - Grounding metadata scenario (uses legacy text + grounding events)
152
+ const shouldUseMultimodalProcessing =
153
+ (hasReasoningParts ||
154
+ (hasImageParts && parts.length > 1) ||
155
+ hasThoughtSignature ||
156
+ hasThoughtsInMetadata ||
157
+ isGemini3Model) &&
158
+ !isSingleImageWithFinish &&
159
+ !hasGroundingMetadata;
160
+
161
+ // Process multimodal parts (text and images in reasoning or content)
162
+ if (
163
+ shouldUseMultimodalProcessing &&
164
+ Array.isArray(candidate.content?.parts) &&
165
+ candidate.content.parts.length > 0
166
+ ) {
167
+ const results: StreamProtocolChunk[] = [];
168
+
119
169
  for (const part of candidate.content.parts) {
170
+ // 1. Reasoning text part
120
171
  if (part && part.text && part.thought === true) {
121
- return { data: part.text, id: context.id, type: 'reasoning' };
172
+ results.push({
173
+ data: {
174
+ content: part.text,
175
+ inReasoning: true,
176
+ partType: 'text',
177
+ thoughtSignature: part.thoughtSignature,
178
+ } as StreamPartChunkData,
179
+ id: context.id,
180
+ type: 'reasoning_part',
181
+ });
182
+ }
183
+
184
+ // 2. Reasoning image part
185
+ else if (part && part.inlineData && part.thought === true) {
186
+ results.push({
187
+ data: {
188
+ content: part.inlineData.data,
189
+ inReasoning: true,
190
+ mimeType: part.inlineData.mimeType,
191
+ partType: 'image',
192
+ thoughtSignature: part.thoughtSignature,
193
+ } as StreamPartChunkData,
194
+ id: context.id,
195
+ type: 'reasoning_part',
196
+ });
197
+ }
198
+
199
+ // 3. Content text part
200
+ else if (part && part.text && !part.thought) {
201
+ results.push({
202
+ data: {
203
+ content: part.text,
204
+ partType: 'text',
205
+ thoughtSignature: part.thoughtSignature,
206
+ } as StreamPartChunkData,
207
+ id: context.id,
208
+ type: 'content_part',
209
+ });
210
+ }
211
+
212
+ // 4. Content image part
213
+ else if (part && part.inlineData && !part.thought) {
214
+ results.push({
215
+ data: {
216
+ content: part.inlineData.data,
217
+ mimeType: part.inlineData.mimeType,
218
+ partType: 'image',
219
+ thoughtSignature: part.thoughtSignature,
220
+ } as StreamPartChunkData,
221
+ id: context.id,
222
+ type: 'content_part',
223
+ });
224
+ }
225
+ }
226
+
227
+ // If we found multimodal parts, return them with usage chunks
228
+ if (results.length > 0) {
229
+ if (candidate.finishReason && usageMetadata) {
230
+ results.push(...usageChunks);
122
231
  }
232
+ return results;
123
233
  }
124
234
  }
125
235