@lobehub/lobehub 2.0.0-next.115 → 2.0.0-next.117
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +50 -0
- package/changelog/v1.json +18 -0
- package/package.json +1 -1
- package/packages/context-engine/src/processors/MessageContent.ts +100 -6
- package/packages/context-engine/src/processors/__tests__/MessageContent.test.ts +239 -0
- package/packages/fetch-sse/src/fetchSSE.ts +30 -0
- package/packages/model-bank/src/aiModels/bedrock.ts +30 -2
- package/packages/model-runtime/src/const/models.ts +62 -24
- package/packages/model-runtime/src/core/contextBuilders/anthropic.ts +14 -0
- package/packages/model-runtime/src/core/contextBuilders/google.test.ts +78 -24
- package/packages/model-runtime/src/core/contextBuilders/google.ts +10 -2
- package/packages/model-runtime/src/core/parameterResolver.test.ts +34 -50
- package/packages/model-runtime/src/core/parameterResolver.ts +0 -41
- package/packages/model-runtime/src/core/streams/google/google-ai.test.ts +451 -20
- package/packages/model-runtime/src/core/streams/google/index.ts +113 -3
- package/packages/model-runtime/src/core/streams/protocol.ts +19 -0
- package/packages/model-runtime/src/index.ts +1 -0
- package/packages/model-runtime/src/providers/anthropic/index.ts +20 -32
- package/packages/model-runtime/src/providers/anthropic/resolveMaxTokens.ts +35 -0
- package/packages/model-runtime/src/providers/bedrock/index.test.ts +5 -7
- package/packages/model-runtime/src/providers/bedrock/index.ts +50 -11
- package/packages/types/src/message/common/base.ts +26 -0
- package/packages/types/src/message/common/metadata.ts +7 -0
- package/packages/utils/src/index.ts +1 -0
- package/packages/utils/src/multimodalContent.ts +25 -0
- package/src/components/Thinking/index.tsx +3 -3
- package/src/features/ChatList/Messages/Assistant/DisplayContent.tsx +44 -0
- package/src/features/ChatList/Messages/Assistant/MessageBody.tsx +96 -0
- package/src/features/ChatList/Messages/Assistant/Reasoning/index.tsx +26 -13
- package/src/features/ChatList/Messages/Assistant/index.tsx +8 -6
- package/src/features/ChatList/Messages/Default.tsx +4 -7
- package/src/features/ChatList/components/RichContentRenderer.tsx +35 -0
- package/src/store/agent/slices/chat/selectors/chatConfig.ts +4 -3
- package/src/store/chat/slices/aiChat/actions/streamingExecutor.ts +244 -17
- package/packages/const/src/models.ts +0 -93
- package/src/features/ChatList/Messages/Assistant/MessageContent.tsx +0 -78
|
@@ -251,16 +251,16 @@ describe('GoogleGenerativeAIStream', () => {
|
|
|
251
251
|
expect(chunks).toEqual(
|
|
252
252
|
[
|
|
253
253
|
'id: chat_1',
|
|
254
|
-
'event:
|
|
255
|
-
'data: "234"\n',
|
|
254
|
+
'event: content_part',
|
|
255
|
+
'data: {"content":"234","partType":"text"}\n',
|
|
256
256
|
|
|
257
257
|
'id: chat_1',
|
|
258
258
|
'event: text',
|
|
259
259
|
'data: ""\n',
|
|
260
260
|
|
|
261
261
|
'id: chat_1',
|
|
262
|
-
'event:
|
|
263
|
-
`data: "567890\\n"\n`,
|
|
262
|
+
'event: content_part',
|
|
263
|
+
`data: {"content":"567890\\n","partType":"text"}\n`,
|
|
264
264
|
// stop
|
|
265
265
|
'id: chat_1',
|
|
266
266
|
'event: stop',
|
|
@@ -376,20 +376,20 @@ describe('GoogleGenerativeAIStream', () => {
|
|
|
376
376
|
expect(chunks).toEqual(
|
|
377
377
|
[
|
|
378
378
|
'id: chat_1',
|
|
379
|
-
'event:
|
|
380
|
-
'data: "**Understanding the Conditional Logic**\\n\\n"\n',
|
|
379
|
+
'event: reasoning_part',
|
|
380
|
+
'data: {"content":"**Understanding the Conditional Logic**\\n\\n","inReasoning":true,"partType":"text"}\n',
|
|
381
381
|
|
|
382
382
|
'id: chat_1',
|
|
383
|
-
'event:
|
|
384
|
-
`data: "**Finalizing Interpretation**\\n\\n"\n`,
|
|
383
|
+
'event: reasoning_part',
|
|
384
|
+
`data: {"content":"**Finalizing Interpretation**\\n\\n","inReasoning":true,"partType":"text"}\n`,
|
|
385
385
|
|
|
386
386
|
'id: chat_1',
|
|
387
|
-
'event:
|
|
388
|
-
`data: "简单来说,"\n`,
|
|
387
|
+
'event: content_part',
|
|
388
|
+
`data: {"content":"简单来说,","partType":"text"}\n`,
|
|
389
389
|
|
|
390
390
|
'id: chat_1',
|
|
391
|
-
'event:
|
|
392
|
-
`data: "文本内容。"\n`,
|
|
391
|
+
'event: content_part',
|
|
392
|
+
`data: {"content":"文本内容。","partType":"text"}\n`,
|
|
393
393
|
// stop
|
|
394
394
|
'id: chat_1',
|
|
395
395
|
'event: stop',
|
|
@@ -471,12 +471,12 @@ describe('GoogleGenerativeAIStream', () => {
|
|
|
471
471
|
expect(chunks).toEqual(
|
|
472
472
|
[
|
|
473
473
|
'id: chat_1',
|
|
474
|
-
'event:
|
|
475
|
-
'data: "234"\n',
|
|
474
|
+
'event: content_part',
|
|
475
|
+
'data: {"content":"234","partType":"text"}\n',
|
|
476
476
|
|
|
477
477
|
'id: chat_1',
|
|
478
|
-
'event:
|
|
479
|
-
`data: "567890\\n"\n`,
|
|
478
|
+
'event: content_part',
|
|
479
|
+
`data: {"content":"567890\\n","partType":"text"}\n`,
|
|
480
480
|
// stop
|
|
481
481
|
'id: chat_1',
|
|
482
482
|
'event: stop',
|
|
@@ -1166,8 +1166,8 @@ describe('GoogleGenerativeAIStream', () => {
|
|
|
1166
1166
|
expect(chunks).toEqual(
|
|
1167
1167
|
[
|
|
1168
1168
|
'id: chat_1',
|
|
1169
|
-
'event:
|
|
1170
|
-
'data: "你好!很高兴为你服务。请问有什么我可以帮你的吗?\\n\\n无论是回答问题、协助写作、翻译,还是随便聊聊,我都随时待命!"\n',
|
|
1169
|
+
'event: content_part',
|
|
1170
|
+
'data: {"content":"你好!很高兴为你服务。请问有什么我可以帮你的吗?\\n\\n无论是回答问题、协助写作、翻译,还是随便聊聊,我都随时待命!","partType":"text"}\n',
|
|
1171
1171
|
|
|
1172
1172
|
'id: chat_1',
|
|
1173
1173
|
'event: stop',
|
|
@@ -1286,8 +1286,8 @@ describe('GoogleGenerativeAIStream', () => {
|
|
|
1286
1286
|
expect(chunks).toEqual(
|
|
1287
1287
|
[
|
|
1288
1288
|
'id: chat_1',
|
|
1289
|
-
'event:
|
|
1290
|
-
'data: "Here is my answer"\n',
|
|
1289
|
+
'event: content_part',
|
|
1290
|
+
'data: {"content":"Here is my answer","partType":"text","thoughtSignature":"sig123"}\n',
|
|
1291
1291
|
|
|
1292
1292
|
'id: chat_1',
|
|
1293
1293
|
'event: stop',
|
|
@@ -1300,4 +1300,435 @@ describe('GoogleGenerativeAIStream', () => {
|
|
|
1300
1300
|
);
|
|
1301
1301
|
});
|
|
1302
1302
|
});
|
|
1303
|
+
|
|
1304
|
+
describe('Multimodal parts (reasoning_part and content_part)', () => {
|
|
1305
|
+
it('should handle mixed reasoning text and reasoning image parts', async () => {
|
|
1306
|
+
vi.spyOn(uuidModule, 'nanoid').mockReturnValueOnce('1');
|
|
1307
|
+
|
|
1308
|
+
const data = [
|
|
1309
|
+
{
|
|
1310
|
+
candidates: [
|
|
1311
|
+
{
|
|
1312
|
+
content: {
|
|
1313
|
+
parts: [
|
|
1314
|
+
{
|
|
1315
|
+
text: "**Clarifying the Core Concept**\n\nI'm now focusing on the visual metaphor. I plan to depict Agent Runtime as a software environment that manages and executes agents' tasks, similar to how an operating system functions. I aim to create an informative and intuitive infographic.\n\n\n",
|
|
1316
|
+
thought: true,
|
|
1317
|
+
},
|
|
1318
|
+
],
|
|
1319
|
+
role: 'model',
|
|
1320
|
+
},
|
|
1321
|
+
index: 0,
|
|
1322
|
+
},
|
|
1323
|
+
],
|
|
1324
|
+
usageMetadata: {
|
|
1325
|
+
promptTokenCount: 9,
|
|
1326
|
+
totalTokenCount: 9,
|
|
1327
|
+
promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
|
|
1328
|
+
},
|
|
1329
|
+
modelVersion: 'gemini-3-pro-image-preview',
|
|
1330
|
+
responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
|
|
1331
|
+
},
|
|
1332
|
+
{
|
|
1333
|
+
candidates: [
|
|
1334
|
+
{
|
|
1335
|
+
content: {
|
|
1336
|
+
parts: [
|
|
1337
|
+
{
|
|
1338
|
+
text: '**Developing Visual Representation**\n\nI\'m now iterating on the visual representation. The "command center" metaphor is proving useful. I\'m focusing on the interplay of the core components: the central engine coordinates perception, memory, planning, and action, with tools and plugins as extensions. The goal is to clearly show the flow of information through the system, from input to output, using visual cues. The aesthetic aims for a futuristic, tech-inspired look with glowing lines and circuit board elements, using a palette of blues, purples, and oranges.\n\n\n',
|
|
1339
|
+
thought: true,
|
|
1340
|
+
},
|
|
1341
|
+
],
|
|
1342
|
+
role: 'model',
|
|
1343
|
+
},
|
|
1344
|
+
index: 0,
|
|
1345
|
+
},
|
|
1346
|
+
],
|
|
1347
|
+
usageMetadata: {
|
|
1348
|
+
promptTokenCount: 9,
|
|
1349
|
+
totalTokenCount: 9,
|
|
1350
|
+
promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
|
|
1351
|
+
},
|
|
1352
|
+
modelVersion: 'gemini-3-pro-image-preview',
|
|
1353
|
+
responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
|
|
1354
|
+
},
|
|
1355
|
+
{
|
|
1356
|
+
candidates: [
|
|
1357
|
+
{
|
|
1358
|
+
content: {
|
|
1359
|
+
parts: [
|
|
1360
|
+
{
|
|
1361
|
+
text: "**Constructing the Architecture**\n\nI'm presently building out the architecture of the infographic. I've broken down \"Agent Runtime\" into its core components and I'm designing the visual relationships between them. The central engine will be the focal point, with modules for perception, memory, planning, action, and tools radiating outwards. My aim is to illustrate the workflow from input to output clearly. I'll utilize arrows to represent the flow of data and instructions between each module.\n\n\n",
|
|
1362
|
+
thought: true,
|
|
1363
|
+
},
|
|
1364
|
+
],
|
|
1365
|
+
role: 'model',
|
|
1366
|
+
},
|
|
1367
|
+
index: 0,
|
|
1368
|
+
},
|
|
1369
|
+
],
|
|
1370
|
+
usageMetadata: {
|
|
1371
|
+
promptTokenCount: 9,
|
|
1372
|
+
totalTokenCount: 9,
|
|
1373
|
+
promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
|
|
1374
|
+
},
|
|
1375
|
+
modelVersion: 'gemini-3-pro-image-preview',
|
|
1376
|
+
responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
|
|
1377
|
+
},
|
|
1378
|
+
{
|
|
1379
|
+
candidates: [
|
|
1380
|
+
{
|
|
1381
|
+
content: {
|
|
1382
|
+
parts: [
|
|
1383
|
+
{
|
|
1384
|
+
inlineData: {
|
|
1385
|
+
mimeType: 'image/jpeg',
|
|
1386
|
+
data: '/9j/4AAQSkZJRgABAQEBLAEsAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQBiUgnjeAMHrkrk/mMk8k57YpwnCsBtzkcgk+/ONuO3QE4JJ3ccuJ373EbEg5KgDkAg4xtySNrHv14HYm9ne7au7JpLRfm+vn63Emtk9FZWv1t+Ntt9L9Wj//2Q==',
|
|
1387
|
+
},
|
|
1388
|
+
thought: true,
|
|
1389
|
+
},
|
|
1390
|
+
],
|
|
1391
|
+
role: 'model',
|
|
1392
|
+
},
|
|
1393
|
+
index: 0,
|
|
1394
|
+
},
|
|
1395
|
+
],
|
|
1396
|
+
usageMetadata: {
|
|
1397
|
+
promptTokenCount: 9,
|
|
1398
|
+
totalTokenCount: 9,
|
|
1399
|
+
promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
|
|
1400
|
+
},
|
|
1401
|
+
modelVersion: 'gemini-3-pro-image-preview',
|
|
1402
|
+
responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
|
|
1403
|
+
},
|
|
1404
|
+
{
|
|
1405
|
+
candidates: [
|
|
1406
|
+
{
|
|
1407
|
+
content: {
|
|
1408
|
+
parts: [
|
|
1409
|
+
{
|
|
1410
|
+
text: '**Constructing an Infographic**\n\nI\'ve successfully created an infographic depicting an "Agent Runtime." The design employs a tech-inspired circuit board aesthetic, placing the core engine at the center. I\'ve clearly represented six essential modules: perception, memory, planning, action, tools, and learning. Arrows and text annotations vividly illustrate the data flow and processing.\n\n\n',
|
|
1411
|
+
thought: true,
|
|
1412
|
+
},
|
|
1413
|
+
],
|
|
1414
|
+
role: 'model',
|
|
1415
|
+
},
|
|
1416
|
+
index: 0,
|
|
1417
|
+
},
|
|
1418
|
+
],
|
|
1419
|
+
usageMetadata: {
|
|
1420
|
+
promptTokenCount: 9,
|
|
1421
|
+
totalTokenCount: 9,
|
|
1422
|
+
promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
|
|
1423
|
+
},
|
|
1424
|
+
modelVersion: 'gemini-3-pro-image-preview',
|
|
1425
|
+
responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
|
|
1426
|
+
},
|
|
1427
|
+
{
|
|
1428
|
+
candidates: [
|
|
1429
|
+
{
|
|
1430
|
+
content: {
|
|
1431
|
+
parts: [
|
|
1432
|
+
{
|
|
1433
|
+
text: "**Defining Agent Runtime Modules**\n\nI'm making progress clarifying the architecture of an \"Agent Runtime\" system. I've designed an infographic with a circuit board aesthetic, centered on the core engine. Six key modules are now visualized: perception, memory, planning, action, tools, and learning. I've incorporated arrows and annotations to show data flow effectively.\n\n\n",
|
|
1434
|
+
thought: true,
|
|
1435
|
+
},
|
|
1436
|
+
],
|
|
1437
|
+
role: 'model',
|
|
1438
|
+
},
|
|
1439
|
+
index: 0,
|
|
1440
|
+
},
|
|
1441
|
+
],
|
|
1442
|
+
usageMetadata: {
|
|
1443
|
+
promptTokenCount: 9,
|
|
1444
|
+
totalTokenCount: 9,
|
|
1445
|
+
promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
|
|
1446
|
+
},
|
|
1447
|
+
modelVersion: 'gemini-3-pro-image-preview',
|
|
1448
|
+
responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
|
|
1449
|
+
},
|
|
1450
|
+
{
|
|
1451
|
+
candidates: [
|
|
1452
|
+
{
|
|
1453
|
+
content: {
|
|
1454
|
+
parts: [
|
|
1455
|
+
{
|
|
1456
|
+
inlineData: {
|
|
1457
|
+
mimeType: 'image/jpeg',
|
|
1458
|
+
data: '/9j/4AAQSkZJRgABAQEBLAEsAAD/2wBDAAEBAQEBAQEBA2Q==',
|
|
1459
|
+
},
|
|
1460
|
+
thoughtSignature:
|
|
1461
|
+
'EueybArjsmwB0e2Kby+QPRkacnmPuV+CqMr6tiey3M5BHLHgIiggQOMeFmnKzsoux6PI6dQMgmdbXE1OTLLcWUmUD1CgFn+C2VdI09FpHrVhxVAtSk/zFVSlsjfCuANxtkP8tCDppVZqIya0QYjzg5K1fEO0m42CZX2/MHyqL8NjzR0lT8ENdoV3RSaK2tXqPH45uIb6nGeBSuX1n2EUMzO',
|
|
1462
|
+
},
|
|
1463
|
+
],
|
|
1464
|
+
role: 'model',
|
|
1465
|
+
},
|
|
1466
|
+
finishReason: 'STOP',
|
|
1467
|
+
index: 0,
|
|
1468
|
+
},
|
|
1469
|
+
],
|
|
1470
|
+
usageMetadata: {
|
|
1471
|
+
promptTokenCount: 9,
|
|
1472
|
+
candidatesTokenCount: 1358,
|
|
1473
|
+
totalTokenCount: 1728,
|
|
1474
|
+
promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
|
|
1475
|
+
candidatesTokensDetails: [{ modality: 'IMAGE', tokenCount: 1120 }],
|
|
1476
|
+
thoughtsTokenCount: 361,
|
|
1477
|
+
},
|
|
1478
|
+
modelVersion: 'gemini-3-pro-image-preview',
|
|
1479
|
+
responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
|
|
1480
|
+
},
|
|
1481
|
+
];
|
|
1482
|
+
|
|
1483
|
+
const mockGoogleStream = new ReadableStream({
|
|
1484
|
+
start(controller) {
|
|
1485
|
+
data.forEach((item) => {
|
|
1486
|
+
controller.enqueue(item);
|
|
1487
|
+
});
|
|
1488
|
+
controller.close();
|
|
1489
|
+
},
|
|
1490
|
+
});
|
|
1491
|
+
|
|
1492
|
+
const protocolStream = GoogleGenerativeAIStream(mockGoogleStream);
|
|
1493
|
+
const chunks = await decodeStreamChunks(protocolStream);
|
|
1494
|
+
|
|
1495
|
+
expect(chunks).toEqual(
|
|
1496
|
+
[
|
|
1497
|
+
// First reasoning text
|
|
1498
|
+
'id: chat_1',
|
|
1499
|
+
'event: reasoning_part',
|
|
1500
|
+
'data: {"content":"**Clarifying the Core Concept**\\n\\nI\'m now focusing on the visual metaphor. I plan to depict Agent Runtime as a software environment that manages and executes agents\' tasks, similar to how an operating system functions. I aim to create an informative and intuitive infographic.\\n\\n\\n","inReasoning":true,"partType":"text"}\n',
|
|
1501
|
+
|
|
1502
|
+
// Second reasoning text
|
|
1503
|
+
'id: chat_1',
|
|
1504
|
+
'event: reasoning_part',
|
|
1505
|
+
'data: {"content":"**Developing Visual Representation**\\n\\nI\'m now iterating on the visual representation. The \\"command center\\" metaphor is proving useful. I\'m focusing on the interplay of the core components: the central engine coordinates perception, memory, planning, and action, with tools and plugins as extensions. The goal is to clearly show the flow of information through the system, from input to output, using visual cues. The aesthetic aims for a futuristic, tech-inspired look with glowing lines and circuit board elements, using a palette of blues, purples, and oranges.\\n\\n\\n","inReasoning":true,"partType":"text"}\n',
|
|
1506
|
+
|
|
1507
|
+
// Third reasoning text
|
|
1508
|
+
'id: chat_1',
|
|
1509
|
+
'event: reasoning_part',
|
|
1510
|
+
'data: {"content":"**Constructing the Architecture**\\n\\nI\'m presently building out the architecture of the infographic. I\'ve broken down \\"Agent Runtime\\" into its core components and I\'m designing the visual relationships between them. The central engine will be the focal point, with modules for perception, memory, planning, action, and tools radiating outwards. My aim is to illustrate the workflow from input to output clearly. I\'ll utilize arrows to represent the flow of data and instructions between each module.\\n\\n\\n","inReasoning":true,"partType":"text"}\n',
|
|
1511
|
+
|
|
1512
|
+
// First reasoning image
|
|
1513
|
+
'id: chat_1',
|
|
1514
|
+
'event: reasoning_part',
|
|
1515
|
+
'data: {"content":"/9j/4AAQSkZJRgABAQEBLAEsAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQBiUgnjeAMHrkrk/mMk8k57YpwnCsBtzkcgk+/ONuO3QE4JJ3ccuJ373EbEg5KgDkAg4xtySNrHv14HYm9ne7au7JpLRfm+vn63Emtk9FZWv1t+Ntt9L9Wj//2Q==","inReasoning":true,"mimeType":"image/jpeg","partType":"image"}\n',
|
|
1516
|
+
|
|
1517
|
+
// Fourth reasoning text
|
|
1518
|
+
'id: chat_1',
|
|
1519
|
+
'event: reasoning_part',
|
|
1520
|
+
'data: {"content":"**Constructing an Infographic**\\n\\nI\'ve successfully created an infographic depicting an \\"Agent Runtime.\\" The design employs a tech-inspired circuit board aesthetic, placing the core engine at the center. I\'ve clearly represented six essential modules: perception, memory, planning, action, tools, and learning. Arrows and text annotations vividly illustrate the data flow and processing.\\n\\n\\n","inReasoning":true,"partType":"text"}\n',
|
|
1521
|
+
|
|
1522
|
+
// Fifth reasoning text
|
|
1523
|
+
'id: chat_1',
|
|
1524
|
+
'event: reasoning_part',
|
|
1525
|
+
'data: {"content":"**Defining Agent Runtime Modules**\\n\\nI\'m making progress clarifying the architecture of an \\"Agent Runtime\\" system. I\'ve designed an infographic with a circuit board aesthetic, centered on the core engine. Six key modules are now visualized: perception, memory, planning, action, tools, and learning. I\'ve incorporated arrows and annotations to show data flow effectively.\\n\\n\\n","inReasoning":true,"partType":"text"}\n',
|
|
1526
|
+
|
|
1527
|
+
// Content image (with thoughtSignature but not thought:true)
|
|
1528
|
+
'id: chat_1',
|
|
1529
|
+
'event: content_part',
|
|
1530
|
+
'data: {"content":"/9j/4AAQSkZJRgABAQEBLAEsAAD/2wBDAAEBAQEBAQEBA2Q==","mimeType":"image/jpeg","partType":"image","thoughtSignature":"EueybArjsmwB0e2Kby+QPRkacnmPuV+CqMr6tiey3M5BHLHgIiggQOMeFmnKzsoux6PI6dQMgmdbXE1OTLLcWUmUD1CgFn+C2VdI09FpHrVhxVAtSk/zFVSlsjfCuANxtkP8tCDppVZqIya0QYjzg5K1fEO0m42CZX2/MHyqL8NjzR0lT8ENdoV3RSaK2tXqPH45uIb6nGeBSuX1n2EUMzO"}\n',
|
|
1531
|
+
|
|
1532
|
+
// stop
|
|
1533
|
+
'id: chat_1',
|
|
1534
|
+
'event: stop',
|
|
1535
|
+
'data: "STOP"\n',
|
|
1536
|
+
|
|
1537
|
+
// usage
|
|
1538
|
+
'id: chat_1',
|
|
1539
|
+
'event: usage',
|
|
1540
|
+
'data: {"inputTextTokens":9,"outputImageTokens":1120,"outputReasoningTokens":361,"outputTextTokens":238,"totalInputTokens":9,"totalOutputTokens":1719,"totalTokens":1728}\n',
|
|
1541
|
+
].map((i) => i + '\n'),
|
|
1542
|
+
);
|
|
1543
|
+
});
|
|
1544
|
+
|
|
1545
|
+
it('should handle content text and image parts without reasoning', async () => {
|
|
1546
|
+
vi.spyOn(uuidModule, 'nanoid').mockReturnValueOnce('1');
|
|
1547
|
+
|
|
1548
|
+
const data = [
|
|
1549
|
+
{
|
|
1550
|
+
candidates: [
|
|
1551
|
+
{
|
|
1552
|
+
content: {
|
|
1553
|
+
parts: [
|
|
1554
|
+
{
|
|
1555
|
+
text: 'This is the description: ',
|
|
1556
|
+
},
|
|
1557
|
+
],
|
|
1558
|
+
role: 'model',
|
|
1559
|
+
},
|
|
1560
|
+
index: 0,
|
|
1561
|
+
},
|
|
1562
|
+
],
|
|
1563
|
+
usageMetadata: {
|
|
1564
|
+
promptTokenCount: 5,
|
|
1565
|
+
totalTokenCount: 5,
|
|
1566
|
+
promptTokensDetails: [{ modality: 'TEXT', tokenCount: 5 }],
|
|
1567
|
+
},
|
|
1568
|
+
modelVersion: 'gemini-3-pro-image-preview',
|
|
1569
|
+
},
|
|
1570
|
+
{
|
|
1571
|
+
candidates: [
|
|
1572
|
+
{
|
|
1573
|
+
content: {
|
|
1574
|
+
parts: [
|
|
1575
|
+
{
|
|
1576
|
+
inlineData: {
|
|
1577
|
+
mimeType: 'image/png',
|
|
1578
|
+
data: 'iVBORw0KGgoAAAANSUhEUgAAAAUA',
|
|
1579
|
+
},
|
|
1580
|
+
},
|
|
1581
|
+
],
|
|
1582
|
+
role: 'model',
|
|
1583
|
+
},
|
|
1584
|
+
index: 0,
|
|
1585
|
+
},
|
|
1586
|
+
],
|
|
1587
|
+
usageMetadata: {
|
|
1588
|
+
promptTokenCount: 5,
|
|
1589
|
+
totalTokenCount: 5,
|
|
1590
|
+
promptTokensDetails: [{ modality: 'TEXT', tokenCount: 5 }],
|
|
1591
|
+
},
|
|
1592
|
+
modelVersion: 'gemini-3-pro-image-preview',
|
|
1593
|
+
},
|
|
1594
|
+
{
|
|
1595
|
+
candidates: [
|
|
1596
|
+
{
|
|
1597
|
+
content: {
|
|
1598
|
+
parts: [
|
|
1599
|
+
{
|
|
1600
|
+
text: ' an example.',
|
|
1601
|
+
},
|
|
1602
|
+
],
|
|
1603
|
+
role: 'model',
|
|
1604
|
+
},
|
|
1605
|
+
finishReason: 'STOP',
|
|
1606
|
+
index: 0,
|
|
1607
|
+
},
|
|
1608
|
+
],
|
|
1609
|
+
usageMetadata: {
|
|
1610
|
+
promptTokenCount: 5,
|
|
1611
|
+
candidatesTokenCount: 10,
|
|
1612
|
+
totalTokenCount: 15,
|
|
1613
|
+
promptTokensDetails: [{ modality: 'TEXT', tokenCount: 5 }],
|
|
1614
|
+
candidatesTokensDetails: [{ modality: 'TEXT', tokenCount: 10 }],
|
|
1615
|
+
},
|
|
1616
|
+
modelVersion: 'gemini-3-pro-image-preview',
|
|
1617
|
+
},
|
|
1618
|
+
];
|
|
1619
|
+
|
|
1620
|
+
const mockGoogleStream = new ReadableStream({
|
|
1621
|
+
start(controller) {
|
|
1622
|
+
data.forEach((item) => {
|
|
1623
|
+
controller.enqueue(item);
|
|
1624
|
+
});
|
|
1625
|
+
controller.close();
|
|
1626
|
+
},
|
|
1627
|
+
});
|
|
1628
|
+
|
|
1629
|
+
const protocolStream = GoogleGenerativeAIStream(mockGoogleStream);
|
|
1630
|
+
const chunks = await decodeStreamChunks(protocolStream);
|
|
1631
|
+
|
|
1632
|
+
expect(chunks).toEqual(
|
|
1633
|
+
[
|
|
1634
|
+
'id: chat_1',
|
|
1635
|
+
'event: content_part',
|
|
1636
|
+
'data: {"content":"This is the description: ","partType":"text"}\n',
|
|
1637
|
+
|
|
1638
|
+
'id: chat_1',
|
|
1639
|
+
'event: content_part',
|
|
1640
|
+
'data: {"content":"iVBORw0KGgoAAAANSUhEUgAAAAUA","mimeType":"image/png","partType":"image"}\n',
|
|
1641
|
+
|
|
1642
|
+
'id: chat_1',
|
|
1643
|
+
'event: content_part',
|
|
1644
|
+
'data: {"content":" an example.","partType":"text"}\n',
|
|
1645
|
+
|
|
1646
|
+
'id: chat_1',
|
|
1647
|
+
'event: stop',
|
|
1648
|
+
'data: "STOP"\n',
|
|
1649
|
+
|
|
1650
|
+
'id: chat_1',
|
|
1651
|
+
'event: usage',
|
|
1652
|
+
'data: {"inputTextTokens":5,"outputImageTokens":0,"outputTextTokens":10,"totalInputTokens":5,"totalOutputTokens":10,"totalTokens":15}\n',
|
|
1653
|
+
].map((i) => i + '\n'),
|
|
1654
|
+
);
|
|
1655
|
+
});
|
|
1656
|
+
|
|
1657
|
+
it('should handle mixed reasoning and content parts in single chunk', async () => {
|
|
1658
|
+
vi.spyOn(uuidModule, 'nanoid').mockReturnValueOnce('1');
|
|
1659
|
+
|
|
1660
|
+
const data = [
|
|
1661
|
+
{
|
|
1662
|
+
candidates: [
|
|
1663
|
+
{
|
|
1664
|
+
content: {
|
|
1665
|
+
parts: [
|
|
1666
|
+
{
|
|
1667
|
+
text: 'Analyzing the request...',
|
|
1668
|
+
thought: true,
|
|
1669
|
+
},
|
|
1670
|
+
{
|
|
1671
|
+
text: 'Here is the answer: ',
|
|
1672
|
+
},
|
|
1673
|
+
{
|
|
1674
|
+
inlineData: {
|
|
1675
|
+
mimeType: 'image/png',
|
|
1676
|
+
data: 'base64data',
|
|
1677
|
+
},
|
|
1678
|
+
},
|
|
1679
|
+
],
|
|
1680
|
+
role: 'model',
|
|
1681
|
+
},
|
|
1682
|
+
finishReason: 'STOP',
|
|
1683
|
+
index: 0,
|
|
1684
|
+
},
|
|
1685
|
+
],
|
|
1686
|
+
usageMetadata: {
|
|
1687
|
+
promptTokenCount: 10,
|
|
1688
|
+
candidatesTokenCount: 20,
|
|
1689
|
+
totalTokenCount: 30,
|
|
1690
|
+
promptTokensDetails: [{ modality: 'TEXT', tokenCount: 10 }],
|
|
1691
|
+
thoughtsTokenCount: 5,
|
|
1692
|
+
},
|
|
1693
|
+
modelVersion: 'gemini-3-pro-image-preview',
|
|
1694
|
+
},
|
|
1695
|
+
];
|
|
1696
|
+
|
|
1697
|
+
const mockGoogleStream = new ReadableStream({
|
|
1698
|
+
start(controller) {
|
|
1699
|
+
data.forEach((item) => {
|
|
1700
|
+
controller.enqueue(item);
|
|
1701
|
+
});
|
|
1702
|
+
controller.close();
|
|
1703
|
+
},
|
|
1704
|
+
});
|
|
1705
|
+
|
|
1706
|
+
const protocolStream = GoogleGenerativeAIStream(mockGoogleStream);
|
|
1707
|
+
const chunks = await decodeStreamChunks(protocolStream);
|
|
1708
|
+
|
|
1709
|
+
expect(chunks).toEqual(
|
|
1710
|
+
[
|
|
1711
|
+
'id: chat_1',
|
|
1712
|
+
'event: reasoning_part',
|
|
1713
|
+
'data: {"content":"Analyzing the request...","inReasoning":true,"partType":"text"}\n',
|
|
1714
|
+
|
|
1715
|
+
'id: chat_1',
|
|
1716
|
+
'event: content_part',
|
|
1717
|
+
'data: {"content":"Here is the answer: ","partType":"text"}\n',
|
|
1718
|
+
|
|
1719
|
+
'id: chat_1',
|
|
1720
|
+
'event: content_part',
|
|
1721
|
+
'data: {"content":"base64data","mimeType":"image/png","partType":"image"}\n',
|
|
1722
|
+
|
|
1723
|
+
'id: chat_1',
|
|
1724
|
+
'event: stop',
|
|
1725
|
+
'data: "STOP"\n',
|
|
1726
|
+
|
|
1727
|
+
'id: chat_1',
|
|
1728
|
+
'event: usage',
|
|
1729
|
+
'data: {"inputTextTokens":10,"outputImageTokens":0,"outputReasoningTokens":5,"outputTextTokens":20,"totalInputTokens":10,"totalOutputTokens":25,"totalTokens":30}\n',
|
|
1730
|
+
].map((i) => i + '\n'),
|
|
1731
|
+
);
|
|
1732
|
+
});
|
|
1733
|
+
});
|
|
1303
1734
|
});
|
|
@@ -7,6 +7,7 @@ import { convertGoogleAIUsage } from '../../usageConverters/google-ai';
|
|
|
7
7
|
import {
|
|
8
8
|
ChatPayloadForTransformStream,
|
|
9
9
|
StreamContext,
|
|
10
|
+
StreamPartChunkData,
|
|
10
11
|
StreamProtocolChunk,
|
|
11
12
|
StreamToolCallChunkData,
|
|
12
13
|
createCallbacksTransformer,
|
|
@@ -114,12 +115,121 @@ const transformGoogleGenerativeAIStream = (
|
|
|
114
115
|
.join('') || '';
|
|
115
116
|
|
|
116
117
|
if (candidate) {
|
|
117
|
-
//
|
|
118
|
-
|
|
118
|
+
// Check if this response contains reasoning or multimodal content
|
|
119
|
+
const parts = candidate.content?.parts || [];
|
|
120
|
+
const hasReasoningParts = parts.some((p: any) => p.thought === true);
|
|
121
|
+
const hasImageParts = parts.some((p: any) => p.inlineData);
|
|
122
|
+
const hasThoughtSignature = parts.some((p: any) => p.thoughtSignature);
|
|
123
|
+
const hasThoughtsInMetadata = (usageMetadata as any)?.thoughtsTokenCount > 0;
|
|
124
|
+
|
|
125
|
+
// Check model version to determine if new format should be used
|
|
126
|
+
const modelVersion = (chunk as any).modelVersion || '';
|
|
127
|
+
const isGemini25Plus = modelVersion.includes('gemini-2.5') || modelVersion.includes('gemini-3');
|
|
128
|
+
const isGemini3Model =
|
|
129
|
+
modelVersion.includes('gemini-3') || modelVersion.includes('image-preview');
|
|
130
|
+
|
|
131
|
+
// Check if this is the old single-image scenario (single image part with finishReason)
|
|
132
|
+
// This should use the legacy base64_image event format (only for gemini-2.0 and earlier)
|
|
133
|
+
const isSingleImageWithFinish =
|
|
134
|
+
parts.length === 1 &&
|
|
135
|
+
hasImageParts &&
|
|
136
|
+
!hasReasoningParts &&
|
|
137
|
+
candidate.finishReason &&
|
|
138
|
+
!isGemini25Plus;
|
|
139
|
+
|
|
140
|
+
// Check if this has grounding metadata (should use legacy text + grounding events)
|
|
141
|
+
const hasGroundingMetadata = !!candidate.groundingMetadata?.groundingChunks;
|
|
142
|
+
|
|
143
|
+
// Use content_part/reasoning_part events when:
|
|
144
|
+
// 1. There are reasoning parts in current chunk (thought: true)
|
|
145
|
+
// 2. There are multiple parts with images (multimodal content)
|
|
146
|
+
// 3. There are thoughtSignature in parts (reasoning metadata attached to content)
|
|
147
|
+
// 4. There is thoughtsTokenCount in metadata (indicates response contains reasoning)
|
|
148
|
+
// 5. This is Gemini 3 model with image generation (always use new format for consistency)
|
|
149
|
+
// BUT NOT for:
|
|
150
|
+
// - The legacy single-image scenario
|
|
151
|
+
// - Grounding metadata scenario (uses legacy text + grounding events)
|
|
152
|
+
const shouldUseMultimodalProcessing =
|
|
153
|
+
(hasReasoningParts ||
|
|
154
|
+
(hasImageParts && parts.length > 1) ||
|
|
155
|
+
hasThoughtSignature ||
|
|
156
|
+
hasThoughtsInMetadata ||
|
|
157
|
+
isGemini3Model) &&
|
|
158
|
+
!isSingleImageWithFinish &&
|
|
159
|
+
!hasGroundingMetadata;
|
|
160
|
+
|
|
161
|
+
// Process multimodal parts (text and images in reasoning or content)
|
|
162
|
+
if (
|
|
163
|
+
shouldUseMultimodalProcessing &&
|
|
164
|
+
Array.isArray(candidate.content?.parts) &&
|
|
165
|
+
candidate.content.parts.length > 0
|
|
166
|
+
) {
|
|
167
|
+
const results: StreamProtocolChunk[] = [];
|
|
168
|
+
|
|
119
169
|
for (const part of candidate.content.parts) {
|
|
170
|
+
// 1. Reasoning text part
|
|
120
171
|
if (part && part.text && part.thought === true) {
|
|
121
|
-
|
|
172
|
+
results.push({
|
|
173
|
+
data: {
|
|
174
|
+
content: part.text,
|
|
175
|
+
inReasoning: true,
|
|
176
|
+
partType: 'text',
|
|
177
|
+
thoughtSignature: part.thoughtSignature,
|
|
178
|
+
} as StreamPartChunkData,
|
|
179
|
+
id: context.id,
|
|
180
|
+
type: 'reasoning_part',
|
|
181
|
+
});
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// 2. Reasoning image part
|
|
185
|
+
else if (part && part.inlineData && part.thought === true) {
|
|
186
|
+
results.push({
|
|
187
|
+
data: {
|
|
188
|
+
content: part.inlineData.data,
|
|
189
|
+
inReasoning: true,
|
|
190
|
+
mimeType: part.inlineData.mimeType,
|
|
191
|
+
partType: 'image',
|
|
192
|
+
thoughtSignature: part.thoughtSignature,
|
|
193
|
+
} as StreamPartChunkData,
|
|
194
|
+
id: context.id,
|
|
195
|
+
type: 'reasoning_part',
|
|
196
|
+
});
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
// 3. Content text part
|
|
200
|
+
else if (part && part.text && !part.thought) {
|
|
201
|
+
results.push({
|
|
202
|
+
data: {
|
|
203
|
+
content: part.text,
|
|
204
|
+
partType: 'text',
|
|
205
|
+
thoughtSignature: part.thoughtSignature,
|
|
206
|
+
} as StreamPartChunkData,
|
|
207
|
+
id: context.id,
|
|
208
|
+
type: 'content_part',
|
|
209
|
+
});
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// 4. Content image part
|
|
213
|
+
else if (part && part.inlineData && !part.thought) {
|
|
214
|
+
results.push({
|
|
215
|
+
data: {
|
|
216
|
+
content: part.inlineData.data,
|
|
217
|
+
mimeType: part.inlineData.mimeType,
|
|
218
|
+
partType: 'image',
|
|
219
|
+
thoughtSignature: part.thoughtSignature,
|
|
220
|
+
} as StreamPartChunkData,
|
|
221
|
+
id: context.id,
|
|
222
|
+
type: 'content_part',
|
|
223
|
+
});
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
// If we found multimodal parts, return them with usage chunks
|
|
228
|
+
if (results.length > 0) {
|
|
229
|
+
if (candidate.finishReason && usageMetadata) {
|
|
230
|
+
results.push(...usageChunks);
|
|
122
231
|
}
|
|
232
|
+
return results;
|
|
123
233
|
}
|
|
124
234
|
}
|
|
125
235
|
|