@lobehub/chat 1.131.4 → 1.132.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/CHANGELOG.md +25 -0
  2. package/changelog/v1.json +9 -0
  3. package/package.json +1 -1
  4. package/packages/context-engine/src/processors/MessageContent.ts +45 -10
  5. package/packages/context-engine/src/processors/__tests__/MessageContent.test.ts +179 -1
  6. package/packages/database/src/models/message.ts +9 -1
  7. package/packages/model-bank/src/aiModels/google.ts +7 -0
  8. package/packages/model-runtime/src/providers/google/index.ts +31 -8
  9. package/packages/model-runtime/src/types/chat.ts +6 -0
  10. package/packages/prompts/src/prompts/files/index.test.ts +148 -3
  11. package/packages/prompts/src/prompts/files/index.ts +17 -5
  12. package/packages/prompts/src/prompts/files/video.ts +17 -0
  13. package/packages/types/src/agent/index.ts +1 -1
  14. package/packages/types/src/message/chat.ts +2 -4
  15. package/packages/types/src/message/index.ts +1 -0
  16. package/packages/types/src/message/video.ts +5 -0
  17. package/packages/utils/src/client/index.ts +1 -0
  18. package/packages/utils/src/client/videoValidation.test.ts +53 -0
  19. package/packages/utils/src/client/videoValidation.ts +21 -0
  20. package/packages/utils/src/parseModels.ts +4 -0
  21. package/src/app/[variants]/(main)/chat/(workspace)/@conversation/features/ChatInput/useSend.ts +9 -4
  22. package/src/components/ModelSelect/index.tsx +14 -2
  23. package/src/features/ChatInput/ActionBar/Upload/ClientMode.tsx +7 -0
  24. package/src/features/ChatInput/ActionBar/Upload/ServerMode.tsx +29 -3
  25. package/src/features/ChatInput/components/UploadDetail/UploadStatus.tsx +1 -1
  26. package/src/features/Conversation/Messages/Assistant/index.tsx +4 -1
  27. package/src/features/Conversation/Messages/User/VideoFileListViewer.tsx +31 -0
  28. package/src/features/Conversation/Messages/User/index.tsx +3 -1
  29. package/src/hooks/useModelSupportVideo.ts +10 -0
  30. package/src/locales/default/chat.ts +4 -0
  31. package/src/locales/default/components.ts +1 -0
  32. package/src/services/chat/contextEngineering.test.ts +0 -1
  33. package/src/services/chat/contextEngineering.ts +3 -1
  34. package/src/services/chat/helper.ts +4 -0
  35. package/src/services/upload.ts +1 -1
  36. package/src/store/aiInfra/slices/aiModel/selectors.ts +7 -0
  37. package/src/store/chat/slices/aiChat/actions/generateAIChatV2.ts +22 -0
  38. package/src/store/chat/slices/message/action.ts +15 -14
package/CHANGELOG.md CHANGED
@@ -2,6 +2,31 @@
2
2
 
3
3
  # Changelog
4
4
 
5
+ ## [Version 1.132.0](https://github.com/lobehub/lobe-chat/compare/v1.131.4...v1.132.0)
6
+
7
+ <sup>Released on **2025-09-21**</sup>
8
+
9
+ #### ✨ Features
10
+
11
+ - **misc**: Support google video understanding.
12
+
13
+ <br/>
14
+
15
+ <details>
16
+ <summary><kbd>Improvements and Fixes</kbd></summary>
17
+
18
+ #### What's improved
19
+
20
+ - **misc**: Support google video understanding, closes [#8761](https://github.com/lobehub/lobe-chat/issues/8761) ([f02d43b](https://github.com/lobehub/lobe-chat/commit/f02d43b))
21
+
22
+ </details>
23
+
24
+ <div align="right">
25
+
26
+ [![](https://img.shields.io/badge/-BACK_TO_TOP-151515?style=flat-square)](#readme-top)
27
+
28
+ </div>
29
+
5
30
  ### [Version 1.131.4](https://github.com/lobehub/lobe-chat/compare/v1.131.3...v1.131.4)
6
31
 
7
32
  <sup>Released on **2025-09-21**</sup>
package/changelog/v1.json CHANGED
@@ -1,4 +1,13 @@
1
1
  [
2
+ {
3
+ "children": {
4
+ "features": [
5
+ "Support google video understanding."
6
+ ]
7
+ },
8
+ "date": "2025-09-21",
9
+ "version": "1.132.0"
10
+ },
2
11
  {
3
12
  "children": {
4
13
  "improvements": [
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lobehub/chat",
3
- "version": "1.131.4",
3
+ "version": "1.132.0",
4
4
  "description": "Lobe Chat - an open-source, high-performance chatbot framework that supports speech synthesis, multimodal, and extensible Function Call plugin system. Supports one-click free deployment of your private ChatGPT/LLM web application.",
5
5
  "keywords": [
6
6
  "framework",
@@ -17,6 +17,8 @@ export interface FileContextConfig {
17
17
  export interface MessageContentConfig {
18
18
  /** File context configuration */
19
19
  fileContext?: FileContextConfig;
20
+ /** Function to check if video is supported */
21
+ isCanUseVideo?: (model: string, provider: string) => boolean | undefined;
20
22
  /** Function to check if vision is supported */
21
23
  isCanUseVision?: (model: string, provider: string) => boolean | undefined;
22
24
  /** Model name */
@@ -33,7 +35,10 @@ export interface UserMessageContentPart {
33
35
  signature?: string;
34
36
  text?: string;
35
37
  thinking?: string;
36
- type: 'text' | 'image_url' | 'thinking';
38
+ type: 'text' | 'image_url' | 'thinking' | 'video_url';
39
+ video_url?: {
40
+ url: string;
41
+ };
37
42
  }
38
43
 
39
44
  /**
@@ -104,12 +109,13 @@ export class MessageContentProcessor extends BaseProcessor {
104
109
  * Process user message content
105
110
  */
106
111
  private async processUserMessage(message: any): Promise<any> {
107
- // Check if images or files need processing
112
+ // Check if images, videos or files need processing
108
113
  const hasImages = message.imageList && message.imageList.length > 0;
114
+ const hasVideos = message.videoList && message.videoList.length > 0;
109
115
  const hasFiles = message.fileList && message.fileList.length > 0;
110
116
 
111
- // If no images and files, return plain text content directly
112
- if (!hasImages && !hasFiles) {
117
+ // If no images, videos and files, return plain text content directly
118
+ if (!hasImages && !hasVideos && !hasFiles) {
113
119
  return {
114
120
  ...message,
115
121
  content: message.content,
@@ -121,12 +127,13 @@ export class MessageContentProcessor extends BaseProcessor {
121
127
  // Add text content
122
128
  let textContent = message.content || '';
123
129
 
124
- // Add file context (if file context is enabled and has files or images)
125
- if ((hasFiles || hasImages) && this.config.fileContext?.enabled) {
130
+ // Add file context (if file context is enabled and has files, images or videos)
131
+ if ((hasFiles || hasImages || hasVideos) && this.config.fileContext?.enabled) {
126
132
  const filesContext = filesPrompts({
127
133
  addUrl: this.config.fileContext.includeFileUrl ?? true,
128
134
  fileList: message.fileList,
129
- imageList: message.imageList,
135
+ imageList: message.imageList || [],
136
+ videoList: message.videoList || [],
130
137
  });
131
138
 
132
139
  if (filesContext) {
@@ -148,17 +155,26 @@ export class MessageContentProcessor extends BaseProcessor {
148
155
  contentParts.push(...imageContentParts);
149
156
  }
150
157
 
158
+ // Process video content
159
+ if (hasVideos && this.config.isCanUseVideo?.(this.config.model, this.config.provider)) {
160
+ const videoContentParts = await this.processVideoList(message.videoList || []);
161
+ contentParts.push(...videoContentParts);
162
+ }
163
+
151
164
  // 明确返回的字段,只保留必要的消息字段
152
- const hasFileContext = (hasFiles || hasImages) && this.config.fileContext?.enabled;
165
+ const hasFileContext = (hasFiles || hasImages || hasVideos) && this.config.fileContext?.enabled;
153
166
  const hasVisionContent =
154
167
  hasImages && this.config.isCanUseVision?.(this.config.model, this.config.provider);
168
+ const hasVideoContent =
169
+ hasVideos && this.config.isCanUseVideo?.(this.config.model, this.config.provider);
155
170
 
156
- // 如果只有文本内容且没有添加文件上下文也没有视觉内容,返回纯文本
171
+ // 如果只有文本内容且没有添加文件上下文也没有视觉/视频内容,返回纯文本
157
172
  if (
158
173
  contentParts.length === 1 &&
159
174
  contentParts[0].type === 'text' &&
160
175
  !hasFileContext &&
161
- !hasVisionContent
176
+ !hasVisionContent &&
177
+ !hasVideoContent
162
178
  ) {
163
179
  return {
164
180
  content: contentParts[0].text,
@@ -274,6 +290,22 @@ export class MessageContentProcessor extends BaseProcessor {
274
290
  );
275
291
  }
276
292
 
293
+ /**
294
+ * 处理视频列表
295
+ */
296
+ private async processVideoList(videoList: any[]): Promise<UserMessageContentPart[]> {
297
+ if (!videoList || videoList.length === 0) {
298
+ return [];
299
+ }
300
+
301
+ return videoList.map((video) => {
302
+ return {
303
+ type: 'video_url',
304
+ video_url: { url: video.url },
305
+ } as UserMessageContentPart;
306
+ });
307
+ }
308
+
277
309
  /**
278
310
  * 验证内容部分格式
279
311
  */
@@ -290,6 +322,9 @@ export class MessageContentProcessor extends BaseProcessor {
290
322
  case 'thinking': {
291
323
  return !!(part.thinking && part.signature);
292
324
  }
325
+ case 'video_url': {
326
+ return !!(part.video_url && part.video_url.url);
327
+ }
293
328
  default: {
294
329
  return false;
295
330
  }
@@ -1,4 +1,4 @@
1
- import { ChatImageItem, ChatMessage } from '@lobechat/types';
1
+ import { ChatImageItem, ChatMessage, ChatVideoItem } from '@lobechat/types';
2
2
  import { describe, expect, it, vi } from 'vitest';
3
3
 
4
4
  import type { PipelineContext } from '../../types';
@@ -26,6 +26,7 @@ const createContext = (messages: ChatMessage[]): PipelineContext => ({
26
26
  });
27
27
 
28
28
  const mockIsCanUseVision = vi.fn();
29
+ const mockIsCanUseVideo = vi.fn();
29
30
 
30
31
  describe('MessageContentProcessor', () => {
31
32
  describe('Image processing functionality', () => {
@@ -391,4 +392,181 @@ describe('MessageContentProcessor', () => {
391
392
  expect(result.metadata.assistantMessagesProcessed).toBe(1);
392
393
  });
393
394
  });
395
+
396
+ describe('Video processing functionality', () => {
397
+ it('should return empty video content parts if model cannot use video', async () => {
398
+ mockIsCanUseVideo.mockReturnValue(false);
399
+
400
+ const processor = new MessageContentProcessor({
401
+ model: 'any-model',
402
+ provider: 'any-provider',
403
+ isCanUseVideo: mockIsCanUseVideo,
404
+ fileContext: { enabled: false },
405
+ });
406
+
407
+ const messages: ChatMessage[] = [
408
+ {
409
+ id: 'test',
410
+ role: 'user',
411
+ content: 'Hello',
412
+ videoList: [{ url: 'video_url', alt: 'test video', id: 'test' } as ChatVideoItem],
413
+ createdAt: Date.now(),
414
+ updatedAt: Date.now(),
415
+ meta: {},
416
+ },
417
+ ];
418
+
419
+ const result = await processor.process(createContext(messages));
420
+
421
+ // Should return plain text when video is not supported
422
+ expect(result.messages[0].content).toBe('Hello');
423
+ });
424
+
425
+ it('should process videos if model can use video', async () => {
426
+ mockIsCanUseVideo.mockReturnValue(true);
427
+
428
+ const processor = new MessageContentProcessor({
429
+ model: 'gpt-4-vision',
430
+ provider: 'openai',
431
+ isCanUseVideo: mockIsCanUseVideo,
432
+ fileContext: { enabled: false },
433
+ });
434
+
435
+ const messages: ChatMessage[] = [
436
+ {
437
+ id: 'test',
438
+ role: 'user',
439
+ content: 'Hello',
440
+ videoList: [
441
+ { url: 'http://example.com/video.mp4', alt: 'test video', id: 'test1' },
442
+ { url: 'http://example.com/video2.mp4', alt: 'test video 2', id: 'test2' },
443
+ ] as ChatVideoItem[],
444
+ createdAt: Date.now(),
445
+ updatedAt: Date.now(),
446
+ meta: {},
447
+ },
448
+ ];
449
+
450
+ const result = await processor.process(createContext(messages));
451
+
452
+ const content = result.messages[0].content as any[];
453
+ expect(content).toHaveLength(3); // text + 2 videos
454
+ expect(content[0].type).toBe('text');
455
+ expect(content[0].text).toBe('Hello');
456
+ expect(content[1].type).toBe('video_url');
457
+ expect(content[1].video_url.url).toBe('http://example.com/video.mp4');
458
+ expect(content[2].type).toBe('video_url');
459
+ expect(content[2].video_url.url).toBe('http://example.com/video2.mp4');
460
+ });
461
+
462
+ it('should handle video disabled scenario correctly', async () => {
463
+ mockIsCanUseVideo.mockReturnValue(false);
464
+
465
+ const processor = new MessageContentProcessor({
466
+ model: 'text-model',
467
+ provider: 'openai',
468
+ isCanUseVideo: mockIsCanUseVideo,
469
+ fileContext: { enabled: false },
470
+ });
471
+
472
+ const messages: ChatMessage[] = [
473
+ {
474
+ id: 'test',
475
+ role: 'user',
476
+ content: 'Analyze this video',
477
+ videoList: [
478
+ { url: 'http://example.com/video.mp4', alt: 'test video', id: 'test' },
479
+ ] as ChatVideoItem[],
480
+ createdAt: Date.now(),
481
+ updatedAt: Date.now(),
482
+ meta: {},
483
+ },
484
+ ];
485
+
486
+ const result = await processor.process(createContext(messages));
487
+
488
+ // Should return plain text only when video not supported
489
+ expect(result.messages[0].content).toBe('Analyze this video');
490
+ });
491
+
492
+ it('should include videos in file context when enabled', async () => {
493
+ mockIsCanUseVideo.mockReturnValue(false); // Video processing disabled but file context enabled
494
+
495
+ const processor = new MessageContentProcessor({
496
+ model: 'gpt-4',
497
+ provider: 'openai',
498
+ isCanUseVideo: mockIsCanUseVideo,
499
+ fileContext: { enabled: true, includeFileUrl: true },
500
+ });
501
+
502
+ const messages: ChatMessage[] = [
503
+ {
504
+ id: 'test',
505
+ role: 'user',
506
+ content: 'Hello',
507
+ videoList: [
508
+ {
509
+ id: 'video1',
510
+ url: 'http://example.com/video.mp4',
511
+ alt: 'Test video',
512
+ },
513
+ ] as ChatVideoItem[],
514
+ createdAt: Date.now(),
515
+ updatedAt: Date.now(),
516
+ meta: {},
517
+ },
518
+ ];
519
+
520
+ const result = await processor.process(createContext(messages));
521
+
522
+ // Should return structured content when has videos and file context enabled
523
+ expect(Array.isArray(result.messages[0].content)).toBe(true);
524
+ const content = result.messages[0].content as any[];
525
+ expect(content).toHaveLength(1);
526
+ expect(content[0].type).toBe('text');
527
+ expect(content[0].text).toContain('SYSTEM CONTEXT');
528
+ expect(content[0].text).toContain('Hello');
529
+ });
530
+
531
+ it('should handle mixed images and videos correctly', async () => {
532
+ mockIsCanUseVision.mockReturnValue(true);
533
+ mockIsCanUseVideo.mockReturnValue(true);
534
+
535
+ const processor = new MessageContentProcessor({
536
+ model: 'gpt-4-vision',
537
+ provider: 'openai',
538
+ isCanUseVideo: mockIsCanUseVideo,
539
+ isCanUseVision: mockIsCanUseVision,
540
+ fileContext: { enabled: false },
541
+ });
542
+
543
+ const messages: ChatMessage[] = [
544
+ {
545
+ id: 'test',
546
+ role: 'user',
547
+ content: 'Analyze these media files',
548
+ imageList: [
549
+ { url: 'http://example.com/image.jpg', alt: 'test image', id: 'img1' },
550
+ ] as ChatImageItem[],
551
+ videoList: [
552
+ { url: 'http://example.com/video.mp4', alt: 'test video', id: 'vid1' },
553
+ ] as ChatVideoItem[],
554
+ createdAt: Date.now(),
555
+ updatedAt: Date.now(),
556
+ meta: {},
557
+ },
558
+ ];
559
+
560
+ const result = await processor.process(createContext(messages));
561
+
562
+ const content = result.messages[0].content as any[];
563
+ expect(content).toHaveLength(3); // text + image + video
564
+ expect(content[0].type).toBe('text');
565
+ expect(content[0].text).toBe('Analyze these media files');
566
+ expect(content[1].type).toBe('image_url');
567
+ expect(content[1].image_url.url).toBe('http://example.com/image.jpg');
568
+ expect(content[2].type).toBe('video_url');
569
+ expect(content[2].video_url.url).toBe('http://example.com/video.mp4');
570
+ });
571
+ });
394
572
  });
@@ -5,6 +5,7 @@ import {
5
5
  ChatTTS,
6
6
  ChatToolPayload,
7
7
  ChatTranslate,
8
+ ChatVideoItem,
8
9
  CreateMessageParams,
9
10
  MessageItem,
10
11
  ModelRankItem,
@@ -175,7 +176,10 @@ export class MessageModel {
175
176
  }
176
177
 
177
178
  const imageList = relatedFileList.filter((i) => (i.fileType || '').startsWith('image'));
178
- const fileList = relatedFileList.filter((i) => !(i.fileType || '').startsWith('image'));
179
+ const videoList = relatedFileList.filter((i) => (i.fileType || '').startsWith('video'));
180
+ const fileList = relatedFileList.filter(
181
+ (i) => !(i.fileType || '').startsWith('image') && !(i.fileType || '').startsWith('video'),
182
+ );
179
183
 
180
184
  // 3. get relative file chunks
181
185
  const chunksList = await this.db
@@ -251,6 +255,10 @@ export class MessageModel {
251
255
  ragQuery: messageQuery?.rewriteQuery,
252
256
  ragQueryId: messageQuery?.id,
253
257
  ragRawQuery: messageQuery?.userQuery,
258
+ videoList: videoList
259
+ .filter((relation) => relation.messageId === item.id)
260
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
261
+ .map<ChatVideoItem>(({ id, url, name }) => ({ alt: name!, id, url })),
254
262
  } as unknown as ChatMessage;
255
263
  },
256
264
  );
@@ -7,6 +7,7 @@ const googleChatModels: AIChatModelCard[] = [
7
7
  functionCall: true,
8
8
  reasoning: true,
9
9
  search: true,
10
+ video: true,
10
11
  vision: true,
11
12
  },
12
13
  contextWindowTokens: 1_048_576 + 65_536,
@@ -60,6 +61,7 @@ const googleChatModels: AIChatModelCard[] = [
60
61
  functionCall: true,
61
62
  reasoning: true,
62
63
  search: true,
64
+ video: true,
63
65
  vision: true,
64
66
  },
65
67
  contextWindowTokens: 1_048_576 + 65_536,
@@ -112,6 +114,7 @@ const googleChatModels: AIChatModelCard[] = [
112
114
  functionCall: true,
113
115
  reasoning: true,
114
116
  search: true,
117
+ video: true,
115
118
  vision: true,
116
119
  },
117
120
  contextWindowTokens: 1_048_576 + 65_536,
@@ -163,6 +166,7 @@ const googleChatModels: AIChatModelCard[] = [
163
166
  functionCall: true,
164
167
  reasoning: true,
165
168
  search: true,
169
+ video: true,
166
170
  vision: true,
167
171
  },
168
172
  contextWindowTokens: 1_048_576 + 65_536,
@@ -191,6 +195,7 @@ const googleChatModels: AIChatModelCard[] = [
191
195
  functionCall: true,
192
196
  reasoning: true,
193
197
  search: true,
198
+ video: true,
194
199
  vision: true,
195
200
  },
196
201
  contextWindowTokens: 1_048_576 + 65_536,
@@ -240,6 +245,7 @@ const googleChatModels: AIChatModelCard[] = [
240
245
  functionCall: true,
241
246
  reasoning: true,
242
247
  search: true,
248
+ video: true,
243
249
  vision: true,
244
250
  },
245
251
  contextWindowTokens: 1_048_576 + 65_536,
@@ -267,6 +273,7 @@ const googleChatModels: AIChatModelCard[] = [
267
273
  functionCall: true,
268
274
  reasoning: true,
269
275
  search: true,
276
+ video: true,
270
277
  vision: true,
271
278
  },
272
279
  contextWindowTokens: 1_048_576 + 65_536,
@@ -439,10 +439,7 @@ export class LobeGoogleAI implements LobeRuntimeAI {
439
439
  }
440
440
 
441
441
  return {
442
- inlineData: {
443
- data: base64,
444
- mimeType: mimeType || 'image/png',
445
- },
442
+ inlineData: { data: base64, mimeType: mimeType || 'image/png' },
446
443
  };
447
444
  }
448
445
 
@@ -450,15 +447,41 @@ export class LobeGoogleAI implements LobeRuntimeAI {
450
447
  const { base64, mimeType } = await imageUrlToBase64(content.image_url.url);
451
448
 
452
449
  return {
453
- inlineData: {
454
- data: base64,
455
- mimeType,
456
- },
450
+ inlineData: { data: base64, mimeType },
457
451
  };
458
452
  }
459
453
 
460
454
  throw new TypeError(`currently we don't support image url: ${content.image_url.url}`);
461
455
  }
456
+
457
+ case 'video_url': {
458
+ const { mimeType, base64, type } = parseDataUri(content.video_url.url);
459
+
460
+ if (type === 'base64') {
461
+ if (!base64) {
462
+ throw new TypeError("Video URL doesn't contain base64 data");
463
+ }
464
+
465
+ return {
466
+ inlineData: { data: base64, mimeType: mimeType || 'video/mp4' },
467
+ };
468
+ }
469
+
470
+ if (type === 'url') {
471
+ // For video URLs, we need to fetch and convert to base64
472
+ // Note: This might need size/duration limits for practical use
473
+ const response = await fetch(content.video_url.url);
474
+ const arrayBuffer = await response.arrayBuffer();
475
+ const base64 = Buffer.from(arrayBuffer).toString('base64');
476
+ const mimeType = response.headers.get('content-type') || 'video/mp4';
477
+
478
+ return {
479
+ inlineData: { data: base64, mimeType },
480
+ };
481
+ }
482
+
483
+ throw new TypeError(`currently we don't support video url: ${content.video_url.url}`);
484
+ }
462
485
  }
463
486
  };
464
487
 
@@ -21,9 +21,15 @@ interface UserMessageContentPartImage {
21
21
  type: 'image_url';
22
22
  }
23
23
 
24
+ interface UserMessageContentPartVideo {
25
+ type: 'video_url';
26
+ video_url: { url: string };
27
+ }
28
+
24
29
  export type UserMessageContentPart =
25
30
  | UserMessageContentPartText
26
31
  | UserMessageContentPartImage
32
+ | UserMessageContentPartVideo
27
33
  | UserMessageContentPartThinking;
28
34
 
29
35
  export interface OpenAIChatMessage {