@idk500/video-vision-mcp 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,319 @@
1
+ /**
2
+ * Vision + text client backed by an OpenAI-compatible endpoint.
3
+ *
4
+ * Originally this file targeted Tencent Cloud Hunyuan. It has been rewritten
5
+ * to call a generic OpenAI-compatible Chat Completions endpoint so any
6
+ * provider that accepts `image_url` content parts can serve as the "eyes" —
7
+ * by default Zhipu Bigmodel's free `glm-4.6v-flash` model.
8
+ *
9
+ * The class name (`HunyuanClient`) and its public method signatures are kept
10
+ * identical on purpose: the rest of the codebase (video-processor.ts,
11
+ * index.ts) depends only on this surface, so changing the backend does not
12
+ * require touching callers.
13
+ */
14
+ import fs from 'fs/promises';
15
+ import fetch from 'node-fetch';
16
+ const DEFAULT_ENDPOINT = 'https://open.bigmodel.cn/api/paas/v4';
17
+ const DEFAULT_VISION_MODEL = 'glm-4.6v-flash';
18
+ const DEFAULT_TEXT_MODEL = 'glm-4.6';
19
+ // Max image size accepted by most OpenAI-compatible vision endpoints (MB).
20
+ const MAX_IMAGE_MB = 5;
21
+ // 5xx / overload responses are retried with backoff.
22
+ const MAX_RETRIES = 4;
23
+ const RETRY_BASE_MS = 3000;
24
+ export class HunyuanClient {
25
+ apiKey;
26
+ endpoint;
27
+ visionModel;
28
+ textModel;
29
+ constructor(config) {
30
+ // `secretId` carries the API key for the OpenAI-compatible backend.
31
+ this.apiKey = config?.secretId;
32
+ this.endpoint = (config?.endpoint || DEFAULT_ENDPOINT).replace(/\/+$/, '');
33
+ this.visionModel = config?.visionModel || DEFAULT_VISION_MODEL;
34
+ this.textModel = config?.textModel || DEFAULT_TEXT_MODEL;
35
+ }
36
+ /**
37
+ * Set credentials at runtime. For the OpenAI-compatible backend only the
38
+ * `secretId` (API key) is meaningful; `secretKey` is accepted for
39
+ * signature compatibility and ignored.
40
+ */
41
+ setCredentials(secretId, _secretKey) {
42
+ this.apiKey = secretId;
43
+ }
44
+ resolveApiKey(override) {
45
+ const key = override || this.apiKey || process.env.VISION_API_KEY;
46
+ if (!key) {
47
+ throw new Error([
48
+ '视觉模型 API Key 未配置。',
49
+ '',
50
+ '请通过以下任一方式提供:',
51
+ '1. 环境变量 VISION_API_KEY',
52
+ '2. 调用参数 apiKey / 历史字段 secretId',
53
+ '3. 启动参数(见 index.ts)',
54
+ '',
55
+ '默认后端:智谱 Bigmodel ' + DEFAULT_VISION_MODEL + '(OpenAI 兼容)。',
56
+ '获取 key:https://open.bigmodel.cn/usercenter/apikeys',
57
+ ].join('\n'));
58
+ }
59
+ return key;
60
+ }
61
+ resolveEndpoint() {
62
+ return (process.env.VISION_ENDPOINT || this.endpoint).replace(/\/+$/, '');
63
+ }
64
+ resolveVisionModel(override) {
65
+ return process.env.VISION_MODEL || override || this.visionModel;
66
+ }
67
+ resolveTextModel(override) {
68
+ return process.env.TEXT_MODEL || override || this.textModel;
69
+ }
70
+ /** Map a file extension to its MIME type, defaulting to jpeg. */
71
+ mimeFromExt(imagePath) {
72
+ const ext = imagePath.toLowerCase().split('.').pop();
73
+ switch (ext) {
74
+ case 'png':
75
+ return 'image/png';
76
+ case 'webp':
77
+ return 'image/webp';
78
+ case 'bmp':
79
+ return 'image/bmp';
80
+ case 'gif':
81
+ return 'image/gif';
82
+ case 'jpg':
83
+ case 'jpeg':
84
+ default:
85
+ return 'image/jpeg';
86
+ }
87
+ }
88
+ /**
89
+ * Read an image and return a base64 string suitable for `image_url.url`.
90
+ * Returns a bare base64 string (no `data:` prefix) — this is what the
91
+ * Zhipu Bigmodel endpoint expects; an explicit `data:` URL is *not* added
92
+ * here because the backend rejects it for the glm-4.6v family.
93
+ */
94
+ async imageToBase64(imagePath) {
95
+ try {
96
+ await fs.access(imagePath);
97
+ }
98
+ catch {
99
+ throw new Error(`图片文件不存在或无法访问: ${imagePath}`);
100
+ }
101
+ const buf = await fs.readFile(imagePath);
102
+ const sizeMB = buf.length / (1024 * 1024);
103
+ if (sizeMB > MAX_IMAGE_MB) {
104
+ throw new Error(`图片文件过大 (${sizeMB.toFixed(2)}MB),请使用小于 ${MAX_IMAGE_MB}MB 的图片。文件: ${imagePath}`);
105
+ }
106
+ // Read the file but only use its MIME for diagnostics; the endpoint takes
107
+ // raw base64 and infers the format itself.
108
+ const mimeType = this.mimeFromExt(imagePath);
109
+ console.error(`图片 ${imagePath} 转换完成 - 大小: ${sizeMB.toFixed(2)}MB, 格式: ${mimeType}`);
110
+ return buf.toString('base64');
111
+ }
112
+ /** Sleep helper for retry backoff. */
113
+ sleep(ms) {
114
+ return new Promise((resolve) => setTimeout(resolve, ms));
115
+ }
116
+ /**
117
+ * POST to /chat/completions with retry on 429/5xx. The Zhipu free vision
118
+ * model throttles aggressively under load, so callers benefit from backoff.
119
+ */
120
+ async chatCompletions(body, apiKey, endpoint) {
121
+ let lastError = null;
122
+ for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
123
+ let response;
124
+ try {
125
+ response = await fetch(`${endpoint}/chat/completions`, {
126
+ method: 'POST',
127
+ headers: {
128
+ 'Content-Type': 'application/json',
129
+ Authorization: `Bearer ${apiKey}`,
130
+ },
131
+ body: JSON.stringify(body),
132
+ });
133
+ }
134
+ catch (err) {
135
+ lastError = err instanceof Error ? err : new Error(String(err));
136
+ console.error(`网络请求失败 (尝试 ${attempt}/${MAX_RETRIES}): ${lastError.message}`);
137
+ if (attempt < MAX_RETRIES) {
138
+ await this.sleep(RETRY_BASE_MS * attempt);
139
+ continue;
140
+ }
141
+ throw new Error(`视觉模型网络请求失败 (${endpoint}): ${lastError.message}\n请检查网络连接和端点配置。`);
142
+ }
143
+ // Throttled — back off and retry.
144
+ if (response.status === 429) {
145
+ const wait = RETRY_BASE_MS * attempt;
146
+ console.error(`模型访问量过大 (429),${wait}ms 后重试 (${attempt}/${MAX_RETRIES})`);
147
+ lastError = new Error('模型当前访问量过大,请稍后再试');
148
+ if (attempt < MAX_RETRIES) {
149
+ await this.sleep(wait);
150
+ continue;
151
+ }
152
+ }
153
+ // Transient server error — back off and retry.
154
+ if (response.status >= 500) {
155
+ const wait = RETRY_BASE_MS * attempt;
156
+ const errText = await response.text().catch(() => '');
157
+ console.error(`服务端错误 (HTTP ${response.status}),${wait}ms 后重试 (${attempt}/${MAX_RETRIES}): ${errText.slice(0, 120)}`);
158
+ lastError = new Error(`视觉模型服务端错误 (HTTP ${response.status})`);
159
+ if (attempt < MAX_RETRIES) {
160
+ await this.sleep(wait);
161
+ continue;
162
+ }
163
+ }
164
+ const raw = await response.text();
165
+ if (!response.ok) {
166
+ // Non-retriable error: surface a readable message.
167
+ let detail = raw;
168
+ try {
169
+ const parsed = JSON.parse(raw);
170
+ detail = parsed.error?.message || parsed.message || raw;
171
+ }
172
+ catch {
173
+ /* keep raw */
174
+ }
175
+ throw new Error(`视觉模型请求失败 (HTTP ${response.status} ${response.statusText}):\n${detail}\n\n请检查:\n1. API Key 是否正确且未过期\n2. 模型名称是否正确 (视觉: ${body.model})\n3. 端点配置 (${endpoint})\n4. 账户余额/套餐是否包含该模型`);
176
+ }
177
+ let parsed;
178
+ try {
179
+ parsed = JSON.parse(raw);
180
+ }
181
+ catch {
182
+ throw new Error(`视觉模型返回了非 JSON 响应: ${raw.slice(0, 200)}`);
183
+ }
184
+ if (parsed.error) {
185
+ throw new Error(`视觉模型错误: ${JSON.stringify(parsed.error)}`);
186
+ }
187
+ return parsed;
188
+ }
189
+ throw lastError || new Error('视觉模型请求失败');
190
+ }
191
+ /**
192
+ * Analyze a single image. The backend is the OpenAI-compatible Chat
193
+ * Completions API with an `image_url` content part.
194
+ */
195
+ async analyzeImage(imagePath, prompt = '请描述这张图片的内容。', apiKeyOverride) {
196
+ console.error(`开始分析图片: ${imagePath}`);
197
+ const apiKey = this.resolveApiKey(apiKeyOverride);
198
+ const endpoint = this.resolveEndpoint();
199
+ const model = this.resolveVisionModel();
200
+ const base64 = await this.imageToBase64(imagePath);
201
+ const result = await this.chatCompletions({
202
+ model,
203
+ // Disable reasoning/thinking so we get a direct answer cheaply.
204
+ thinking: { type: 'disabled' },
205
+ messages: [
206
+ {
207
+ role: 'user',
208
+ content: [
209
+ { type: 'text', text: prompt },
210
+ { type: 'image_url', image_url: { url: base64 } },
211
+ ],
212
+ },
213
+ ],
214
+ max_tokens: 1024,
215
+ }, apiKey, endpoint);
216
+ const content = result.choices?.[0]?.message?.content ?? '';
217
+ const usage = result.usage ?? {};
218
+ const out = {
219
+ content,
220
+ usage: {
221
+ promptTokens: usage.prompt_tokens ?? 0,
222
+ completionTokens: usage.completion_tokens ?? 0,
223
+ totalTokens: usage.total_tokens ?? 0,
224
+ },
225
+ };
226
+ console.error(`图片分析完成 - Token 使用: ${out.usage.totalTokens} (提示: ${out.usage.promptTokens}, 回复: ${out.usage.completionTokens})`);
227
+ return out;
228
+ }
229
+ /**
230
+ * Analyze each image in sequence. Serial processing keeps us under the free
231
+ * model's QPS limit; failed images do not abort the batch.
232
+ */
233
+ async analyzeImageBatch(imagePaths, prompt = '请描述这张图片的内容。', apiKeyOverride) {
234
+ const results = [];
235
+ const total = imagePaths.length;
236
+ console.error(`开始批量分析 ${total} 张图片`);
237
+ for (let i = 0; i < total; i++) {
238
+ const imagePath = imagePaths[i];
239
+ try {
240
+ console.error(`分析进度: ${i + 1}/${total} - ${imagePath}`);
241
+ const result = await this.analyzeImage(imagePath, prompt, apiKeyOverride);
242
+ results.push(result);
243
+ // Throttle to respect the free model's rate limit between images.
244
+ if (i < total - 1) {
245
+ await this.sleep(1200);
246
+ }
247
+ }
248
+ catch (error) {
249
+ const msg = error instanceof Error ? error.message : String(error);
250
+ console.error(`图片分析失败 ${imagePath}:`, msg);
251
+ results.push({
252
+ content: `❌ 分析失败: ${msg}`,
253
+ usage: { promptTokens: 0, completionTokens: 0, totalTokens: 0 },
254
+ });
255
+ console.error(`跳过失败的图片,继续处理剩余图片...`);
256
+ }
257
+ }
258
+ const success = results.filter((r) => !r.content.startsWith('❌')).length;
259
+ const totalTokens = results.reduce((s, r) => s + r.usage.totalTokens, 0);
260
+ console.error(`批量分析完成 - 成功: ${success}/${total}, 总 Token: ${totalTokens}`);
261
+ return results;
262
+ }
263
+ /**
264
+ * Analyze multiple images in a single request. Useful for video frames so
265
+ * the model can reason about sequence/context. Limited to 4 images to keep
266
+ * request size and cost bounded (matches the original behavior).
267
+ */
268
+ async analyzeImagesInSingleRequest(imagePaths, prompt = '请描述这些图片的内容。', apiKeyOverride) {
269
+ const apiKey = this.resolveApiKey(apiKeyOverride);
270
+ const endpoint = this.resolveEndpoint();
271
+ const model = this.resolveVisionModel();
272
+ const limited = imagePaths.slice(0, 4);
273
+ const images = await Promise.all(limited.map((p) => this.imageToBase64(p)));
274
+ const content = [{ type: 'text', text: prompt }];
275
+ for (const b64 of images) {
276
+ content.push({ type: 'image_url', image_url: { url: b64 } });
277
+ }
278
+ const result = await this.chatCompletions({
279
+ model,
280
+ thinking: { type: 'disabled' },
281
+ messages: [{ role: 'user', content }],
282
+ max_tokens: 2048,
283
+ }, apiKey, endpoint);
284
+ const out = {
285
+ content: result.choices?.[0]?.message?.content ?? '',
286
+ usage: {
287
+ promptTokens: result.usage?.prompt_tokens ?? 0,
288
+ completionTokens: result.usage?.completion_tokens ?? 0,
289
+ totalTokens: result.usage?.total_tokens ?? 0,
290
+ },
291
+ };
292
+ return out;
293
+ }
294
+ /**
295
+ * Generate text from a prompt. Used for the second pass of script
296
+ * generation, where we don't need vision — a plain text model is cheaper.
297
+ */
298
+ async generateText(prompt, modelOverride, apiKeyOverride) {
299
+ const apiKey = this.resolveApiKey(apiKeyOverride);
300
+ const endpoint = this.resolveEndpoint();
301
+ const model = this.resolveTextModel(modelOverride);
302
+ console.error(`开始生成文本内容,使用模型: ${model}`);
303
+ const result = await this.chatCompletions({
304
+ model,
305
+ messages: [{ role: 'user', content: prompt }],
306
+ max_tokens: 4096,
307
+ }, apiKey, endpoint);
308
+ const out = {
309
+ content: result.choices?.[0]?.message?.content ?? '',
310
+ usage: {
311
+ promptTokens: result.usage?.prompt_tokens ?? 0,
312
+ completionTokens: result.usage?.completion_tokens ?? 0,
313
+ totalTokens: result.usage?.total_tokens ?? 0,
314
+ },
315
+ };
316
+ console.error(`文本生成完成 - Token 使用: ${out.usage.totalTokens} (提示: ${out.usage.promptTokens}, 回复: ${out.usage.completionTokens})`);
317
+ return out;
318
+ }
319
+ }
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env node
2
+ declare class VideoMCPServer {
3
+ private server;
4
+ private videoProcessor;
5
+ private frameExtractor;
6
+ private secretId?;
7
+ private secretKey?;
8
+ private region?;
9
+ constructor(options?: {
10
+ secretId?: string;
11
+ secretKey?: string;
12
+ region?: string;
13
+ });
14
+ private setupToolHandlers;
15
+ private formatError;
16
+ private handleExtractFrames;
17
+ private handleAnalyzeVideo;
18
+ private handleAnalyzeImageBatch;
19
+ private handleGetVideoInfo;
20
+ private handleGenerateVideoScript;
21
+ private handleGenerateImageScript;
22
+ run(): Promise<void>;
23
+ }
24
+ export { VideoMCPServer };