@hsiehchenwei/mcp-gemini-transcriber 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/server.mjs ADDED
@@ -0,0 +1,1954 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * MCP Gemini Transcriber Server (Node.js)
4
+ * 音訊轉逐字稿 MCP 工具,基於成功的 transcribe_gemini.mjs 腳本
5
+ */
6
+
7
+ import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
8
+ import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
9
+ import { z } from 'zod';
10
+ import { GoogleGenAI, createUserContent, createPartFromUri } from '@google/genai';
11
+ import { execSync, spawn } from 'child_process';
12
+ import { createHash } from 'crypto';
13
+ import { promises as fs } from 'fs';
14
+ import { tmpdir } from 'os';
15
+ import path from 'path';
16
+ import { config } from 'dotenv';
17
+
18
+ // 載入環境變數
19
+ config();
20
+
21
+ // 取得工作區根目錄
22
+ const WORKSPACE_ROOT = process.env.CURSOR_WORKSPACE_ROOT || process.env.WORKSPACE_ROOT || process.cwd();
23
+
24
+ /**
25
+ * 解析路徑(支援相對路徑)
26
+ */
27
+ function resolvePath(inputPath) {
28
+ if (path.isAbsolute(inputPath)) {
29
+ return inputPath;
30
+ }
31
+ return path.resolve(WORKSPACE_ROOT, inputPath);
32
+ }
33
+
34
+ // 設定常數
35
+ const SEGMENT_DURATION = 300; // 每段 5 分鐘(300 秒)
36
+ const OVERLAP_SECONDS = 10; // 重疊 10 秒(一句話約 3-10 秒)
37
+ const MAX_WORKERS = 25; // 快速模式最大平行數
38
+ const MAX_RETRIES = 2; // 最大重試次數
39
+ const RETRY_BASE_DELAY = 2000; // 重試基礎延遲(毫秒)
40
+ const MIN_TIMEOUT = 30000;
41
+ const MAX_TIMEOUT = 180000;
42
+ const MODEL_NAME = 'gemini-3-flash-preview';
43
+
44
+ // 支援的檔案格式
45
+ const SUPPORTED_AUDIO_FORMATS = ['.mp3', '.m4a', '.wav', '.webm', '.ogg', '.flac', '.aiff', '.aac'];
46
+ const SUPPORTED_IMAGE_FORMATS = ['.png', '.jpg', '.jpeg', '.webp', '.heic', '.heif'];
47
+
48
+ /**
49
+ * 計算動態 timeout
50
+ */
51
+ function calculateTimeout(durationSeconds) {
52
+ return Math.max(MIN_TIMEOUT, Math.min(MAX_TIMEOUT, durationSeconds * 1000));
53
+ }
54
+
55
+ /**
56
+ * 帶 timeout 的 Promise
57
+ */
58
+ function withTimeout(promise, ms, message = 'Operation timed out') {
59
+ return Promise.race([
60
+ promise,
61
+ new Promise((_, reject) => setTimeout(() => reject(new Error(message)), ms))
62
+ ]);
63
+ }
64
+
65
+ /**
66
+ * 取得音檔長度(秒)
67
+ */
68
+ async function getAudioDuration(audioPath) {
69
+ try {
70
+ const result = execSync(
71
+ `ffprobe -v quiet -show_entries format=duration -of json "${audioPath}"`,
72
+ { encoding: 'utf8' }
73
+ );
74
+ const data = JSON.parse(result);
75
+ return parseFloat(data.format.duration);
76
+ } catch {
77
+ return 0;
78
+ }
79
+ }
80
+
81
+ /**
82
+ * 分割音檔為多段(快速模式,無 overlap)
83
+ */
84
+ async function splitAudioFast(audioPath) {
85
+ const duration = await getAudioDuration(audioPath);
86
+
87
+ if (duration <= SEGMENT_DURATION * 1.5) {
88
+ return [{ index: 0, path: audioPath, offset: 0, duration, isOriginal: true }];
89
+ }
90
+
91
+ const tempDir = path.join(tmpdir(), `audio_segments_${Date.now()}`);
92
+ await fs.mkdir(tempDir, { recursive: true });
93
+
94
+ const numSegments = Math.ceil(duration / SEGMENT_DURATION);
95
+ console.error(`📐 [快速模式] 音檔 ${(duration / 60).toFixed(1)} 分鐘,分割為 ${numSegments} 段...`);
96
+
97
+ const ext = path.extname(audioPath);
98
+ const splitPromises = [];
99
+
100
+ for (let i = 0; i < numSegments; i++) {
101
+ const startTime = i * SEGMENT_DURATION;
102
+ const segmentPath = path.join(tempDir, `segment_${String(i).padStart(3, '0')}${ext}`);
103
+ const segmentDuration = Math.min(SEGMENT_DURATION, duration - startTime);
104
+
105
+ splitPromises.push(
106
+ new Promise((resolve, reject) => {
107
+ const ffmpeg = spawn('ffmpeg', [
108
+ '-y', '-i', audioPath,
109
+ '-ss', String(startTime),
110
+ '-t', String(segmentDuration),
111
+ '-c', 'copy',
112
+ segmentPath
113
+ ], { stdio: 'pipe' });
114
+
115
+ ffmpeg.on('close', (code) => {
116
+ if (code === 0) {
117
+ resolve({ index: i, path: segmentPath, offset: startTime, duration: segmentDuration });
118
+ } else {
119
+ reject(new Error(`FFmpeg failed for segment ${i}`));
120
+ }
121
+ });
122
+ ffmpeg.on('error', reject);
123
+ })
124
+ );
125
+ }
126
+
127
+ const results = await Promise.all(splitPromises);
128
+ console.error(`✅ 分割完成:${results.length} 段`);
129
+ return results.sort((a, b) => a.index - b.index);
130
+ }
131
+
132
+ /**
133
+ * 分割音檔為多段(平行模式,5 秒 overlap)
134
+ */
135
+ async function splitAudioParallel(audioPath) {
136
+ const duration = await getAudioDuration(audioPath);
137
+ const PARALLEL_OVERLAP = 5; // 平行模式使用 5 秒 overlap
138
+
139
+ if (duration <= SEGMENT_DURATION * 1.5) {
140
+ return [{ index: 0, path: audioPath, offset: 0, duration, isOriginal: true }];
141
+ }
142
+
143
+ const tempDir = path.join(tmpdir(), `audio_segments_${Date.now()}`);
144
+ await fs.mkdir(tempDir, { recursive: true });
145
+
146
+ const stepDuration = SEGMENT_DURATION - PARALLEL_OVERLAP;
147
+ const numSegments = Math.ceil((duration - PARALLEL_OVERLAP) / stepDuration);
148
+ console.error(`📐 [平行模式] 音檔 ${(duration / 60).toFixed(1)} 分鐘,分割為 ${numSegments} 段(overlap ${PARALLEL_OVERLAP}s)...`);
149
+
150
+ const ext = path.extname(audioPath);
151
+ const splitPromises = [];
152
+
153
+ for (let i = 0; i < numSegments; i++) {
154
+ const startTime = i * stepDuration;
155
+ const segmentPath = path.join(tempDir, `segment_${String(i).padStart(3, '0')}${ext}`);
156
+ const segmentDuration = Math.min(SEGMENT_DURATION, duration - startTime);
157
+
158
+ splitPromises.push(
159
+ new Promise((resolve, reject) => {
160
+ const ffmpeg = spawn('ffmpeg', [
161
+ '-y', '-i', audioPath,
162
+ '-ss', String(startTime),
163
+ '-t', String(segmentDuration),
164
+ '-c', 'copy',
165
+ segmentPath
166
+ ], { stdio: 'pipe' });
167
+
168
+ ffmpeg.on('close', (code) => {
169
+ if (code === 0) {
170
+ resolve({
171
+ index: i,
172
+ path: segmentPath,
173
+ offset: startTime,
174
+ duration: segmentDuration,
175
+ hasOverlap: i > 0
176
+ });
177
+ } else {
178
+ reject(new Error(`FFmpeg failed for segment ${i}`));
179
+ }
180
+ });
181
+ ffmpeg.on('error', reject);
182
+ })
183
+ );
184
+ }
185
+
186
+ const results = await Promise.all(splitPromises);
187
+ console.error(`✅ 分割完成:${results.length} 段`);
188
+ return results.sort((a, b) => a.index - b.index);
189
+ }
190
+
191
+ /**
192
+ * 分割音檔為多段(語者識別模式,含 overlap)
193
+ */
194
+ async function splitAudioSpeaker(audioPath) {
195
+ const duration = await getAudioDuration(audioPath);
196
+
197
+ if (duration <= SEGMENT_DURATION * 1.5) {
198
+ return [{ index: 0, path: audioPath, offset: 0, duration, isOriginal: true }];
199
+ }
200
+
201
+ const tempDir = path.join(tmpdir(), `audio_segments_${Date.now()}`);
202
+ await fs.mkdir(tempDir, { recursive: true });
203
+
204
+ const stepDuration = SEGMENT_DURATION - OVERLAP_SECONDS;
205
+ const numSegments = Math.ceil((duration - OVERLAP_SECONDS) / stepDuration);
206
+ console.error(`📐 [語者識別模式] 音檔 ${(duration / 60).toFixed(1)} 分鐘,分割為 ${numSegments} 段(overlap ${OVERLAP_SECONDS}s)...`);
207
+
208
+ const ext = path.extname(audioPath);
209
+ const splitPromises = [];
210
+
211
+ for (let i = 0; i < numSegments; i++) {
212
+ const startTime = i * stepDuration;
213
+ const segmentPath = path.join(tempDir, `segment_${String(i).padStart(3, '0')}${ext}`);
214
+ const segmentDuration = Math.min(SEGMENT_DURATION, duration - startTime);
215
+
216
+ splitPromises.push(
217
+ new Promise((resolve, reject) => {
218
+ const ffmpeg = spawn('ffmpeg', [
219
+ '-y', '-i', audioPath,
220
+ '-ss', String(startTime),
221
+ '-t', String(segmentDuration),
222
+ '-c', 'copy',
223
+ segmentPath
224
+ ], { stdio: 'pipe' });
225
+
226
+ ffmpeg.on('close', (code) => {
227
+ if (code === 0) {
228
+ resolve({
229
+ index: i,
230
+ path: segmentPath,
231
+ offset: startTime,
232
+ duration: segmentDuration,
233
+ hasOverlap: i > 0
234
+ });
235
+ } else {
236
+ reject(new Error(`FFmpeg failed for segment ${i}`));
237
+ }
238
+ });
239
+ ffmpeg.on('error', reject);
240
+ })
241
+ );
242
+ }
243
+
244
+ const results = await Promise.all(splitPromises);
245
+ console.error(`✅ 分割完成:${results.length} 段`);
246
+ return results.sort((a, b) => a.index - b.index);
247
+ }
248
+
249
+ /**
250
+ * 調整時間戳記(統一為 [MM:SS] 格式,超過 60 分鐘才用 [HH:MM:SS])
251
+ */
252
+ function adjustTimestamps(transcript, offsetSeconds) {
253
+ return transcript.replace(/\[(\d{1,2}:\d{2}(?::\d{2})?)\]/g, (match, timeStr) => {
254
+ const parts = timeStr.split(':').map(Number);
255
+ let h, m, s;
256
+
257
+ if (parts.length === 3) {
258
+ [h, m, s] = parts;
259
+ } else {
260
+ [m, s] = parts;
261
+ h = 0;
262
+ }
263
+
264
+ const totalSeconds = h * 3600 + m * 60 + s + offsetSeconds;
265
+ const newH = Math.floor(totalSeconds / 3600);
266
+ const newM = Math.floor((totalSeconds % 3600) / 60);
267
+ const newS = totalSeconds % 60;
268
+
269
+ // 統一格式:小於 60 分鐘用 [MM:SS],否則用 [HH:MM:SS]
270
+ if (newH === 0) {
271
+ return `[${String(newM).padStart(2, '0')}:${String(newS).padStart(2, '0')}]`;
272
+ } else {
273
+ return `[${String(newH).padStart(2, '0')}:${String(newM).padStart(2, '0')}:${String(newS).padStart(2, '0')}]`;
274
+ }
275
+ });
276
+ }
277
+
278
+ /**
279
+ * 標準化時間戳格式(將 [00:MM:SS] 轉為 [MM:SS])
280
+ */
281
+ function normalizeTimestamps(transcript) {
282
+ return transcript.replace(/\[00:(\d{2}:\d{2})\]/g, '[$1]');
283
+ }
284
+
285
+ /**
286
+ * 合併重疊區域文字(改進版去重)
287
+ * 使用多重策略比對,找到最佳接續點
288
+ */
289
+ function mergeOverlapText(existingText, newText) {
290
+ if (!existingText) return newText;
291
+ if (!newText) return existingText;
292
+
293
+ const existingLines = existingText.trim().split('\n').filter(l => l.trim());
294
+ const newLines = newText.trim().split('\n').filter(l => l.trim());
295
+
296
+ // 清理文字用於比對(移除時間戳和語者標籤)
297
+ const cleanLine = (line) => {
298
+ return line
299
+ .replace(/\[\d{1,2}:\d{2}(:\d{2})?\]/g, '') // 移除時間戳
300
+ .replace(/\*\*[^*]+\*\*[::]/g, '') // 移除 **語者**:
301
+ .replace(/^[^::]+[::]/g, '') // 移除 語者:
302
+ .trim();
303
+ };
304
+
305
+ // 計算兩個字串的相似度(Jaccard)
306
+ const similarity = (a, b) => {
307
+ if (!a || !b) return 0;
308
+ const setA = new Set(a.split(''));
309
+ const setB = new Set(b.split(''));
310
+ const intersection = new Set([...setA].filter(x => setB.has(x)));
311
+ const union = new Set([...setA, ...setB]);
312
+ return intersection.size / union.size;
313
+ };
314
+
315
+ // 取得既有文字最後 25 行的清理版本
316
+ const existingTail = existingLines.slice(-25);
317
+ const existingCleanTail = existingTail.map(cleanLine).filter(l => l.length >= 5);
318
+
319
+ // 檢查新文字前 20 行,找到最佳接續點
320
+ let bestMergeIndex = 0;
321
+ let bestScore = 0;
322
+
323
+ const checkLines = Math.min(20, newLines.length);
324
+
325
+ for (let i = 0; i < checkLines; i++) {
326
+ const newClean = cleanLine(newLines[i]);
327
+ if (newClean.length < 5) continue;
328
+
329
+ for (const existClean of existingCleanTail) {
330
+ // 策略 1:完全包含
331
+ if (newClean.includes(existClean) || existClean.includes(newClean)) {
332
+ if (i + 1 > bestMergeIndex) {
333
+ bestMergeIndex = i + 1;
334
+ bestScore = 1.0;
335
+ }
336
+ continue;
337
+ }
338
+
339
+ // 策略 2:前綴相同(至少 10 個字)
340
+ const minLen = Math.min(newClean.length, existClean.length, 20);
341
+ if (minLen >= 10 && newClean.slice(0, minLen) === existClean.slice(0, minLen)) {
342
+ if (i + 1 > bestMergeIndex) {
343
+ bestMergeIndex = i + 1;
344
+ bestScore = 0.9;
345
+ }
346
+ continue;
347
+ }
348
+
349
+ // 策略 3:高相似度(> 0.7)
350
+ const sim = similarity(newClean, existClean);
351
+ if (sim > 0.7 && sim > bestScore) {
352
+ bestMergeIndex = i + 1;
353
+ bestScore = sim;
354
+ }
355
+ }
356
+ }
357
+
358
+ // 如果找到重疊,從接續點開始合併
359
+ if (bestMergeIndex > 0) {
360
+ console.error(` 🔗 去重:跳過新段落前 ${bestMergeIndex} 行(相似度 ${(bestScore * 100).toFixed(0)}%)`);
361
+ }
362
+
363
+ const mergedNew = newLines.slice(bestMergeIndex).join('\n');
364
+
365
+ // 確保不會產生空白合併
366
+ if (!mergedNew.trim()) {
367
+ return existingText.trim();
368
+ }
369
+
370
+ return existingText.trim() + '\n\n' + mergedNew;
371
+ }
372
+
373
+ /**
374
+ * 提取語者清單
375
+ */
376
+ function extractSpeakers(transcript) {
377
+ const speakers = new Set();
378
+ // 匹配各種語者格式
379
+ const patterns = [
380
+ /\*\*([^*]+)\*\*[::]/g, // **語者**:
381
+ /\[([^\]]+)\][::]/g, // [語者]:
382
+ /^([^::\[\*\n]{1,20})[::]/gm, // 語者:(行首)
383
+ ];
384
+
385
+ for (const pattern of patterns) {
386
+ let match;
387
+ while ((match = pattern.exec(transcript)) !== null) {
388
+ const speaker = match[1].trim();
389
+ if (speaker && !speaker.match(/^\d{1,2}:\d{2}/)) { // 排除時間戳
390
+ speakers.add(speaker);
391
+ }
392
+ }
393
+ }
394
+ return Array.from(speakers);
395
+ }
396
+
397
+ /**
398
+ * 使用 Gemini API 統一語者名稱
399
+ */
400
+ async function normalizeSpeakers(ai, transcript, model) {
401
+ const speakers = extractSpeakers(transcript);
402
+ if (speakers.length <= 1) return transcript;
403
+
404
+ console.error(`🔄 統一語者名稱(發現 ${speakers.length} 個語者標籤)...`);
405
+
406
+ // 分析語者對應關係
407
+ const analysisPrompt = `分析以下逐字稿中的語者標籤,找出指向同一個人的不同標籤。
408
+
409
+ 發現的語者標籤:${speakers.join(', ')}
410
+
411
+ 逐字稿片段(用於判斷脈絡):
412
+ ${transcript.slice(0, 15000)}
413
+
414
+ 請輸出 JSON 格式的語者對應表,將所有標籤統一為最合適的名稱。
415
+ 規則:
416
+ 1. 只有在對話中【明確提到】的姓名才能使用
417
+ 2. 如果沒有提到姓名,保持原始標籤(如「男1」「女1」)
418
+ 3. 不要憑空創造姓名
419
+
420
+ 只輸出 JSON,格式如:
421
+ {"舊標籤1": "新標籤1", "舊標籤2": "新標籤2", ...}`;
422
+
423
+ try {
424
+ const response = await ai.models.generateContent({
425
+ model: model,
426
+ contents: createUserContent([analysisPrompt]),
427
+ config: { maxOutputTokens: 1024, temperature: 0.1 }
428
+ });
429
+
430
+ // 解析 JSON
431
+ const text = response.text;
432
+ const jsonMatch = text.match(/\{[\s\S]*\}/);
433
+ if (!jsonMatch) {
434
+ console.error(' ⚠️ 無法解析語者對應表,保持原樣');
435
+ return transcript;
436
+ }
437
+
438
+ const mapping = JSON.parse(jsonMatch[0]);
439
+ console.error(` 📋 語者對應表:${JSON.stringify(mapping)}`);
440
+
441
+ // 執行替換
442
+ let result = transcript;
443
+ for (const [oldName, newName] of Object.entries(mapping)) {
444
+ if (oldName === newName) continue;
445
+ // 替換各種格式
446
+ result = result.replace(new RegExp(`\\*\\*${escapeRegex(oldName)}\\*\\*([::])`, 'g'), `**${newName}**$1`);
447
+ result = result.replace(new RegExp(`\\[${escapeRegex(oldName)}\\]([::])`, 'g'), `[${newName}]$1`);
448
+ result = result.replace(new RegExp(`^${escapeRegex(oldName)}([::])`, 'gm'), `${newName}$1`);
449
+ }
450
+
451
+ console.error(' ✅ 語者名稱已統一');
452
+ return result;
453
+
454
+ } catch (error) {
455
+ console.error(` ⚠️ 語者統一失敗:${error.message},保持原樣`);
456
+ return transcript;
457
+ }
458
+ }
459
+
460
+ /**
461
+ * 跳脫正則特殊字元
462
+ */
463
+ function escapeRegex(str) {
464
+ return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
465
+ }
466
+
467
+ /**
468
+ * 快速模式:簡單轉錄單一片段(不做語者識別)
469
+ */
470
+ async function transcribeSingleSegmentFast(ai, segment, model, maxRetries = MAX_RETRIES) {
471
+ const { index, path: segmentPath, offset, duration = SEGMENT_DURATION } = segment;
472
+ let uploadedFile = null;
473
+ const segmentTimeout = calculateTimeout(duration);
474
+
475
+ const prompt = `你是專業語音轉錄員。【完整】轉錄這段音訊。
476
+
477
+ 規則:
478
+ 1. 完整轉錄,不可省略任何內容
479
+ 2. 時間戳格式:[MM:SS]
480
+ 3. 語者標籤:使用中性標籤 **語者 A**:、**語者 B**:、**語者 C**: 等
481
+ - 根據聲音特徵區分不同的人(音色、語調、說話方式)
482
+ - 不要預設性別,讓後續分析判斷
483
+ 4. 保留語氣詞(嗯、啊、欸、對)
484
+ 5. 使用繁體中文
485
+ 6. 直接輸出逐字稿,不要前言
486
+
487
+ 開始:`;
488
+
489
+ for (let attempt = 0; attempt < maxRetries; attempt++) {
490
+ try {
491
+ uploadedFile = await withTimeout(
492
+ ai.files.upload({ file: segmentPath, config: { mimeType: 'audio/mpeg' } }),
493
+ segmentTimeout,
494
+ `上傳超時 (${segmentTimeout/1000}s)`
495
+ );
496
+
497
+ const response = await withTimeout(
498
+ ai.models.generateContent({
499
+ model: model,
500
+ contents: createUserContent([
501
+ createPartFromUri(uploadedFile.uri, uploadedFile.mimeType),
502
+ prompt
503
+ ]),
504
+ config: { maxOutputTokens: 65536, temperature: 0.1 }
505
+ }),
506
+ segmentTimeout,
507
+ `轉錄超時 (${segmentTimeout/1000}s)`
508
+ );
509
+
510
+ ai.files.delete({ name: uploadedFile.name }).catch(() => {});
511
+
512
+ let transcript = response.text;
513
+
514
+ if (offset > 0) {
515
+ transcript = adjustTimestamps(transcript, offset);
516
+ }
517
+ // 統一時間戳格式
518
+ transcript = normalizeTimestamps(transcript);
519
+
520
+ return { index, transcript, success: true };
521
+
522
+ } catch (error) {
523
+ if (uploadedFile) {
524
+ ai.files.delete({ name: uploadedFile.name }).catch(() => {});
525
+ uploadedFile = null;
526
+ }
527
+
528
+ if (attempt < maxRetries - 1) {
529
+ const delay = RETRY_BASE_DELAY * Math.pow(2, attempt);
530
+ console.error(` ⚠️ 第 ${index + 1} 段重試 (${attempt + 1}/${maxRetries})...`);
531
+ await new Promise(r => setTimeout(r, delay));
532
+ continue;
533
+ }
534
+
535
+ return { index, transcript: '[此段無法轉錄]', success: false, error: error.message };
536
+ }
537
+ }
538
+ }
539
+
540
+ /**
541
+ * 語者識別模式:單一 API 呼叫同時轉錄 + 語者識別
542
+ * 回傳格式包含逐字稿和語者資訊,減少 API 呼叫次數
543
+ */
544
+ async function transcribeSingleSegmentSpeaker(ai, segment, model, speakerProfiles = {}, previousEnding = '', maxRetries = MAX_RETRIES) {
545
+ const { index, path: segmentPath, offset, duration = SEGMENT_DURATION } = segment;
546
+ let uploadedFile = null;
547
+ const segmentTimeout = calculateTimeout(duration);
548
+
549
+ // 構建已知語者資訊
550
+ const speakerList = Object.entries(speakerProfiles)
551
+ .map(([id, info]) => `${id}(${info.gender || '?'},${info.name || id})`)
552
+ .join('、');
553
+
554
+ // 單一 prompt:同時要求轉錄 + 情緒分析 + 語者資訊
555
+ let prompt = `你是專業語音轉錄員。請【完整】轉錄這段音訊,並識別語者和分析情緒。
556
+
557
+ 【輸出格式要求】:
558
+ 1. 先輸出逐字稿,格式:[MM:SS] **語者標籤**:內容
559
+ 2. 語者標籤規則:
560
+ - 用性別+編號:**男1**、**女1**、**男2** 等
561
+ - 只有對話中【明確提到】的姓名才能使用,不要憑空創造
562
+ - 全程保持標籤一致,不要改變
563
+ 3. 時間戳格式統一用 [MM:SS],不要用 [HH:MM:SS]
564
+ 4. 完成逐字稿後,用 ===EMOTION=== 分隔,分析這段音訊的情緒:
565
+ - 整體氛圍(輕鬆/認真/熱烈/沉思等)+ 強度(1-10)
566
+ - 2-3 個關鍵情緒時刻:[MM:SS] 語者X:「具體語句」→ 情緒:XXX
567
+ - 語者情緒特徵簡述(每位語者的情緒狀態和說話風格)
568
+ 5. 最後用 ===SPEAKERS=== 分隔,輸出語者資訊 JSON
569
+
570
+ `;
571
+
572
+ if (speakerList) {
573
+ prompt += `【已識別的語者】:${speakerList}
574
+ 請延續使用這些標籤,保持一致性。如發現新語者,新增編號。
575
+
576
+ `;
577
+ }
578
+
579
+ if (previousEnding) {
580
+ prompt += `【前段結尾】(用於銜接):
581
+ ${previousEnding.slice(-400)}
582
+
583
+ `;
584
+ }
585
+
586
+ prompt += `【輸出範例】:
587
+ [00:00] **男1**:今天我們來討論這本書。
588
+ [00:05] **女1**:對,我覺得作者的觀點很有趣。
589
+ [00:12] **男1**:嗯,特別是第三章。
590
+
591
+ ===SPEAKERS===
592
+ {
593
+ "男1": {"name": "男1", "gender": "男", "role": "?", "traits": "?"},
594
+ "女1": {"name": "女1", "gender": "女", "role": "?", "traits": "?"}
595
+ }
596
+
597
+ 開始轉錄:`;
598
+
599
+ for (let attempt = 0; attempt < maxRetries; attempt++) {
600
+ try {
601
+ uploadedFile = await withTimeout(
602
+ ai.files.upload({ file: segmentPath, config: { mimeType: 'audio/mpeg' } }),
603
+ segmentTimeout,
604
+ `上傳超時 (${segmentTimeout/1000}s)`
605
+ );
606
+
607
+ const response = await withTimeout(
608
+ ai.models.generateContent({
609
+ model: model,
610
+ contents: createUserContent([
611
+ createPartFromUri(uploadedFile.uri, uploadedFile.mimeType),
612
+ prompt
613
+ ]),
614
+ config: { maxOutputTokens: 65536, temperature: 0.1 }
615
+ }),
616
+ segmentTimeout,
617
+ `轉錄超時 (${segmentTimeout/1000}s)`
618
+ );
619
+
620
+ ai.files.delete({ name: uploadedFile.name }).catch(() => {});
621
+
622
+ const fullText = response.text;
623
+
624
+ // 解析回傳:分離逐字稿、情緒和語者資訊
625
+ let transcript = fullText;
626
+ let emotion = '';
627
+ let newSpeakerProfiles = {};
628
+
629
+ const emotionSplit = fullText.split('===EMOTION===');
630
+ if (emotionSplit.length >= 2) {
631
+ transcript = emotionSplit[0].trim();
632
+ const rest = emotionSplit[1];
633
+
634
+ const speakerSplit = rest.split('===SPEAKERS===');
635
+ if (speakerSplit.length >= 2) {
636
+ emotion = speakerSplit[0].trim();
637
+ const jsonPart = speakerSplit[1].trim();
638
+ const jsonMatch = jsonPart.match(/\{[\s\S]*\}/);
639
+ if (jsonMatch) {
640
+ try {
641
+ newSpeakerProfiles = JSON.parse(jsonMatch[0]);
642
+ console.error(` 📋 識別 ${Object.keys(newSpeakerProfiles).length} 位語者`);
643
+ } catch {
644
+ console.error(` ⚠️ 語者 JSON 解析失敗`);
645
+ }
646
+ }
647
+ } else {
648
+ emotion = rest.trim();
649
+ }
650
+ } else {
651
+ // 沒有情緒分析,只有語者資訊
652
+ const speakerSplit = fullText.split('===SPEAKERS===');
653
+ if (speakerSplit.length >= 2) {
654
+ transcript = speakerSplit[0].trim();
655
+ const jsonPart = speakerSplit[1].trim();
656
+ const jsonMatch = jsonPart.match(/\{[\s\S]*\}/);
657
+ if (jsonMatch) {
658
+ try {
659
+ newSpeakerProfiles = JSON.parse(jsonMatch[0]);
660
+ console.error(` 📋 識別 ${Object.keys(newSpeakerProfiles).length} 位語者`);
661
+ } catch {
662
+ console.error(` ⚠️ 語者 JSON 解析失敗`);
663
+ }
664
+ }
665
+ }
666
+ }
667
+
668
+ // 調整時間戳並標準化格式
669
+ if (offset > 0) {
670
+ transcript = adjustTimestamps(transcript, offset);
671
+ // 也調整情緒分析中的時間戳
672
+ if (emotion) {
673
+ emotion = adjustTimestamps(emotion, offset);
674
+ }
675
+ }
676
+ // 統一時間戳格式(將 [00:MM:SS] 轉為 [MM:SS])
677
+ transcript = normalizeTimestamps(transcript);
678
+ if (emotion) {
679
+ emotion = normalizeTimestamps(emotion);
680
+ }
681
+
682
+ return {
683
+ index,
684
+ transcript,
685
+ emotion, // 新增
686
+ success: true,
687
+ speakerProfiles: newSpeakerProfiles
688
+ };
689
+
690
+ } catch (error) {
691
+ if (uploadedFile) {
692
+ ai.files.delete({ name: uploadedFile.name }).catch(() => {});
693
+ uploadedFile = null;
694
+ }
695
+
696
+ if (attempt < maxRetries - 1) {
697
+ const delay = RETRY_BASE_DELAY * Math.pow(2, attempt);
698
+ console.error(` ⚠️ 第 ${index + 1} 段重試 (${attempt + 1}/${maxRetries})...`);
699
+ await new Promise(r => setTimeout(r, delay));
700
+ continue;
701
+ }
702
+
703
+ return { index, transcript: '[此段無法轉錄]', success: false, error: error.message, speakerProfiles: {} };
704
+ }
705
+ }
706
+ }
707
+
708
+ /**
709
+ * 提取情緒分析中的關鍵資訊(簡化版)
710
+ */
711
+ function extractEmotionBrief(emotionText) {
712
+ const lines = emotionText.split('\n');
713
+ let brief = '';
714
+
715
+ // 提取整體氛圍和強度
716
+ for (const line of lines) {
717
+ if (line.includes('整體氛圍') || line.includes('整體')) {
718
+ const match = line.match(/整體[^::]*[::]([^,,]+)/);
719
+ if (match) brief += match[1].trim() + ',';
720
+ }
721
+ if (line.includes('強度') && line.match(/(\d+)/)) {
722
+ const match = line.match(/(\d+)/);
723
+ if (match) brief += `強度 ${match[1]}/10`;
724
+ break;
725
+ }
726
+ }
727
+
728
+ return brief || '情緒分析';
729
+ }
730
+
731
+ /**
732
+ * 在逐字稿中找到插入情緒標記的位置
733
+ */
734
+ function findInsertPosition(transcript, targetOffset) {
735
+ // 將目標時間轉為 [MM:SS] 格式
736
+ const targetTime = formatTime(targetOffset);
737
+ const targetTimePattern = new RegExp(`\\[${targetTime.replace(':', '\\:')}\\]`);
738
+
739
+ // 尋找最接近的時間戳位置
740
+ const lines = transcript.split('\n');
741
+ for (let i = 0; i < lines.length; i++) {
742
+ if (targetTimePattern.test(lines[i])) {
743
+ // 找到該行的起始位置
744
+ let pos = 0;
745
+ for (let j = 0; j < i; j++) {
746
+ pos += lines[j].length + 1; // +1 for newline
747
+ }
748
+ return pos;
749
+ }
750
+ }
751
+
752
+ // 如果找不到精確匹配,找最接近的時間戳
753
+ const timePattern = /\[(\d{2}):(\d{2})\]/g;
754
+ let bestPos = -1;
755
+ let bestDiff = Infinity;
756
+
757
+ let match;
758
+ while ((match = timePattern.exec(transcript)) !== null) {
759
+ const mins = parseInt(match[1]);
760
+ const secs = parseInt(match[2]);
761
+ const timeInSeconds = mins * 60 + secs;
762
+ const diff = Math.abs(timeInSeconds - targetOffset);
763
+
764
+ if (diff < bestDiff) {
765
+ bestDiff = diff;
766
+ bestPos = match.index;
767
+ }
768
+ }
769
+
770
+ return bestPos > -1 ? bestPos : 0;
771
+ }
772
+
773
+ /**
774
+ * 平行模式:平行轉錄所有段落
775
+ */
776
+ async function transcribeSegmentsParallel(ai, segments, model, tempDir) {
777
+ const segmentsDir = path.join(tempDir, 'segments');
778
+ await fs.mkdir(segmentsDir, { recursive: true });
779
+
780
+ console.error(`🚀 [平行模式] 平行轉錄 ${segments.length} 段(最多 ${MAX_WORKERS} 並行)...`);
781
+
782
+ const results = [];
783
+ const segmentData = [];
784
+
785
+ // 批次平行處理
786
+ for (let i = 0; i < segments.length; i += MAX_WORKERS) {
787
+ const batch = segments.slice(i, i + MAX_WORKERS);
788
+ const batchPromises = batch.map(async (segment) => {
789
+ // 平行模式不需要 previousEnding,因為是同時處理
790
+ const result = await transcribeSingleSegmentSpeaker(ai, segment, model, {}, '');
791
+
792
+ if (result.success) {
793
+ // 儲存到 temp 目錄
794
+ const segFile = path.join(segmentsDir, `seg_${String(segment.index + 1).padStart(3, '0')}.md`);
795
+ const segContent = `# 段落 ${segment.index + 1}
796
+
797
+ ## 語者資訊
798
+
799
+ ${Object.entries(result.speakerProfiles).map(([id, info]) =>
800
+ `- **${info.name || id}**:${[info.gender, info.role, info.traits].filter(Boolean).join(',')}`
801
+ ).join('\n') || '(未識別)'}
802
+
803
+ ## 逐字稿
804
+
805
+ ${result.transcript}
806
+ `;
807
+ await fs.writeFile(segFile, segContent, 'utf8');
808
+ console.error(` ✅ 第 ${segment.index + 1}/${segments.length} 段完成,已儲存`);
809
+ } else {
810
+ console.error(` ⚠️ 第 ${segment.index + 1}/${segments.length} 段失敗`);
811
+ }
812
+
813
+ return result;
814
+ });
815
+
816
+ const batchResults = await Promise.all(batchPromises);
817
+ results.push(...batchResults);
818
+ }
819
+
820
+ // 按 index 排序並建立 segmentData
821
+ results.sort((a, b) => a.index - b.index);
822
+
823
+ for (const result of results) {
824
+ if (result.success) {
825
+ segmentData.push({
826
+ index: result.index,
827
+ transcript: result.transcript,
828
+ speakerProfiles: result.speakerProfiles
829
+ });
830
+ }
831
+ }
832
+
833
+ console.error(`✅ 平行轉錄完成:${segmentData.length}/${segments.length} 段成功`);
834
+ return segmentData;
835
+ }
836
+
837
+ /**
838
+ * 合併語者資訊(保留所有語者,更新已知語者的資訊)
839
+ */
840
+ function mergeSpeakerProfiles(existing, newProfiles) {
841
+ const merged = { ...existing };
842
+
843
+ for (const [id, info] of Object.entries(newProfiles)) {
844
+ if (merged[id]) {
845
+ // 更新已有語者:如果新資訊更具體則採用
846
+ if (info.name && info.name !== id && (!merged[id].name || merged[id].name === id)) {
847
+ merged[id].name = info.name;
848
+ }
849
+ if (info.role && !merged[id].role) {
850
+ merged[id].role = info.role;
851
+ }
852
+ if (info.traits && !merged[id].traits) {
853
+ merged[id].traits = info.traits;
854
+ }
855
+ } else {
856
+ // 新語者
857
+ merged[id] = info;
858
+ }
859
+ }
860
+
861
+ return merged;
862
+ }
863
+
864
+ /**
865
+ * 平行模式:金字塔式合併
866
+ */
867
+ async function pyramidMerge(ai, segments, model, tempDir) {
868
+ if (segments.length === 0) {
869
+ return { transcript: '', speakerProfiles: {} };
870
+ }
871
+ if (segments.length === 1) {
872
+ return segments[0];
873
+ }
874
+
875
+ let currentSegments = [...segments];
876
+ let round = 1;
877
+
878
+ while (currentSegments.length > 1) {
879
+ const roundDir = path.join(tempDir, `round_${round}`);
880
+ await fs.mkdir(roundDir, { recursive: true });
881
+
882
+ console.error(` 🔄 第 ${round} 輪合併:${currentSegments.length} 段 → ${Math.ceil(currentSegments.length / 2)} 段`);
883
+
884
+ const merged = [];
885
+
886
+ // 兩兩合併
887
+ for (let i = 0; i < currentSegments.length; i += 2) {
888
+ if (i + 1 < currentSegments.length) {
889
+ // 合併兩段
890
+ const result = await mergeTwoSegments(ai, currentSegments[i], currentSegments[i + 1], model);
891
+ merged.push(result);
892
+
893
+ // 儲存本輪合併結果
894
+ const mergedFile = path.join(roundDir, `merged_${String(Math.floor(i / 2) + 1).padStart(3, '0')}.md`);
895
+ const mergedContent = `# 合併結果 ${Math.floor(i / 2) + 1}(第 ${round} 輪)
896
+
897
+ ## 語者資訊
898
+
899
+ ${Object.entries(result.speakerProfiles).map(([id, info]) =>
900
+ `- **${info.name || id}**:${[info.gender, info.role, info.traits].filter(Boolean).join(',')}`
901
+ ).join('\n') || '(未識別)'}
902
+
903
+ ## 逐字稿
904
+
905
+ ${result.transcript}
906
+ `;
907
+ await fs.writeFile(mergedFile, mergedContent, 'utf8');
908
+ console.error(` ✅ 合併 ${i + 1} + ${i + 2}`);
909
+ } else {
910
+ // 奇數個,最後一個直接帶入下輪
911
+ merged.push(currentSegments[i]);
912
+ console.error(` ⏭️ 段落 ${i + 1} 直接帶入下輪`);
913
+ }
914
+ }
915
+
916
+ currentSegments = merged;
917
+ round++;
918
+ }
919
+
920
+ console.error(`✅ 金字塔合併完成,共 ${round - 1} 輪`);
921
+ return currentSegments[0];
922
+ }
923
+
924
+ /**
925
+ * 平行模式:兩兩合併段落(API 呼叫,含重試和錯誤處理)
926
+ */
927
+ async function mergeTwoSegments(ai, segA, segB, model, maxRetries = 2) {
928
+ // 檢查內容長度(Gemini API 輸入限制約 1M tokens,約 4M 字元)
929
+ const MAX_INPUT_LENGTH = 3000000; // 約 3M 字元,保留安全邊際
930
+ const segALen = segA.transcript.length;
931
+ const segBLen = segB.transcript.length;
932
+ const totalLen = segALen + segBLen;
933
+
934
+ // 如果內容過大,截斷到合理長度(保留開頭和結尾)
935
+ let transcriptA = segA.transcript;
936
+ let transcriptB = segB.transcript;
937
+
938
+ if (totalLen > MAX_INPUT_LENGTH) {
939
+ const maxEach = Math.floor(MAX_INPUT_LENGTH / 2);
940
+ if (segALen > maxEach) {
941
+ // 保留開頭和結尾
942
+ const keepStart = Math.floor(maxEach * 0.4);
943
+ const keepEnd = Math.floor(maxEach * 0.6);
944
+ transcriptA = segA.transcript.slice(0, keepStart) +
945
+ '\n\n[...中間內容省略...]\n\n' +
946
+ segA.transcript.slice(-keepEnd);
947
+ console.error(` ⚠️ 第一段過長(${segALen} 字元),截斷為 ${transcriptA.length} 字元`);
948
+ }
949
+ if (segBLen > maxEach) {
950
+ const keepStart = Math.floor(maxEach * 0.4);
951
+ const keepEnd = Math.floor(maxEach * 0.6);
952
+ transcriptB = segB.transcript.slice(0, keepStart) +
953
+ '\n\n[...中間內容省略...]\n\n' +
954
+ segB.transcript.slice(-keepEnd);
955
+ console.error(` ⚠️ 第二段過長(${segBLen} 字元),截斷為 ${transcriptB.length} 字元`);
956
+ }
957
+ }
958
+
959
+ const prompt = `請合併以下兩段逐字稿,並統一語者標籤。
960
+
961
+ 【第一段逐字稿】:
962
+ ${transcriptA}
963
+
964
+ 【第一段語者資訊】:
965
+ ${JSON.stringify(segA.speakerProfiles, null, 2)}
966
+
967
+ 【第二段逐字稿】:
968
+ ${transcriptB}
969
+
970
+ 【第二段語者資訊】:
971
+ ${JSON.stringify(segB.speakerProfiles, null, 2)}
972
+
973
+ 【合併要求】:
974
+ 1. 合併兩段逐字稿,去除重複的 overlap 部分(約 5 秒重疊)
975
+ 2. 統一語者標籤:
976
+ - 如果兩段都有相同的語者(如「男1」),保持標籤一致
977
+ - 如果兩段對同一人使用不同標籤(如第一段「男1」,第二段「語者A」),統一為最合適的標籤
978
+ - 只有對話中【明確提到】的姓名才能使用,不要憑空創造
979
+ 3. 保持時間戳順序正確
980
+ 4. 時間戳格式統一用 [MM:SS]
981
+
982
+ 【輸出格式】:
983
+ 先輸出合併後的逐字稿,然後用 ===SPEAKERS=== 分隔,輸出統一的語者資訊 JSON。
984
+
985
+ 合併後的逐字稿:`;
986
+
987
+ // 計算動態 timeout(根據內容長度)
988
+ const estimatedTimeout = Math.max(60000, Math.min(300000, totalLen / 10)); // 60s - 300s
989
+
990
+ for (let attempt = 0; attempt < maxRetries; attempt++) {
991
+ try {
992
+ const response = await withTimeout(
993
+ ai.models.generateContent({
994
+ model: model,
995
+ contents: createUserContent([prompt]),
996
+ config: { maxOutputTokens: 65536, temperature: 0.1 }
997
+ }),
998
+ estimatedTimeout,
999
+ `合併 API 超時(${Math.round(estimatedTimeout/1000)}s)`
1000
+ );
1001
+
1002
+ const fullText = response.text;
1003
+
1004
+ // 解析回傳
1005
+ let mergedTranscript = fullText;
1006
+ let mergedSpeakers = {};
1007
+
1008
+ const speakerSplit = fullText.split('===SPEAKERS===');
1009
+ if (speakerSplit.length >= 2) {
1010
+ mergedTranscript = speakerSplit[0].trim();
1011
+ const jsonPart = speakerSplit[1].trim();
1012
+ const jsonMatch = jsonPart.match(/\{[\s\S]*\}/);
1013
+ if (jsonMatch) {
1014
+ try {
1015
+ mergedSpeakers = JSON.parse(jsonMatch[0]);
1016
+ } catch {
1017
+ // 如果解析失敗,合併兩段的語者資訊
1018
+ mergedSpeakers = mergeSpeakerProfiles(segA.speakerProfiles, segB.speakerProfiles);
1019
+ }
1020
+ }
1021
+ } else {
1022
+ // 如果沒有語者資訊,合併兩段的語者資訊
1023
+ mergedSpeakers = mergeSpeakerProfiles(segA.speakerProfiles, segB.speakerProfiles);
1024
+ }
1025
+
1026
+ // 標準化時間戳
1027
+ mergedTranscript = normalizeTimestamps(mergedTranscript);
1028
+
1029
+ return {
1030
+ transcript: mergedTranscript,
1031
+ speakerProfiles: mergedSpeakers
1032
+ };
1033
+ } catch (error) {
1034
+ const errorDetails = {
1035
+ message: error.message,
1036
+ type: error.constructor.name,
1037
+ attempt: attempt + 1,
1038
+ totalLen: totalLen,
1039
+ segALen: segALen,
1040
+ segBLen: segBLen
1041
+ };
1042
+
1043
+ console.error(` ⚠️ 合併失敗(嘗試 ${attempt + 1}/${maxRetries}):${error.message}`);
1044
+ console.error(` 詳細資訊:${JSON.stringify(errorDetails)}`);
1045
+
1046
+ if (attempt < maxRetries - 1) {
1047
+ const delay = RETRY_BASE_DELAY * Math.pow(2, attempt);
1048
+ console.error(` 等待 ${(delay/1000).toFixed(1)}s 後重試...`);
1049
+ await new Promise(r => setTimeout(r, delay));
1050
+ continue;
1051
+ }
1052
+
1053
+ // 所有重試都失敗,使用簡單合併
1054
+ console.error(` ⚠️ 所有重試失敗,使用簡單合併(無 API 統一語者標籤)`);
1055
+ const simpleMerged = mergeOverlapText(segA.transcript, segB.transcript);
1056
+ return {
1057
+ transcript: simpleMerged,
1058
+ speakerProfiles: mergeSpeakerProfiles(segA.speakerProfiles, segB.speakerProfiles)
1059
+ };
1060
+ }
1061
+ }
1062
+ }
1063
+
1064
+ /**
1065
+ * 即時寫入檔案(每段處理完成後更新)
1066
+ */
1067
+ async function saveIncrementalResult(outPath, transcript, speakerProfiles, metadata) {
1068
+ const speakerSection = Object.entries(speakerProfiles)
1069
+ .map(([id, info]) => {
1070
+ const name = info.name || id;
1071
+ const details = [info.gender, info.role, info.traits].filter(Boolean).join(',');
1072
+ return `- **${name}**:${details || '待識別'}`;
1073
+ })
1074
+ .join('\n');
1075
+
1076
+ const content = `---
1077
+ id: transcript-${metadata.timestamp}
1078
+ title: ${metadata.title}
1079
+ recorded_at: ${new Date().toISOString()}
1080
+ duration: ${metadata.duration}
1081
+ audio_file: ${metadata.audioPath}
1082
+ segments_completed: ${metadata.completed}/${metadata.total}
1083
+ mode: speaker
1084
+ ---
1085
+
1086
+ # ${metadata.title}
1087
+
1088
+ ## 語者資訊
1089
+
1090
+ ${speakerSection || '(識別中...)'}
1091
+
1092
+ ## 逐字稿
1093
+
1094
+ ${transcript}
1095
+ `;
1096
+
1097
+ await fs.writeFile(outPath, content, 'utf8');
1098
+ }
1099
+
1100
+ /**
1101
+ * 快速模式:平行轉錄,直接合併
1102
+ */
1103
+ async function transcribeAudioFast(audioPath, outputPath, model, ai, durationMin) {
1104
+ const segments = await splitAudioFast(audioPath);
1105
+ let tempDir = null;
1106
+
1107
+ if (segments.length > 1 && !segments[0].isOriginal) {
1108
+ tempDir = path.dirname(segments[0].path);
1109
+ }
1110
+
1111
+ let transcript;
1112
+
1113
+ if (segments.length === 1) {
1114
+ console.error('⬆️ 上傳中...');
1115
+ const uploadedFile = await ai.files.upload({
1116
+ file: audioPath,
1117
+ config: { mimeType: 'audio/mpeg' }
1118
+ });
1119
+ console.error('✅ 上傳完成');
1120
+
1121
+ const response = await ai.models.generateContent({
1122
+ model: model,
1123
+ contents: createUserContent([
1124
+ createPartFromUri(uploadedFile.uri, uploadedFile.mimeType),
1125
+ '你是專業語音轉錄員。【完整】轉錄這段音訊。\n' +
1126
+ '規則:完整轉錄不省略、標註時間戳[MM:SS]、區分發言者、保留語氣詞、繁體中文、直接輸出。\n' +
1127
+ '開始:'
1128
+ ]),
1129
+ config: { maxOutputTokens: 65536, temperature: 0.1 }
1130
+ });
1131
+
1132
+ transcript = response.text;
1133
+ try { await ai.files.delete({ name: uploadedFile.name }); } catch {}
1134
+
1135
+ } else {
1136
+ // 平行處理
1137
+ console.error(`🚀 [快速模式] 平行轉錄 ${segments.length} 段(最多 ${MAX_WORKERS} 並行)...`);
1138
+ const results = new Map();
1139
+
1140
+ for (let i = 0; i < segments.length; i += MAX_WORKERS) {
1141
+ const batch = segments.slice(i, i + MAX_WORKERS);
1142
+ const batchPromises = batch.map(seg => transcribeSingleSegmentFast(ai, seg, model));
1143
+ const batchResults = await Promise.all(batchPromises);
1144
+
1145
+ for (const r of batchResults) {
1146
+ results.set(r.index, r.transcript);
1147
+ console.error(` ✅ 第 ${r.index + 1}/${segments.length} 段完成`);
1148
+ }
1149
+ }
1150
+
1151
+ // 按順序合併
1152
+ const sortedIndices = Array.from(results.keys()).sort((a, b) => a - b);
1153
+ transcript = sortedIndices.map(i => results.get(i)).join('\n\n');
1154
+ }
1155
+
1156
+ // 清理暫存
1157
+ if (tempDir) {
1158
+ await fs.rm(tempDir, { recursive: true, force: true });
1159
+ }
1160
+
1161
+ // 儲存結果
1162
+ const outPath = outputPath || audioPath.replace(/\.[^/.]+$/, '.md');
1163
+ const formattedTimestamp = new Date().toISOString().slice(0, 19).replace(/:/g, '-');
1164
+
1165
+ const content = `---
1166
+ id: transcript-${formattedTimestamp}
1167
+ title: ${path.basename(audioPath)}
1168
+ recorded_at: ${new Date().toISOString()}
1169
+ duration: ${Math.round(durationMin)} 分鐘
1170
+ audio_file: ${audioPath}
1171
+ segments: ${segments.length}
1172
+ mode: fast
1173
+ model: ${model}
1174
+ ---
1175
+
1176
+ # ${path.basename(audioPath, path.extname(audioPath))}
1177
+
1178
+ ## 逐字稿
1179
+
1180
+ ${transcript}
1181
+ `;
1182
+
1183
+ await fs.writeFile(outPath, content, 'utf8');
1184
+ console.error(`📄 [快速模式] 逐字稿已儲存: ${outPath}`);
1185
+
1186
+ return {
1187
+ outputPath: outPath,
1188
+ duration: `${Math.round(durationMin)} 分鐘`,
1189
+ segments: segments.length,
1190
+ summary: '(快速模式不產生摘要)',
1191
+ keywords: ''
1192
+ };
1193
+ }
1194
+
1195
+ /**
1196
+ * 語者識別模式:逐段處理,單一 API 呼叫同時取得逐字稿 + 語者資訊
1197
+ */
1198
+ async function transcribeAudioSpeaker(audioPath, outputPath, model, ai, durationMin) {
1199
+ const segments = await splitAudioSpeaker(audioPath);
1200
+ let tempDir = null;
1201
+
1202
+ if (segments.length > 1 && !segments[0].isOriginal) {
1203
+ tempDir = path.dirname(segments[0].path);
1204
+ }
1205
+
1206
+ const outPath = outputPath || audioPath.replace(/\.[^/.]+$/, '.md');
1207
+ const formattedTimestamp = new Date().toISOString().slice(0, 19).replace(/:/g, '-');
1208
+ const metadata = {
1209
+ timestamp: formattedTimestamp,
1210
+ title: path.basename(audioPath, path.extname(audioPath)),
1211
+ duration: `${Math.round(durationMin)} 分鐘`,
1212
+ audioPath: audioPath,
1213
+ completed: 0,
1214
+ total: segments.length
1215
+ };
1216
+
1217
+ let fullTranscript = '';
1218
+ let speakerProfiles = {};
1219
+ let emotionResults = []; // 收集每段的情緒分析
1220
+
1221
+ console.error(`🎯 [語者識別模式] 逐段轉錄 ${segments.length} 段(單一 API 呼叫)...`);
1222
+
1223
+ for (const segment of segments) {
1224
+ console.error(` 📝 處理第 ${segment.index + 1}/${segments.length} 段...`);
1225
+
1226
+ // 取前段結尾作為上下文
1227
+ const previousEnding = fullTranscript.slice(-500);
1228
+
1229
+ // 單一 API 呼叫:同時取得逐字稿 + 情緒分析 + 語者資訊
1230
+ const result = await transcribeSingleSegmentSpeaker(
1231
+ ai, segment, model, speakerProfiles, previousEnding
1232
+ );
1233
+
1234
+ if (!result.success) {
1235
+ console.error(` ⚠️ 第 ${segment.index + 1} 段失敗,跳過`);
1236
+ continue;
1237
+ }
1238
+
1239
+ // 收集情緒分析結果
1240
+ if (result.emotion) {
1241
+ emotionResults.push({
1242
+ index: segment.index,
1243
+ time: Math.floor(segment.offset / 60),
1244
+ emotion: result.emotion,
1245
+ offset: segment.offset
1246
+ });
1247
+ }
1248
+
1249
+ // 合併語者資訊
1250
+ if (result.speakerProfiles && Object.keys(result.speakerProfiles).length > 0) {
1251
+ speakerProfiles = mergeSpeakerProfiles(speakerProfiles, result.speakerProfiles);
1252
+ }
1253
+
1254
+ // 合併逐字稿(處理 overlap 去重)
1255
+ if (segment.hasOverlap && fullTranscript) {
1256
+ fullTranscript = mergeOverlapText(fullTranscript, result.transcript);
1257
+ } else {
1258
+ fullTranscript = fullTranscript ? fullTranscript + '\n\n' + result.transcript : result.transcript;
1259
+ }
1260
+
1261
+ // 即時寫入檔案
1262
+ metadata.completed = segment.index + 1;
1263
+ await saveIncrementalResult(outPath, fullTranscript, speakerProfiles, metadata);
1264
+ console.error(` ✅ 第 ${segment.index + 1}/${segments.length} 段完成,已寫入檔案`);
1265
+ }
1266
+
1267
+ // 清理暫存
1268
+ if (tempDir) {
1269
+ await fs.rm(tempDir, { recursive: true, force: true });
1270
+ }
1271
+
1272
+ // 產生最終摘要
1273
+ console.error('📝 產生摘要...');
1274
+ const summaryResponse = await ai.models.generateContent({
1275
+ model: model,
1276
+ contents: createUserContent([
1277
+ `根據以下逐字稿,產生 3-5 點重點摘要(繁體中文):\n\n${fullTranscript.slice(0, 30000)}`
1278
+ ]),
1279
+ config: { maxOutputTokens: 2048, temperature: 0.3 }
1280
+ });
1281
+ const summary = summaryResponse.text;
1282
+
1283
+ // 生成簡短情緒摘要(一段文字,包含經典句子)
1284
+ let emotionSummary = '';
1285
+ if (emotionResults.length > 0) {
1286
+ console.error('💭 產生情緒摘要...');
1287
+ try {
1288
+ const emotionSummaryPrompt = `根據以下逐字稿和情緒分析結果,用一段文字(200-300字)描述整個對談的情緒流動:
1289
+
1290
+ 1. 整體氛圍和情緒變化過程
1291
+ 2. 2-3 句經典語句(格式:[MM:SS] 語者X:「...」)
1292
+ 3. 關鍵情緒轉折點和高潮時刻
1293
+
1294
+ 逐字稿片段:
1295
+ ${fullTranscript.slice(0, 20000)}
1296
+
1297
+ 情緒分析結果:
1298
+ ${emotionResults.map(r => `[${r.time}分鐘] ${r.emotion.slice(0, 200)}`).join('\n\n')}
1299
+
1300
+ 請用流暢的繁體中文寫成一段文字,不要使用列表或表格。`;
1301
+
1302
+ const emotionSummaryResponse = await ai.models.generateContent({
1303
+ model: model,
1304
+ contents: createUserContent([emotionSummaryPrompt]),
1305
+ config: { maxOutputTokens: 1024, temperature: 0.3 }
1306
+ });
1307
+ emotionSummary = emotionSummaryResponse.text;
1308
+ console.error(' ✅ 情緒摘要完成');
1309
+ } catch (error) {
1310
+ console.error(` ⚠️ 情緒摘要生成失敗:${error.message}`);
1311
+ }
1312
+ }
1313
+
1314
+ // 在逐字稿中插入情緒轉折標記
1315
+ let transcriptWithEmotions = fullTranscript;
1316
+ if (emotionResults.length > 0) {
1317
+ console.error('📍 在逐字稿中插入情緒轉折標記...');
1318
+ const sortedEmotions = emotionResults.sort((a, b) => a.index - b.index);
1319
+
1320
+ // 從後往前插入,避免位置偏移
1321
+ for (let i = sortedEmotions.length - 1; i >= 0; i--) {
1322
+ const emotion = sortedEmotions[i];
1323
+ const timeStr = formatTime(emotion.offset);
1324
+ const emotionBrief = extractEmotionBrief(emotion.emotion);
1325
+
1326
+ // 找到插入位置(根據時間戳)
1327
+ const insertPos = findInsertPosition(transcriptWithEmotions, emotion.offset);
1328
+
1329
+ if (insertPos > -1) {
1330
+ const marker = `\n\n---\n**📍 [${timeStr}] 情緒轉折**:${emotionBrief}\n---\n\n`;
1331
+ transcriptWithEmotions =
1332
+ transcriptWithEmotions.slice(0, insertPos) +
1333
+ marker +
1334
+ transcriptWithEmotions.slice(insertPos);
1335
+ }
1336
+ }
1337
+ console.error(` ✅ 已插入 ${sortedEmotions.length} 個情緒標記`);
1338
+ }
1339
+
1340
+ // 替換語者標籤為辨識出的名字/身份
1341
+ console.error('🏷️ 替換語者標籤...');
1342
+ let finalTranscript = transcriptWithEmotions;
1343
+ let replacementCount = 0;
1344
+
1345
+ // 按標籤長度降序排序(避免部分替換問題,例如「男1」和「男10」)
1346
+ const sortedProfiles = Object.entries(speakerProfiles)
1347
+ .sort((a, b) => b[0].length - a[0].length);
1348
+
1349
+ for (const [id, info] of sortedProfiles) {
1350
+ // 優先使用 name,否則用 id
1351
+ const displayName = info.name && info.name !== id ? info.name : null;
1352
+
1353
+ if (displayName) {
1354
+ // 替換各種格式的語者標籤
1355
+ const patterns = [
1356
+ new RegExp(`\\*\\*${escapeRegex(id)}\\*\\*([::])`, 'g'),
1357
+ new RegExp(`\\[${escapeRegex(id)}\\]([::])`, 'g'),
1358
+ ];
1359
+
1360
+ for (const pattern of patterns) {
1361
+ const before = finalTranscript;
1362
+ finalTranscript = finalTranscript.replace(pattern, `**${displayName}**$1`);
1363
+ if (finalTranscript !== before) {
1364
+ replacementCount++;
1365
+ }
1366
+ }
1367
+ }
1368
+ }
1369
+
1370
+ if (replacementCount > 0) {
1371
+ console.error(` ✅ 已替換 ${replacementCount} 處語者標籤`);
1372
+ }
1373
+
1374
+ // 寫入最終檔案
1375
+ const speakerSection = Object.entries(speakerProfiles)
1376
+ .map(([id, info]) => {
1377
+ const name = info.name || id;
1378
+ const details = [info.gender, info.role, info.traits].filter(Boolean).join(',');
1379
+ return `- **${name}**:${details || '待識別'}`;
1380
+ })
1381
+ .join('\n');
1382
+
1383
+ const finalContent = `---
1384
+ id: transcript-${formattedTimestamp}
1385
+ title: ${metadata.title}
1386
+ recorded_at: ${new Date().toISOString()}
1387
+ duration: ${metadata.duration}
1388
+ audio_file: ${audioPath}
1389
+ segments: ${segments.length}
1390
+ mode: speaker
1391
+ model: ${model}
1392
+ ---
1393
+
1394
+ # ${metadata.title}
1395
+
1396
+ ## 摘要
1397
+
1398
+ ${summary}
1399
+
1400
+ ${emotionSummary ? `## 情緒流動\n\n${emotionSummary}\n` : ''}## 語者資訊
1401
+
1402
+ ${speakerSection || '(未識別)'}
1403
+
1404
+ ## 逐字稿
1405
+
1406
+ ${finalTranscript}
1407
+ `;
1408
+
1409
+ await fs.writeFile(outPath, finalContent, 'utf8');
1410
+ console.error(`📄 [語者識別模式] 逐字稿已儲存: ${outPath}`);
1411
+
1412
+ return {
1413
+ outputPath: outPath,
1414
+ duration: metadata.duration,
1415
+ segments: segments.length,
1416
+ summary,
1417
+ keywords: '',
1418
+ speakerProfiles
1419
+ };
1420
+ }
1421
+
1422
+ /**
1423
+ * 平行模式:主函數
1424
+ */
1425
+ async function transcribeAudioParallel(audioPath, outputPath, model, ai, durationMin) {
1426
+ // 建立 temp 目錄
1427
+ const tempDir = path.join(tmpdir(), `parallel_${Date.now()}`);
1428
+ await fs.mkdir(tempDir, { recursive: true });
1429
+
1430
+ try {
1431
+ // 1. 切割音檔
1432
+ const segments = await splitAudioParallel(audioPath);
1433
+ let audioTempDir = null;
1434
+
1435
+ if (segments.length > 1 && !segments[0].isOriginal) {
1436
+ audioTempDir = path.dirname(segments[0].path);
1437
+ }
1438
+
1439
+ // 2. 平行轉錄所有段落
1440
+ const segmentData = await transcribeSegmentsParallel(ai, segments, model, tempDir);
1441
+
1442
+ if (segmentData.length === 0) {
1443
+ throw new Error('所有段落轉錄失敗');
1444
+ }
1445
+
1446
+ // 3. 金字塔合併
1447
+ console.error(`🔺 [平行模式] 開始金字塔合併...`);
1448
+ const finalResult = await pyramidMerge(ai, segmentData, model, tempDir);
1449
+
1450
+ // 4. 儲存最終結果到 temp 目錄
1451
+ const finalTempFile = path.join(tempDir, 'final.md');
1452
+ const finalContent = `# 最終合併結果
1453
+
1454
+ ## 語者資訊
1455
+
1456
+ ${Object.entries(finalResult.speakerProfiles).map(([id, info]) => {
1457
+ const name = info.name || id;
1458
+ const details = [info.gender, info.role, info.traits].filter(Boolean).join(',');
1459
+ return `- **${name}**:${details || '待識別'}`;
1460
+ }).join('\n') || '(未識別)'}
1461
+
1462
+ ## 逐字稿
1463
+
1464
+ ${finalResult.transcript}
1465
+ `;
1466
+ await fs.writeFile(finalTempFile, finalContent, 'utf8');
1467
+
1468
+ // 5. 產生摘要
1469
+ console.error('📝 產生摘要...');
1470
+ const summaryResponse = await ai.models.generateContent({
1471
+ model: model,
1472
+ contents: createUserContent([
1473
+ `根據以下逐字稿,產生 3-5 點重點摘要(繁體中文):\n\n${finalResult.transcript.slice(0, 30000)}`
1474
+ ]),
1475
+ config: { maxOutputTokens: 2048, temperature: 0.3 }
1476
+ });
1477
+ const summary = summaryResponse.text;
1478
+
1479
+ // 6. 儲存最終結果
1480
+ const outPath = outputPath || audioPath.replace(/\.[^/.]+$/, '.md');
1481
+ const formattedTimestamp = new Date().toISOString().slice(0, 19).replace(/:/g, '-');
1482
+
1483
+ const speakerSection = Object.entries(finalResult.speakerProfiles)
1484
+ .map(([id, info]) => {
1485
+ const name = info.name || id;
1486
+ const details = [info.gender, info.role, info.traits].filter(Boolean).join(',');
1487
+ return `- **${name}**:${details || '待識別'}`;
1488
+ })
1489
+ .join('\n');
1490
+
1491
+ const content = `---
1492
+ id: transcript-${formattedTimestamp}
1493
+ title: ${path.basename(audioPath, path.extname(audioPath))}
1494
+ recorded_at: ${new Date().toISOString()}
1495
+ duration: ${Math.round(durationMin)} 分鐘
1496
+ audio_file: ${audioPath}
1497
+ segments: ${segments.length}
1498
+ mode: parallel
1499
+ temp_dir: ${tempDir}
1500
+ model: ${model}
1501
+ ---
1502
+
1503
+ # ${path.basename(audioPath, path.extname(audioPath))}
1504
+
1505
+ ## 摘要
1506
+
1507
+ ${summary}
1508
+
1509
+ ## 語者資訊
1510
+
1511
+ ${speakerSection || '(未識別)'}
1512
+
1513
+ ## 逐字稿
1514
+
1515
+ ${finalResult.transcript}
1516
+ `;
1517
+
1518
+ await fs.writeFile(outPath, content, 'utf8');
1519
+ console.error(`📄 [平行模式] 逐字稿已儲存: ${outPath}`);
1520
+ console.error(`📁 中間結果目錄: ${tempDir}`);
1521
+
1522
+ // 清理音檔分段暫存
1523
+ if (audioTempDir) {
1524
+ await fs.rm(audioTempDir, { recursive: true, force: true });
1525
+ }
1526
+
1527
+ return {
1528
+ outputPath: outPath,
1529
+ duration: `${Math.round(durationMin)} 分鐘`,
1530
+ segments: segments.length,
1531
+ summary,
1532
+ keywords: '',
1533
+ speakerProfiles: finalResult.speakerProfiles,
1534
+ tempDir: tempDir
1535
+ };
1536
+ } catch (error) {
1537
+ // 發生錯誤時保留 temp 目錄供除錯
1538
+ console.error(`❌ [平行模式] 處理失敗:${error.message}`);
1539
+ console.error(`📁 除錯目錄保留:${tempDir}`);
1540
+ throw error;
1541
+ }
1542
+ }
1543
+
1544
+
1545
+ /**
1546
+ * 主要轉錄函數(根據 mode 調用不同模式)
1547
+ */
1548
+ async function transcribeAudio(audioPath, outputPath = null, model = MODEL_NAME, mode = null) {
1549
+ const apiKey = process.env.GEMINI_API_KEY;
1550
+ if (!apiKey) {
1551
+ throw new Error('GEMINI_API_KEY 未設定');
1552
+ }
1553
+
1554
+ // 如果沒有指定 mode,從環境變數或使用預設值
1555
+ if (!mode) {
1556
+ mode = process.env.DEFAULT_MODE || 'fast';
1557
+ }
1558
+
1559
+ // 驗證 mode 是否有效
1560
+ if (mode !== 'fast' && mode !== 'speaker') {
1561
+ console.error(`⚠️ 無效的模式 "${mode}",使用預設值 "fast"`);
1562
+ mode = 'fast';
1563
+ }
1564
+
1565
+ const ai = new GoogleGenAI({ apiKey });
1566
+
1567
+ const stats = await fs.stat(audioPath);
1568
+ const fileSizeMB = stats.size / (1024 * 1024);
1569
+ const duration = await getAudioDuration(audioPath);
1570
+ const durationMin = duration > 0 ? duration / 60 : fileSizeMB * 2;
1571
+
1572
+ console.error(`📁 處理檔案: ${path.basename(audioPath)} (${fileSizeMB.toFixed(1)} MB, ${durationMin.toFixed(1)} 分鐘)`);
1573
+
1574
+ const modeNames = {
1575
+ 'fast': '快速模式',
1576
+ 'speaker': '語者識別模式'
1577
+ };
1578
+ console.error(`🔧 模式: ${modeNames[mode] || '快速模式'}`);
1579
+
1580
+ let result;
1581
+ if (mode === 'fast') {
1582
+ result = await transcribeAudioFast(audioPath, outputPath, model, ai, durationMin);
1583
+ } else {
1584
+ result = await transcribeAudioSpeaker(audioPath, outputPath, model, ai, durationMin);
1585
+ }
1586
+
1587
+ // 在返回值中添加 mode
1588
+ return { ...result, mode };
1589
+ }
1590
+
1591
+ /**
1592
+ * 描述圖片
1593
+ */
1594
+ async function describeImage(imagePath, outputPath = null, detailLevel = 'detailed') {
1595
+ const apiKey = process.env.GEMINI_API_KEY;
1596
+ if (!apiKey) {
1597
+ throw new Error('GEMINI_API_KEY 未設定');
1598
+ }
1599
+
1600
+ const ai = new GoogleGenAI({ apiKey });
1601
+
1602
+ const detailPrompts = {
1603
+ simple: '請用繁體中文簡單描述這張圖片的主要內容(50字以內)。',
1604
+ normal: '請用繁體中文描述這張圖片,包括主要物體、場景、顏色和構圖。',
1605
+ detailed: `請用繁體中文非常詳細地描述這張圖片,包括:
1606
+ 1. **主體描述**:圖片中的主要物體或人物
1607
+ 2. **場景環境**:背景、地點、氛圍
1608
+ 3. **視覺元素**:顏色、光線、構圖
1609
+ 4. **細節觀察**:任何值得注意的細節
1610
+ 5. **整體印象**:圖片傳達的情緒或訊息`
1611
+ };
1612
+
1613
+ const prompt = detailPrompts[detailLevel] || detailPrompts.detailed;
1614
+
1615
+ const uploadedFile = await ai.files.upload({
1616
+ file: imagePath,
1617
+ config: { mimeType: 'image/jpeg' }
1618
+ });
1619
+
1620
+ const response = await ai.models.generateContent({
1621
+ model: MODEL_NAME,
1622
+ contents: createUserContent([
1623
+ createPartFromUri(uploadedFile.uri, uploadedFile.mimeType),
1624
+ prompt
1625
+ ]),
1626
+ config: { maxOutputTokens: 4096, temperature: 0.7 }
1627
+ });
1628
+
1629
+ try { await ai.files.delete({ name: uploadedFile.name }); } catch { /* ignore */ }
1630
+
1631
+ const description = response.text;
1632
+ const outPath = outputPath || imagePath.replace(/\.[^/.]+$/, '.md');
1633
+
1634
+ const content = `# 圖片描述
1635
+
1636
+ ## 檔案資訊
1637
+ - **圖片檔案**: ${path.basename(imagePath)}
1638
+ - **處理時間**: ${new Date().toISOString()}
1639
+ - **詳細程度**: ${detailLevel}
1640
+
1641
+ ---
1642
+
1643
+ ## 描述內容
1644
+
1645
+ ${description}
1646
+ `;
1647
+
1648
+ await fs.writeFile(outPath, content, 'utf8');
1649
+
1650
+ return { outputPath: outPath, description };
1651
+ }
1652
+
1653
+ // 建立 MCP Server
1654
+ const server = new McpServer({
1655
+ name: 'gemini-transcriber',
1656
+ version: '1.0.0'
1657
+ });
1658
+
1659
+ // 註冊工具:音訊轉錄
1660
+ server.tool(
1661
+ 'transcribe_audio',
1662
+ '將音訊檔案轉為逐字稿。支援兩種模式:fast(快速模式,預設)、speaker(語者識別+情緒分析)。預設模式可透過環境變數 DEFAULT_MODE 設定。',
1663
+ {
1664
+ audio_path: z.string().describe('音訊檔案路徑(必填)'),
1665
+ output_path: z.string().optional().describe('輸出檔案路徑(選填)'),
1666
+ model: z.string().optional().describe('使用的模型(預設: gemini-3-flash-preview)'),
1667
+ mode: z.enum(['fast', 'speaker']).optional().describe('模式:fast(快速,預設)或 speaker(語者識別+情緒分析)。如未指定,使用環境變數 DEFAULT_MODE 或預設 fast')
1668
+ },
1669
+ async ({ audio_path, output_path, model, mode }) => {
1670
+ try {
1671
+ const resolvedAudioPath = resolvePath(audio_path);
1672
+ const resolvedOutputPath = output_path ? resolvePath(output_path) : null;
1673
+
1674
+ try {
1675
+ await fs.access(resolvedAudioPath);
1676
+ } catch {
1677
+ return { content: [{ type: 'text', text: `❌ 找不到音訊檔案: ${resolvedAudioPath}` }] };
1678
+ }
1679
+
1680
+ const ext = path.extname(resolvedAudioPath).toLowerCase();
1681
+ if (!SUPPORTED_AUDIO_FORMATS.includes(ext)) {
1682
+ return { content: [{ type: 'text', text: `❌ 不支援的音訊格式: ${ext}\n支援格式: ${SUPPORTED_AUDIO_FORMATS.join(', ')}` }] };
1683
+ }
1684
+
1685
+ // 如果沒有指定 mode,讓 transcribeAudio 使用環境變數或預設值
1686
+ const result = await transcribeAudio(resolvedAudioPath, resolvedOutputPath, model || MODEL_NAME, mode || undefined);
1687
+
1688
+ // 根據模式顯示不同結果(從結果中取得實際使用的模式)
1689
+ const actualMode = result.mode || (mode || process.env.DEFAULT_MODE || 'fast');
1690
+ if (actualMode === 'fast') {
1691
+ return {
1692
+ content: [{
1693
+ type: 'text',
1694
+ text: `✅ [快速模式] 音訊轉錄完成!
1695
+
1696
+ 📁 **輸出檔案**: ${result.outputPath}
1697
+ ⏱️ **總時長**: ${result.duration}
1698
+ 📊 **處理區塊**: ${result.segments} 個`
1699
+ }]
1700
+ };
1701
+ } else if (selectedMode === 'hybrid') {
1702
+ const speakerInfo = result.speakerProfiles ?
1703
+ Object.entries(result.speakerProfiles)
1704
+ .map(([name, info]) => `- **${name}**(${[info.gender, info.role].filter(Boolean).join(',') || '?'})`)
1705
+ .join('\n') : '';
1706
+
1707
+ return {
1708
+ content: [{
1709
+ type: 'text',
1710
+ text: `✅ [Hybrid 模式] 音訊轉錄完成!
1711
+
1712
+ 📁 **輸出檔案**: ${result.outputPath}
1713
+ ⏱️ **總時長**: ${result.duration}
1714
+ 📊 **處理區塊**: ${result.segments} 個
1715
+
1716
+ ## 語者資訊
1717
+ ${speakerInfo || '(未識別)'}
1718
+
1719
+ ## 摘要
1720
+ ${result.summary}`
1721
+ }]
1722
+ };
1723
+ } else {
1724
+ const speakerInfo = result.speakerProfiles ?
1725
+ Object.entries(result.speakerProfiles)
1726
+ .map(([id, info]) => `- ${info.name || id}(${info.role || '?'})`)
1727
+ .join('\n') : '';
1728
+
1729
+ return {
1730
+ content: [{
1731
+ type: 'text',
1732
+ text: `✅ [語者識別模式] 音訊轉錄完成!
1733
+
1734
+ 📁 **輸出檔案**: ${result.outputPath}
1735
+ ⏱️ **總時長**: ${result.duration}
1736
+ 📊 **處理區塊**: ${result.segments} 個
1737
+
1738
+ ## 語者資訊
1739
+ ${speakerInfo || '(未識別)'}
1740
+
1741
+ ## 摘要
1742
+ ${result.summary}`
1743
+ }]
1744
+ };
1745
+ }
1746
+ } catch (error) {
1747
+ return { content: [{ type: 'text', text: `❌ 轉錄失敗: ${error.message}` }] };
1748
+ }
1749
+ }
1750
+ );
1751
+
1752
+ // 註冊工具:圖片描述
1753
+ server.tool(
1754
+ 'describe_image',
1755
+ '描述圖片內容。支援 png/jpg/jpeg/webp 等格式。',
1756
+ {
1757
+ image_path: z.string().describe('圖片檔案路徑(必填)'),
1758
+ output_path: z.string().optional().describe('輸出檔案路徑(選填)'),
1759
+ detail_level: z.enum(['simple', 'normal', 'detailed']).optional().describe('詳細程度(預設: detailed)')
1760
+ },
1761
+ async ({ image_path, output_path, detail_level }) => {
1762
+ try {
1763
+ // 解析路徑(支援相對路徑)
1764
+ const resolvedImagePath = resolvePath(image_path);
1765
+ const resolvedOutputPath = output_path ? resolvePath(output_path) : null;
1766
+
1767
+ try {
1768
+ await fs.access(resolvedImagePath);
1769
+ } catch {
1770
+ return { content: [{ type: 'text', text: `❌ 找不到圖片檔案: ${resolvedImagePath}` }] };
1771
+ }
1772
+
1773
+ const ext = path.extname(resolvedImagePath).toLowerCase();
1774
+ if (!SUPPORTED_IMAGE_FORMATS.includes(ext)) {
1775
+ return { content: [{ type: 'text', text: `❌ 不支援的圖片格式: ${ext}` }] };
1776
+ }
1777
+
1778
+ const result = await describeImage(resolvedImagePath, resolvedOutputPath, detail_level || 'detailed');
1779
+
1780
+ return {
1781
+ content: [{
1782
+ type: 'text',
1783
+ text: `✅ 圖片描述完成!
1784
+
1785
+ 📁 **輸出檔案**: ${result.outputPath}
1786
+
1787
+ ---
1788
+
1789
+ ${result.description}`
1790
+ }]
1791
+ };
1792
+ } catch (error) {
1793
+ return { content: [{ type: 'text', text: `❌ 描述失敗: ${error.message}` }] };
1794
+ }
1795
+ }
1796
+ );
1797
+
1798
+ // 註冊工具:語者分析
1799
+ server.tool(
1800
+ 'analyze_speaker',
1801
+ '分析音頻中特定時間點的語者資訊(性別、年齡、聲音特徵等)。可指定時間戳或文字內容來定位。',
1802
+ {
1803
+ audio_path: z.string().describe('音訊檔案路徑(必填)'),
1804
+ timestamp: z.string().optional().describe('時間戳,格式 MM:SS 或 HH:MM:SS(如 01:18)'),
1805
+ duration: z.number().optional().describe('分析的時長(秒),預設 30 秒'),
1806
+ question: z.string().optional().describe('自訂問題(如:講這句話的人是男生還是女生?)')
1807
+ },
1808
+ async ({ audio_path, timestamp, duration = 30, question }) => {
1809
+ try {
1810
+ const apiKey = process.env.GEMINI_API_KEY;
1811
+ if (!apiKey) {
1812
+ return { content: [{ type: 'text', text: '❌ GEMINI_API_KEY 未設定' }] };
1813
+ }
1814
+
1815
+ const ai = new GoogleGenAI({ apiKey });
1816
+ const resolvedAudioPath = resolvePath(audio_path);
1817
+
1818
+ try {
1819
+ await fs.access(resolvedAudioPath);
1820
+ } catch {
1821
+ return { content: [{ type: 'text', text: `❌ 找不到音訊檔案: ${resolvedAudioPath}` }] };
1822
+ }
1823
+
1824
+ // 解析時間戳
1825
+ let startSeconds = 0;
1826
+ if (timestamp) {
1827
+ const parts = timestamp.split(':').map(Number);
1828
+ if (parts.length === 2) {
1829
+ startSeconds = parts[0] * 60 + parts[1];
1830
+ } else if (parts.length === 3) {
1831
+ startSeconds = parts[0] * 3600 + parts[1] * 60 + parts[2];
1832
+ }
1833
+ }
1834
+
1835
+ // 切割音頻片段
1836
+ const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'speaker-analysis-'));
1837
+ const segmentPath = path.join(tempDir, 'segment.mp3');
1838
+
1839
+ await new Promise((resolve, reject) => {
1840
+ const args = [
1841
+ '-y', '-i', resolvedAudioPath,
1842
+ '-ss', String(startSeconds),
1843
+ '-t', String(duration),
1844
+ '-acodec', 'libmp3lame', '-ab', '64k', '-ar', '16000', '-ac', '1',
1845
+ segmentPath
1846
+ ];
1847
+ const ffmpeg = spawn('ffmpeg', args);
1848
+ ffmpeg.on('close', code => code === 0 ? resolve() : reject(new Error(`ffmpeg 錯誤: ${code}`)));
1849
+ ffmpeg.on('error', reject);
1850
+ });
1851
+
1852
+ // 上傳並分析
1853
+ const uploadedFile = await ai.files.upload({
1854
+ file: segmentPath,
1855
+ config: { mimeType: 'audio/mpeg' }
1856
+ });
1857
+
1858
+ const defaultQuestion = `分析這段音頻中說話的人:
1859
+
1860
+ 1. 性別(男/女)
1861
+ 2. 年齡範圍(如:20-30歲、30-40歲)
1862
+ 3. 聲音特徵(音調高低、說話速度、口音等)
1863
+ 4. 性格推測(根據說話方式推斷)
1864
+ 5. 情緒狀態(平靜、興奮、猶豫等)
1865
+
1866
+ 請詳細描述這位語者的特徵。`;
1867
+
1868
+ const response = await ai.models.generateContent({
1869
+ model: MODEL_NAME,
1870
+ contents: createUserContent([
1871
+ createPartFromUri(uploadedFile.uri, uploadedFile.mimeType),
1872
+ question || defaultQuestion
1873
+ ]),
1874
+ config: { maxOutputTokens: 2048, temperature: 0.1 }
1875
+ });
1876
+
1877
+ // 清理
1878
+ ai.files.delete({ name: uploadedFile.name }).catch(() => {});
1879
+ await fs.rm(tempDir, { recursive: true, force: true });
1880
+
1881
+ const timeRange = `${timestamp || '00:00'} - ${formatTime(startSeconds + duration)}`;
1882
+
1883
+ return {
1884
+ content: [{
1885
+ type: 'text',
1886
+ text: `🎤 **語者分析結果**
1887
+
1888
+ 📁 音訊檔案: ${path.basename(resolvedAudioPath)}
1889
+ ⏱️ 分析時段: ${timeRange}
1890
+
1891
+ ---
1892
+
1893
+ ${response.text}`
1894
+ }]
1895
+ };
1896
+ } catch (error) {
1897
+ return { content: [{ type: 'text', text: `❌ 分析失敗: ${error.message}` }] };
1898
+ }
1899
+ }
1900
+ );
1901
+
1902
+ // 輔助函數:格式化時間
1903
+ function formatTime(seconds) {
1904
+ const m = Math.floor(seconds / 60);
1905
+ const s = Math.floor(seconds % 60);
1906
+ return `${String(m).padStart(2, '0')}:${String(s).padStart(2, '0')}`;
1907
+ }
1908
+
1909
+ // 註冊工具:列出支援格式
1910
+ server.tool(
1911
+ 'list_supported_formats',
1912
+ '列出支援的檔案格式和轉錄模式',
1913
+ {},
1914
+ async () => {
1915
+ return {
1916
+ content: [{
1917
+ type: 'text',
1918
+ text: `📋 **支援的檔案格式**
1919
+
1920
+ ## 🎵 音訊格式
1921
+ ${SUPPORTED_AUDIO_FORMATS.join(', ')}
1922
+
1923
+ ## 🖼️ 圖片格式
1924
+ ${SUPPORTED_IMAGE_FORMATS.join(', ')}
1925
+
1926
+ ## 🔧 轉錄模式
1927
+ - **speaker**(預設):語者識別模式,逐段處理,累積語者資訊,每段即時寫入
1928
+ - **fast**:快速模式,平行處理,直接合併,適合快速驗證
1929
+ - **hybrid**:快速轉錄 + 全局語者分析 + 音頻語者特徵分析
1930
+
1931
+ ## 🎤 語者分析工具
1932
+ - **analyze_speaker**:分析特定時間點的語者(性別、年齡、聲音特徵)
1933
+ 用法:analyze_speaker(audio_path, timestamp="01:18", duration=30)
1934
+
1935
+ ## 🤖 可用模型
1936
+ - gemini-3-flash-preview(預設)
1937
+ - gemini-2.5-flash
1938
+ - gemini-2.5-pro`
1939
+ }]
1940
+ };
1941
+ }
1942
+ );
1943
+
1944
+ // 啟動 Server
1945
+ async function main() {
1946
+ const transport = new StdioServerTransport();
1947
+ await server.connect(transport);
1948
+ console.error('🚀 MCP Gemini Transcriber Server 已啟動');
1949
+ }
1950
+
1951
+ main().catch(err => {
1952
+ console.error('❌ 啟動失敗:', err);
1953
+ process.exit(1);
1954
+ });