npm - @hsiehchenwei/mcp-gemini-transcriber - Versions diffs - 1.0.0 → 1.1.0 - Mend

@hsiehchenwei/mcp-gemini-transcriber 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/package.json +3 -3
package/server.mjs +255 -54

package/package.json CHANGED Viewed

@@ -1,11 +1,11 @@
 {
   "name": "@hsiehchenwei/mcp-gemini-transcriber",
-  "version": "1.0.0",
+  "version": "1.1.0",
   "type": "module",
   "description": "MCP 音訊轉逐字稿工具（使用 Gemini API）- 支援語者識別與情緒分析",
   "main": "server.mjs",
   "bin": {
-    "mcp-gemini-transcriber": "./server.mjs"
+    "mcp-gemini-transcriber": "server.mjs"
   },
   "scripts": {
     "start": "node server.mjs"
@@ -24,7 +24,7 @@
   "license": "MIT",
   "repository": {
     "type": "git",
-    "url": "https://github.com/chenwei/MCPTools.git",
+    "url": "git+https://github.com/chenwei/MCPTools.git",
     "directory": "mcp-gemini-transcriber"
   },
   "dependencies": {

package/server.mjs CHANGED Viewed

@@ -45,6 +45,138 @@ const MODEL_NAME = 'gemini-3-flash-preview';
 const SUPPORTED_AUDIO_FORMATS = ['.mp3', '.m4a', '.wav', '.webm', '.ogg', '.flac', '.aiff', '.aac'];
 const SUPPORTED_IMAGE_FORMATS = ['.png', '.jpg', '.jpeg', '.webp', '.heic', '.heif'];
+// Gemini 3 Flash Preview 價格（每 100 萬 tokens，美元）
+const PRICING = {
+  input: {
+    text: 0.50,    // 文字/圖片/影片
+    audio: 1.00    // 音訊
+  },
+  output: 3.00     // 輸出（含思考代幣）
+};
+/**
+ * Token 用量追蹤器
+ */
+class UsageTracker {
+  constructor() {
+    this.reset();
+  }
+  reset() {
+    this.inputTokens = { text: 0, audio: 0, image: 0, video: 0 };
+    this.outputTokens = 0;
+    this.thoughtTokens = 0;
+    this.apiCalls = 0;
+  }
+  // 從 API response 中提取並累加 usage
+  addFromResponse(response) {
+    const usage = response?.usageMetadata;
+    if (!usage) return;
+    this.apiCalls++;
+    // 輸出 tokens（含思考）
+    this.outputTokens += usage.candidatesTokenCount || 0;
+    this.thoughtTokens += usage.thoughtsTokenCount || 0;
+    // 輸入 tokens（按 modality 分類）
+    if (usage.promptTokensDetails) {
+      for (const detail of usage.promptTokensDetails) {
+        const modality = (detail.modality || 'TEXT').toLowerCase();
+        const count = detail.tokenCount || 0;
+        if (modality === 'audio') {
+          this.inputTokens.audio += count;
+        } else if (modality === 'image') {
+          this.inputTokens.image += count;
+        } else if (modality === 'video') {
+          this.inputTokens.video += count;
+        } else {
+          this.inputTokens.text += count;
+        }
+      }
+    } else {
+      // fallback：沒有 details 時全部算 text
+      this.inputTokens.text += usage.promptTokenCount || 0;
+    }
+  }
+  // 計算費用（美元）
+  calculateCost() {
+    const inputCost =
+      (this.inputTokens.text + this.inputTokens.image + this.inputTokens.video) / 1_000_000 * PRICING.input.text +
+      this.inputTokens.audio / 1_000_000 * PRICING.input.audio;
+    // 輸出費用包含思考 tokens
+    const outputCost = (this.outputTokens + this.thoughtTokens) / 1_000_000 * PRICING.output;
+    return {
+      inputCost,
+      outputCost,
+      totalCost: inputCost + outputCost
+    };
+  }
+  // 取得摘要
+  getSummary() {
+    const cost = this.calculateCost();
+    const totalInput = this.inputTokens.text + this.inputTokens.audio + this.inputTokens.image + this.inputTokens.video;
+    const totalOutput = this.outputTokens + this.thoughtTokens;
+    return {
+      apiCalls: this.apiCalls,
+      tokens: {
+        input: {
+          text: this.inputTokens.text,
+          audio: this.inputTokens.audio,
+          image: this.inputTokens.image,
+          video: this.inputTokens.video,
+          total: totalInput
+        },
+        output: this.outputTokens,
+        thought: this.thoughtTokens,
+        totalOutput: totalOutput,
+        total: totalInput + totalOutput
+      },
+      cost: {
+        input: cost.inputCost,
+        output: cost.outputCost,
+        total: cost.totalCost,
+        formatted: `$${cost.totalCost.toFixed(6)}`
+      }
+    };
+  }
+  // 格式化輸出（用於顯示）
+  formatSummary() {
+    const summary = this.getSummary();
+    let lines = [];
+    lines.push(`📊 **API 使用量統計**`);
+    lines.push(`- API 呼叫次數：${summary.apiCalls} 次`);
+    lines.push(`- 輸入 Tokens：${summary.tokens.input.total.toLocaleString()}`);
+    if (summary.tokens.input.audio > 0) {
+      lines.push(`  - 音訊：${summary.tokens.input.audio.toLocaleString()}`);
+    }
+    if (summary.tokens.input.text > 0) {
+      lines.push(`  - 文字：${summary.tokens.input.text.toLocaleString()}`);
+    }
+    lines.push(`- 輸出 Tokens：${summary.tokens.totalOutput.toLocaleString()}`);
+    if (summary.tokens.thought > 0) {
+      lines.push(`  - 思考：${summary.tokens.thought.toLocaleString()}`);
+      lines.push(`  - 回應：${summary.tokens.output.toLocaleString()}`);
+    }
+    lines.push(`- 總 Tokens：${summary.tokens.total.toLocaleString()}`);
+    lines.push('');
+    lines.push(`💰 **預估費用**`);
+    lines.push(`- 輸入費用：$${summary.cost.input.toFixed(6)}`);
+    lines.push(`- 輸出費用：$${summary.cost.output.toFixed(6)}`);
+    lines.push(`- **總費用：${summary.cost.formatted}**`);
+    return lines.join('\n');
+  }
+}
 /**
  * 計算動態 timeout
  */
@@ -62,6 +194,48 @@ function withTimeout(promise, ms, message = 'Operation timed out') {
   ]);
 }
+/**
+ * 檢查路徑是否包含非 ASCII 字元
+ */
+function hasNonAscii(str) {
+  return /[^\x00-\x7F]/.test(str);
+}
+/**
+ * 安全上傳檔案（處理非 ASCII 路徑）
+ * @google/genai SDK 無法處理包含中文字元的檔案路徑
+ */
+async function safeUploadFile(ai, filePath, mimeType, timeout) {
+  let tempPath = null;
+  let actualPath = filePath;
+  // 如果路徑包含非 ASCII 字元，複製到臨時路徑
+  if (hasNonAscii(filePath)) {
+    const tempDir = path.join(tmpdir(), `upload_${Date.now()}`);
+    await fs.mkdir(tempDir, { recursive: true });
+    const ext = path.extname(filePath);
+    tempPath = path.join(tempDir, `audio_${createHash('md5').update(filePath).digest('hex').slice(0, 8)}${ext}`);
+    await fs.copyFile(filePath, tempPath);
+    actualPath = tempPath;
+    console.error(`   📋 已複製到臨時路徑（避免中文路徑問題）`);
+  }
+  try {
+    const uploadedFile = await withTimeout(
+      ai.files.upload({ file: actualPath, config: { mimeType } }),
+      timeout,
+      `上傳超時 (${timeout/1000}s)`
+    );
+    return { uploadedFile, tempPath };
+  } catch (error) {
+    // 清理臨時檔案
+    if (tempPath) {
+      await fs.rm(path.dirname(tempPath), { recursive: true, force: true }).catch(() => {});
+    }
+    throw error;
+  }
+}
 /**
  * 取得音檔長度（秒）
  */
@@ -467,7 +641,7 @@ function escapeRegex(str) {
 /**
  * 快速模式：簡單轉錄單一片段（不做語者識別）
  */
-async function transcribeSingleSegmentFast(ai, segment, model, maxRetries = MAX_RETRIES) {
+async function transcribeSingleSegmentFast(ai, segment, model, tracker = null, maxRetries = MAX_RETRIES) {
   const { index, path: segmentPath, offset, duration = SEGMENT_DURATION } = segment;
   let uploadedFile = null;
   const segmentTimeout = calculateTimeout(duration);
@@ -486,13 +660,12 @@ async function transcribeSingleSegmentFast(ai, segment, model, maxRetries = MAX_
 開始：`;
+  let uploadTempPath = null;
   for (let attempt = 0; attempt < maxRetries; attempt++) {
     try {
-      uploadedFile = await withTimeout(
-        ai.files.upload({ file: segmentPath, config: { mimeType: 'audio/mpeg' } }),
-        segmentTimeout,
-        `上傳超時 (${segmentTimeout/1000}s)`
-      );
+      const uploadResult = await safeUploadFile(ai, segmentPath, 'audio/mpeg', segmentTimeout);
+      uploadedFile = uploadResult.uploadedFile;
+      uploadTempPath = uploadResult.tempPath;
       const response = await withTimeout(
         ai.models.generateContent({
@@ -507,7 +680,13 @@ async function transcribeSingleSegmentFast(ai, segment, model, maxRetries = MAX_
         `轉錄超時 (${segmentTimeout/1000}s)`
       );
+      // 追蹤 token 用量
+      if (tracker) tracker.addFromResponse(response);
       ai.files.delete({ name: uploadedFile.name }).catch(() => {});
+      if (uploadTempPath) {
+        await fs.rm(path.dirname(uploadTempPath), { recursive: true, force: true }).catch(() => {});
+      }
       let transcript = response.text;
@@ -524,6 +703,10 @@ async function transcribeSingleSegmentFast(ai, segment, model, maxRetries = MAX_
         ai.files.delete({ name: uploadedFile.name }).catch(() => {});
         uploadedFile = null;
       }
+      if (uploadTempPath) {
+        await fs.rm(path.dirname(uploadTempPath), { recursive: true, force: true }).catch(() => {});
+        uploadTempPath = null;
+      }
       if (attempt < maxRetries - 1) {
         const delay = RETRY_BASE_DELAY * Math.pow(2, attempt);
@@ -541,7 +724,7 @@ async function transcribeSingleSegmentFast(ai, segment, model, maxRetries = MAX_
  * 語者識別模式：單一 API 呼叫同時轉錄 + 語者識別
  * 回傳格式包含逐字稿和語者資訊，減少 API 呼叫次數
  */
-async function transcribeSingleSegmentSpeaker(ai, segment, model, speakerProfiles = {}, previousEnding = '', maxRetries = MAX_RETRIES) {
+async function transcribeSingleSegmentSpeaker(ai, segment, model, speakerProfiles = {}, previousEnding = '', tracker = null, maxRetries = MAX_RETRIES) {
   const { index, path: segmentPath, offset, duration = SEGMENT_DURATION } = segment;
   let uploadedFile = null;
   const segmentTimeout = calculateTimeout(duration);
@@ -596,13 +779,12 @@ ${previousEnding.slice(-400)}
 開始轉錄：`;
+  let uploadTempPath = null;
   for (let attempt = 0; attempt < maxRetries; attempt++) {
     try {
-      uploadedFile = await withTimeout(
-        ai.files.upload({ file: segmentPath, config: { mimeType: 'audio/mpeg' } }),
-        segmentTimeout,
-        `上傳超時 (${segmentTimeout/1000}s)`
-      );
+      const uploadResult = await safeUploadFile(ai, segmentPath, 'audio/mpeg', segmentTimeout);
+      uploadedFile = uploadResult.uploadedFile;
+      uploadTempPath = uploadResult.tempPath;
       const response = await withTimeout(
         ai.models.generateContent({
@@ -617,7 +799,13 @@ ${previousEnding.slice(-400)}
         `轉錄超時 (${segmentTimeout/1000}s)`
       );
+      // 追蹤 token 用量
+      if (tracker) tracker.addFromResponse(response);
       ai.files.delete({ name: uploadedFile.name }).catch(() => {});
+      if (uploadTempPath) {
+        await fs.rm(path.dirname(uploadTempPath), { recursive: true, force: true }).catch(() => {});
+      }
       const fullText = response.text;
@@ -682,7 +870,7 @@ ${previousEnding.slice(-400)}
       return {
         index,
         transcript,
-        emotion,  // 新增
+        emotion,
         success: true,
         speakerProfiles: newSpeakerProfiles
       };
@@ -692,6 +880,10 @@ ${previousEnding.slice(-400)}
         ai.files.delete({ name: uploadedFile.name }).catch(() => {});
         uploadedFile = null;
       }
+      if (uploadTempPath) {
+        await fs.rm(path.dirname(uploadTempPath), { recursive: true, force: true }).catch(() => {});
+        uploadTempPath = null;
+      }
       if (attempt < maxRetries - 1) {
         const delay = RETRY_BASE_DELAY * Math.pow(2, attempt);
@@ -1103,6 +1295,7 @@ ${transcript}
 async function transcribeAudioFast(audioPath, outputPath, model, ai, durationMin) {
   const segments = await splitAudioFast(audioPath);
   let tempDir = null;
+  const tracker = new UsageTracker();
   if (segments.length > 1 && !segments[0].isOriginal) {
     tempDir = path.dirname(segments[0].path);
@@ -1112,10 +1305,7 @@ async function transcribeAudioFast(audioPath, outputPath, model, ai, durationMin
   if (segments.length === 1) {
     console.error('⬆️  上傳中...');
-    const uploadedFile = await ai.files.upload({
-      file: audioPath,
-      config: { mimeType: 'audio/mpeg' }
-    });
+    const { uploadedFile, tempPath: uploadTempPath } = await safeUploadFile(ai, audioPath, 'audio/mpeg', 120000);
     console.error('✅ 上傳完成');
     const response = await ai.models.generateContent({
@@ -1129,8 +1319,12 @@ async function transcribeAudioFast(audioPath, outputPath, model, ai, durationMin
       config: { maxOutputTokens: 65536, temperature: 0.1 }
     });
+    tracker.addFromResponse(response);
     transcript = response.text;
     try { await ai.files.delete({ name: uploadedFile.name }); } catch {}
+    if (uploadTempPath) {
+      await fs.rm(path.dirname(uploadTempPath), { recursive: true, force: true }).catch(() => {});
+    }
   } else {
     // 平行處理
@@ -1139,7 +1333,7 @@ async function transcribeAudioFast(audioPath, outputPath, model, ai, durationMin
     for (let i = 0; i < segments.length; i += MAX_WORKERS) {
       const batch = segments.slice(i, i + MAX_WORKERS);
-      const batchPromises = batch.map(seg => transcribeSingleSegmentFast(ai, seg, model));
+      const batchPromises = batch.map(seg => transcribeSingleSegmentFast(ai, seg, model, tracker));
       const batchResults = await Promise.all(batchPromises);
       for (const r of batchResults) {
@@ -1158,6 +1352,10 @@ async function transcribeAudioFast(audioPath, outputPath, model, ai, durationMin
     await fs.rm(tempDir, { recursive: true, force: true });
   }
+  // 取得使用量摘要
+  const usage = tracker.getSummary();
+  console.error(`💰 費用：${usage.cost.formatted}（${usage.tokens.total.toLocaleString()} tokens）`);
   // 儲存結果
   const outPath = outputPath || audioPath.replace(/\.[^/.]+$/, '.md');
   const formattedTimestamp = new Date().toISOString().slice(0, 19).replace(/:/g, '-');
@@ -1188,7 +1386,8 @@ ${transcript}
     duration: `${Math.round(durationMin)} 分鐘`,
     segments: segments.length,
     summary: '（快速模式不產生摘要）',
-    keywords: ''
+    keywords: '',
+    usage
   };
 }
@@ -1198,6 +1397,7 @@ ${transcript}
 async function transcribeAudioSpeaker(audioPath, outputPath, model, ai, durationMin) {
   const segments = await splitAudioSpeaker(audioPath);
   let tempDir = null;
+  const tracker = new UsageTracker();
   if (segments.length > 1 && !segments[0].isOriginal) {
     tempDir = path.dirname(segments[0].path);
@@ -1228,7 +1428,7 @@ async function transcribeAudioSpeaker(audioPath, outputPath, model, ai, duration
     // 單一 API 呼叫：同時取得逐字稿 + 情緒分析 + 語者資訊
     const result = await transcribeSingleSegmentSpeaker(
-      ai, segment, model, speakerProfiles, previousEnding
+      ai, segment, model, speakerProfiles, previousEnding, tracker
     );
     if (!result.success) {
@@ -1278,6 +1478,7 @@ async function transcribeAudioSpeaker(audioPath, outputPath, model, ai, duration
     ]),
     config: { maxOutputTokens: 2048, temperature: 0.3 }
   });
+  tracker.addFromResponse(summaryResponse);
   const summary = summaryResponse.text;
   // 生成簡短情緒摘要（一段文字，包含經典句子）
@@ -1304,6 +1505,7 @@ ${emotionResults.map(r => `[${r.time}分鐘] ${r.emotion.slice(0, 200)}`).join('
         contents: createUserContent([emotionSummaryPrompt]),
         config: { maxOutputTokens: 1024, temperature: 0.3 }
       });
+      tracker.addFromResponse(emotionSummaryResponse);
       emotionSummary = emotionSummaryResponse.text;
       console.error('   ✅ 情緒摘要完成');
     } catch (error) {
@@ -1409,13 +1611,18 @@ ${finalTranscript}
   await fs.writeFile(outPath, finalContent, 'utf8');
   console.error(`📄 [語者識別模式] 逐字稿已儲存: ${outPath}`);
+  // 取得使用量摘要
+  const usage = tracker.getSummary();
+  console.error(`💰 費用：${usage.cost.formatted}（${usage.tokens.total.toLocaleString()} tokens）`);
   return {
     outputPath: outPath,
     duration: metadata.duration,
     segments: segments.length,
     summary,
     keywords: '',
-    speakerProfiles
+    speakerProfiles,
+    usage
   };
 }
@@ -1612,10 +1819,7 @@ async function describeImage(imagePath, outputPath = null, detailLevel = 'detail
   const prompt = detailPrompts[detailLevel] || detailPrompts.detailed;
-  const uploadedFile = await ai.files.upload({
-    file: imagePath,
-    config: { mimeType: 'image/jpeg' }
-  });
+  const { uploadedFile, tempPath: uploadTempPath } = await safeUploadFile(ai, imagePath, 'image/jpeg', 60000);
   const response = await ai.models.generateContent({
     model: MODEL_NAME,
@@ -1627,6 +1831,9 @@ async function describeImage(imagePath, outputPath = null, detailLevel = 'detail
   });
   try { await ai.files.delete({ name: uploadedFile.name }); } catch { /* ignore */ }
+  if (uploadTempPath) {
+    await fs.rm(path.dirname(uploadTempPath), { recursive: true, force: true }).catch(() => {});
+  }
   const description = response.text;
   const outPath = outputPath || imagePath.replace(/\.[^/.]+$/, '.md');
@@ -1687,43 +1894,36 @@ server.tool(
       // 根據模式顯示不同結果（從結果中取得實際使用的模式）
       const actualMode = result.mode || (mode || process.env.DEFAULT_MODE || 'fast');
-      if (actualMode === 'fast') {
-        return {
-          content: [{
-            type: 'text',
-            text: `✅ [快速模式] 音訊轉錄完成！
+      // 格式化費用資訊（簡潔版）
+      const usageInfo = result.usage ? `
-📁 **輸出檔案**: ${result.outputPath}
-⏱️ **總時長**: ${result.duration}
-📊 **處理區塊**: ${result.segments} 個`
-          }]
-        };
-      } else if (selectedMode === 'hybrid') {
-        const speakerInfo = result.speakerProfiles ?
-          Object.entries(result.speakerProfiles)
-            .map(([name, info]) => `- **${name}**（${[info.gender, info.role].filter(Boolean).join('，') || '?'}）`)
-            .join('\n') : '';
+💰 **費用統計**
+- 輸入 Tokens：${result.usage.tokens.input.total.toLocaleString()}
+- 輸出 Tokens：${result.usage.tokens.totalOutput.toLocaleString()}
+- **總費用：${result.usage.cost.formatted}**` : '';
+      if (actualMode === 'fast') {
         return {
           content: [{
             type: 'text',
-            text: `✅ [Hybrid 模式] 音訊轉錄完成！
+            text: `✅ [快速模式] 音訊轉錄完成！
 📁 **輸出檔案**: ${result.outputPath}
 ⏱️ **總時長**: ${result.duration}
 📊 **處理區塊**: ${result.segments} 個
-## 語者資訊
-${speakerInfo || '（未識別）'}
-## 摘要
-${result.summary}`
+${usageInfo}`
           }]
         };
       } else {
+        // speaker 模式
         const speakerInfo = result.speakerProfiles ?
           Object.entries(result.speakerProfiles)
-            .map(([id, info]) => `- ${info.name || id}（${info.role || '?'}）`)
+            .map(([id, info]) => {
+              const name = info.name || id;
+              const details = [info.gender, info.role, info.traits].filter(Boolean).join('，');
+              return `- **${name}**：${details || '待識別'}`;
+            })
             .join('\n') : '';
         return {
@@ -1739,7 +1939,8 @@ ${result.summary}`
 ${speakerInfo || '（未識別）'}
 ## 摘要
-${result.summary}`
+${result.summary || '（無）'}
+${usageInfo}`
           }]
         };
       }
@@ -1849,11 +2050,8 @@ server.tool(
         ffmpeg.on('error', reject);
       });
-      // 上傳並分析
-      const uploadedFile = await ai.files.upload({
-        file: segmentPath,
-        config: { mimeType: 'audio/mpeg' }
-      });
+      // 上傳並分析（使用 safeUploadFile 處理中文路徑）
+      const { uploadedFile, tempPath: uploadTempPath } = await safeUploadFile(ai, segmentPath, 'audio/mpeg', 60000);
       const defaultQuestion = `分析這段音頻中說話的人：
@@ -1876,6 +2074,9 @@ server.tool(
       // 清理
       ai.files.delete({ name: uploadedFile.name }).catch(() => {});
+      if (uploadTempPath) {
+        await fs.rm(path.dirname(uploadTempPath), { recursive: true, force: true }).catch(() => {});
+      }
       await fs.rm(tempDir, { recursive: true, force: true });
       const timeRange = `${timestamp || '00:00'} - ${formatTime(startSeconds + duration)}`;