@ww_nero/media 1.0.5 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -13,7 +13,9 @@
13
13
  - `audio_file`: 音频文件的相对路径(相对于工作目录),支持 mp3/wav 格式
14
14
 
15
15
  **限制:**
16
- - 音频长度最大 1 分钟,超出需要外部分段处理后逐段识别
16
+ - 音频长度最大 1 分钟
17
+ - 文件大小不超过 5MB
18
+ - 超出限制需要外部分段处理后逐段识别
17
19
 
18
20
  **输出:**
19
21
  - 识别结果保存到工作目录下的 `transcribe.srt` 文件
@@ -22,22 +24,14 @@
22
24
 
23
25
  | 变量名 | 说明 | 必填 |
24
26
  |--------|------|------|
25
- | `ASR_API_KEY` | 阿里云 ASR API Key | 是 |
26
- | `ASR_UPLOAD_URL` | 上传接口的完整 URL,如 `http://server.domain.com/upload` | 是 |
27
+ | `ASR_API_KEY` | 阿里云 DashScope API Key | 是 |
27
28
 
28
- ## 依赖
29
-
30
- ### Python 依赖
29
+ ## 安装
31
30
 
32
31
  ```bash
33
- pip install dashscope requests
32
+ npm install
34
33
  ```
35
34
 
36
- ### 系统依赖
37
-
38
- - `ffprobe` (用于检测音频时长,来自 ffmpeg)
39
- - `python3`
40
-
41
35
  ## 配置示例
42
36
 
43
37
  ```json
@@ -47,31 +41,9 @@ pip install dashscope requests
47
41
  "command": "node",
48
42
  "args": ["/path/to/media/index.js"],
49
43
  "env": {
50
- "ASR_API_KEY": "your-api-key",
51
- "ASR_UPLOAD_URL": "http://server.domain.com/upload"
44
+ "ASR_API_KEY": "your-api-key"
52
45
  }
53
46
  }
54
47
  }
55
48
  }
56
49
  ```
57
-
58
- ## 上传接口要求
59
-
60
- 上传接口需符合以下规范(参考 demo_server 实现):
61
-
62
- **请求:**
63
- - 方法: `POST`
64
- - Content-Type: `multipart/form-data`
65
- - 字段名: `file`
66
-
67
- **响应(成功):**
68
- ```json
69
- {
70
- "message": "文件上传成功",
71
- "fileName": "abc123.mp3"
72
- }
73
- ```
74
-
75
- 上传的文件可通过 `{base_url}/{fileName}` 访问,例如:
76
- - 上传接口: `http://server.domain.com/upload`
77
- - 文件访问: `http://server.domain.com/abc123.mp3`
package/index.js CHANGED
@@ -2,7 +2,6 @@
2
2
 
3
3
  const fs = require('fs');
4
4
  const path = require('path');
5
- const { spawn } = require('child_process');
6
5
  const { Server } = require('@modelcontextprotocol/sdk/server/index.js');
7
6
  const { StdioServerTransport } = require('@modelcontextprotocol/sdk/server/stdio.js');
8
7
  const {
@@ -12,11 +11,185 @@ const {
12
11
 
13
12
  const ASR_API_KEY = process.env.ASR_API_KEY || '';
14
13
  const ASR_UPLOAD_URL = 'http://fsheep.com:10808/upload';
14
+ const ASR_SERVICE_URL = 'https://dashscope.aliyuncs.com/api/v1/services/audio/asr/transcription';
15
15
 
16
16
  const SUPPORTED_AUDIO_TYPES = ['.mp3', '.wav'];
17
- const MAX_AUDIO_DURATION_SECONDS = 60;
17
+ const MAX_FILE_SIZE_BYTES = 5 * 1024 * 1024; // 5MB
18
+ const LANGUAGE_HINTS = ['zh', 'en', 'ja'];
18
19
 
19
- const SCRIPTS_DIR = path.join(__dirname, 'scripts');
20
+ /**
21
+ * 将毫秒转换为 SRT 时间格式 HH:MM:SS,mmm
22
+ */
23
+ const msToSrtTime = (ms) => {
24
+ const hours = Math.floor(ms / 3600000);
25
+ const minutes = Math.floor((ms % 3600000) / 60000);
26
+ const seconds = Math.floor((ms % 60000) / 1000);
27
+ const milliseconds = ms % 1000;
28
+ return `${String(hours).padStart(2, '0')}:${String(minutes).padStart(2, '0')}:${String(seconds).padStart(2, '0')},${String(milliseconds).padStart(3, '0')}`;
29
+ };
30
+
31
+ /**
32
+ * 将 ASR 识别结果转换为 SRT 字幕内容
33
+ */
34
+ const asrToSrt = (asrData) => {
35
+ const srtEntries = [];
36
+ let subtitleIndex = 1;
37
+
38
+ for (const item of asrData) {
39
+ const transcription = item.transcription || {};
40
+ const transcripts = transcription.transcripts || [];
41
+
42
+ for (const transcript of transcripts) {
43
+ const sentences = transcript.sentences || [];
44
+
45
+ for (const sentence of sentences) {
46
+ const beginTime = sentence.begin_time || 0;
47
+ const endTime = sentence.end_time || 0;
48
+ const text = (sentence.text || '').trim();
49
+
50
+ if (text) {
51
+ const startStr = msToSrtTime(beginTime);
52
+ const endStr = msToSrtTime(endTime);
53
+
54
+ srtEntries.push(`${subtitleIndex}\n${startStr} --> ${endStr}\n${text}\n`);
55
+ subtitleIndex++;
56
+ }
57
+ }
58
+ }
59
+ }
60
+
61
+ return srtEntries.join('\n');
62
+ };
63
+
64
+ /**
65
+ * 上传音频文件到服务器
66
+ */
67
+ const uploadAudio = async (uploadUrl, audioPath) => {
68
+ const fileBuffer = fs.readFileSync(audioPath);
69
+ const fileName = path.basename(audioPath);
70
+
71
+ const FormData = (await import('form-data')).default;
72
+ const formData = new FormData();
73
+ formData.append('file', fileBuffer, fileName);
74
+
75
+ const response = await fetch(uploadUrl, {
76
+ method: 'POST',
77
+ body: formData,
78
+ headers: formData.getHeaders(),
79
+ });
80
+
81
+ if (!response.ok) {
82
+ throw new Error(`上传失败: ${response.status} - ${await response.text()}`);
83
+ }
84
+
85
+ const data = await response.json();
86
+
87
+ if (!data.success || !data.data || !data.data.path) {
88
+ throw new Error(`上传响应格式错误: ${JSON.stringify(data)}`);
89
+ }
90
+
91
+ return data.data.path;
92
+ };
93
+
94
+ /**
95
+ * 根据上传接口 URL 和文件路径构建静态资源 URL
96
+ */
97
+ const getStaticUrl = (uploadUrl, filePath) => {
98
+ const url = new URL(uploadUrl);
99
+ return `${url.protocol}//${url.host}${filePath}`;
100
+ };
101
+
102
+ /**
103
+ * 提交 ASR 转写任务
104
+ */
105
+ const submitAsrTask = async (fileUrls, apiKey) => {
106
+ const response = await fetch(ASR_SERVICE_URL, {
107
+ method: 'POST',
108
+ headers: {
109
+ 'Authorization': `Bearer ${apiKey}`,
110
+ 'Content-Type': 'application/json',
111
+ 'X-DashScope-Async': 'enable',
112
+ },
113
+ body: JSON.stringify({
114
+ model: 'paraformer-v2',
115
+ input: { file_urls: fileUrls },
116
+ parameters: {
117
+ channel_id: [0],
118
+ language_hints: LANGUAGE_HINTS,
119
+ },
120
+ }),
121
+ });
122
+
123
+ if (!response.ok) {
124
+ const errorText = await response.text();
125
+ throw new Error(`ASR 任务提交失败: ${response.status} - ${errorText}`);
126
+ }
127
+
128
+ const data = await response.json();
129
+
130
+ if (!data.output || !data.output.task_id) {
131
+ throw new Error(`ASR 响应格式错误: ${JSON.stringify(data)}`);
132
+ }
133
+
134
+ return data.output.task_id;
135
+ };
136
+
137
+ /**
138
+ * 轮询等待 ASR 任务完成
139
+ */
140
+ const waitForTaskComplete = async (taskId, apiKey, timeoutMs = 5 * 60 * 1000) => {
141
+ const taskUrl = `https://dashscope.aliyuncs.com/api/v1/tasks/${taskId}`;
142
+ const startTime = Date.now();
143
+ const pollInterval = 500;
144
+
145
+ while (Date.now() - startTime < timeoutMs) {
146
+ const response = await fetch(taskUrl, {
147
+ method: 'GET',
148
+ headers: {
149
+ 'Authorization': `Bearer ${apiKey}`,
150
+ },
151
+ });
152
+
153
+ if (!response.ok) {
154
+ throw new Error(`查询任务状态失败: ${response.status}`);
155
+ }
156
+
157
+ const data = await response.json();
158
+ const status = data.output?.task_status;
159
+
160
+ if (status === 'SUCCEEDED') {
161
+ return data.output.results || [];
162
+ } else if (status === 'FAILED') {
163
+ throw new Error('ASR 识别任务失败');
164
+ }
165
+
166
+ await new Promise(resolve => setTimeout(resolve, pollInterval));
167
+ }
168
+
169
+ throw new Error('ASR 识别超时');
170
+ };
171
+
172
+ /**
173
+ * 获取转写结果详情
174
+ */
175
+ const fetchTranscriptionResults = async (results) => {
176
+ const allTranscriptions = [];
177
+
178
+ for (const result of results) {
179
+ if (result.subtask_status === 'SUCCEEDED' && result.transcription_url) {
180
+ const response = await fetch(result.transcription_url);
181
+ if (response.ok) {
182
+ const transcriptionData = await response.json();
183
+ allTranscriptions.push({
184
+ file_url: result.file_url,
185
+ transcription: transcriptionData,
186
+ });
187
+ }
188
+ }
189
+ }
190
+
191
+ return allTranscriptions;
192
+ };
20
193
 
21
194
  /**
22
195
  * 将 Windows 路径转换为 WSL 路径,或反之
@@ -115,175 +288,6 @@ const resolveAudioFile = (workingDir, rawPath) => {
115
288
  return resolved;
116
289
  };
117
290
 
118
- /**
119
- * 将路径转换为 WSL 路径格式
120
- */
121
- const toWslPath = (targetPath) => {
122
- if (targetPath.startsWith('/mnt/')) {
123
- return targetPath;
124
- }
125
- const winMatch = targetPath.match(/^([a-zA-Z]):\\(.*)$/);
126
- if (winMatch) {
127
- const drive = winMatch[1].toLowerCase();
128
- const rest = winMatch[2].replace(/\\/g, '/');
129
- return `/mnt/${drive}/${rest}`;
130
- }
131
- return targetPath;
132
- };
133
-
134
- /**
135
- * 检查是否运行在 WSL 环境中
136
- */
137
- const isWslEnvironment = () => {
138
- try {
139
- if (process.platform === 'linux') {
140
- const release = fs.readFileSync('/proc/version', 'utf8');
141
- return release.toLowerCase().includes('microsoft');
142
- }
143
- } catch {
144
- // ignore
145
- }
146
- return false;
147
- };
148
-
149
- /**
150
- * 获取音频时长(秒)
151
- */
152
- const getAudioDuration = (audioPath) => {
153
- return new Promise((resolve) => {
154
- const wslPath = toWslPath(audioPath);
155
- const isWsl = isWslEnvironment();
156
-
157
- let cmd, args;
158
- if (isWsl || process.platform === 'linux') {
159
- cmd = 'ffprobe';
160
- args = [
161
- '-v', 'error',
162
- '-show_entries', 'format=duration',
163
- '-of', 'default=noprint_wrappers=1:nokey=1',
164
- wslPath
165
- ];
166
- } else {
167
- cmd = 'ffprobe';
168
- args = [
169
- '-v', 'error',
170
- '-show_entries', 'format=duration',
171
- '-of', 'default=noprint_wrappers=1:nokey=1',
172
- audioPath
173
- ];
174
- }
175
-
176
- const child = spawn(cmd, args, {
177
- stdio: ['pipe', 'pipe', 'pipe'],
178
- shell: false
179
- });
180
-
181
- let stdout = '';
182
- let stderr = '';
183
-
184
- child.stdout.on('data', (data) => {
185
- stdout += data.toString();
186
- });
187
-
188
- child.stderr.on('data', (data) => {
189
- stderr += data.toString();
190
- });
191
-
192
- child.on('close', (code) => {
193
- if (code === 0 && stdout.trim()) {
194
- const duration = parseFloat(stdout.trim());
195
- if (!isNaN(duration)) {
196
- resolve(duration);
197
- return;
198
- }
199
- }
200
- // 如果获取失败,返回 -1 表示未知(不阻止执行)
201
- resolve(-1);
202
- });
203
-
204
- child.on('error', () => {
205
- resolve(-1);
206
- });
207
- });
208
- };
209
-
210
- /**
211
- * 执行 Python 脚本进行 ASR 识别
212
- */
213
- const runAsrScript = (audioPath, outputPath, uploadUrl, apiKey) => {
214
- return new Promise((resolve, reject) => {
215
- const scriptPath = path.join(SCRIPTS_DIR, 'asr_srt.py');
216
- const wslScriptPath = toWslPath(scriptPath);
217
- const wslAudioPath = toWslPath(audioPath);
218
- const wslOutputPath = toWslPath(outputPath);
219
-
220
- // 构建 python 命令字符串
221
- const pythonCommand = [
222
- 'python',
223
- `"${wslScriptPath}"`,
224
- '--audio', `"${wslAudioPath}"`,
225
- '--output', `"${wslOutputPath}"`,
226
- '--upload-url', `"${uploadUrl}"`,
227
- '--api-key', `"${apiKey}"`
228
- ].join(' ');
229
-
230
- const isWsl = isWslEnvironment();
231
-
232
- // 参考 bash.js 的调用方式,通过 bash -ic 执行命令
233
- cmd = 'bash';
234
- args = ['-ic', pythonCommand];
235
-
236
- const child = spawn(cmd, args, {
237
- stdio: ['pipe', 'pipe', 'pipe'],
238
- shell: false,
239
- env: {
240
- ...process.env,
241
- ASR_API_KEY: apiKey
242
- }
243
- });
244
-
245
- let stdout = '';
246
- let stderr = '';
247
- const maxOutput = 64 * 1024;
248
-
249
- child.stdout.on('data', (data) => {
250
- const chunk = data.toString();
251
- stdout += chunk;
252
- if (stdout.length > maxOutput) {
253
- stdout = stdout.slice(-maxOutput);
254
- }
255
- });
256
-
257
- child.stderr.on('data', (data) => {
258
- const chunk = data.toString();
259
- stderr += chunk;
260
- if (stderr.length > maxOutput) {
261
- stderr = stderr.slice(-maxOutput);
262
- }
263
- });
264
-
265
- const timeout = setTimeout(() => {
266
- child.kill('SIGKILL');
267
- reject(new Error('ASR 识别超时(超过 5 分钟)'));
268
- }, 5 * 60 * 1000);
269
-
270
- child.on('close', (code) => {
271
- clearTimeout(timeout);
272
- if (code === 0) {
273
- resolve(stdout.trim());
274
- } else {
275
- const errorMsg = stderr.trim() || stdout.trim() || `进程退出码: ${code}`;
276
- reject(new Error(errorMsg));
277
- }
278
- });
279
-
280
- child.on('error', (err) => {
281
- clearTimeout(timeout);
282
- reject(new Error(`执行脚本失败: ${err.message}`));
283
- });
284
- });
285
- };
286
-
287
291
  /**
288
292
  * ASR 语音识别
289
293
  */
@@ -296,20 +300,38 @@ const asr = async ({ working_directory, audio_file }) => {
296
300
  const workingDir = resolveWorkingDirectory(working_directory);
297
301
  const audioPath = resolveAudioFile(workingDir, audio_file);
298
302
 
299
- // 检查音频时长
300
- const duration = await getAudioDuration(audioPath);
301
- if (duration > 0 && duration > MAX_AUDIO_DURATION_SECONDS) {
303
+ // 检查文件大小
304
+ const fileSize = fs.statSync(audioPath).size;
305
+ if (fileSize > MAX_FILE_SIZE_BYTES) {
302
306
  throw new Error(
303
- `音频时长 ${Math.round(duration)} 秒超过限制(最大 ${MAX_AUDIO_DURATION_SECONDS} 秒)。` +
307
+ `音频文件大小 ${(fileSize / 1024 / 1024).toFixed(2)}MB 超过限制(最大 5MB)。` +
304
308
  `请先对音频进行分段处理后再逐段识别。`
305
309
  );
306
310
  }
307
311
 
308
- // 输出文件路径
309
- const outputPath = path.join(workingDir, 'transcribe.srt');
312
+ // 1. 上传音频文件
313
+ const filePath = await uploadAudio(ASR_UPLOAD_URL, audioPath);
314
+
315
+ // 2. 构建静态资源 URL
316
+ const audioUrl = getStaticUrl(ASR_UPLOAD_URL, filePath);
317
+
318
+ // 3. 提交 ASR 任务
319
+ const taskId = await submitAsrTask([audioUrl], ASR_API_KEY);
320
+
321
+ // 4. 等待任务完成
322
+ const results = await waitForTaskComplete(taskId, ASR_API_KEY);
310
323
 
311
- // 执行 ASR 脚本
312
- await runAsrScript(audioPath, outputPath, ASR_UPLOAD_URL, ASR_API_KEY);
324
+ // 5. 获取转写结果
325
+ const transcriptions = await fetchTranscriptionResults(results);
326
+
327
+ if (!transcriptions.length) {
328
+ throw new Error('未获取到识别结果');
329
+ }
330
+
331
+ // 6. 转换为 SRT 格式并保存
332
+ const srtContent = asrToSrt(transcriptions);
333
+ const outputPath = path.join(workingDir, 'transcribe.srt');
334
+ fs.writeFileSync(outputPath, srtContent, 'utf-8');
313
335
 
314
336
  return 'transcribe.srt';
315
337
  };
@@ -317,7 +339,7 @@ const asr = async ({ working_directory, audio_file }) => {
317
339
  const server = new Server(
318
340
  {
319
341
  name: 'media',
320
- version: '1.0.5',
342
+ version: '1.0.6',
321
343
  },
322
344
  {
323
345
  capabilities: {
@@ -330,7 +352,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => ({
330
352
  tools: [
331
353
  {
332
354
  name: 'asr',
333
- description: '语音识别工具,将音频文件转换为带时间戳的 SRT 字幕文件。支持 mp3/wav 格式,音频长度限制 1 分钟以内。',
355
+ description: '语音识别工具,将音频文件转换为带时间戳的 SRT 字幕文件。',
334
356
  inputSchema: {
335
357
  type: 'object',
336
358
  properties: {
@@ -340,7 +362,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => ({
340
362
  },
341
363
  audio_file: {
342
364
  type: 'string',
343
- description: '音频文件的相对路径(相对于工作目录),支持 mp3/wav 格式',
365
+ description: '音频文件的相对路径(相对于工作目录),支持 mp3/wav 格式,音频长度限制 1 分钟以内,文件大小不超过 5MB',
344
366
  },
345
367
  },
346
368
  required: ['working_directory', 'audio_file'],
package/package.json CHANGED
@@ -1,16 +1,16 @@
1
1
  {
2
2
  "name": "@ww_nero/media",
3
- "version": "1.0.5",
3
+ "version": "1.0.6",
4
4
  "description": "MCP server for media processing, including ASR speech recognition",
5
5
  "main": "index.js",
6
6
  "bin": {
7
7
  "media": "index.js"
8
8
  },
9
9
  "files": [
10
- "index.js",
11
- "scripts"
10
+ "index.js"
12
11
  ],
13
12
  "dependencies": {
14
- "@modelcontextprotocol/sdk": "^1.22.0"
13
+ "@modelcontextprotocol/sdk": "^1.22.0",
14
+ "form-data": "^4.0.1"
15
15
  }
16
16
  }
@@ -1,227 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- ASR 语音识别脚本
5
- 上传音频文件到服务器,调用阿里云 ASR 接口进行识别,生成 SRT 字幕文件
6
- """
7
-
8
- import os
9
- import sys
10
- import json
11
- import time
12
- import argparse
13
- import requests
14
- from pathlib import Path
15
- from http import HTTPStatus
16
-
17
- # 尝试导入 dashscope,如果失败则提示安装
18
- try:
19
- import dashscope
20
- from dashscope.audio.asr import Transcription
21
- except ImportError:
22
- print("错误: 请先安装 dashscope 库: pip install dashscope", file=sys.stderr)
23
- sys.exit(1)
24
-
25
-
26
- def ms_to_srt_time(ms: int) -> str:
27
- """将毫秒转换为 SRT 时间格式 HH:MM:SS,mmm"""
28
- hours = ms // 3600000
29
- minutes = (ms % 3600000) // 60000
30
- seconds = (ms % 60000) // 1000
31
- milliseconds = ms % 1000
32
- return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
33
-
34
-
35
- def asr_to_srt(asr_data: list, output_srt_path: str):
36
- """
37
- 将 ASR 识别结果转换为 SRT 字幕文件
38
-
39
- Args:
40
- asr_data: ASR 识别结果列表
41
- output_srt_path: 输出的 SRT 文件路径
42
- """
43
- srt_entries = []
44
- subtitle_index = 1
45
-
46
- for item in asr_data:
47
- transcription = item.get('transcription', {})
48
- transcripts = transcription.get('transcripts', [])
49
-
50
- for transcript in transcripts:
51
- sentences = transcript.get('sentences', [])
52
-
53
- for sentence in sentences:
54
- begin_time = sentence.get('begin_time', 0)
55
- end_time = sentence.get('end_time', 0)
56
- text = sentence.get('text', '').strip()
57
-
58
- if text:
59
- start_str = ms_to_srt_time(begin_time)
60
- end_str = ms_to_srt_time(end_time)
61
-
62
- srt_entry = f"{subtitle_index}\n{start_str} --> {end_str}\n{text}\n"
63
- srt_entries.append(srt_entry)
64
- subtitle_index += 1
65
-
66
- with open(output_srt_path, 'w', encoding='utf-8') as f:
67
- f.write('\n'.join(srt_entries))
68
-
69
- return subtitle_index - 1
70
-
71
-
72
- def upload_audio(upload_url: str, audio_path: str) -> str:
73
- """
74
- 上传音频文件到服务器
75
-
76
- Args:
77
- upload_url: 上传接口的完整 URL
78
- audio_path: 本地音频文件路径
79
-
80
- Returns:
81
- 上传后的文件名
82
- """
83
- audio_path = Path(audio_path)
84
- if not audio_path.exists():
85
- raise FileNotFoundError(f"音频文件不存在: {audio_path}")
86
-
87
- with open(audio_path, 'rb') as f:
88
- files = {'file': (audio_path.name, f)}
89
- response = requests.post(upload_url, files=files, timeout=60)
90
-
91
- if response.status_code != 200:
92
- raise Exception(f"上传失败: {response.status_code} - {response.text}")
93
-
94
- data = response.json()
95
-
96
- # 响应格式: {'success': True, 'data': {'path': '/tmp/xxx.wav'}}
97
- if not data.get('success') or 'data' not in data or 'path' not in data['data']:
98
- raise Exception(f"上传响应格式错误: {data}")
99
-
100
- # 返回完整的相对路径,用于拼接到服务器地址后面
101
- return data['data']['path']
102
-
103
-
104
- def get_static_url(upload_url: str, file_path: str) -> str:
105
- """
106
- 根据上传接口 URL 和文件路径构建静态资源 URL
107
-
108
- Args:
109
- upload_url: 上传接口的完整 URL (如 http://server.domain.com/upload)
110
- file_path: 上传后返回的文件路径 (如 /tmp/xxx.wav)
111
-
112
- Returns:
113
- 静态资源的完整 URL
114
- """
115
- # 从上传 URL 中提取基础 URL
116
- # 例如: http://server.domain.com/upload -> http://server.domain.com
117
- from urllib.parse import urlparse, urlunparse
118
- parsed = urlparse(upload_url)
119
- base_url = urlunparse((parsed.scheme, parsed.netloc, '', '', '', ''))
120
- # file_path 已经是以 / 开头的完整相对路径,直接拼接
121
- return f"{base_url}{file_path}"
122
-
123
-
124
- def transcribe_audio(audio_url: str, api_key: str) -> list:
125
- """
126
- 调用阿里云 ASR 接口进行语音识别
127
-
128
- Args:
129
- audio_url: 音频文件的公网 URL
130
- api_key: 阿里云 ASR API Key
131
-
132
- Returns:
133
- 识别结果列表
134
- """
135
- dashscope.api_key = api_key
136
-
137
- # 发起异步识别请求
138
- transcribe_response = Transcription.async_call(
139
- model='paraformer-v2',
140
- file_urls=[audio_url],
141
- language_hints=['zh', 'en', 'ja']
142
- )
143
-
144
- if not transcribe_response or not hasattr(transcribe_response, 'output'):
145
- raise Exception("ASR 请求失败: 无效的响应")
146
-
147
- # 轮询等待识别完成
148
- while True:
149
- task_status = transcribe_response.output.task_status
150
- if task_status == 'SUCCEEDED' or task_status == 'FAILED':
151
- break
152
- time.sleep(1)
153
- transcribe_response = Transcription.fetch(task=transcribe_response.output.task_id)
154
-
155
- if transcribe_response.status_code != HTTPStatus.OK:
156
- raise Exception(f"ASR 识别失败: {transcribe_response.status_code}")
157
-
158
- if transcribe_response.output.task_status == 'FAILED':
159
- raise Exception("ASR 识别任务失败")
160
-
161
- # 获取识别结果
162
- results = transcribe_response.output.get('results', [])
163
- all_transcriptions = []
164
-
165
- for result in results:
166
- if result.get('subtask_status') == 'SUCCEEDED':
167
- transcription_url = result.get('transcription_url')
168
- if transcription_url:
169
- resp = requests.get(transcription_url, timeout=30)
170
- if resp.status_code == 200:
171
- transcription_data = resp.json()
172
- all_transcriptions.append({
173
- 'file_url': result.get('file_url'),
174
- 'transcription': transcription_data
175
- })
176
-
177
- return all_transcriptions
178
-
179
-
180
- def main():
181
- parser = argparse.ArgumentParser(description='ASR 语音识别并生成 SRT 字幕')
182
- parser.add_argument('--audio', required=True, help='音频文件路径')
183
- parser.add_argument('--output', required=True, help='输出 SRT 文件路径')
184
- parser.add_argument('--upload-url', required=True, help='上传接口的完整 URL')
185
- parser.add_argument('--api-key', help='ASR API Key (也可通过 ASR_API_KEY 环境变量设置)')
186
-
187
- args = parser.parse_args()
188
-
189
- # 获取 API Key
190
- api_key = args.api_key or os.environ.get('ASR_API_KEY')
191
- if not api_key:
192
- print("错误: 请通过 --api-key 参数或 ASR_API_KEY 环境变量提供 API Key", file=sys.stderr)
193
- sys.exit(1)
194
-
195
- try:
196
- # 1. 上传音频文件
197
- print(f"正在上传音频文件: {args.audio}")
198
- file_path = upload_audio(args.upload_url, args.audio)
199
- print(f"上传成功: {file_path}")
200
-
201
- # 2. 构建静态资源 URL
202
- audio_url = get_static_url(args.upload_url, file_path)
203
- print(f"音频 URL: {audio_url}")
204
-
205
- # 3. 调用 ASR 识别
206
- print("正在进行语音识别...")
207
- transcriptions = transcribe_audio(audio_url, api_key)
208
-
209
- if not transcriptions:
210
- print("警告: 未获取到识别结果", file=sys.stderr)
211
- sys.exit(1)
212
-
213
- # 4. 生成 SRT 文件
214
- subtitle_count = asr_to_srt(transcriptions, args.output)
215
- print(f"SRT 字幕文件已生成: {args.output}")
216
- print(f"共 {subtitle_count} 条字幕")
217
-
218
- except FileNotFoundError as e:
219
- print(f"错误: {e}", file=sys.stderr)
220
- sys.exit(1)
221
- except Exception as e:
222
- print(f"错误: {e}", file=sys.stderr)
223
- sys.exit(1)
224
-
225
-
226
- if __name__ == '__main__':
227
- main()