@ww_nero/media 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -0
- package/index.js +392 -0
- package/package.json +16 -0
- package/scripts/asr_srt.py +231 -0
package/README.md
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# Media MCP Server
|
|
2
|
+
|
|
3
|
+
媒体处理 MCP 服务,提供语音识别等功能。
|
|
4
|
+
|
|
5
|
+
## 功能
|
|
6
|
+
|
|
7
|
+
### asr - 语音识别
|
|
8
|
+
|
|
9
|
+
将音频文件转换为带时间戳的 SRT 字幕文件。
|
|
10
|
+
|
|
11
|
+
**参数:**
|
|
12
|
+
- `working_directory`: 工作目录的绝对路径,识别结果将保存到此目录
|
|
13
|
+
- `audio_file`: 音频文件的相对路径(相对于工作目录),支持 mp3/wav 格式
|
|
14
|
+
|
|
15
|
+
**限制:**
|
|
16
|
+
- 音频长度最大 1 分钟,超出需要外部分段处理后逐段识别
|
|
17
|
+
|
|
18
|
+
**输出:**
|
|
19
|
+
- 识别结果保存到工作目录下的 `transcribe.srt` 文件
|
|
20
|
+
|
|
21
|
+
## 环境变量
|
|
22
|
+
|
|
23
|
+
| 变量名 | 说明 | 必填 |
|
|
24
|
+
|--------|------|------|
|
|
25
|
+
| `ASR_API_KEY` | 阿里云 ASR API Key | 是 |
|
|
26
|
+
| `ASR_UPLOAD_URL` | 上传接口的完整 URL,如 `http://server.domain.com/upload` | 是 |
|
|
27
|
+
|
|
28
|
+
## 依赖
|
|
29
|
+
|
|
30
|
+
### Python 依赖
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install dashscope requests
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
### 系统依赖
|
|
37
|
+
|
|
38
|
+
- `ffprobe` (用于检测音频时长,来自 ffmpeg)
|
|
39
|
+
- `python3`
|
|
40
|
+
|
|
41
|
+
## 配置示例
|
|
42
|
+
|
|
43
|
+
```json
|
|
44
|
+
{
|
|
45
|
+
"mcpServers": {
|
|
46
|
+
"media": {
|
|
47
|
+
"command": "node",
|
|
48
|
+
"args": ["/path/to/media/index.js"],
|
|
49
|
+
"env": {
|
|
50
|
+
"ASR_API_KEY": "your-api-key",
|
|
51
|
+
"ASR_UPLOAD_URL": "http://server.domain.com/upload"
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## 上传接口要求
|
|
59
|
+
|
|
60
|
+
上传接口需符合以下规范(参考 demo_server 实现):
|
|
61
|
+
|
|
62
|
+
**请求:**
|
|
63
|
+
- 方法: `POST`
|
|
64
|
+
- Content-Type: `multipart/form-data`
|
|
65
|
+
- 字段名: `file`
|
|
66
|
+
|
|
67
|
+
**响应(成功):**
|
|
68
|
+
```json
|
|
69
|
+
{
|
|
70
|
+
"message": "文件上传成功",
|
|
71
|
+
"fileName": "abc123.mp3"
|
|
72
|
+
}
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
上传的文件可通过 `{base_url}/{fileName}` 访问,例如:
|
|
76
|
+
- 上传接口: `http://server.domain.com/upload`
|
|
77
|
+
- 文件访问: `http://server.domain.com/abc123.mp3`
|
package/index.js
ADDED
|
@@ -0,0 +1,392 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
const fs = require('fs');
|
|
4
|
+
const path = require('path');
|
|
5
|
+
const { spawn } = require('child_process');
|
|
6
|
+
const { Server } = require('@modelcontextprotocol/sdk/server/index.js');
|
|
7
|
+
const { StdioServerTransport } = require('@modelcontextprotocol/sdk/server/stdio.js');
|
|
8
|
+
const {
|
|
9
|
+
CallToolRequestSchema,
|
|
10
|
+
ListToolsRequestSchema,
|
|
11
|
+
} = require('@modelcontextprotocol/sdk/types.js');
|
|
12
|
+
|
|
13
|
+
const ASR_API_KEY = process.env.ASR_API_KEY || '';
|
|
14
|
+
const ASR_UPLOAD_URL = process.env.ASR_UPLOAD_URL || '';
|
|
15
|
+
|
|
16
|
+
const SUPPORTED_AUDIO_TYPES = ['.mp3', '.wav'];
|
|
17
|
+
const MAX_AUDIO_DURATION_SECONDS = 60;
|
|
18
|
+
|
|
19
|
+
const SCRIPTS_DIR = path.join(__dirname, 'scripts');
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* 将 Windows 路径转换为 WSL 路径,或反之
|
|
23
|
+
*/
|
|
24
|
+
const convertPath = (filePath) => {
|
|
25
|
+
const wslMatch = filePath.match(/^\/mnt\/([a-zA-Z])\/(.*)$/);
|
|
26
|
+
if (wslMatch) {
|
|
27
|
+
const drive = wslMatch[1].toUpperCase();
|
|
28
|
+
const rest = wslMatch[2].replace(/\//g, '\\');
|
|
29
|
+
return `${drive}:\\${rest}`;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
const winMatch = filePath.match(/^([a-zA-Z]):\\(.*)$/);
|
|
33
|
+
if (winMatch) {
|
|
34
|
+
const drive = winMatch[1].toLowerCase();
|
|
35
|
+
const rest = winMatch[2].replace(/\\/g, '/');
|
|
36
|
+
return `/mnt/${drive}/${rest}`;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
return null;
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* 解析存在的路径(支持 WSL 和 Windows 路径互转)
|
|
44
|
+
*/
|
|
45
|
+
const resolveExistingPath = (inputPath) => {
|
|
46
|
+
if (fs.existsSync(inputPath)) {
|
|
47
|
+
return inputPath;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
const converted = convertPath(inputPath);
|
|
51
|
+
if (converted && fs.existsSync(converted)) {
|
|
52
|
+
return converted;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
return null;
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* 解析并验证工作目录
|
|
60
|
+
*/
|
|
61
|
+
const resolveWorkingDirectory = (rawPath) => {
|
|
62
|
+
if (!rawPath || typeof rawPath !== 'string') {
|
|
63
|
+
throw new Error('必须提供 working_directory');
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
if (!path.isAbsolute(rawPath)) {
|
|
67
|
+
throw new Error('working_directory 必须是绝对路径');
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
const resolved = resolveExistingPath(rawPath);
|
|
71
|
+
if (!resolved) {
|
|
72
|
+
throw new Error('工作目录不存在,请确认路径是否正确');
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const stats = fs.statSync(resolved);
|
|
76
|
+
if (!stats.isDirectory()) {
|
|
77
|
+
throw new Error('working_directory 必须是文件夹路径');
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
return resolved;
|
|
81
|
+
};
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* 解析并验证音频文件路径
|
|
85
|
+
*/
|
|
86
|
+
const resolveAudioFile = (workingDir, rawPath) => {
|
|
87
|
+
if (!rawPath || typeof rawPath !== 'string') {
|
|
88
|
+
throw new Error('音频文件路径不能为空');
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const trimmed = rawPath.trim();
|
|
92
|
+
if (!trimmed) {
|
|
93
|
+
throw new Error('音频文件路径不能为空');
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
const absolutePath = path.isAbsolute(trimmed)
|
|
97
|
+
? trimmed
|
|
98
|
+
: path.join(workingDir, trimmed);
|
|
99
|
+
|
|
100
|
+
const resolved = resolveExistingPath(absolutePath);
|
|
101
|
+
if (!resolved) {
|
|
102
|
+
throw new Error(`音频文件不存在: ${trimmed}`);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
const stats = fs.statSync(resolved);
|
|
106
|
+
if (!stats.isFile()) {
|
|
107
|
+
throw new Error(`音频路径不是文件: ${trimmed}`);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
const ext = path.extname(resolved).toLowerCase();
|
|
111
|
+
if (!SUPPORTED_AUDIO_TYPES.includes(ext)) {
|
|
112
|
+
throw new Error(`不支持的音频格式: ${ext || '未知'},仅支持 ${SUPPORTED_AUDIO_TYPES.join(', ')}`);
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
return resolved;
|
|
116
|
+
};
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* 将路径转换为 WSL 路径格式
|
|
120
|
+
*/
|
|
121
|
+
const toWslPath = (targetPath) => {
|
|
122
|
+
if (targetPath.startsWith('/mnt/')) {
|
|
123
|
+
return targetPath;
|
|
124
|
+
}
|
|
125
|
+
const winMatch = targetPath.match(/^([a-zA-Z]):\\(.*)$/);
|
|
126
|
+
if (winMatch) {
|
|
127
|
+
const drive = winMatch[1].toLowerCase();
|
|
128
|
+
const rest = winMatch[2].replace(/\\/g, '/');
|
|
129
|
+
return `/mnt/${drive}/${rest}`;
|
|
130
|
+
}
|
|
131
|
+
return targetPath;
|
|
132
|
+
};
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* 检查是否运行在 WSL 环境中
|
|
136
|
+
*/
|
|
137
|
+
const isWslEnvironment = () => {
|
|
138
|
+
try {
|
|
139
|
+
if (process.platform === 'linux') {
|
|
140
|
+
const release = fs.readFileSync('/proc/version', 'utf8');
|
|
141
|
+
return release.toLowerCase().includes('microsoft');
|
|
142
|
+
}
|
|
143
|
+
} catch {
|
|
144
|
+
// ignore
|
|
145
|
+
}
|
|
146
|
+
return false;
|
|
147
|
+
};
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* 获取音频时长(秒)
|
|
151
|
+
*/
|
|
152
|
+
const getAudioDuration = (audioPath) => {
|
|
153
|
+
return new Promise((resolve) => {
|
|
154
|
+
const wslPath = toWslPath(audioPath);
|
|
155
|
+
const isWsl = isWslEnvironment();
|
|
156
|
+
|
|
157
|
+
let cmd, args;
|
|
158
|
+
if (isWsl || process.platform === 'linux') {
|
|
159
|
+
cmd = 'ffprobe';
|
|
160
|
+
args = [
|
|
161
|
+
'-v', 'error',
|
|
162
|
+
'-show_entries', 'format=duration',
|
|
163
|
+
'-of', 'default=noprint_wrappers=1:nokey=1',
|
|
164
|
+
wslPath
|
|
165
|
+
];
|
|
166
|
+
} else {
|
|
167
|
+
cmd = 'ffprobe';
|
|
168
|
+
args = [
|
|
169
|
+
'-v', 'error',
|
|
170
|
+
'-show_entries', 'format=duration',
|
|
171
|
+
'-of', 'default=noprint_wrappers=1:nokey=1',
|
|
172
|
+
audioPath
|
|
173
|
+
];
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
const child = spawn(cmd, args, {
|
|
177
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
178
|
+
shell: false
|
|
179
|
+
});
|
|
180
|
+
|
|
181
|
+
let stdout = '';
|
|
182
|
+
let stderr = '';
|
|
183
|
+
|
|
184
|
+
child.stdout.on('data', (data) => {
|
|
185
|
+
stdout += data.toString();
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
child.stderr.on('data', (data) => {
|
|
189
|
+
stderr += data.toString();
|
|
190
|
+
});
|
|
191
|
+
|
|
192
|
+
child.on('close', (code) => {
|
|
193
|
+
if (code === 0 && stdout.trim()) {
|
|
194
|
+
const duration = parseFloat(stdout.trim());
|
|
195
|
+
if (!isNaN(duration)) {
|
|
196
|
+
resolve(duration);
|
|
197
|
+
return;
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
// 如果获取失败,返回 -1 表示未知(不阻止执行)
|
|
201
|
+
resolve(-1);
|
|
202
|
+
});
|
|
203
|
+
|
|
204
|
+
child.on('error', () => {
|
|
205
|
+
resolve(-1);
|
|
206
|
+
});
|
|
207
|
+
});
|
|
208
|
+
};
|
|
209
|
+
|
|
210
|
+
/**
|
|
211
|
+
* 执行 Python 脚本进行 ASR 识别
|
|
212
|
+
*/
|
|
213
|
+
const runAsrScript = (audioPath, outputPath, uploadUrl, apiKey) => {
|
|
214
|
+
return new Promise((resolve, reject) => {
|
|
215
|
+
const scriptPath = path.join(SCRIPTS_DIR, 'asr_srt.py');
|
|
216
|
+
const wslScriptPath = toWslPath(scriptPath);
|
|
217
|
+
const wslAudioPath = toWslPath(audioPath);
|
|
218
|
+
const wslOutputPath = toWslPath(outputPath);
|
|
219
|
+
|
|
220
|
+
const isWsl = isWslEnvironment();
|
|
221
|
+
|
|
222
|
+
let cmd, args;
|
|
223
|
+
if (isWsl || process.platform === 'linux') {
|
|
224
|
+
cmd = 'python3';
|
|
225
|
+
args = [
|
|
226
|
+
wslScriptPath,
|
|
227
|
+
'--audio', wslAudioPath,
|
|
228
|
+
'--output', wslOutputPath,
|
|
229
|
+
'--upload-url', uploadUrl,
|
|
230
|
+
'--api-key', apiKey
|
|
231
|
+
];
|
|
232
|
+
} else {
|
|
233
|
+
// Windows 环境,通过 wsl 调用
|
|
234
|
+
cmd = 'wsl';
|
|
235
|
+
args = [
|
|
236
|
+
'-e', 'python3',
|
|
237
|
+
wslScriptPath,
|
|
238
|
+
'--audio', wslAudioPath,
|
|
239
|
+
'--output', wslOutputPath,
|
|
240
|
+
'--upload-url', uploadUrl,
|
|
241
|
+
'--api-key', apiKey
|
|
242
|
+
];
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
const child = spawn(cmd, args, {
|
|
246
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
247
|
+
shell: false,
|
|
248
|
+
env: {
|
|
249
|
+
...process.env,
|
|
250
|
+
ASR_API_KEY: apiKey
|
|
251
|
+
}
|
|
252
|
+
});
|
|
253
|
+
|
|
254
|
+
let stdout = '';
|
|
255
|
+
let stderr = '';
|
|
256
|
+
const maxOutput = 64 * 1024;
|
|
257
|
+
|
|
258
|
+
child.stdout.on('data', (data) => {
|
|
259
|
+
const chunk = data.toString();
|
|
260
|
+
stdout += chunk;
|
|
261
|
+
if (stdout.length > maxOutput) {
|
|
262
|
+
stdout = stdout.slice(-maxOutput);
|
|
263
|
+
}
|
|
264
|
+
});
|
|
265
|
+
|
|
266
|
+
child.stderr.on('data', (data) => {
|
|
267
|
+
const chunk = data.toString();
|
|
268
|
+
stderr += chunk;
|
|
269
|
+
if (stderr.length > maxOutput) {
|
|
270
|
+
stderr = stderr.slice(-maxOutput);
|
|
271
|
+
}
|
|
272
|
+
});
|
|
273
|
+
|
|
274
|
+
const timeout = setTimeout(() => {
|
|
275
|
+
child.kill('SIGKILL');
|
|
276
|
+
reject(new Error('ASR 识别超时(超过 5 分钟)'));
|
|
277
|
+
}, 5 * 60 * 1000);
|
|
278
|
+
|
|
279
|
+
child.on('close', (code) => {
|
|
280
|
+
clearTimeout(timeout);
|
|
281
|
+
if (code === 0) {
|
|
282
|
+
resolve(stdout.trim());
|
|
283
|
+
} else {
|
|
284
|
+
const errorMsg = stderr.trim() || stdout.trim() || `进程退出码: ${code}`;
|
|
285
|
+
reject(new Error(errorMsg));
|
|
286
|
+
}
|
|
287
|
+
});
|
|
288
|
+
|
|
289
|
+
child.on('error', (err) => {
|
|
290
|
+
clearTimeout(timeout);
|
|
291
|
+
reject(new Error(`执行脚本失败: ${err.message}`));
|
|
292
|
+
});
|
|
293
|
+
});
|
|
294
|
+
};
|
|
295
|
+
|
|
296
|
+
/**
|
|
297
|
+
* ASR 语音识别
|
|
298
|
+
*/
|
|
299
|
+
const asr = async ({ working_directory, audio_file }) => {
|
|
300
|
+
// 验证环境变量
|
|
301
|
+
if (!ASR_API_KEY) {
|
|
302
|
+
throw new Error('请配置 ASR_API_KEY 环境变量');
|
|
303
|
+
}
|
|
304
|
+
if (!ASR_UPLOAD_URL) {
|
|
305
|
+
throw new Error('请配置 ASR_UPLOAD_URL 环境变量(完整的上传接口 URL,如 http://server.domain.com/upload)');
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
const workingDir = resolveWorkingDirectory(working_directory);
|
|
309
|
+
const audioPath = resolveAudioFile(workingDir, audio_file);
|
|
310
|
+
|
|
311
|
+
// 检查音频时长
|
|
312
|
+
const duration = await getAudioDuration(audioPath);
|
|
313
|
+
if (duration > 0 && duration > MAX_AUDIO_DURATION_SECONDS) {
|
|
314
|
+
throw new Error(
|
|
315
|
+
`音频时长 ${Math.round(duration)} 秒超过限制(最大 ${MAX_AUDIO_DURATION_SECONDS} 秒)。` +
|
|
316
|
+
`请先对音频进行分段处理后再逐段识别。`
|
|
317
|
+
);
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
// 输出文件路径
|
|
321
|
+
const outputPath = path.join(workingDir, 'transcribe.srt');
|
|
322
|
+
|
|
323
|
+
// 执行 ASR 脚本
|
|
324
|
+
await runAsrScript(audioPath, outputPath, ASR_UPLOAD_URL, ASR_API_KEY);
|
|
325
|
+
|
|
326
|
+
return 'transcribe.srt';
|
|
327
|
+
};
|
|
328
|
+
|
|
329
|
+
const server = new Server(
|
|
330
|
+
{
|
|
331
|
+
name: 'media',
|
|
332
|
+
version: '1.0.0',
|
|
333
|
+
},
|
|
334
|
+
{
|
|
335
|
+
capabilities: {
|
|
336
|
+
tools: {},
|
|
337
|
+
},
|
|
338
|
+
}
|
|
339
|
+
);
|
|
340
|
+
|
|
341
|
+
server.setRequestHandler(ListToolsRequestSchema, async () => ({
|
|
342
|
+
tools: [
|
|
343
|
+
{
|
|
344
|
+
name: 'asr',
|
|
345
|
+
description: '语音识别工具,将音频文件转换为带时间戳的 SRT 字幕文件。支持 mp3/wav 格式,音频长度限制 1 分钟以内。',
|
|
346
|
+
inputSchema: {
|
|
347
|
+
type: 'object',
|
|
348
|
+
properties: {
|
|
349
|
+
working_directory: {
|
|
350
|
+
type: 'string',
|
|
351
|
+
description: '工作目录的绝对路径,识别结果将保存到此目录下的 transcribe.srt 文件',
|
|
352
|
+
},
|
|
353
|
+
audio_file: {
|
|
354
|
+
type: 'string',
|
|
355
|
+
description: '音频文件的相对路径(相对于工作目录),支持 mp3/wav 格式',
|
|
356
|
+
},
|
|
357
|
+
},
|
|
358
|
+
required: ['working_directory', 'audio_file'],
|
|
359
|
+
},
|
|
360
|
+
},
|
|
361
|
+
],
|
|
362
|
+
}));
|
|
363
|
+
|
|
364
|
+
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
365
|
+
const { name, arguments: args } = request.params;
|
|
366
|
+
|
|
367
|
+
try {
|
|
368
|
+
if (name === 'asr') {
|
|
369
|
+
const { working_directory, audio_file } = args;
|
|
370
|
+
if (!working_directory || !audio_file) {
|
|
371
|
+
throw new Error('必须同时提供 working_directory 和 audio_file 参数');
|
|
372
|
+
}
|
|
373
|
+
const filename = await asr({ working_directory, audio_file });
|
|
374
|
+
return { content: [{ type: 'text', text: `语音识别完成,字幕文件已保存到工作目录下:${filename}` }] };
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
return {
|
|
378
|
+
content: [{ type: 'text', text: `未知工具: ${name}` }],
|
|
379
|
+
isError: true,
|
|
380
|
+
};
|
|
381
|
+
} catch (error) {
|
|
382
|
+
const message = error?.message || '未知错误';
|
|
383
|
+
return { content: [{ type: 'text', text: message }], isError: true };
|
|
384
|
+
}
|
|
385
|
+
});
|
|
386
|
+
|
|
387
|
+
const main = async () => {
|
|
388
|
+
const transport = new StdioServerTransport();
|
|
389
|
+
await server.connect(transport);
|
|
390
|
+
};
|
|
391
|
+
|
|
392
|
+
main().catch(console.error);
|
package/package.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@ww_nero/media",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "MCP server for media processing, including ASR speech recognition",
|
|
5
|
+
"main": "index.js",
|
|
6
|
+
"bin": {
|
|
7
|
+
"media": "index.js"
|
|
8
|
+
},
|
|
9
|
+
"files": [
|
|
10
|
+
"index.js",
|
|
11
|
+
"scripts"
|
|
12
|
+
],
|
|
13
|
+
"dependencies": {
|
|
14
|
+
"@modelcontextprotocol/sdk": "^1.22.0"
|
|
15
|
+
}
|
|
16
|
+
}
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
ASR 语音识别脚本
|
|
5
|
+
上传音频文件到服务器,调用阿里云 ASR 接口进行识别,生成 SRT 字幕文件
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import sys
|
|
10
|
+
import json
|
|
11
|
+
import time
|
|
12
|
+
import argparse
|
|
13
|
+
import requests
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from http import HTTPStatus
|
|
16
|
+
|
|
17
|
+
# 尝试导入 dashscope,如果失败则提示安装
|
|
18
|
+
try:
|
|
19
|
+
import dashscope
|
|
20
|
+
from dashscope.audio.asr import Transcription
|
|
21
|
+
except ImportError:
|
|
22
|
+
print("错误: 请先安装 dashscope 库: pip install dashscope", file=sys.stderr)
|
|
23
|
+
sys.exit(1)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def ms_to_srt_time(ms: int) -> str:
|
|
27
|
+
"""将毫秒转换为 SRT 时间格式 HH:MM:SS,mmm"""
|
|
28
|
+
hours = ms // 3600000
|
|
29
|
+
minutes = (ms % 3600000) // 60000
|
|
30
|
+
seconds = (ms % 60000) // 1000
|
|
31
|
+
milliseconds = ms % 1000
|
|
32
|
+
return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def asr_to_srt(asr_data: list, output_srt_path: str):
|
|
36
|
+
"""
|
|
37
|
+
将 ASR 识别结果转换为 SRT 字幕文件
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
asr_data: ASR 识别结果列表
|
|
41
|
+
output_srt_path: 输出的 SRT 文件路径
|
|
42
|
+
"""
|
|
43
|
+
srt_entries = []
|
|
44
|
+
subtitle_index = 1
|
|
45
|
+
|
|
46
|
+
for item in asr_data:
|
|
47
|
+
transcription = item.get('transcription', {})
|
|
48
|
+
transcripts = transcription.get('transcripts', [])
|
|
49
|
+
|
|
50
|
+
for transcript in transcripts:
|
|
51
|
+
sentences = transcript.get('sentences', [])
|
|
52
|
+
|
|
53
|
+
for sentence in sentences:
|
|
54
|
+
begin_time = sentence.get('begin_time', 0)
|
|
55
|
+
end_time = sentence.get('end_time', 0)
|
|
56
|
+
text = sentence.get('text', '').strip()
|
|
57
|
+
|
|
58
|
+
if text:
|
|
59
|
+
start_str = ms_to_srt_time(begin_time)
|
|
60
|
+
end_str = ms_to_srt_time(end_time)
|
|
61
|
+
|
|
62
|
+
srt_entry = f"{subtitle_index}\n{start_str} --> {end_str}\n{text}\n"
|
|
63
|
+
srt_entries.append(srt_entry)
|
|
64
|
+
subtitle_index += 1
|
|
65
|
+
|
|
66
|
+
with open(output_srt_path, 'w', encoding='utf-8') as f:
|
|
67
|
+
f.write('\n'.join(srt_entries))
|
|
68
|
+
|
|
69
|
+
return subtitle_index - 1
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def upload_audio(upload_url: str, audio_path: str) -> str:
|
|
73
|
+
"""
|
|
74
|
+
上传音频文件到服务器
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
upload_url: 上传接口的完整 URL
|
|
78
|
+
audio_path: 本地音频文件路径
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
上传后的文件名
|
|
82
|
+
"""
|
|
83
|
+
audio_path = Path(audio_path)
|
|
84
|
+
if not audio_path.exists():
|
|
85
|
+
raise FileNotFoundError(f"音频文件不存在: {audio_path}")
|
|
86
|
+
|
|
87
|
+
with open(audio_path, 'rb') as f:
|
|
88
|
+
files = {'file': (audio_path.name, f)}
|
|
89
|
+
response = requests.post(upload_url, files=files, timeout=60)
|
|
90
|
+
|
|
91
|
+
if response.status_code == 409:
|
|
92
|
+
# 文件已存在,从响应中获取文件名或根据原文件名推断
|
|
93
|
+
data = response.json()
|
|
94
|
+
if 'fileName' in data:
|
|
95
|
+
return data['fileName']
|
|
96
|
+
# 如果没有返回文件名,使用原始文件名的扩展名
|
|
97
|
+
raise Exception(f"文件已存在: {data.get('message', '未知错误')}")
|
|
98
|
+
|
|
99
|
+
if response.status_code != 200:
|
|
100
|
+
raise Exception(f"上传失败: {response.status_code} - {response.text}")
|
|
101
|
+
|
|
102
|
+
data = response.json()
|
|
103
|
+
if 'fileName' not in data:
|
|
104
|
+
raise Exception(f"上传响应格式错误: {data}")
|
|
105
|
+
|
|
106
|
+
return data['fileName']
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def get_static_url(upload_url: str, filename: str) -> str:
|
|
110
|
+
"""
|
|
111
|
+
根据上传接口 URL 和文件名构建静态资源 URL
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
upload_url: 上传接口的完整 URL (如 http://server.domain.com/upload)
|
|
115
|
+
filename: 上传后的文件名
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
静态资源的完整 URL
|
|
119
|
+
"""
|
|
120
|
+
# 从上传 URL 中提取基础 URL
|
|
121
|
+
# 例如: http://server.domain.com/upload -> http://server.domain.com/
|
|
122
|
+
from urllib.parse import urlparse, urlunparse
|
|
123
|
+
parsed = urlparse(upload_url)
|
|
124
|
+
base_url = urlunparse((parsed.scheme, parsed.netloc, '/', '', '', ''))
|
|
125
|
+
return f"{base_url.rstrip('/')}/{filename}"
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def transcribe_audio(audio_url: str, api_key: str) -> list:
|
|
129
|
+
"""
|
|
130
|
+
调用阿里云 ASR 接口进行语音识别
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
audio_url: 音频文件的公网 URL
|
|
134
|
+
api_key: 阿里云 ASR API Key
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
识别结果列表
|
|
138
|
+
"""
|
|
139
|
+
dashscope.api_key = api_key
|
|
140
|
+
|
|
141
|
+
# 发起异步识别请求
|
|
142
|
+
transcribe_response = Transcription.async_call(
|
|
143
|
+
model='paraformer-v2',
|
|
144
|
+
file_urls=[audio_url],
|
|
145
|
+
language_hints=['zh', 'en', 'ja']
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
if not transcribe_response or not hasattr(transcribe_response, 'output'):
|
|
149
|
+
raise Exception("ASR 请求失败: 无效的响应")
|
|
150
|
+
|
|
151
|
+
# 轮询等待识别完成
|
|
152
|
+
while True:
|
|
153
|
+
task_status = transcribe_response.output.task_status
|
|
154
|
+
if task_status == 'SUCCEEDED' or task_status == 'FAILED':
|
|
155
|
+
break
|
|
156
|
+
time.sleep(1)
|
|
157
|
+
transcribe_response = Transcription.fetch(task=transcribe_response.output.task_id)
|
|
158
|
+
|
|
159
|
+
if transcribe_response.status_code != HTTPStatus.OK:
|
|
160
|
+
raise Exception(f"ASR 识别失败: {transcribe_response.status_code}")
|
|
161
|
+
|
|
162
|
+
if transcribe_response.output.task_status == 'FAILED':
|
|
163
|
+
raise Exception("ASR 识别任务失败")
|
|
164
|
+
|
|
165
|
+
# 获取识别结果
|
|
166
|
+
results = transcribe_response.output.get('results', [])
|
|
167
|
+
all_transcriptions = []
|
|
168
|
+
|
|
169
|
+
for result in results:
|
|
170
|
+
if result.get('subtask_status') == 'SUCCEEDED':
|
|
171
|
+
transcription_url = result.get('transcription_url')
|
|
172
|
+
if transcription_url:
|
|
173
|
+
resp = requests.get(transcription_url, timeout=30)
|
|
174
|
+
if resp.status_code == 200:
|
|
175
|
+
transcription_data = resp.json()
|
|
176
|
+
all_transcriptions.append({
|
|
177
|
+
'file_url': result.get('file_url'),
|
|
178
|
+
'transcription': transcription_data
|
|
179
|
+
})
|
|
180
|
+
|
|
181
|
+
return all_transcriptions
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def main():
|
|
185
|
+
parser = argparse.ArgumentParser(description='ASR 语音识别并生成 SRT 字幕')
|
|
186
|
+
parser.add_argument('--audio', required=True, help='音频文件路径')
|
|
187
|
+
parser.add_argument('--output', required=True, help='输出 SRT 文件路径')
|
|
188
|
+
parser.add_argument('--upload-url', required=True, help='上传接口的完整 URL')
|
|
189
|
+
parser.add_argument('--api-key', help='ASR API Key (也可通过 ASR_API_KEY 环境变量设置)')
|
|
190
|
+
|
|
191
|
+
args = parser.parse_args()
|
|
192
|
+
|
|
193
|
+
# 获取 API Key
|
|
194
|
+
api_key = args.api_key or os.environ.get('ASR_API_KEY')
|
|
195
|
+
if not api_key:
|
|
196
|
+
print("错误: 请通过 --api-key 参数或 ASR_API_KEY 环境变量提供 API Key", file=sys.stderr)
|
|
197
|
+
sys.exit(1)
|
|
198
|
+
|
|
199
|
+
try:
|
|
200
|
+
# 1. 上传音频文件
|
|
201
|
+
print(f"正在上传音频文件: {args.audio}")
|
|
202
|
+
filename = upload_audio(args.upload_url, args.audio)
|
|
203
|
+
print(f"上传成功: {filename}")
|
|
204
|
+
|
|
205
|
+
# 2. 构建静态资源 URL
|
|
206
|
+
audio_url = get_static_url(args.upload_url, filename)
|
|
207
|
+
print(f"音频 URL: {audio_url}")
|
|
208
|
+
|
|
209
|
+
# 3. 调用 ASR 识别
|
|
210
|
+
print("正在进行语音识别...")
|
|
211
|
+
transcriptions = transcribe_audio(audio_url, api_key)
|
|
212
|
+
|
|
213
|
+
if not transcriptions:
|
|
214
|
+
print("警告: 未获取到识别结果", file=sys.stderr)
|
|
215
|
+
sys.exit(1)
|
|
216
|
+
|
|
217
|
+
# 4. 生成 SRT 文件
|
|
218
|
+
subtitle_count = asr_to_srt(transcriptions, args.output)
|
|
219
|
+
print(f"SRT 字幕文件已生成: {args.output}")
|
|
220
|
+
print(f"共 {subtitle_count} 条字幕")
|
|
221
|
+
|
|
222
|
+
except FileNotFoundError as e:
|
|
223
|
+
print(f"错误: {e}", file=sys.stderr)
|
|
224
|
+
sys.exit(1)
|
|
225
|
+
except Exception as e:
|
|
226
|
+
print(f"错误: {e}", file=sys.stderr)
|
|
227
|
+
sys.exit(1)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
if __name__ == '__main__':
|
|
231
|
+
main()
|