@ww_nero/media 1.0.5 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -35
- package/index.js +205 -183
- package/package.json +4 -4
- package/scripts/asr_srt.py +0 -227
package/README.md
CHANGED
|
@@ -13,7 +13,9 @@
|
|
|
13
13
|
- `audio_file`: 音频文件的相对路径(相对于工作目录),支持 mp3/wav 格式
|
|
14
14
|
|
|
15
15
|
**限制:**
|
|
16
|
-
- 音频长度最大 1
|
|
16
|
+
- 音频长度最大 1 分钟
|
|
17
|
+
- 文件大小不超过 5MB
|
|
18
|
+
- 超出限制需要外部分段处理后逐段识别
|
|
17
19
|
|
|
18
20
|
**输出:**
|
|
19
21
|
- 识别结果保存到工作目录下的 `transcribe.srt` 文件
|
|
@@ -22,22 +24,14 @@
|
|
|
22
24
|
|
|
23
25
|
| 变量名 | 说明 | 必填 |
|
|
24
26
|
|--------|------|------|
|
|
25
|
-
| `ASR_API_KEY` | 阿里云
|
|
26
|
-
| `ASR_UPLOAD_URL` | 上传接口的完整 URL,如 `http://server.domain.com/upload` | 是 |
|
|
27
|
+
| `ASR_API_KEY` | 阿里云 DashScope API Key | 是 |
|
|
27
28
|
|
|
28
|
-
##
|
|
29
|
-
|
|
30
|
-
### Python 依赖
|
|
29
|
+
## 安装
|
|
31
30
|
|
|
32
31
|
```bash
|
|
33
|
-
|
|
32
|
+
npm install
|
|
34
33
|
```
|
|
35
34
|
|
|
36
|
-
### 系统依赖
|
|
37
|
-
|
|
38
|
-
- `ffprobe` (用于检测音频时长,来自 ffmpeg)
|
|
39
|
-
- `python3`
|
|
40
|
-
|
|
41
35
|
## 配置示例
|
|
42
36
|
|
|
43
37
|
```json
|
|
@@ -47,31 +41,9 @@ pip install dashscope requests
|
|
|
47
41
|
"command": "node",
|
|
48
42
|
"args": ["/path/to/media/index.js"],
|
|
49
43
|
"env": {
|
|
50
|
-
"ASR_API_KEY": "your-api-key"
|
|
51
|
-
"ASR_UPLOAD_URL": "http://server.domain.com/upload"
|
|
44
|
+
"ASR_API_KEY": "your-api-key"
|
|
52
45
|
}
|
|
53
46
|
}
|
|
54
47
|
}
|
|
55
48
|
}
|
|
56
49
|
```
|
|
57
|
-
|
|
58
|
-
## 上传接口要求
|
|
59
|
-
|
|
60
|
-
上传接口需符合以下规范(参考 demo_server 实现):
|
|
61
|
-
|
|
62
|
-
**请求:**
|
|
63
|
-
- 方法: `POST`
|
|
64
|
-
- Content-Type: `multipart/form-data`
|
|
65
|
-
- 字段名: `file`
|
|
66
|
-
|
|
67
|
-
**响应(成功):**
|
|
68
|
-
```json
|
|
69
|
-
{
|
|
70
|
-
"message": "文件上传成功",
|
|
71
|
-
"fileName": "abc123.mp3"
|
|
72
|
-
}
|
|
73
|
-
```
|
|
74
|
-
|
|
75
|
-
上传的文件可通过 `{base_url}/{fileName}` 访问,例如:
|
|
76
|
-
- 上传接口: `http://server.domain.com/upload`
|
|
77
|
-
- 文件访问: `http://server.domain.com/abc123.mp3`
|
package/index.js
CHANGED
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
const fs = require('fs');
|
|
4
4
|
const path = require('path');
|
|
5
|
-
const { spawn } = require('child_process');
|
|
6
5
|
const { Server } = require('@modelcontextprotocol/sdk/server/index.js');
|
|
7
6
|
const { StdioServerTransport } = require('@modelcontextprotocol/sdk/server/stdio.js');
|
|
8
7
|
const {
|
|
@@ -12,11 +11,185 @@ const {
|
|
|
12
11
|
|
|
13
12
|
const ASR_API_KEY = process.env.ASR_API_KEY || '';
|
|
14
13
|
const ASR_UPLOAD_URL = 'http://fsheep.com:10808/upload';
|
|
14
|
+
const ASR_SERVICE_URL = 'https://dashscope.aliyuncs.com/api/v1/services/audio/asr/transcription';
|
|
15
15
|
|
|
16
16
|
const SUPPORTED_AUDIO_TYPES = ['.mp3', '.wav'];
|
|
17
|
-
const
|
|
17
|
+
const MAX_FILE_SIZE_BYTES = 5 * 1024 * 1024; // 5MB
|
|
18
|
+
const LANGUAGE_HINTS = ['zh', 'en', 'ja'];
|
|
18
19
|
|
|
19
|
-
|
|
20
|
+
/**
|
|
21
|
+
* 将毫秒转换为 SRT 时间格式 HH:MM:SS,mmm
|
|
22
|
+
*/
|
|
23
|
+
const msToSrtTime = (ms) => {
|
|
24
|
+
const hours = Math.floor(ms / 3600000);
|
|
25
|
+
const minutes = Math.floor((ms % 3600000) / 60000);
|
|
26
|
+
const seconds = Math.floor((ms % 60000) / 1000);
|
|
27
|
+
const milliseconds = ms % 1000;
|
|
28
|
+
return `${String(hours).padStart(2, '0')}:${String(minutes).padStart(2, '0')}:${String(seconds).padStart(2, '0')},${String(milliseconds).padStart(3, '0')}`;
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* 将 ASR 识别结果转换为 SRT 字幕内容
|
|
33
|
+
*/
|
|
34
|
+
const asrToSrt = (asrData) => {
|
|
35
|
+
const srtEntries = [];
|
|
36
|
+
let subtitleIndex = 1;
|
|
37
|
+
|
|
38
|
+
for (const item of asrData) {
|
|
39
|
+
const transcription = item.transcription || {};
|
|
40
|
+
const transcripts = transcription.transcripts || [];
|
|
41
|
+
|
|
42
|
+
for (const transcript of transcripts) {
|
|
43
|
+
const sentences = transcript.sentences || [];
|
|
44
|
+
|
|
45
|
+
for (const sentence of sentences) {
|
|
46
|
+
const beginTime = sentence.begin_time || 0;
|
|
47
|
+
const endTime = sentence.end_time || 0;
|
|
48
|
+
const text = (sentence.text || '').trim();
|
|
49
|
+
|
|
50
|
+
if (text) {
|
|
51
|
+
const startStr = msToSrtTime(beginTime);
|
|
52
|
+
const endStr = msToSrtTime(endTime);
|
|
53
|
+
|
|
54
|
+
srtEntries.push(`${subtitleIndex}\n${startStr} --> ${endStr}\n${text}\n`);
|
|
55
|
+
subtitleIndex++;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
return srtEntries.join('\n');
|
|
62
|
+
};
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* 上传音频文件到服务器
|
|
66
|
+
*/
|
|
67
|
+
const uploadAudio = async (uploadUrl, audioPath) => {
|
|
68
|
+
const fileBuffer = fs.readFileSync(audioPath);
|
|
69
|
+
const fileName = path.basename(audioPath);
|
|
70
|
+
|
|
71
|
+
const FormData = (await import('form-data')).default;
|
|
72
|
+
const formData = new FormData();
|
|
73
|
+
formData.append('file', fileBuffer, fileName);
|
|
74
|
+
|
|
75
|
+
const response = await fetch(uploadUrl, {
|
|
76
|
+
method: 'POST',
|
|
77
|
+
body: formData,
|
|
78
|
+
headers: formData.getHeaders(),
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
if (!response.ok) {
|
|
82
|
+
throw new Error(`上传失败: ${response.status} - ${await response.text()}`);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
const data = await response.json();
|
|
86
|
+
|
|
87
|
+
if (!data.success || !data.data || !data.data.path) {
|
|
88
|
+
throw new Error(`上传响应格式错误: ${JSON.stringify(data)}`);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
return data.data.path;
|
|
92
|
+
};
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* 根据上传接口 URL 和文件路径构建静态资源 URL
|
|
96
|
+
*/
|
|
97
|
+
const getStaticUrl = (uploadUrl, filePath) => {
|
|
98
|
+
const url = new URL(uploadUrl);
|
|
99
|
+
return `${url.protocol}//${url.host}${filePath}`;
|
|
100
|
+
};
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* 提交 ASR 转写任务
|
|
104
|
+
*/
|
|
105
|
+
const submitAsrTask = async (fileUrls, apiKey) => {
|
|
106
|
+
const response = await fetch(ASR_SERVICE_URL, {
|
|
107
|
+
method: 'POST',
|
|
108
|
+
headers: {
|
|
109
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
110
|
+
'Content-Type': 'application/json',
|
|
111
|
+
'X-DashScope-Async': 'enable',
|
|
112
|
+
},
|
|
113
|
+
body: JSON.stringify({
|
|
114
|
+
model: 'paraformer-v2',
|
|
115
|
+
input: { file_urls: fileUrls },
|
|
116
|
+
parameters: {
|
|
117
|
+
channel_id: [0],
|
|
118
|
+
language_hints: LANGUAGE_HINTS,
|
|
119
|
+
},
|
|
120
|
+
}),
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
if (!response.ok) {
|
|
124
|
+
const errorText = await response.text();
|
|
125
|
+
throw new Error(`ASR 任务提交失败: ${response.status} - ${errorText}`);
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
const data = await response.json();
|
|
129
|
+
|
|
130
|
+
if (!data.output || !data.output.task_id) {
|
|
131
|
+
throw new Error(`ASR 响应格式错误: ${JSON.stringify(data)}`);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
return data.output.task_id;
|
|
135
|
+
};
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* 轮询等待 ASR 任务完成
|
|
139
|
+
*/
|
|
140
|
+
const waitForTaskComplete = async (taskId, apiKey, timeoutMs = 5 * 60 * 1000) => {
|
|
141
|
+
const taskUrl = `https://dashscope.aliyuncs.com/api/v1/tasks/${taskId}`;
|
|
142
|
+
const startTime = Date.now();
|
|
143
|
+
const pollInterval = 500;
|
|
144
|
+
|
|
145
|
+
while (Date.now() - startTime < timeoutMs) {
|
|
146
|
+
const response = await fetch(taskUrl, {
|
|
147
|
+
method: 'GET',
|
|
148
|
+
headers: {
|
|
149
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
150
|
+
},
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
if (!response.ok) {
|
|
154
|
+
throw new Error(`查询任务状态失败: ${response.status}`);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
const data = await response.json();
|
|
158
|
+
const status = data.output?.task_status;
|
|
159
|
+
|
|
160
|
+
if (status === 'SUCCEEDED') {
|
|
161
|
+
return data.output.results || [];
|
|
162
|
+
} else if (status === 'FAILED') {
|
|
163
|
+
throw new Error('ASR 识别任务失败');
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
await new Promise(resolve => setTimeout(resolve, pollInterval));
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
throw new Error('ASR 识别超时');
|
|
170
|
+
};
|
|
171
|
+
|
|
172
|
+
/**
|
|
173
|
+
* 获取转写结果详情
|
|
174
|
+
*/
|
|
175
|
+
const fetchTranscriptionResults = async (results) => {
|
|
176
|
+
const allTranscriptions = [];
|
|
177
|
+
|
|
178
|
+
for (const result of results) {
|
|
179
|
+
if (result.subtask_status === 'SUCCEEDED' && result.transcription_url) {
|
|
180
|
+
const response = await fetch(result.transcription_url);
|
|
181
|
+
if (response.ok) {
|
|
182
|
+
const transcriptionData = await response.json();
|
|
183
|
+
allTranscriptions.push({
|
|
184
|
+
file_url: result.file_url,
|
|
185
|
+
transcription: transcriptionData,
|
|
186
|
+
});
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
return allTranscriptions;
|
|
192
|
+
};
|
|
20
193
|
|
|
21
194
|
/**
|
|
22
195
|
* 将 Windows 路径转换为 WSL 路径,或反之
|
|
@@ -115,175 +288,6 @@ const resolveAudioFile = (workingDir, rawPath) => {
|
|
|
115
288
|
return resolved;
|
|
116
289
|
};
|
|
117
290
|
|
|
118
|
-
/**
|
|
119
|
-
* 将路径转换为 WSL 路径格式
|
|
120
|
-
*/
|
|
121
|
-
const toWslPath = (targetPath) => {
|
|
122
|
-
if (targetPath.startsWith('/mnt/')) {
|
|
123
|
-
return targetPath;
|
|
124
|
-
}
|
|
125
|
-
const winMatch = targetPath.match(/^([a-zA-Z]):\\(.*)$/);
|
|
126
|
-
if (winMatch) {
|
|
127
|
-
const drive = winMatch[1].toLowerCase();
|
|
128
|
-
const rest = winMatch[2].replace(/\\/g, '/');
|
|
129
|
-
return `/mnt/${drive}/${rest}`;
|
|
130
|
-
}
|
|
131
|
-
return targetPath;
|
|
132
|
-
};
|
|
133
|
-
|
|
134
|
-
/**
|
|
135
|
-
* 检查是否运行在 WSL 环境中
|
|
136
|
-
*/
|
|
137
|
-
const isWslEnvironment = () => {
|
|
138
|
-
try {
|
|
139
|
-
if (process.platform === 'linux') {
|
|
140
|
-
const release = fs.readFileSync('/proc/version', 'utf8');
|
|
141
|
-
return release.toLowerCase().includes('microsoft');
|
|
142
|
-
}
|
|
143
|
-
} catch {
|
|
144
|
-
// ignore
|
|
145
|
-
}
|
|
146
|
-
return false;
|
|
147
|
-
};
|
|
148
|
-
|
|
149
|
-
/**
|
|
150
|
-
* 获取音频时长(秒)
|
|
151
|
-
*/
|
|
152
|
-
const getAudioDuration = (audioPath) => {
|
|
153
|
-
return new Promise((resolve) => {
|
|
154
|
-
const wslPath = toWslPath(audioPath);
|
|
155
|
-
const isWsl = isWslEnvironment();
|
|
156
|
-
|
|
157
|
-
let cmd, args;
|
|
158
|
-
if (isWsl || process.platform === 'linux') {
|
|
159
|
-
cmd = 'ffprobe';
|
|
160
|
-
args = [
|
|
161
|
-
'-v', 'error',
|
|
162
|
-
'-show_entries', 'format=duration',
|
|
163
|
-
'-of', 'default=noprint_wrappers=1:nokey=1',
|
|
164
|
-
wslPath
|
|
165
|
-
];
|
|
166
|
-
} else {
|
|
167
|
-
cmd = 'ffprobe';
|
|
168
|
-
args = [
|
|
169
|
-
'-v', 'error',
|
|
170
|
-
'-show_entries', 'format=duration',
|
|
171
|
-
'-of', 'default=noprint_wrappers=1:nokey=1',
|
|
172
|
-
audioPath
|
|
173
|
-
];
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
const child = spawn(cmd, args, {
|
|
177
|
-
stdio: ['pipe', 'pipe', 'pipe'],
|
|
178
|
-
shell: false
|
|
179
|
-
});
|
|
180
|
-
|
|
181
|
-
let stdout = '';
|
|
182
|
-
let stderr = '';
|
|
183
|
-
|
|
184
|
-
child.stdout.on('data', (data) => {
|
|
185
|
-
stdout += data.toString();
|
|
186
|
-
});
|
|
187
|
-
|
|
188
|
-
child.stderr.on('data', (data) => {
|
|
189
|
-
stderr += data.toString();
|
|
190
|
-
});
|
|
191
|
-
|
|
192
|
-
child.on('close', (code) => {
|
|
193
|
-
if (code === 0 && stdout.trim()) {
|
|
194
|
-
const duration = parseFloat(stdout.trim());
|
|
195
|
-
if (!isNaN(duration)) {
|
|
196
|
-
resolve(duration);
|
|
197
|
-
return;
|
|
198
|
-
}
|
|
199
|
-
}
|
|
200
|
-
// 如果获取失败,返回 -1 表示未知(不阻止执行)
|
|
201
|
-
resolve(-1);
|
|
202
|
-
});
|
|
203
|
-
|
|
204
|
-
child.on('error', () => {
|
|
205
|
-
resolve(-1);
|
|
206
|
-
});
|
|
207
|
-
});
|
|
208
|
-
};
|
|
209
|
-
|
|
210
|
-
/**
|
|
211
|
-
* 执行 Python 脚本进行 ASR 识别
|
|
212
|
-
*/
|
|
213
|
-
const runAsrScript = (audioPath, outputPath, uploadUrl, apiKey) => {
|
|
214
|
-
return new Promise((resolve, reject) => {
|
|
215
|
-
const scriptPath = path.join(SCRIPTS_DIR, 'asr_srt.py');
|
|
216
|
-
const wslScriptPath = toWslPath(scriptPath);
|
|
217
|
-
const wslAudioPath = toWslPath(audioPath);
|
|
218
|
-
const wslOutputPath = toWslPath(outputPath);
|
|
219
|
-
|
|
220
|
-
// 构建 python 命令字符串
|
|
221
|
-
const pythonCommand = [
|
|
222
|
-
'python',
|
|
223
|
-
`"${wslScriptPath}"`,
|
|
224
|
-
'--audio', `"${wslAudioPath}"`,
|
|
225
|
-
'--output', `"${wslOutputPath}"`,
|
|
226
|
-
'--upload-url', `"${uploadUrl}"`,
|
|
227
|
-
'--api-key', `"${apiKey}"`
|
|
228
|
-
].join(' ');
|
|
229
|
-
|
|
230
|
-
const isWsl = isWslEnvironment();
|
|
231
|
-
|
|
232
|
-
// 参考 bash.js 的调用方式,通过 bash -ic 执行命令
|
|
233
|
-
cmd = 'bash';
|
|
234
|
-
args = ['-ic', pythonCommand];
|
|
235
|
-
|
|
236
|
-
const child = spawn(cmd, args, {
|
|
237
|
-
stdio: ['pipe', 'pipe', 'pipe'],
|
|
238
|
-
shell: false,
|
|
239
|
-
env: {
|
|
240
|
-
...process.env,
|
|
241
|
-
ASR_API_KEY: apiKey
|
|
242
|
-
}
|
|
243
|
-
});
|
|
244
|
-
|
|
245
|
-
let stdout = '';
|
|
246
|
-
let stderr = '';
|
|
247
|
-
const maxOutput = 64 * 1024;
|
|
248
|
-
|
|
249
|
-
child.stdout.on('data', (data) => {
|
|
250
|
-
const chunk = data.toString();
|
|
251
|
-
stdout += chunk;
|
|
252
|
-
if (stdout.length > maxOutput) {
|
|
253
|
-
stdout = stdout.slice(-maxOutput);
|
|
254
|
-
}
|
|
255
|
-
});
|
|
256
|
-
|
|
257
|
-
child.stderr.on('data', (data) => {
|
|
258
|
-
const chunk = data.toString();
|
|
259
|
-
stderr += chunk;
|
|
260
|
-
if (stderr.length > maxOutput) {
|
|
261
|
-
stderr = stderr.slice(-maxOutput);
|
|
262
|
-
}
|
|
263
|
-
});
|
|
264
|
-
|
|
265
|
-
const timeout = setTimeout(() => {
|
|
266
|
-
child.kill('SIGKILL');
|
|
267
|
-
reject(new Error('ASR 识别超时(超过 5 分钟)'));
|
|
268
|
-
}, 5 * 60 * 1000);
|
|
269
|
-
|
|
270
|
-
child.on('close', (code) => {
|
|
271
|
-
clearTimeout(timeout);
|
|
272
|
-
if (code === 0) {
|
|
273
|
-
resolve(stdout.trim());
|
|
274
|
-
} else {
|
|
275
|
-
const errorMsg = stderr.trim() || stdout.trim() || `进程退出码: ${code}`;
|
|
276
|
-
reject(new Error(errorMsg));
|
|
277
|
-
}
|
|
278
|
-
});
|
|
279
|
-
|
|
280
|
-
child.on('error', (err) => {
|
|
281
|
-
clearTimeout(timeout);
|
|
282
|
-
reject(new Error(`执行脚本失败: ${err.message}`));
|
|
283
|
-
});
|
|
284
|
-
});
|
|
285
|
-
};
|
|
286
|
-
|
|
287
291
|
/**
|
|
288
292
|
* ASR 语音识别
|
|
289
293
|
*/
|
|
@@ -296,20 +300,38 @@ const asr = async ({ working_directory, audio_file }) => {
|
|
|
296
300
|
const workingDir = resolveWorkingDirectory(working_directory);
|
|
297
301
|
const audioPath = resolveAudioFile(workingDir, audio_file);
|
|
298
302
|
|
|
299
|
-
//
|
|
300
|
-
const
|
|
301
|
-
if (
|
|
303
|
+
// 检查文件大小
|
|
304
|
+
const fileSize = fs.statSync(audioPath).size;
|
|
305
|
+
if (fileSize > MAX_FILE_SIZE_BYTES) {
|
|
302
306
|
throw new Error(
|
|
303
|
-
|
|
307
|
+
`音频文件大小 ${(fileSize / 1024 / 1024).toFixed(2)}MB 超过限制(最大 5MB)。` +
|
|
304
308
|
`请先对音频进行分段处理后再逐段识别。`
|
|
305
309
|
);
|
|
306
310
|
}
|
|
307
311
|
|
|
308
|
-
//
|
|
309
|
-
const
|
|
312
|
+
// 1. 上传音频文件
|
|
313
|
+
const filePath = await uploadAudio(ASR_UPLOAD_URL, audioPath);
|
|
314
|
+
|
|
315
|
+
// 2. 构建静态资源 URL
|
|
316
|
+
const audioUrl = getStaticUrl(ASR_UPLOAD_URL, filePath);
|
|
317
|
+
|
|
318
|
+
// 3. 提交 ASR 任务
|
|
319
|
+
const taskId = await submitAsrTask([audioUrl], ASR_API_KEY);
|
|
320
|
+
|
|
321
|
+
// 4. 等待任务完成
|
|
322
|
+
const results = await waitForTaskComplete(taskId, ASR_API_KEY);
|
|
310
323
|
|
|
311
|
-
//
|
|
312
|
-
|
|
324
|
+
// 5. 获取转写结果
|
|
325
|
+
const transcriptions = await fetchTranscriptionResults(results);
|
|
326
|
+
|
|
327
|
+
if (!transcriptions.length) {
|
|
328
|
+
throw new Error('未获取到识别结果');
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
// 6. 转换为 SRT 格式并保存
|
|
332
|
+
const srtContent = asrToSrt(transcriptions);
|
|
333
|
+
const outputPath = path.join(workingDir, 'transcribe.srt');
|
|
334
|
+
fs.writeFileSync(outputPath, srtContent, 'utf-8');
|
|
313
335
|
|
|
314
336
|
return 'transcribe.srt';
|
|
315
337
|
};
|
|
@@ -317,7 +339,7 @@ const asr = async ({ working_directory, audio_file }) => {
|
|
|
317
339
|
const server = new Server(
|
|
318
340
|
{
|
|
319
341
|
name: 'media',
|
|
320
|
-
version: '1.0.
|
|
342
|
+
version: '1.0.6',
|
|
321
343
|
},
|
|
322
344
|
{
|
|
323
345
|
capabilities: {
|
|
@@ -330,7 +352,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => ({
|
|
|
330
352
|
tools: [
|
|
331
353
|
{
|
|
332
354
|
name: 'asr',
|
|
333
|
-
description: '语音识别工具,将音频文件转换为带时间戳的 SRT
|
|
355
|
+
description: '语音识别工具,将音频文件转换为带时间戳的 SRT 字幕文件。',
|
|
334
356
|
inputSchema: {
|
|
335
357
|
type: 'object',
|
|
336
358
|
properties: {
|
|
@@ -340,7 +362,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => ({
|
|
|
340
362
|
},
|
|
341
363
|
audio_file: {
|
|
342
364
|
type: 'string',
|
|
343
|
-
description: '音频文件的相对路径(相对于工作目录),支持 mp3/wav
|
|
365
|
+
description: '音频文件的相对路径(相对于工作目录),支持 mp3/wav 格式,音频长度限制 1 分钟以内,文件大小不超过 5MB',
|
|
344
366
|
},
|
|
345
367
|
},
|
|
346
368
|
required: ['working_directory', 'audio_file'],
|
package/package.json
CHANGED
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ww_nero/media",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.6",
|
|
4
4
|
"description": "MCP server for media processing, including ASR speech recognition",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"bin": {
|
|
7
7
|
"media": "index.js"
|
|
8
8
|
},
|
|
9
9
|
"files": [
|
|
10
|
-
"index.js"
|
|
11
|
-
"scripts"
|
|
10
|
+
"index.js"
|
|
12
11
|
],
|
|
13
12
|
"dependencies": {
|
|
14
|
-
"@modelcontextprotocol/sdk": "^1.22.0"
|
|
13
|
+
"@modelcontextprotocol/sdk": "^1.22.0",
|
|
14
|
+
"form-data": "^4.0.1"
|
|
15
15
|
}
|
|
16
16
|
}
|
package/scripts/asr_srt.py
DELETED
|
@@ -1,227 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
ASR 语音识别脚本
|
|
5
|
-
上传音频文件到服务器,调用阿里云 ASR 接口进行识别,生成 SRT 字幕文件
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import os
|
|
9
|
-
import sys
|
|
10
|
-
import json
|
|
11
|
-
import time
|
|
12
|
-
import argparse
|
|
13
|
-
import requests
|
|
14
|
-
from pathlib import Path
|
|
15
|
-
from http import HTTPStatus
|
|
16
|
-
|
|
17
|
-
# 尝试导入 dashscope,如果失败则提示安装
|
|
18
|
-
try:
|
|
19
|
-
import dashscope
|
|
20
|
-
from dashscope.audio.asr import Transcription
|
|
21
|
-
except ImportError:
|
|
22
|
-
print("错误: 请先安装 dashscope 库: pip install dashscope", file=sys.stderr)
|
|
23
|
-
sys.exit(1)
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def ms_to_srt_time(ms: int) -> str:
|
|
27
|
-
"""将毫秒转换为 SRT 时间格式 HH:MM:SS,mmm"""
|
|
28
|
-
hours = ms // 3600000
|
|
29
|
-
minutes = (ms % 3600000) // 60000
|
|
30
|
-
seconds = (ms % 60000) // 1000
|
|
31
|
-
milliseconds = ms % 1000
|
|
32
|
-
return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def asr_to_srt(asr_data: list, output_srt_path: str):
|
|
36
|
-
"""
|
|
37
|
-
将 ASR 识别结果转换为 SRT 字幕文件
|
|
38
|
-
|
|
39
|
-
Args:
|
|
40
|
-
asr_data: ASR 识别结果列表
|
|
41
|
-
output_srt_path: 输出的 SRT 文件路径
|
|
42
|
-
"""
|
|
43
|
-
srt_entries = []
|
|
44
|
-
subtitle_index = 1
|
|
45
|
-
|
|
46
|
-
for item in asr_data:
|
|
47
|
-
transcription = item.get('transcription', {})
|
|
48
|
-
transcripts = transcription.get('transcripts', [])
|
|
49
|
-
|
|
50
|
-
for transcript in transcripts:
|
|
51
|
-
sentences = transcript.get('sentences', [])
|
|
52
|
-
|
|
53
|
-
for sentence in sentences:
|
|
54
|
-
begin_time = sentence.get('begin_time', 0)
|
|
55
|
-
end_time = sentence.get('end_time', 0)
|
|
56
|
-
text = sentence.get('text', '').strip()
|
|
57
|
-
|
|
58
|
-
if text:
|
|
59
|
-
start_str = ms_to_srt_time(begin_time)
|
|
60
|
-
end_str = ms_to_srt_time(end_time)
|
|
61
|
-
|
|
62
|
-
srt_entry = f"{subtitle_index}\n{start_str} --> {end_str}\n{text}\n"
|
|
63
|
-
srt_entries.append(srt_entry)
|
|
64
|
-
subtitle_index += 1
|
|
65
|
-
|
|
66
|
-
with open(output_srt_path, 'w', encoding='utf-8') as f:
|
|
67
|
-
f.write('\n'.join(srt_entries))
|
|
68
|
-
|
|
69
|
-
return subtitle_index - 1
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
def upload_audio(upload_url: str, audio_path: str) -> str:
|
|
73
|
-
"""
|
|
74
|
-
上传音频文件到服务器
|
|
75
|
-
|
|
76
|
-
Args:
|
|
77
|
-
upload_url: 上传接口的完整 URL
|
|
78
|
-
audio_path: 本地音频文件路径
|
|
79
|
-
|
|
80
|
-
Returns:
|
|
81
|
-
上传后的文件名
|
|
82
|
-
"""
|
|
83
|
-
audio_path = Path(audio_path)
|
|
84
|
-
if not audio_path.exists():
|
|
85
|
-
raise FileNotFoundError(f"音频文件不存在: {audio_path}")
|
|
86
|
-
|
|
87
|
-
with open(audio_path, 'rb') as f:
|
|
88
|
-
files = {'file': (audio_path.name, f)}
|
|
89
|
-
response = requests.post(upload_url, files=files, timeout=60)
|
|
90
|
-
|
|
91
|
-
if response.status_code != 200:
|
|
92
|
-
raise Exception(f"上传失败: {response.status_code} - {response.text}")
|
|
93
|
-
|
|
94
|
-
data = response.json()
|
|
95
|
-
|
|
96
|
-
# 响应格式: {'success': True, 'data': {'path': '/tmp/xxx.wav'}}
|
|
97
|
-
if not data.get('success') or 'data' not in data or 'path' not in data['data']:
|
|
98
|
-
raise Exception(f"上传响应格式错误: {data}")
|
|
99
|
-
|
|
100
|
-
# 返回完整的相对路径,用于拼接到服务器地址后面
|
|
101
|
-
return data['data']['path']
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
def get_static_url(upload_url: str, file_path: str) -> str:
|
|
105
|
-
"""
|
|
106
|
-
根据上传接口 URL 和文件路径构建静态资源 URL
|
|
107
|
-
|
|
108
|
-
Args:
|
|
109
|
-
upload_url: 上传接口的完整 URL (如 http://server.domain.com/upload)
|
|
110
|
-
file_path: 上传后返回的文件路径 (如 /tmp/xxx.wav)
|
|
111
|
-
|
|
112
|
-
Returns:
|
|
113
|
-
静态资源的完整 URL
|
|
114
|
-
"""
|
|
115
|
-
# 从上传 URL 中提取基础 URL
|
|
116
|
-
# 例如: http://server.domain.com/upload -> http://server.domain.com
|
|
117
|
-
from urllib.parse import urlparse, urlunparse
|
|
118
|
-
parsed = urlparse(upload_url)
|
|
119
|
-
base_url = urlunparse((parsed.scheme, parsed.netloc, '', '', '', ''))
|
|
120
|
-
# file_path 已经是以 / 开头的完整相对路径,直接拼接
|
|
121
|
-
return f"{base_url}{file_path}"
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
def transcribe_audio(audio_url: str, api_key: str) -> list:
|
|
125
|
-
"""
|
|
126
|
-
调用阿里云 ASR 接口进行语音识别
|
|
127
|
-
|
|
128
|
-
Args:
|
|
129
|
-
audio_url: 音频文件的公网 URL
|
|
130
|
-
api_key: 阿里云 ASR API Key
|
|
131
|
-
|
|
132
|
-
Returns:
|
|
133
|
-
识别结果列表
|
|
134
|
-
"""
|
|
135
|
-
dashscope.api_key = api_key
|
|
136
|
-
|
|
137
|
-
# 发起异步识别请求
|
|
138
|
-
transcribe_response = Transcription.async_call(
|
|
139
|
-
model='paraformer-v2',
|
|
140
|
-
file_urls=[audio_url],
|
|
141
|
-
language_hints=['zh', 'en', 'ja']
|
|
142
|
-
)
|
|
143
|
-
|
|
144
|
-
if not transcribe_response or not hasattr(transcribe_response, 'output'):
|
|
145
|
-
raise Exception("ASR 请求失败: 无效的响应")
|
|
146
|
-
|
|
147
|
-
# 轮询等待识别完成
|
|
148
|
-
while True:
|
|
149
|
-
task_status = transcribe_response.output.task_status
|
|
150
|
-
if task_status == 'SUCCEEDED' or task_status == 'FAILED':
|
|
151
|
-
break
|
|
152
|
-
time.sleep(1)
|
|
153
|
-
transcribe_response = Transcription.fetch(task=transcribe_response.output.task_id)
|
|
154
|
-
|
|
155
|
-
if transcribe_response.status_code != HTTPStatus.OK:
|
|
156
|
-
raise Exception(f"ASR 识别失败: {transcribe_response.status_code}")
|
|
157
|
-
|
|
158
|
-
if transcribe_response.output.task_status == 'FAILED':
|
|
159
|
-
raise Exception("ASR 识别任务失败")
|
|
160
|
-
|
|
161
|
-
# 获取识别结果
|
|
162
|
-
results = transcribe_response.output.get('results', [])
|
|
163
|
-
all_transcriptions = []
|
|
164
|
-
|
|
165
|
-
for result in results:
|
|
166
|
-
if result.get('subtask_status') == 'SUCCEEDED':
|
|
167
|
-
transcription_url = result.get('transcription_url')
|
|
168
|
-
if transcription_url:
|
|
169
|
-
resp = requests.get(transcription_url, timeout=30)
|
|
170
|
-
if resp.status_code == 200:
|
|
171
|
-
transcription_data = resp.json()
|
|
172
|
-
all_transcriptions.append({
|
|
173
|
-
'file_url': result.get('file_url'),
|
|
174
|
-
'transcription': transcription_data
|
|
175
|
-
})
|
|
176
|
-
|
|
177
|
-
return all_transcriptions
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
def main():
|
|
181
|
-
parser = argparse.ArgumentParser(description='ASR 语音识别并生成 SRT 字幕')
|
|
182
|
-
parser.add_argument('--audio', required=True, help='音频文件路径')
|
|
183
|
-
parser.add_argument('--output', required=True, help='输出 SRT 文件路径')
|
|
184
|
-
parser.add_argument('--upload-url', required=True, help='上传接口的完整 URL')
|
|
185
|
-
parser.add_argument('--api-key', help='ASR API Key (也可通过 ASR_API_KEY 环境变量设置)')
|
|
186
|
-
|
|
187
|
-
args = parser.parse_args()
|
|
188
|
-
|
|
189
|
-
# 获取 API Key
|
|
190
|
-
api_key = args.api_key or os.environ.get('ASR_API_KEY')
|
|
191
|
-
if not api_key:
|
|
192
|
-
print("错误: 请通过 --api-key 参数或 ASR_API_KEY 环境变量提供 API Key", file=sys.stderr)
|
|
193
|
-
sys.exit(1)
|
|
194
|
-
|
|
195
|
-
try:
|
|
196
|
-
# 1. 上传音频文件
|
|
197
|
-
print(f"正在上传音频文件: {args.audio}")
|
|
198
|
-
file_path = upload_audio(args.upload_url, args.audio)
|
|
199
|
-
print(f"上传成功: {file_path}")
|
|
200
|
-
|
|
201
|
-
# 2. 构建静态资源 URL
|
|
202
|
-
audio_url = get_static_url(args.upload_url, file_path)
|
|
203
|
-
print(f"音频 URL: {audio_url}")
|
|
204
|
-
|
|
205
|
-
# 3. 调用 ASR 识别
|
|
206
|
-
print("正在进行语音识别...")
|
|
207
|
-
transcriptions = transcribe_audio(audio_url, api_key)
|
|
208
|
-
|
|
209
|
-
if not transcriptions:
|
|
210
|
-
print("警告: 未获取到识别结果", file=sys.stderr)
|
|
211
|
-
sys.exit(1)
|
|
212
|
-
|
|
213
|
-
# 4. 生成 SRT 文件
|
|
214
|
-
subtitle_count = asr_to_srt(transcriptions, args.output)
|
|
215
|
-
print(f"SRT 字幕文件已生成: {args.output}")
|
|
216
|
-
print(f"共 {subtitle_count} 条字幕")
|
|
217
|
-
|
|
218
|
-
except FileNotFoundError as e:
|
|
219
|
-
print(f"错误: {e}", file=sys.stderr)
|
|
220
|
-
sys.exit(1)
|
|
221
|
-
except Exception as e:
|
|
222
|
-
print(f"错误: {e}", file=sys.stderr)
|
|
223
|
-
sys.exit(1)
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
if __name__ == '__main__':
|
|
227
|
-
main()
|