@ww_nero/media 1.0.9 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -4
- package/index.js +39 -229
- package/package.json +7 -4
- package/utils/asr.js +233 -0
- package/utils/tts.js +133 -0
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Media MCP Server
|
|
2
2
|
|
|
3
|
-
媒体处理 MCP
|
|
3
|
+
媒体处理 MCP 服务,提供语音识别和语音合成功能。
|
|
4
4
|
|
|
5
5
|
## 功能
|
|
6
6
|
|
|
@@ -18,13 +18,24 @@
|
|
|
18
18
|
- 超出限制需要外部分段处理后逐段识别
|
|
19
19
|
|
|
20
20
|
**输出:**
|
|
21
|
-
- 识别结果保存到工作目录下的 `
|
|
21
|
+
- 识别结果保存到工作目录下的 `asr_<timestamp>.srt` 文件
|
|
22
|
+
|
|
23
|
+
### tts - 语音合成
|
|
24
|
+
|
|
25
|
+
将文本转换为音频文件。
|
|
26
|
+
|
|
27
|
+
**参数:**
|
|
28
|
+
- `working_directory`: 工作目录的绝对路径,合成的音频文件将保存到此目录
|
|
29
|
+
- `text`: 需要合成语音的文本内容
|
|
30
|
+
|
|
31
|
+
**输出:**
|
|
32
|
+
- 合成结果保存到工作目录下的 `tts_<timestamp>.mp3` 文件
|
|
22
33
|
|
|
23
34
|
## 环境变量
|
|
24
35
|
|
|
25
36
|
| 变量名 | 说明 | 必填 |
|
|
26
37
|
|--------|------|------|
|
|
27
|
-
| `
|
|
38
|
+
| `DASHSCOPE_API_KEY` | 阿里云 DashScope API Key | 是 |
|
|
28
39
|
|
|
29
40
|
## 安装
|
|
30
41
|
|
|
@@ -41,7 +52,7 @@ npm install
|
|
|
41
52
|
"command": "node",
|
|
42
53
|
"args": ["/path/to/media/index.js"],
|
|
43
54
|
"env": {
|
|
44
|
-
"
|
|
55
|
+
"DASHSCOPE_API_KEY": "your-api-key"
|
|
45
56
|
}
|
|
46
57
|
}
|
|
47
58
|
}
|
package/index.js
CHANGED
|
@@ -9,185 +9,10 @@ const {
|
|
|
9
9
|
ListToolsRequestSchema,
|
|
10
10
|
} = require('@modelcontextprotocol/sdk/types.js');
|
|
11
11
|
|
|
12
|
-
const
|
|
13
|
-
const
|
|
14
|
-
const ASR_SERVICE_URL = 'https://dashscope.aliyuncs.com/api/v1/services/audio/asr/transcription';
|
|
12
|
+
const { asr, SUPPORTED_AUDIO_TYPES, MAX_FILE_SIZE_BYTES } = require('./utils/asr.js');
|
|
13
|
+
const { tts } = require('./utils/tts.js');
|
|
15
14
|
|
|
16
|
-
const
|
|
17
|
-
const MAX_FILE_SIZE_BYTES = 120 * 1024 * 1024; // 120MB
|
|
18
|
-
const LANGUAGE_HINTS = ['zh', 'en', 'ja'];
|
|
19
|
-
|
|
20
|
-
/**
|
|
21
|
-
* 将毫秒转换为 SRT 时间格式 HH:MM:SS,mmm
|
|
22
|
-
*/
|
|
23
|
-
const msToSrtTime = (ms) => {
|
|
24
|
-
const hours = Math.floor(ms / 3600000);
|
|
25
|
-
const minutes = Math.floor((ms % 3600000) / 60000);
|
|
26
|
-
const seconds = Math.floor((ms % 60000) / 1000);
|
|
27
|
-
const milliseconds = ms % 1000;
|
|
28
|
-
return `${String(hours).padStart(2, '0')}:${String(minutes).padStart(2, '0')}:${String(seconds).padStart(2, '0')},${String(milliseconds).padStart(3, '0')}`;
|
|
29
|
-
};
|
|
30
|
-
|
|
31
|
-
/**
|
|
32
|
-
* 将 ASR 识别结果转换为 SRT 字幕内容
|
|
33
|
-
*/
|
|
34
|
-
const asrToSrt = (asrData) => {
|
|
35
|
-
const srtEntries = [];
|
|
36
|
-
let subtitleIndex = 1;
|
|
37
|
-
|
|
38
|
-
for (const item of asrData) {
|
|
39
|
-
const transcription = item.transcription || {};
|
|
40
|
-
const transcripts = transcription.transcripts || [];
|
|
41
|
-
|
|
42
|
-
for (const transcript of transcripts) {
|
|
43
|
-
const sentences = transcript.sentences || [];
|
|
44
|
-
|
|
45
|
-
for (const sentence of sentences) {
|
|
46
|
-
const beginTime = sentence.begin_time || 0;
|
|
47
|
-
const endTime = sentence.end_time || 0;
|
|
48
|
-
const text = (sentence.text || '').trim();
|
|
49
|
-
|
|
50
|
-
if (text) {
|
|
51
|
-
const startStr = msToSrtTime(beginTime);
|
|
52
|
-
const endStr = msToSrtTime(endTime);
|
|
53
|
-
|
|
54
|
-
srtEntries.push(`${subtitleIndex}\n${startStr} --> ${endStr}\n${text}\n`);
|
|
55
|
-
subtitleIndex++;
|
|
56
|
-
}
|
|
57
|
-
}
|
|
58
|
-
}
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
return srtEntries.join('\n');
|
|
62
|
-
};
|
|
63
|
-
|
|
64
|
-
/**
|
|
65
|
-
* 上传音频文件到服务器
|
|
66
|
-
*/
|
|
67
|
-
const uploadAudio = async (uploadUrl, audioPath) => {
|
|
68
|
-
const fileBuffer = fs.readFileSync(audioPath);
|
|
69
|
-
const fileName = path.basename(audioPath);
|
|
70
|
-
|
|
71
|
-
const formData = new FormData();
|
|
72
|
-
formData.append('file', new Blob([fileBuffer]), fileName);
|
|
73
|
-
|
|
74
|
-
const response = await fetch(uploadUrl, {
|
|
75
|
-
method: 'POST',
|
|
76
|
-
body: formData,
|
|
77
|
-
});
|
|
78
|
-
|
|
79
|
-
if (!response.ok) {
|
|
80
|
-
throw new Error(`上传失败: ${response.status} - ${await response.text()}`);
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
const data = await response.json();
|
|
84
|
-
|
|
85
|
-
if (!data.success || !data.data || !data.data.path) {
|
|
86
|
-
throw new Error(`上传响应格式错误: ${JSON.stringify(data)}`);
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
return data.data.path;
|
|
90
|
-
};
|
|
91
|
-
|
|
92
|
-
/**
|
|
93
|
-
* 根据上传接口 URL 和文件路径构建静态资源 URL
|
|
94
|
-
*/
|
|
95
|
-
const getStaticUrl = (uploadUrl, filePath) => {
|
|
96
|
-
const url = new URL(uploadUrl);
|
|
97
|
-
return `${url.protocol}//${url.host}${filePath}`;
|
|
98
|
-
};
|
|
99
|
-
|
|
100
|
-
/**
|
|
101
|
-
* 提交 ASR 转写任务
|
|
102
|
-
*/
|
|
103
|
-
const submitAsrTask = async (fileUrls, apiKey) => {
|
|
104
|
-
const response = await fetch(ASR_SERVICE_URL, {
|
|
105
|
-
method: 'POST',
|
|
106
|
-
headers: {
|
|
107
|
-
'Authorization': `Bearer ${apiKey}`,
|
|
108
|
-
'Content-Type': 'application/json',
|
|
109
|
-
'X-DashScope-Async': 'enable',
|
|
110
|
-
},
|
|
111
|
-
body: JSON.stringify({
|
|
112
|
-
model: 'paraformer-v2',
|
|
113
|
-
input: { file_urls: fileUrls },
|
|
114
|
-
parameters: {
|
|
115
|
-
channel_id: [0],
|
|
116
|
-
language_hints: LANGUAGE_HINTS,
|
|
117
|
-
},
|
|
118
|
-
}),
|
|
119
|
-
});
|
|
120
|
-
|
|
121
|
-
if (!response.ok) {
|
|
122
|
-
const errorText = await response.text();
|
|
123
|
-
throw new Error(`ASR 任务提交失败: ${response.status} - ${errorText}`);
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
const data = await response.json();
|
|
127
|
-
|
|
128
|
-
if (!data.output || !data.output.task_id) {
|
|
129
|
-
throw new Error(`ASR 响应格式错误: ${JSON.stringify(data)}`);
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
return data.output.task_id;
|
|
133
|
-
};
|
|
134
|
-
|
|
135
|
-
/**
|
|
136
|
-
* 轮询等待 ASR 任务完成
|
|
137
|
-
*/
|
|
138
|
-
const waitForTaskComplete = async (taskId, apiKey, timeoutMs = 5 * 60 * 1000) => {
|
|
139
|
-
const taskUrl = `https://dashscope.aliyuncs.com/api/v1/tasks/${taskId}`;
|
|
140
|
-
const startTime = Date.now();
|
|
141
|
-
const pollInterval = 500;
|
|
142
|
-
|
|
143
|
-
while (Date.now() - startTime < timeoutMs) {
|
|
144
|
-
const response = await fetch(taskUrl, {
|
|
145
|
-
method: 'GET',
|
|
146
|
-
headers: {
|
|
147
|
-
'Authorization': `Bearer ${apiKey}`,
|
|
148
|
-
},
|
|
149
|
-
});
|
|
150
|
-
|
|
151
|
-
if (!response.ok) {
|
|
152
|
-
throw new Error(`查询任务状态失败: ${response.status}`);
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
const data = await response.json();
|
|
156
|
-
const status = data.output?.task_status;
|
|
157
|
-
|
|
158
|
-
if (status === 'SUCCEEDED') {
|
|
159
|
-
return data.output.results || [];
|
|
160
|
-
} else if (status === 'FAILED') {
|
|
161
|
-
throw new Error('ASR 识别任务失败');
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
await new Promise(resolve => setTimeout(resolve, pollInterval));
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
throw new Error('ASR 识别超时');
|
|
168
|
-
};
|
|
169
|
-
|
|
170
|
-
/**
|
|
171
|
-
* 获取转写结果详情
|
|
172
|
-
*/
|
|
173
|
-
const fetchTranscriptionResults = async (results) => {
|
|
174
|
-
const allTranscriptions = [];
|
|
175
|
-
|
|
176
|
-
for (const result of results) {
|
|
177
|
-
if (result.subtask_status === 'SUCCEEDED' && result.transcription_url) {
|
|
178
|
-
const response = await fetch(result.transcription_url);
|
|
179
|
-
if (response.ok) {
|
|
180
|
-
const transcriptionData = await response.json();
|
|
181
|
-
allTranscriptions.push({
|
|
182
|
-
file_url: result.file_url,
|
|
183
|
-
transcription: transcriptionData,
|
|
184
|
-
});
|
|
185
|
-
}
|
|
186
|
-
}
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
return allTranscriptions;
|
|
190
|
-
};
|
|
15
|
+
const DASHSCOPE_API_KEY = process.env.DASHSCOPE_API_KEY || '';
|
|
191
16
|
|
|
192
17
|
/**
|
|
193
18
|
* 将 Windows 路径转换为 WSL 路径,或反之
|
|
@@ -286,59 +111,10 @@ const resolveAudioFile = (workingDir, rawPath) => {
|
|
|
286
111
|
return resolved;
|
|
287
112
|
};
|
|
288
113
|
|
|
289
|
-
/**
|
|
290
|
-
* ASR 语音识别
|
|
291
|
-
*/
|
|
292
|
-
const asr = async ({ working_directory, audio_file }) => {
|
|
293
|
-
// 验证环境变量
|
|
294
|
-
if (!ASR_API_KEY) {
|
|
295
|
-
throw new Error('请配置 ASR_API_KEY 环境变量');
|
|
296
|
-
}
|
|
297
|
-
|
|
298
|
-
const workingDir = resolveWorkingDirectory(working_directory);
|
|
299
|
-
const audioPath = resolveAudioFile(workingDir, audio_file);
|
|
300
|
-
|
|
301
|
-
// 检查文件大小
|
|
302
|
-
const fileSize = fs.statSync(audioPath).size;
|
|
303
|
-
if (fileSize > MAX_FILE_SIZE_BYTES) {
|
|
304
|
-
throw new Error(
|
|
305
|
-
`音频文件大小 ${(fileSize / 1024 / 1024).toFixed(2)}MB 超过限制(最大 120MB)。` +
|
|
306
|
-
`请先对音频进行分段处理后再逐段识别。`
|
|
307
|
-
);
|
|
308
|
-
}
|
|
309
|
-
|
|
310
|
-
// 1. 上传音频文件
|
|
311
|
-
const filePath = await uploadAudio(ASR_UPLOAD_URL, audioPath);
|
|
312
|
-
|
|
313
|
-
// 2. 构建静态资源 URL
|
|
314
|
-
const audioUrl = getStaticUrl(ASR_UPLOAD_URL, filePath);
|
|
315
|
-
|
|
316
|
-
// 3. 提交 ASR 任务
|
|
317
|
-
const taskId = await submitAsrTask([audioUrl], ASR_API_KEY);
|
|
318
|
-
|
|
319
|
-
// 4. 等待任务完成
|
|
320
|
-
const results = await waitForTaskComplete(taskId, ASR_API_KEY);
|
|
321
|
-
|
|
322
|
-
// 5. 获取转写结果
|
|
323
|
-
const transcriptions = await fetchTranscriptionResults(results);
|
|
324
|
-
|
|
325
|
-
if (!transcriptions.length) {
|
|
326
|
-
throw new Error('未获取到识别结果');
|
|
327
|
-
}
|
|
328
|
-
|
|
329
|
-
// 6. 转换为 SRT 格式并保存
|
|
330
|
-
const srtContent = asrToSrt(transcriptions);
|
|
331
|
-
const filename = `asr_${Date.now()}.srt`;
|
|
332
|
-
const outputPath = path.join(workingDir, filename);
|
|
333
|
-
fs.writeFileSync(outputPath, srtContent, 'utf-8');
|
|
334
|
-
|
|
335
|
-
return filename;
|
|
336
|
-
};
|
|
337
|
-
|
|
338
114
|
const server = new Server(
|
|
339
115
|
{
|
|
340
116
|
name: 'media',
|
|
341
|
-
version: '1.0
|
|
117
|
+
version: '1.1.0',
|
|
342
118
|
},
|
|
343
119
|
{
|
|
344
120
|
capabilities: {
|
|
@@ -367,6 +143,24 @@ server.setRequestHandler(ListToolsRequestSchema, async () => ({
|
|
|
367
143
|
required: ['working_directory', 'audio_file'],
|
|
368
144
|
},
|
|
369
145
|
},
|
|
146
|
+
{
|
|
147
|
+
name: 'tts',
|
|
148
|
+
description: '语音合成工具,将文本转换为音频文件。',
|
|
149
|
+
inputSchema: {
|
|
150
|
+
type: 'object',
|
|
151
|
+
properties: {
|
|
152
|
+
working_directory: {
|
|
153
|
+
type: 'string',
|
|
154
|
+
description: '工作目录的绝对路径,合成的音频文件将保存到此目录',
|
|
155
|
+
},
|
|
156
|
+
text: {
|
|
157
|
+
type: 'string',
|
|
158
|
+
description: '需要合成语音的文本内容',
|
|
159
|
+
},
|
|
160
|
+
},
|
|
161
|
+
required: ['working_directory', 'text'],
|
|
162
|
+
},
|
|
163
|
+
},
|
|
370
164
|
],
|
|
371
165
|
}));
|
|
372
166
|
|
|
@@ -379,10 +173,26 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
379
173
|
if (!working_directory || !audio_file) {
|
|
380
174
|
throw new Error('必须同时提供 working_directory 和 audio_file 参数');
|
|
381
175
|
}
|
|
382
|
-
const
|
|
176
|
+
const workingDir = resolveWorkingDirectory(working_directory);
|
|
177
|
+
const audioPath = resolveAudioFile(workingDir, audio_file);
|
|
178
|
+
const filename = await asr({ workingDir, audioPath, apiKey: DASHSCOPE_API_KEY });
|
|
383
179
|
return { content: [{ type: 'text', text: `语音识别完成,字幕文件已保存到工作目录下:${filename}` }] };
|
|
384
180
|
}
|
|
385
181
|
|
|
182
|
+
if (name === 'tts') {
|
|
183
|
+
const { working_directory, text } = args;
|
|
184
|
+
if (!working_directory || !text) {
|
|
185
|
+
throw new Error('必须同时提供 working_directory 和 text 参数');
|
|
186
|
+
}
|
|
187
|
+
const workingDir = resolveWorkingDirectory(working_directory);
|
|
188
|
+
const filename = await tts({
|
|
189
|
+
workingDir,
|
|
190
|
+
text,
|
|
191
|
+
apiKey: DASHSCOPE_API_KEY,
|
|
192
|
+
});
|
|
193
|
+
return { content: [{ type: 'text', text: `语音合成完成,音频文件已保存到工作目录下:${filename}` }] };
|
|
194
|
+
}
|
|
195
|
+
|
|
386
196
|
return {
|
|
387
197
|
content: [{ type: 'text', text: `未知工具: ${name}` }],
|
|
388
198
|
isError: true,
|
package/package.json
CHANGED
|
@@ -1,15 +1,18 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ww_nero/media",
|
|
3
|
-
"version": "1.0
|
|
4
|
-
"description": "MCP server for media processing, including ASR speech recognition",
|
|
3
|
+
"version": "1.1.0",
|
|
4
|
+
"description": "MCP server for media processing, including ASR speech recognition and TTS speech synthesis",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"bin": {
|
|
7
7
|
"media": "index.js"
|
|
8
8
|
},
|
|
9
9
|
"files": [
|
|
10
|
-
"index.js"
|
|
10
|
+
"index.js",
|
|
11
|
+
"utils"
|
|
11
12
|
],
|
|
12
13
|
"dependencies": {
|
|
13
|
-
"@modelcontextprotocol/sdk": "^1.22.0"
|
|
14
|
+
"@modelcontextprotocol/sdk": "^1.22.0",
|
|
15
|
+
"uuid": "^13.0.0",
|
|
16
|
+
"ws": "^8.18.3"
|
|
14
17
|
}
|
|
15
18
|
}
|
package/utils/asr.js
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
const fs = require('fs');
|
|
2
|
+
const path = require('path');
|
|
3
|
+
|
|
4
|
+
const ASR_UPLOAD_URL = 'http://fsheep.com:10808/upload';
|
|
5
|
+
const ASR_SERVICE_URL = 'https://dashscope.aliyuncs.com/api/v1/services/audio/asr/transcription';
|
|
6
|
+
|
|
7
|
+
const SUPPORTED_AUDIO_TYPES = ['.mp3', '.wav'];
|
|
8
|
+
const MAX_FILE_SIZE_BYTES = 120 * 1024 * 1024; // 120MB
|
|
9
|
+
const LANGUAGE_HINTS = ['zh', 'en', 'ja'];
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* 将毫秒转换为 SRT 时间格式 HH:MM:SS,mmm
|
|
13
|
+
*/
|
|
14
|
+
const msToSrtTime = (ms) => {
|
|
15
|
+
const hours = Math.floor(ms / 3600000);
|
|
16
|
+
const minutes = Math.floor((ms % 3600000) / 60000);
|
|
17
|
+
const seconds = Math.floor((ms % 60000) / 1000);
|
|
18
|
+
const milliseconds = ms % 1000;
|
|
19
|
+
return `${String(hours).padStart(2, '0')}:${String(minutes).padStart(2, '0')}:${String(seconds).padStart(2, '0')},${String(milliseconds).padStart(3, '0')}`;
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* 将 ASR 识别结果转换为 SRT 字幕内容
|
|
24
|
+
*/
|
|
25
|
+
const asrToSrt = (asrData) => {
|
|
26
|
+
const srtEntries = [];
|
|
27
|
+
let subtitleIndex = 1;
|
|
28
|
+
|
|
29
|
+
for (const item of asrData) {
|
|
30
|
+
const transcription = item.transcription || {};
|
|
31
|
+
const transcripts = transcription.transcripts || [];
|
|
32
|
+
|
|
33
|
+
for (const transcript of transcripts) {
|
|
34
|
+
const sentences = transcript.sentences || [];
|
|
35
|
+
|
|
36
|
+
for (const sentence of sentences) {
|
|
37
|
+
const beginTime = sentence.begin_time || 0;
|
|
38
|
+
const endTime = sentence.end_time || 0;
|
|
39
|
+
const text = (sentence.text || '').trim();
|
|
40
|
+
|
|
41
|
+
if (text) {
|
|
42
|
+
const startStr = msToSrtTime(beginTime);
|
|
43
|
+
const endStr = msToSrtTime(endTime);
|
|
44
|
+
|
|
45
|
+
srtEntries.push(`${subtitleIndex}\n${startStr} --> ${endStr}\n${text}\n`);
|
|
46
|
+
subtitleIndex++;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
return srtEntries.join('\n');
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* 上传音频文件到服务器
|
|
57
|
+
*/
|
|
58
|
+
const uploadAudio = async (uploadUrl, audioPath) => {
|
|
59
|
+
const fileBuffer = fs.readFileSync(audioPath);
|
|
60
|
+
const fileName = path.basename(audioPath);
|
|
61
|
+
|
|
62
|
+
const formData = new FormData();
|
|
63
|
+
formData.append('file', new Blob([fileBuffer]), fileName);
|
|
64
|
+
|
|
65
|
+
const response = await fetch(uploadUrl, {
|
|
66
|
+
method: 'POST',
|
|
67
|
+
body: formData,
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
if (!response.ok) {
|
|
71
|
+
throw new Error(`上传失败: ${response.status} - ${await response.text()}`);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const data = await response.json();
|
|
75
|
+
|
|
76
|
+
if (!data.success || !data.data || !data.data.path) {
|
|
77
|
+
throw new Error(`上传响应格式错误: ${JSON.stringify(data)}`);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
return data.data.path;
|
|
81
|
+
};
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* 根据上传接口 URL 和文件路径构建静态资源 URL
|
|
85
|
+
*/
|
|
86
|
+
const getStaticUrl = (uploadUrl, filePath) => {
|
|
87
|
+
const url = new URL(uploadUrl);
|
|
88
|
+
return `${url.protocol}//${url.host}${filePath}`;
|
|
89
|
+
};
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* 提交 ASR 转写任务
|
|
93
|
+
*/
|
|
94
|
+
const submitAsrTask = async (fileUrls, apiKey) => {
|
|
95
|
+
const response = await fetch(ASR_SERVICE_URL, {
|
|
96
|
+
method: 'POST',
|
|
97
|
+
headers: {
|
|
98
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
99
|
+
'Content-Type': 'application/json',
|
|
100
|
+
'X-DashScope-Async': 'enable',
|
|
101
|
+
},
|
|
102
|
+
body: JSON.stringify({
|
|
103
|
+
model: 'paraformer-v2',
|
|
104
|
+
input: { file_urls: fileUrls },
|
|
105
|
+
parameters: {
|
|
106
|
+
channel_id: [0],
|
|
107
|
+
language_hints: LANGUAGE_HINTS,
|
|
108
|
+
},
|
|
109
|
+
}),
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
if (!response.ok) {
|
|
113
|
+
const errorText = await response.text();
|
|
114
|
+
throw new Error(`ASR 任务提交失败: ${response.status} - ${errorText}`);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
const data = await response.json();
|
|
118
|
+
|
|
119
|
+
if (!data.output || !data.output.task_id) {
|
|
120
|
+
throw new Error(`ASR 响应格式错误: ${JSON.stringify(data)}`);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
return data.output.task_id;
|
|
124
|
+
};
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* 轮询等待 ASR 任务完成
|
|
128
|
+
*/
|
|
129
|
+
const waitForTaskComplete = async (taskId, apiKey, timeoutMs = 5 * 60 * 1000) => {
|
|
130
|
+
const taskUrl = `https://dashscope.aliyuncs.com/api/v1/tasks/${taskId}`;
|
|
131
|
+
const startTime = Date.now();
|
|
132
|
+
const pollInterval = 500;
|
|
133
|
+
|
|
134
|
+
while (Date.now() - startTime < timeoutMs) {
|
|
135
|
+
const response = await fetch(taskUrl, {
|
|
136
|
+
method: 'GET',
|
|
137
|
+
headers: {
|
|
138
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
139
|
+
},
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
if (!response.ok) {
|
|
143
|
+
throw new Error(`查询任务状态失败: ${response.status}`);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
const data = await response.json();
|
|
147
|
+
const status = data.output?.task_status;
|
|
148
|
+
|
|
149
|
+
if (status === 'SUCCEEDED') {
|
|
150
|
+
return data.output.results || [];
|
|
151
|
+
} else if (status === 'FAILED') {
|
|
152
|
+
throw new Error('ASR 识别任务失败');
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
await new Promise(resolve => setTimeout(resolve, pollInterval));
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
throw new Error('ASR 识别超时');
|
|
159
|
+
};
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* 获取转写结果详情
|
|
163
|
+
*/
|
|
164
|
+
const fetchTranscriptionResults = async (results) => {
|
|
165
|
+
const allTranscriptions = [];
|
|
166
|
+
|
|
167
|
+
for (const result of results) {
|
|
168
|
+
if (result.subtask_status === 'SUCCEEDED' && result.transcription_url) {
|
|
169
|
+
const response = await fetch(result.transcription_url);
|
|
170
|
+
if (response.ok) {
|
|
171
|
+
const transcriptionData = await response.json();
|
|
172
|
+
allTranscriptions.push({
|
|
173
|
+
file_url: result.file_url,
|
|
174
|
+
transcription: transcriptionData,
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
return allTranscriptions;
|
|
181
|
+
};
|
|
182
|
+
|
|
183
|
+
/**
|
|
184
|
+
* ASR 语音识别
|
|
185
|
+
*/
|
|
186
|
+
const asr = async ({ workingDir, audioPath, apiKey }) => {
|
|
187
|
+
// 验证 API Key
|
|
188
|
+
if (!apiKey) {
|
|
189
|
+
throw new Error('请配置 DASHSCOPE_API_KEY 环境变量');
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// 检查文件大小
|
|
193
|
+
const fileSize = fs.statSync(audioPath).size;
|
|
194
|
+
if (fileSize > MAX_FILE_SIZE_BYTES) {
|
|
195
|
+
throw new Error(
|
|
196
|
+
`音频文件大小 ${(fileSize / 1024 / 1024).toFixed(2)}MB 超过限制(最大 120MB)。` +
|
|
197
|
+
`请先对音频进行分段处理后再逐段识别。`
|
|
198
|
+
);
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// 1. 上传音频文件
|
|
202
|
+
const filePath = await uploadAudio(ASR_UPLOAD_URL, audioPath);
|
|
203
|
+
|
|
204
|
+
// 2. 构建静态资源 URL
|
|
205
|
+
const audioUrl = getStaticUrl(ASR_UPLOAD_URL, filePath);
|
|
206
|
+
|
|
207
|
+
// 3. 提交 ASR 任务
|
|
208
|
+
const taskId = await submitAsrTask([audioUrl], apiKey);
|
|
209
|
+
|
|
210
|
+
// 4. 等待任务完成
|
|
211
|
+
const results = await waitForTaskComplete(taskId, apiKey);
|
|
212
|
+
|
|
213
|
+
// 5. 获取转写结果
|
|
214
|
+
const transcriptions = await fetchTranscriptionResults(results);
|
|
215
|
+
|
|
216
|
+
if (!transcriptions.length) {
|
|
217
|
+
throw new Error('未获取到识别结果');
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// 6. 转换为 SRT 格式并保存
|
|
221
|
+
const srtContent = asrToSrt(transcriptions);
|
|
222
|
+
const filename = `asr_${Date.now()}.srt`;
|
|
223
|
+
const outputPath = path.join(workingDir, filename);
|
|
224
|
+
fs.writeFileSync(outputPath, srtContent, 'utf-8');
|
|
225
|
+
|
|
226
|
+
return filename;
|
|
227
|
+
};
|
|
228
|
+
|
|
229
|
+
module.exports = {
|
|
230
|
+
asr,
|
|
231
|
+
SUPPORTED_AUDIO_TYPES,
|
|
232
|
+
MAX_FILE_SIZE_BYTES,
|
|
233
|
+
};
|
package/utils/tts.js
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
const fs = require('fs');
|
|
2
|
+
const path = require('path');
|
|
3
|
+
const WebSocket = require('ws');
|
|
4
|
+
const { v4: uuidv4 } = require('uuid');
|
|
5
|
+
|
|
6
|
+
const TTS_WS_URL = 'wss://dashscope.aliyuncs.com/api-ws/v1/inference/';
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* TTS 语音合成
|
|
10
|
+
*/
|
|
11
|
+
const tts = async ({ workingDir, text, apiKey }) => {
|
|
12
|
+
// 固定参数
|
|
13
|
+
const voice = 'sambert-zhimiao-emo-v1';
|
|
14
|
+
const format = 'mp3';
|
|
15
|
+
const sampleRate = 16000;
|
|
16
|
+
|
|
17
|
+
// 验证 API Key
|
|
18
|
+
if (!apiKey) {
|
|
19
|
+
throw new Error('请配置 DASHSCOPE_API_KEY 环境变量');
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
// 验证文本
|
|
23
|
+
if (!text || typeof text !== 'string' || !text.trim()) {
|
|
24
|
+
throw new Error('合成文本不能为空');
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
const trimmedText = text.trim();
|
|
28
|
+
|
|
29
|
+
// 生成输出文件名
|
|
30
|
+
const filename = `tts_${Date.now()}.${format}`;
|
|
31
|
+
const outputPath = path.join(workingDir, filename);
|
|
32
|
+
|
|
33
|
+
return new Promise((resolve, reject) => {
|
|
34
|
+
// 清空或创建输出文件
|
|
35
|
+
fs.writeFileSync(outputPath, '');
|
|
36
|
+
const fileStream = fs.createWriteStream(outputPath, { flags: 'a' });
|
|
37
|
+
|
|
38
|
+
const ws = new WebSocket(TTS_WS_URL, {
|
|
39
|
+
headers: {
|
|
40
|
+
Authorization: `bearer ${apiKey}`,
|
|
41
|
+
'X-DashScope-DataInspection': 'enable'
|
|
42
|
+
}
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
let resolved = false;
|
|
46
|
+
|
|
47
|
+
const cleanup = () => {
|
|
48
|
+
if (!resolved) {
|
|
49
|
+
resolved = true;
|
|
50
|
+
fileStream.end();
|
|
51
|
+
if (ws.readyState === WebSocket.OPEN) {
|
|
52
|
+
ws.close();
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
ws.on('open', () => {
|
|
58
|
+
const taskId = uuidv4();
|
|
59
|
+
const runTaskMessage = {
|
|
60
|
+
header: {
|
|
61
|
+
action: 'run-task',
|
|
62
|
+
task_id: taskId,
|
|
63
|
+
streaming: 'out'
|
|
64
|
+
},
|
|
65
|
+
payload: {
|
|
66
|
+
model: voice,
|
|
67
|
+
task_group: 'audio',
|
|
68
|
+
task: 'tts',
|
|
69
|
+
function: 'SpeechSynthesizer',
|
|
70
|
+
input: {
|
|
71
|
+
text: trimmedText
|
|
72
|
+
},
|
|
73
|
+
parameters: {
|
|
74
|
+
text_type: 'PlainText',
|
|
75
|
+
format: format,
|
|
76
|
+
sample_rate: sampleRate,
|
|
77
|
+
volume: 50,
|
|
78
|
+
rate: 1,
|
|
79
|
+
pitch: 1,
|
|
80
|
+
word_timestamp_enabled: false,
|
|
81
|
+
phoneme_timestamp_enabled: false
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
};
|
|
85
|
+
ws.send(JSON.stringify(runTaskMessage));
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
ws.on('message', (data, isBinary) => {
|
|
89
|
+
if (isBinary) {
|
|
90
|
+
fileStream.write(data);
|
|
91
|
+
} else {
|
|
92
|
+
try {
|
|
93
|
+
const message = JSON.parse(data);
|
|
94
|
+
const event = message.header?.event;
|
|
95
|
+
|
|
96
|
+
if (event === 'task-finished') {
|
|
97
|
+
cleanup();
|
|
98
|
+
resolve(filename);
|
|
99
|
+
} else if (event === 'task-failed') {
|
|
100
|
+
cleanup();
|
|
101
|
+
reject(new Error(message.header?.error_message || 'TTS 任务失败'));
|
|
102
|
+
}
|
|
103
|
+
} catch (e) {
|
|
104
|
+
// 忽略 JSON 解析错误
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
ws.on('error', (error) => {
|
|
110
|
+
cleanup();
|
|
111
|
+
reject(new Error(`WebSocket 错误: ${error.message}`));
|
|
112
|
+
});
|
|
113
|
+
|
|
114
|
+
ws.on('close', () => {
|
|
115
|
+
if (!resolved) {
|
|
116
|
+
cleanup();
|
|
117
|
+
reject(new Error('WebSocket 连接意外关闭'));
|
|
118
|
+
}
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
// 超时处理(2分钟)
|
|
122
|
+
setTimeout(() => {
|
|
123
|
+
if (!resolved) {
|
|
124
|
+
cleanup();
|
|
125
|
+
reject(new Error('TTS 任务超时'));
|
|
126
|
+
}
|
|
127
|
+
}, 2 * 60 * 1000);
|
|
128
|
+
});
|
|
129
|
+
};
|
|
130
|
+
|
|
131
|
+
module.exports = {
|
|
132
|
+
tts,
|
|
133
|
+
};
|