@xiaozhiclaw/provider-core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/aliyun-oss-file-upload-adapter.d.ts +44 -0
- package/dist/adapters/aliyun-oss-file-upload-adapter.js +96 -0
- package/dist/adapters/gemini-file-upload-adapter.d.ts +26 -0
- package/dist/adapters/gemini-file-upload-adapter.js +92 -0
- package/dist/adapters/hub-oss-file-upload-adapter.d.ts +29 -0
- package/dist/adapters/hub-oss-file-upload-adapter.js +53 -0
- package/dist/adapters/index.d.ts +10 -0
- package/dist/adapters/index.js +10 -0
- package/dist/adapters/openai-file-upload-adapter.d.ts +38 -0
- package/dist/adapters/openai-file-upload-adapter.js +56 -0
- package/dist/adapters/volcengine-file-upload-adapter.d.ts +24 -0
- package/dist/adapters/volcengine-file-upload-adapter.js +45 -0
- package/dist/builtin-providers.d.ts +8 -0
- package/dist/builtin-providers.js +2237 -0
- package/dist/constants.d.ts +1 -0
- package/dist/constants.js +1 -0
- package/dist/credentials.d.ts +1 -0
- package/dist/credentials.js +8 -0
- package/dist/debug-transport.d.ts +12 -0
- package/dist/debug-transport.js +99 -0
- package/dist/errors.d.ts +11 -0
- package/dist/errors.js +12 -0
- package/dist/events.d.ts +48 -0
- package/dist/events.js +1 -0
- package/dist/file-upload-service.d.ts +68 -0
- package/dist/file-upload-service.js +110 -0
- package/dist/gemini-schema-utils.d.ts +17 -0
- package/dist/gemini-schema-utils.js +76 -0
- package/dist/index.d.ts +37 -0
- package/dist/index.js +33 -0
- package/dist/llm-client.d.ts +43 -0
- package/dist/llm-client.js +217 -0
- package/dist/media-client.d.ts +42 -0
- package/dist/media-client.js +174 -0
- package/dist/media-transport.d.ts +176 -0
- package/dist/media-transport.js +16 -0
- package/dist/media.d.ts +2 -0
- package/dist/media.js +1 -0
- package/dist/model-detection.d.ts +22 -0
- package/dist/model-detection.js +28 -0
- package/dist/paths.d.ts +2 -0
- package/dist/paths.js +11 -0
- package/dist/provider-def.d.ts +220 -0
- package/dist/provider-def.js +9 -0
- package/dist/provider-registry.d.ts +51 -0
- package/dist/provider-registry.js +130 -0
- package/dist/provider-tool-api.d.ts +44 -0
- package/dist/provider-tool-api.js +9 -0
- package/dist/provider-variant-resolver.d.ts +35 -0
- package/dist/provider-variant-resolver.js +174 -0
- package/dist/retry.d.ts +37 -0
- package/dist/retry.js +71 -0
- package/dist/transport.d.ts +281 -0
- package/dist/transport.js +27 -0
- package/dist/transports/anthropic-messages.d.ts +65 -0
- package/dist/transports/anthropic-messages.js +1004 -0
- package/dist/transports/gemini-cache-api.d.ts +86 -0
- package/dist/transports/gemini-cache-api.js +141 -0
- package/dist/transports/gemini-file-api.d.ts +90 -0
- package/dist/transports/gemini-file-api.js +164 -0
- package/dist/transports/gemini-generatecontent.d.ts +56 -0
- package/dist/transports/gemini-generatecontent.js +688 -0
- package/dist/transports/gemini-lyria-realtime.d.ts +117 -0
- package/dist/transports/gemini-lyria-realtime.js +295 -0
- package/dist/transports/gemini-media.d.ts +53 -0
- package/dist/transports/gemini-media.js +383 -0
- package/dist/transports/media-resolve.d.ts +50 -0
- package/dist/transports/media-resolve.js +91 -0
- package/dist/transports/minimax-media.d.ts +56 -0
- package/dist/transports/minimax-media.js +433 -0
- package/dist/transports/openai-chat.d.ts +81 -0
- package/dist/transports/openai-chat.js +782 -0
- package/dist/transports/openai-media.d.ts +24 -0
- package/dist/transports/openai-media.js +118 -0
- package/dist/transports/openai-responses.d.ts +63 -0
- package/dist/transports/openai-responses.js +778 -0
- package/dist/transports/qwen-media.d.ts +59 -0
- package/dist/transports/qwen-media.js +411 -0
- package/dist/transports/realtime-transport.d.ts +183 -0
- package/dist/transports/realtime-transport.js +332 -0
- package/dist/transports/volcengine-grounding.d.ts +58 -0
- package/dist/transports/volcengine-grounding.js +69 -0
- package/dist/transports/volcengine-media.d.ts +94 -0
- package/dist/transports/volcengine-media.js +801 -0
- package/dist/transports/volcengine-responses.d.ts +64 -0
- package/dist/transports/volcengine-responses.js +797 -0
- package/dist/transports/zhipu-media.d.ts +82 -0
- package/dist/transports/zhipu-media.js +522 -0
- package/dist/transports/zhipu-tool-api.d.ts +35 -0
- package/dist/transports/zhipu-tool-api.js +126 -0
- package/dist/wire-types.d.ts +51 -0
- package/dist/wire-types.js +1 -0
- package/package.json +33 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Qwen (DashScope) Media Transport.
|
|
3
|
+
*
|
|
4
|
+
* DashScope async task API pattern:
|
|
5
|
+
* Submit: POST /api/v1/services/aigc/<service>/generation (X-DashScope-Async: enable)
|
|
6
|
+
* Poll: GET /api/v1/tasks/{taskId}
|
|
7
|
+
*
|
|
8
|
+
* Auth: Authorization: Bearer $DASHSCOPE_API_KEY
|
|
9
|
+
* Docs: https://help.aliyun.com/zh/model-studio/developer-reference/
|
|
10
|
+
*
|
|
11
|
+
* Video models: wan2.7-t2v (text-to-video), wan2.7-i2v (image-to-video)
|
|
12
|
+
* TTS endpoints:
|
|
13
|
+
* Qwen3-TTS: /api/v1/services/aigc/multimodal-generation/generation
|
|
14
|
+
* CosyVoice: /api/v1/services/audio/tts/SpeechSynthesizer
|
|
15
|
+
* Rerank: /api/v1/services/rerank/text-rerank/text-rerank
|
|
16
|
+
* Embedding: /compatible-mode/v1/embeddings
|
|
17
|
+
*/
|
|
18
|
+
import type { AsyncMediaTransport, MediaRequest, MediaResult, MediaType } from "../media-transport.js";
|
|
19
|
+
export interface QwenMediaConfig {
|
|
20
|
+
/** Base URL, e.g. "https://dashscope.aliyuncs.com" */
|
|
21
|
+
baseUrl: string;
|
|
22
|
+
timeoutMs?: number;
|
|
23
|
+
}
|
|
24
|
+
export declare class QwenMediaTransport implements AsyncMediaTransport {
|
|
25
|
+
readonly supportedTypes: readonly MediaType[];
|
|
26
|
+
private baseUrl;
|
|
27
|
+
private timeoutMs;
|
|
28
|
+
constructor(config: QwenMediaConfig);
|
|
29
|
+
generate(request: MediaRequest, apiKey: string, signal?: AbortSignal): Promise<MediaResult>;
|
|
30
|
+
private generateTTS;
|
|
31
|
+
private generateQwenTTS;
|
|
32
|
+
private generateCosyVoiceTTS;
|
|
33
|
+
private generateEmbedding;
|
|
34
|
+
private generateRerank;
|
|
35
|
+
private pollTask;
|
|
36
|
+
private generateVideo;
|
|
37
|
+
/**
|
|
38
|
+
* Query a single task by ID using DashScope unified task endpoint.
|
|
39
|
+
* GET /api/v1/tasks/{taskId}
|
|
40
|
+
*/
|
|
41
|
+
getTaskStatus(taskId: string, apiKey: string, signal?: AbortSignal): Promise<{
|
|
42
|
+
status: string;
|
|
43
|
+
task: Record<string, unknown>;
|
|
44
|
+
}>;
|
|
45
|
+
/**
|
|
46
|
+
* List tasks 鈥?DashScope has a task list API.
|
|
47
|
+
* GET /api/v1/tasks?page_no=1&page_size=20&status=RUNNING
|
|
48
|
+
*/
|
|
49
|
+
listVideoTasks(apiKey: string, options?: {
|
|
50
|
+
after?: string;
|
|
51
|
+
limit?: number;
|
|
52
|
+
status?: string;
|
|
53
|
+
}, signal?: AbortSignal): Promise<Record<string, unknown>>;
|
|
54
|
+
/**
|
|
55
|
+
* Cancel/delete 鈥?DashScope does not have a public task cancellation API.
|
|
56
|
+
*/
|
|
57
|
+
deleteVideoTask(_taskId: string, _apiKey: string, _signal?: AbortSignal): Promise<void>;
|
|
58
|
+
private extractMediaUrl;
|
|
59
|
+
}
|
|
@@ -0,0 +1,411 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Qwen (DashScope) Media Transport.
|
|
3
|
+
*
|
|
4
|
+
* DashScope async task API pattern:
|
|
5
|
+
* Submit: POST /api/v1/services/aigc/<service>/generation (X-DashScope-Async: enable)
|
|
6
|
+
* Poll: GET /api/v1/tasks/{taskId}
|
|
7
|
+
*
|
|
8
|
+
* Auth: Authorization: Bearer $DASHSCOPE_API_KEY
|
|
9
|
+
* Docs: https://help.aliyun.com/zh/model-studio/developer-reference/
|
|
10
|
+
*
|
|
11
|
+
* Video models: wan2.7-t2v (text-to-video), wan2.7-i2v (image-to-video)
|
|
12
|
+
* TTS endpoints:
|
|
13
|
+
* Qwen3-TTS: /api/v1/services/aigc/multimodal-generation/generation
|
|
14
|
+
* CosyVoice: /api/v1/services/audio/tts/SpeechSynthesizer
|
|
15
|
+
* Rerank: /api/v1/services/rerank/text-rerank/text-rerank
|
|
16
|
+
* Embedding: /compatible-mode/v1/embeddings
|
|
17
|
+
*/
|
|
18
|
+
const POLL_INTERVAL_MS = 2_000;
|
|
19
|
+
const MAX_POLL_MS_TTS = 120_000; // 2 min for TTS
|
|
20
|
+
const MAX_POLL_MS_VIDEO = 600_000; // 10 min for video
|
|
21
|
+
const DEFAULT_TIMEOUT_MS = 30_000;
|
|
22
|
+
export class QwenMediaTransport {
|
|
23
|
+
supportedTypes = ["tts", "video", "embedding", "rerank"];
|
|
24
|
+
baseUrl;
|
|
25
|
+
timeoutMs;
|
|
26
|
+
constructor(config) {
|
|
27
|
+
// Strip /apps/anthropic suffix 鈥?TTS uses a different path
|
|
28
|
+
this.baseUrl = config.baseUrl
|
|
29
|
+
.replace(/\/apps\/anthropic\/?$/, "")
|
|
30
|
+
.replace(/\/+$/, "");
|
|
31
|
+
this.timeoutMs = config.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
32
|
+
}
|
|
33
|
+
async generate(request, apiKey, signal) {
|
|
34
|
+
if (request.mediaType === "video") {
|
|
35
|
+
return this.generateVideo(request, apiKey, signal);
|
|
36
|
+
}
|
|
37
|
+
if (request.mediaType === "embedding") {
|
|
38
|
+
return this.generateEmbedding(request, apiKey, signal);
|
|
39
|
+
}
|
|
40
|
+
if (request.mediaType === "rerank") {
|
|
41
|
+
return this.generateRerank(request, apiKey, signal);
|
|
42
|
+
}
|
|
43
|
+
if (request.mediaType !== "tts") {
|
|
44
|
+
throw new Error(`QwenMediaTransport: unsupported mediaType "${request.mediaType}"`);
|
|
45
|
+
}
|
|
46
|
+
return this.generateTTS(request, apiKey, signal);
|
|
47
|
+
}
|
|
48
|
+
async generateTTS(request, apiKey, signal) {
|
|
49
|
+
const start = Date.now();
|
|
50
|
+
const text = request.text || request.prompt;
|
|
51
|
+
if (!text) {
|
|
52
|
+
throw new Error("QwenMediaTransport: text or prompt is required for TTS");
|
|
53
|
+
}
|
|
54
|
+
if ((request.model || "").startsWith("cosyvoice")) {
|
|
55
|
+
return this.generateCosyVoiceTTS(request, apiKey, signal, start, text);
|
|
56
|
+
}
|
|
57
|
+
return this.generateQwenTTS(request, apiKey, signal, start, text);
|
|
58
|
+
}
|
|
59
|
+
async generateQwenTTS(request, apiKey, signal, start, text) {
|
|
60
|
+
const submitUrl = `${this.baseUrl}/api/v1/services/aigc/multimodal-generation/generation`;
|
|
61
|
+
const body = {
|
|
62
|
+
model: request.model || "qwen3-tts-flash",
|
|
63
|
+
input: {
|
|
64
|
+
text,
|
|
65
|
+
voice: request.voice ?? "Cherry",
|
|
66
|
+
language_type: request.metadata?.languageType ?? "English",
|
|
67
|
+
},
|
|
68
|
+
};
|
|
69
|
+
const submitRes = await fetch(submitUrl, {
|
|
70
|
+
method: "POST",
|
|
71
|
+
headers: {
|
|
72
|
+
"Content-Type": "application/json",
|
|
73
|
+
Authorization: `Bearer ${apiKey}`,
|
|
74
|
+
},
|
|
75
|
+
body: JSON.stringify(body),
|
|
76
|
+
signal: signal ?? AbortSignal.timeout(this.timeoutMs),
|
|
77
|
+
});
|
|
78
|
+
if (!submitRes.ok) {
|
|
79
|
+
const errText = await submitRes.text().catch(() => "");
|
|
80
|
+
throw new Error(`DashScope Qwen TTS error ${submitRes.status}: ${errText}`);
|
|
81
|
+
}
|
|
82
|
+
const submitData = await submitRes.json();
|
|
83
|
+
if (submitData.code) {
|
|
84
|
+
throw new Error(`DashScope Qwen TTS rejected: ${submitData.code}: ${submitData.message ?? ""}`);
|
|
85
|
+
}
|
|
86
|
+
const audioUrl = submitData.output?.audio?.url;
|
|
87
|
+
const characters = submitData.usage?.characters ?? text.length;
|
|
88
|
+
return {
|
|
89
|
+
mediaUrls: audioUrl ? [audioUrl] : [],
|
|
90
|
+
model: request.model || "qwen3-tts-flash",
|
|
91
|
+
durationMs: Date.now() - start,
|
|
92
|
+
billingUnit: "per_character",
|
|
93
|
+
billingQuantity: characters,
|
|
94
|
+
metadata: {
|
|
95
|
+
usage: submitData.usage,
|
|
96
|
+
finishReason: submitData.output?.finish_reason,
|
|
97
|
+
requestId: submitData.request_id,
|
|
98
|
+
},
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
async generateCosyVoiceTTS(request, apiKey, signal, start, text) {
|
|
102
|
+
const submitUrl = `${this.baseUrl}/api/v1/services/audio/tts/SpeechSynthesizer`;
|
|
103
|
+
const body = {
|
|
104
|
+
model: request.model || "cosyvoice-v2",
|
|
105
|
+
input: { text },
|
|
106
|
+
parameters: {
|
|
107
|
+
voice: request.voice ?? "longxiaochun_v2",
|
|
108
|
+
format: request.audioFormat ?? "mp3",
|
|
109
|
+
sample_rate: request.metadata?.sampleRate ?? 24000,
|
|
110
|
+
},
|
|
111
|
+
};
|
|
112
|
+
const submitRes = await fetch(submitUrl, {
|
|
113
|
+
method: "POST",
|
|
114
|
+
headers: {
|
|
115
|
+
"Content-Type": "application/json",
|
|
116
|
+
Authorization: `Bearer ${apiKey}`,
|
|
117
|
+
},
|
|
118
|
+
body: JSON.stringify(body),
|
|
119
|
+
signal: signal ?? AbortSignal.timeout(this.timeoutMs),
|
|
120
|
+
});
|
|
121
|
+
if (!submitRes.ok) {
|
|
122
|
+
const errText = await submitRes.text().catch(() => "");
|
|
123
|
+
throw new Error(`DashScope CosyVoice TTS error ${submitRes.status}: ${errText}`);
|
|
124
|
+
}
|
|
125
|
+
const data = await submitRes.json();
|
|
126
|
+
if (data.code) {
|
|
127
|
+
throw new Error(`DashScope CosyVoice TTS rejected: ${data.code}: ${data.message ?? ""}`);
|
|
128
|
+
}
|
|
129
|
+
const audioUrl = data.output?.audio?.url;
|
|
130
|
+
const characters = data.usage?.characters ?? text.length;
|
|
131
|
+
return {
|
|
132
|
+
mediaUrls: audioUrl ? [audioUrl] : [],
|
|
133
|
+
model: request.model || "cosyvoice-v2",
|
|
134
|
+
durationMs: Date.now() - start,
|
|
135
|
+
billingUnit: "per_character",
|
|
136
|
+
billingQuantity: characters,
|
|
137
|
+
metadata: {
|
|
138
|
+
usage: data.usage,
|
|
139
|
+
finishReason: data.output?.finish_reason,
|
|
140
|
+
requestId: data.request_id,
|
|
141
|
+
},
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
async generateEmbedding(request, apiKey, signal) {
|
|
145
|
+
const start = Date.now();
|
|
146
|
+
const text = request.text || request.prompt;
|
|
147
|
+
if (!text)
|
|
148
|
+
throw new Error("QwenMediaTransport: text or prompt is required for embedding");
|
|
149
|
+
const url = `${this.baseUrl}/compatible-mode/v1/embeddings`;
|
|
150
|
+
const body = {
|
|
151
|
+
model: request.model || "text-embedding-v4",
|
|
152
|
+
input: text,
|
|
153
|
+
};
|
|
154
|
+
if (request.metadata?.dimensions)
|
|
155
|
+
body.dimensions = request.metadata.dimensions;
|
|
156
|
+
const res = await fetch(url, {
|
|
157
|
+
method: "POST",
|
|
158
|
+
headers: {
|
|
159
|
+
"Content-Type": "application/json",
|
|
160
|
+
Authorization: `Bearer ${apiKey}`,
|
|
161
|
+
},
|
|
162
|
+
body: JSON.stringify(body),
|
|
163
|
+
signal: signal ?? AbortSignal.timeout(this.timeoutMs),
|
|
164
|
+
});
|
|
165
|
+
if (!res.ok) {
|
|
166
|
+
const textBody = await res.text().catch(() => "");
|
|
167
|
+
throw new Error(`DashScope embedding error ${res.status}: ${textBody}`);
|
|
168
|
+
}
|
|
169
|
+
const data = await res.json();
|
|
170
|
+
const embeddings = (data.data ?? []).map(item => item.embedding).filter((item) => Array.isArray(item));
|
|
171
|
+
const totalTokens = data.usage?.total_tokens ?? data.usage?.prompt_tokens;
|
|
172
|
+
return {
|
|
173
|
+
mediaUrls: [],
|
|
174
|
+
model: data.model ?? request.model ?? "text-embedding-v4",
|
|
175
|
+
durationMs: Date.now() - start,
|
|
176
|
+
billingUnit: totalTokens !== undefined ? "per_token" : undefined,
|
|
177
|
+
billingQuantity: totalTokens,
|
|
178
|
+
metadata: {
|
|
179
|
+
embeddings,
|
|
180
|
+
dimensions: embeddings[0]?.length ?? 0,
|
|
181
|
+
usage: data.usage,
|
|
182
|
+
},
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
async generateRerank(request, apiKey, signal) {
|
|
186
|
+
const start = Date.now();
|
|
187
|
+
const query = request.prompt;
|
|
188
|
+
const documents = request.metadata?.documents;
|
|
189
|
+
if (!query)
|
|
190
|
+
throw new Error("QwenMediaTransport: prompt (query) is required for rerank");
|
|
191
|
+
if (!Array.isArray(documents))
|
|
192
|
+
throw new Error("QwenMediaTransport: metadata.documents is required for rerank");
|
|
193
|
+
const url = `${this.baseUrl}/api/v1/services/rerank/text-rerank/text-rerank`;
|
|
194
|
+
const body = {
|
|
195
|
+
model: request.model || "qwen3-rerank",
|
|
196
|
+
input: { query, documents },
|
|
197
|
+
parameters: {},
|
|
198
|
+
};
|
|
199
|
+
if (request.metadata?.topN !== undefined) {
|
|
200
|
+
body.parameters.top_n = request.metadata.topN;
|
|
201
|
+
}
|
|
202
|
+
const res = await fetch(url, {
|
|
203
|
+
method: "POST",
|
|
204
|
+
headers: {
|
|
205
|
+
"Content-Type": "application/json",
|
|
206
|
+
Authorization: `Bearer ${apiKey}`,
|
|
207
|
+
},
|
|
208
|
+
body: JSON.stringify(body),
|
|
209
|
+
signal: signal ?? AbortSignal.timeout(this.timeoutMs),
|
|
210
|
+
});
|
|
211
|
+
if (!res.ok) {
|
|
212
|
+
const textBody = await res.text().catch(() => "");
|
|
213
|
+
throw new Error(`DashScope rerank error ${res.status}: ${textBody}`);
|
|
214
|
+
}
|
|
215
|
+
const data = await res.json();
|
|
216
|
+
const totalTokens = data.usage?.total_tokens;
|
|
217
|
+
return {
|
|
218
|
+
mediaUrls: [],
|
|
219
|
+
model: request.model || "qwen3-rerank",
|
|
220
|
+
durationMs: Date.now() - start,
|
|
221
|
+
billingUnit: totalTokens !== undefined ? "per_token" : undefined,
|
|
222
|
+
billingQuantity: totalTokens,
|
|
223
|
+
metadata: {
|
|
224
|
+
results: (data.output?.results ?? []).map(r => ({
|
|
225
|
+
index: r.index,
|
|
226
|
+
relevanceScore: r.relevance_score,
|
|
227
|
+
document: r.document,
|
|
228
|
+
})),
|
|
229
|
+
usage: data.usage,
|
|
230
|
+
},
|
|
231
|
+
};
|
|
232
|
+
}
|
|
233
|
+
async pollTask(taskId, apiKey, signal, onProgress, maxPollMs = MAX_POLL_MS_TTS) {
|
|
234
|
+
const deadline = Date.now() + maxPollMs;
|
|
235
|
+
const pollUrl = `${this.baseUrl}/api/v1/tasks/${taskId}`;
|
|
236
|
+
while (Date.now() < deadline) {
|
|
237
|
+
signal?.throwIfAborted();
|
|
238
|
+
await sleep(POLL_INTERVAL_MS);
|
|
239
|
+
const res = await fetch(pollUrl, {
|
|
240
|
+
method: "GET",
|
|
241
|
+
headers: { Authorization: `Bearer ${apiKey}` },
|
|
242
|
+
signal: signal ?? AbortSignal.timeout(this.timeoutMs),
|
|
243
|
+
});
|
|
244
|
+
if (!res.ok) {
|
|
245
|
+
const text = await res.text().catch(() => "");
|
|
246
|
+
throw new Error(`DashScope TTS poll error ${res.status}: ${text}`);
|
|
247
|
+
}
|
|
248
|
+
const data = await res.json();
|
|
249
|
+
if (data.code) {
|
|
250
|
+
throw new Error(`DashScope poll rejected: ${data.code} 鈥?${data.message ?? ""}`);
|
|
251
|
+
}
|
|
252
|
+
const status = data.output?.task_status;
|
|
253
|
+
if (status === "SUCCEEDED") {
|
|
254
|
+
onProgress?.(100, "completed", taskId);
|
|
255
|
+
const mediaUrl = this.extractMediaUrl(data);
|
|
256
|
+
return mediaUrl ? [mediaUrl] : [];
|
|
257
|
+
}
|
|
258
|
+
if (status === "FAILED") {
|
|
259
|
+
throw new Error(`DashScope task failed: ${data.message ?? "unknown error"}`);
|
|
260
|
+
}
|
|
261
|
+
// PENDING / RUNNING 鈥?report progress and continue polling
|
|
262
|
+
const elapsed = Date.now() - (deadline - maxPollMs);
|
|
263
|
+
onProgress?.(Math.min(95, Math.round((elapsed / maxPollMs) * 100)), status ?? "running", taskId);
|
|
264
|
+
}
|
|
265
|
+
throw new Error("DashScope task timed out");
|
|
266
|
+
}
|
|
267
|
+
// 鈹€鈹€ Video Generation (qwen-ProviderMax 搂19-23) 鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€
|
|
268
|
+
// DashScope video uses the same async task pattern as TTS.
|
|
269
|
+
// Submit: POST /api/v1/services/aigc/generation/generation
|
|
270
|
+
// Poll: GET /api/v1/tasks/{taskId}
|
|
271
|
+
async generateVideo(request, apiKey, signal) {
|
|
272
|
+
const start = Date.now();
|
|
273
|
+
const model = request.model || "wan2.1-t2v-turbo";
|
|
274
|
+
// DashScope video request body
|
|
275
|
+
const input = {
|
|
276
|
+
prompt: request.prompt,
|
|
277
|
+
};
|
|
278
|
+
// Image-to-video: pass img_url
|
|
279
|
+
if (request.imageUrl) {
|
|
280
|
+
input.img_url = request.imageUrl;
|
|
281
|
+
}
|
|
282
|
+
else if (request.referenceImages?.length) {
|
|
283
|
+
// First-last-frame or reference mode
|
|
284
|
+
if (request.imageRoles) {
|
|
285
|
+
const firstIdx = request.imageRoles.indexOf("first_frame");
|
|
286
|
+
const lastIdx = request.imageRoles.indexOf("last_frame");
|
|
287
|
+
if (firstIdx >= 0)
|
|
288
|
+
input.img_url = request.referenceImages[firstIdx];
|
|
289
|
+
if (lastIdx >= 0)
|
|
290
|
+
input.tail_image_url = request.referenceImages[lastIdx];
|
|
291
|
+
// Reference images
|
|
292
|
+
const refIndices = request.imageRoles
|
|
293
|
+
.map((role, i) => role === "reference_image" ? i : -1)
|
|
294
|
+
.filter(i => i >= 0);
|
|
295
|
+
if (refIndices.length > 0) {
|
|
296
|
+
input.ref_image_url = request.referenceImages[refIndices[0]];
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
else {
|
|
300
|
+
input.img_url = request.referenceImages[0];
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
const parameters = {};
|
|
304
|
+
if (request.duration)
|
|
305
|
+
parameters.duration = request.duration;
|
|
306
|
+
if (request.resolution)
|
|
307
|
+
parameters.resolution = request.resolution;
|
|
308
|
+
if (request.aspectRatio)
|
|
309
|
+
parameters.ratio = request.aspectRatio;
|
|
310
|
+
if (request.metadata?.size)
|
|
311
|
+
parameters.size = request.metadata.size;
|
|
312
|
+
const submitUrl = `${this.baseUrl}/api/v1/services/aigc/generation/generation`;
|
|
313
|
+
const body = { model, input };
|
|
314
|
+
if (Object.keys(parameters).length > 0)
|
|
315
|
+
body.parameters = parameters;
|
|
316
|
+
const submitRes = await fetch(submitUrl, {
|
|
317
|
+
method: "POST",
|
|
318
|
+
headers: {
|
|
319
|
+
"Content-Type": "application/json",
|
|
320
|
+
Authorization: `Bearer ${apiKey}`,
|
|
321
|
+
"X-DashScope-Async": "enable",
|
|
322
|
+
},
|
|
323
|
+
body: JSON.stringify(body),
|
|
324
|
+
signal: signal ?? AbortSignal.timeout(this.timeoutMs),
|
|
325
|
+
});
|
|
326
|
+
if (!submitRes.ok) {
|
|
327
|
+
const errText = await submitRes.text().catch(() => "");
|
|
328
|
+
throw new Error(`DashScope video submit error ${submitRes.status}: ${errText}`);
|
|
329
|
+
}
|
|
330
|
+
const submitData = await submitRes.json();
|
|
331
|
+
if (submitData.code) {
|
|
332
|
+
throw new Error(`DashScope video submit rejected: ${submitData.code} 鈥?${submitData.message ?? ""}`);
|
|
333
|
+
}
|
|
334
|
+
const taskId = submitData.output?.task_id;
|
|
335
|
+
if (!taskId) {
|
|
336
|
+
throw new Error("DashScope video submit: no task_id in response");
|
|
337
|
+
}
|
|
338
|
+
// Poll for completion with longer timeout for video
|
|
339
|
+
const videoUrls = await this.pollTask(taskId, apiKey, signal, request.onProgress, MAX_POLL_MS_VIDEO);
|
|
340
|
+
return {
|
|
341
|
+
mediaUrls: videoUrls,
|
|
342
|
+
model,
|
|
343
|
+
durationMs: Date.now() - start,
|
|
344
|
+
taskId,
|
|
345
|
+
};
|
|
346
|
+
}
|
|
347
|
+
// 鈹€鈹€ AsyncMediaTransport: Task Management 鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€
|
|
348
|
+
/**
|
|
349
|
+
* Query a single task by ID using DashScope unified task endpoint.
|
|
350
|
+
* GET /api/v1/tasks/{taskId}
|
|
351
|
+
*/
|
|
352
|
+
async getTaskStatus(taskId, apiKey, signal) {
|
|
353
|
+
const url = `${this.baseUrl}/api/v1/tasks/${taskId}`;
|
|
354
|
+
const res = await fetch(url, {
|
|
355
|
+
method: "GET",
|
|
356
|
+
headers: { Authorization: `Bearer ${apiKey}` },
|
|
357
|
+
signal: signal ?? AbortSignal.timeout(this.timeoutMs),
|
|
358
|
+
});
|
|
359
|
+
if (!res.ok) {
|
|
360
|
+
const text = await res.text().catch(() => "");
|
|
361
|
+
throw new Error(`DashScope task status error ${res.status}: ${text}`);
|
|
362
|
+
}
|
|
363
|
+
const data = await res.json();
|
|
364
|
+
const output = data.output;
|
|
365
|
+
const rawStatus = output?.task_status ?? "unknown";
|
|
366
|
+
const normalizedStatus = rawStatus === "SUCCEEDED" ? "succeeded"
|
|
367
|
+
: rawStatus === "FAILED" ? "failed"
|
|
368
|
+
: rawStatus.toLowerCase();
|
|
369
|
+
return { status: normalizedStatus, task: data };
|
|
370
|
+
}
|
|
371
|
+
/**
|
|
372
|
+
* List tasks 鈥?DashScope has a task list API.
|
|
373
|
+
* GET /api/v1/tasks?page_no=1&page_size=20&status=RUNNING
|
|
374
|
+
*/
|
|
375
|
+
async listVideoTasks(apiKey, options, signal) {
|
|
376
|
+
const params = new URLSearchParams();
|
|
377
|
+
params.set("page_size", String(options?.limit ?? 20));
|
|
378
|
+
if (options?.status)
|
|
379
|
+
params.set("status", options.status.toUpperCase());
|
|
380
|
+
const url = `${this.baseUrl}/api/v1/tasks?${params.toString()}`;
|
|
381
|
+
const res = await fetch(url, {
|
|
382
|
+
method: "GET",
|
|
383
|
+
headers: { Authorization: `Bearer ${apiKey}` },
|
|
384
|
+
signal: signal ?? AbortSignal.timeout(this.timeoutMs),
|
|
385
|
+
});
|
|
386
|
+
if (!res.ok) {
|
|
387
|
+
// DashScope may not support task list 鈥?return empty gracefully
|
|
388
|
+
return { data: [] };
|
|
389
|
+
}
|
|
390
|
+
return await res.json();
|
|
391
|
+
}
|
|
392
|
+
/**
|
|
393
|
+
* Cancel/delete 鈥?DashScope does not have a public task cancellation API.
|
|
394
|
+
*/
|
|
395
|
+
async deleteVideoTask(_taskId, _apiKey, _signal) {
|
|
396
|
+
throw new Error("DashScope does not support task cancellation.");
|
|
397
|
+
}
|
|
398
|
+
extractMediaUrl(data) {
|
|
399
|
+
// DashScope returns different URL fields per media type
|
|
400
|
+
if (data.output?.video_url)
|
|
401
|
+
return data.output.video_url;
|
|
402
|
+
if (data.output?.audio_url)
|
|
403
|
+
return data.output.audio_url;
|
|
404
|
+
if (data.output?.results?.[0]?.url)
|
|
405
|
+
return data.output.results[0].url;
|
|
406
|
+
return undefined;
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
function sleep(ms) {
|
|
410
|
+
return new Promise(resolve => setTimeout(resolve, ms));
|
|
411
|
+
}
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Realtime WebSocket Transport 鈥?bidirectional audio/voice streaming
|
|
3
|
+
* via the OpenAI Realtime API protocol (also compatible with GLM Realtime).
|
|
4
|
+
*
|
|
5
|
+
* ## Protocol: WebSocket JSON events
|
|
6
|
+
*
|
|
7
|
+
* Client 鈫?Server:
|
|
8
|
+
* - session.update: configure session (model, voice, tools, etc.)
|
|
9
|
+
* - input_audio_buffer.append: send audio chunks (base64 PCM16)
|
|
10
|
+
* - input_audio_buffer.commit: signal end of audio input
|
|
11
|
+
* - conversation.item.create: inject text/function_result items
|
|
12
|
+
* - response.create: request a model response
|
|
13
|
+
* - response.cancel: abort in-progress response
|
|
14
|
+
*
|
|
15
|
+
* Server 鈫?Client:
|
|
16
|
+
* - session.created: session initialized
|
|
17
|
+
* - session.updated: config acknowledged
|
|
18
|
+
* - input_audio_buffer.speech_started: VAD detected speech
|
|
19
|
+
* - input_audio_buffer.speech_stopped: VAD detected silence
|
|
20
|
+
* - response.created: response generation started
|
|
21
|
+
* - response.output_item.added: new output item (text/audio/function_call)
|
|
22
|
+
* - response.audio.delta: audio chunk (base64 PCM16)
|
|
23
|
+
* - response.audio_transcript.delta: transcript of generated speech
|
|
24
|
+
* - response.text.delta: text generation delta
|
|
25
|
+
* - response.function_call_arguments.delta: tool call args delta
|
|
26
|
+
* - response.function_call_arguments.done: tool call complete
|
|
27
|
+
* - response.output_item.done: output item finished
|
|
28
|
+
* - response.done: full response complete
|
|
29
|
+
* - error: server error
|
|
30
|
+
*
|
|
31
|
+
* ## Architecture
|
|
32
|
+
*
|
|
33
|
+
* RealtimeTransport manages a single persistent WebSocket connection per session.
|
|
34
|
+
* It exposes an event-driven API (AsyncGenerator) that the agent tool-loop
|
|
35
|
+
* can consume for voice-enabled interactions.
|
|
36
|
+
*
|
|
37
|
+
* Docs:
|
|
38
|
+
* - OpenAI: https://platform.openai.com/docs/api-reference/realtime
|
|
39
|
+
* - GLM: https://docs.bigmodel.cn/cn/guide/develop/realtime-api
|
|
40
|
+
*/
|
|
41
|
+
export interface RealtimeConfig {
|
|
42
|
+
/** WebSocket endpoint (e.g. "wss://api.openai.com/v1/realtime") */
|
|
43
|
+
baseUrl: string;
|
|
44
|
+
/** Model to use (e.g. "gpt-realtime-2", "glm-realtime") */
|
|
45
|
+
model: string;
|
|
46
|
+
/** API key */
|
|
47
|
+
apiKey: string;
|
|
48
|
+
/** Voice for TTS output */
|
|
49
|
+
voice?: string;
|
|
50
|
+
/** Input modalities: "text", "audio", or both */
|
|
51
|
+
inputModalities?: Array<"text" | "audio">;
|
|
52
|
+
/** Output modalities: "text", "audio", or both */
|
|
53
|
+
outputModalities?: Array<"text" | "audio">;
|
|
54
|
+
/** Temperature for generation */
|
|
55
|
+
temperature?: number;
|
|
56
|
+
/** Tool definitions for function calling */
|
|
57
|
+
tools?: RealtimeTool[];
|
|
58
|
+
/** Voice Activity Detection mode */
|
|
59
|
+
vadMode?: "server_vad" | "none";
|
|
60
|
+
/** VAD threshold (0.0-1.0) */
|
|
61
|
+
vadThreshold?: number;
|
|
62
|
+
/** Auth type: "header" (OpenAI) or "query" (GLM) */
|
|
63
|
+
authMode?: "header" | "query";
|
|
64
|
+
}
|
|
65
|
+
export interface RealtimeTool {
|
|
66
|
+
type: "function";
|
|
67
|
+
name: string;
|
|
68
|
+
description: string;
|
|
69
|
+
parameters: Record<string, unknown>;
|
|
70
|
+
}
|
|
71
|
+
export type RealtimeEvent = {
|
|
72
|
+
type: "session_created";
|
|
73
|
+
sessionId: string;
|
|
74
|
+
} | {
|
|
75
|
+
type: "speech_started";
|
|
76
|
+
} | {
|
|
77
|
+
type: "speech_stopped";
|
|
78
|
+
audioEndMs: number;
|
|
79
|
+
} | {
|
|
80
|
+
type: "audio_delta";
|
|
81
|
+
delta: string;
|
|
82
|
+
} | {
|
|
83
|
+
type: "audio_transcript_delta";
|
|
84
|
+
delta: string;
|
|
85
|
+
} | {
|
|
86
|
+
type: "text_delta";
|
|
87
|
+
delta: string;
|
|
88
|
+
} | {
|
|
89
|
+
type: "function_call_start";
|
|
90
|
+
callId: string;
|
|
91
|
+
name: string;
|
|
92
|
+
} | {
|
|
93
|
+
type: "function_call_delta";
|
|
94
|
+
callId: string;
|
|
95
|
+
delta: string;
|
|
96
|
+
} | {
|
|
97
|
+
type: "function_call_done";
|
|
98
|
+
callId: string;
|
|
99
|
+
name: string;
|
|
100
|
+
arguments: string;
|
|
101
|
+
} | {
|
|
102
|
+
type: "response_done";
|
|
103
|
+
usage?: RealtimeUsage;
|
|
104
|
+
} | {
|
|
105
|
+
type: "error";
|
|
106
|
+
code: string;
|
|
107
|
+
message: string;
|
|
108
|
+
} | {
|
|
109
|
+
type: "closed";
|
|
110
|
+
code: number;
|
|
111
|
+
reason: string;
|
|
112
|
+
};
|
|
113
|
+
export interface RealtimeUsage {
|
|
114
|
+
inputTokens: number;
|
|
115
|
+
outputTokens: number;
|
|
116
|
+
inputAudioTokens?: number;
|
|
117
|
+
outputAudioTokens?: number;
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* Manages a persistent WebSocket connection for real-time audio/voice
|
|
121
|
+
* interactions with an LLM provider.
|
|
122
|
+
*
|
|
123
|
+
* Usage:
|
|
124
|
+
* ```ts
|
|
125
|
+
* const rt = new RealtimeTransport(config);
|
|
126
|
+
* rt.connect();
|
|
127
|
+
*
|
|
128
|
+
* // Send audio
|
|
129
|
+
* rt.appendAudio(base64Chunk);
|
|
130
|
+
* rt.commitAudio();
|
|
131
|
+
*
|
|
132
|
+
* // Or send text
|
|
133
|
+
* rt.sendText("Hello!");
|
|
134
|
+
*
|
|
135
|
+
* // Submit function results
|
|
136
|
+
* rt.sendFunctionResult(callId, result);
|
|
137
|
+
*
|
|
138
|
+
* // Consume events
|
|
139
|
+
* for await (const event of rt.events()) {
|
|
140
|
+
* switch (event.type) {
|
|
141
|
+
* case "audio_delta": playAudio(event.delta); break;
|
|
142
|
+
* case "function_call_done": handleToolCall(event); break;
|
|
143
|
+
* }
|
|
144
|
+
* }
|
|
145
|
+
*
|
|
146
|
+
* rt.close();
|
|
147
|
+
* ```
|
|
148
|
+
*/
|
|
149
|
+
export declare class RealtimeTransport {
|
|
150
|
+
private ws;
|
|
151
|
+
private config;
|
|
152
|
+
private eventQueue;
|
|
153
|
+
private waiters;
|
|
154
|
+
private closed;
|
|
155
|
+
constructor(config: RealtimeConfig);
|
|
156
|
+
/** Open WebSocket connection and configure session. */
|
|
157
|
+
connect(): Promise<void>;
|
|
158
|
+
/** Send audio data (base64 PCM16). */
|
|
159
|
+
appendAudio(base64Chunk: string): void;
|
|
160
|
+
/** Mark end of audio input and trigger response. */
|
|
161
|
+
commitAudio(): void;
|
|
162
|
+
/** Send a text message. */
|
|
163
|
+
sendText(text: string): void;
|
|
164
|
+
/** Submit a function call result back to the model. */
|
|
165
|
+
sendFunctionResult(callId: string, output: string): void;
|
|
166
|
+
/** Trigger a model response (e.g. after sending text). */
|
|
167
|
+
requestResponse(): void;
|
|
168
|
+
/** Cancel an in-progress response. */
|
|
169
|
+
cancelResponse(): void;
|
|
170
|
+
/** Async iterator of server events. */
|
|
171
|
+
events(): AsyncGenerator<RealtimeEvent>;
|
|
172
|
+
/** Close the WebSocket connection. */
|
|
173
|
+
close(): void;
|
|
174
|
+
private buildUrl;
|
|
175
|
+
private sendSessionUpdate;
|
|
176
|
+
private send;
|
|
177
|
+
private push;
|
|
178
|
+
private drainWaiters;
|
|
179
|
+
/**
|
|
180
|
+
* Parse a server-sent JSON event into our typed event(s).
|
|
181
|
+
*/
|
|
182
|
+
private parseServerEvent;
|
|
183
|
+
}
|