@xiaozhiclaw/provider-core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/dist/adapters/aliyun-oss-file-upload-adapter.d.ts +44 -0
  2. package/dist/adapters/aliyun-oss-file-upload-adapter.js +96 -0
  3. package/dist/adapters/gemini-file-upload-adapter.d.ts +26 -0
  4. package/dist/adapters/gemini-file-upload-adapter.js +92 -0
  5. package/dist/adapters/hub-oss-file-upload-adapter.d.ts +29 -0
  6. package/dist/adapters/hub-oss-file-upload-adapter.js +53 -0
  7. package/dist/adapters/index.d.ts +10 -0
  8. package/dist/adapters/index.js +10 -0
  9. package/dist/adapters/openai-file-upload-adapter.d.ts +38 -0
  10. package/dist/adapters/openai-file-upload-adapter.js +56 -0
  11. package/dist/adapters/volcengine-file-upload-adapter.d.ts +24 -0
  12. package/dist/adapters/volcengine-file-upload-adapter.js +45 -0
  13. package/dist/builtin-providers.d.ts +8 -0
  14. package/dist/builtin-providers.js +2237 -0
  15. package/dist/constants.d.ts +1 -0
  16. package/dist/constants.js +1 -0
  17. package/dist/credentials.d.ts +1 -0
  18. package/dist/credentials.js +8 -0
  19. package/dist/debug-transport.d.ts +12 -0
  20. package/dist/debug-transport.js +99 -0
  21. package/dist/errors.d.ts +11 -0
  22. package/dist/errors.js +12 -0
  23. package/dist/events.d.ts +48 -0
  24. package/dist/events.js +1 -0
  25. package/dist/file-upload-service.d.ts +68 -0
  26. package/dist/file-upload-service.js +110 -0
  27. package/dist/gemini-schema-utils.d.ts +17 -0
  28. package/dist/gemini-schema-utils.js +76 -0
  29. package/dist/index.d.ts +37 -0
  30. package/dist/index.js +33 -0
  31. package/dist/llm-client.d.ts +43 -0
  32. package/dist/llm-client.js +217 -0
  33. package/dist/media-client.d.ts +42 -0
  34. package/dist/media-client.js +174 -0
  35. package/dist/media-transport.d.ts +176 -0
  36. package/dist/media-transport.js +16 -0
  37. package/dist/media.d.ts +2 -0
  38. package/dist/media.js +1 -0
  39. package/dist/model-detection.d.ts +22 -0
  40. package/dist/model-detection.js +28 -0
  41. package/dist/paths.d.ts +2 -0
  42. package/dist/paths.js +11 -0
  43. package/dist/provider-def.d.ts +220 -0
  44. package/dist/provider-def.js +9 -0
  45. package/dist/provider-registry.d.ts +51 -0
  46. package/dist/provider-registry.js +130 -0
  47. package/dist/provider-tool-api.d.ts +44 -0
  48. package/dist/provider-tool-api.js +9 -0
  49. package/dist/provider-variant-resolver.d.ts +35 -0
  50. package/dist/provider-variant-resolver.js +174 -0
  51. package/dist/retry.d.ts +37 -0
  52. package/dist/retry.js +71 -0
  53. package/dist/transport.d.ts +281 -0
  54. package/dist/transport.js +27 -0
  55. package/dist/transports/anthropic-messages.d.ts +65 -0
  56. package/dist/transports/anthropic-messages.js +1004 -0
  57. package/dist/transports/gemini-cache-api.d.ts +86 -0
  58. package/dist/transports/gemini-cache-api.js +141 -0
  59. package/dist/transports/gemini-file-api.d.ts +90 -0
  60. package/dist/transports/gemini-file-api.js +164 -0
  61. package/dist/transports/gemini-generatecontent.d.ts +56 -0
  62. package/dist/transports/gemini-generatecontent.js +688 -0
  63. package/dist/transports/gemini-lyria-realtime.d.ts +117 -0
  64. package/dist/transports/gemini-lyria-realtime.js +295 -0
  65. package/dist/transports/gemini-media.d.ts +53 -0
  66. package/dist/transports/gemini-media.js +383 -0
  67. package/dist/transports/media-resolve.d.ts +50 -0
  68. package/dist/transports/media-resolve.js +91 -0
  69. package/dist/transports/minimax-media.d.ts +56 -0
  70. package/dist/transports/minimax-media.js +433 -0
  71. package/dist/transports/openai-chat.d.ts +81 -0
  72. package/dist/transports/openai-chat.js +782 -0
  73. package/dist/transports/openai-media.d.ts +24 -0
  74. package/dist/transports/openai-media.js +118 -0
  75. package/dist/transports/openai-responses.d.ts +63 -0
  76. package/dist/transports/openai-responses.js +778 -0
  77. package/dist/transports/qwen-media.d.ts +59 -0
  78. package/dist/transports/qwen-media.js +411 -0
  79. package/dist/transports/realtime-transport.d.ts +183 -0
  80. package/dist/transports/realtime-transport.js +332 -0
  81. package/dist/transports/volcengine-grounding.d.ts +58 -0
  82. package/dist/transports/volcengine-grounding.js +69 -0
  83. package/dist/transports/volcengine-media.d.ts +94 -0
  84. package/dist/transports/volcengine-media.js +801 -0
  85. package/dist/transports/volcengine-responses.d.ts +64 -0
  86. package/dist/transports/volcengine-responses.js +797 -0
  87. package/dist/transports/zhipu-media.d.ts +82 -0
  88. package/dist/transports/zhipu-media.js +522 -0
  89. package/dist/transports/zhipu-tool-api.d.ts +35 -0
  90. package/dist/transports/zhipu-tool-api.js +126 -0
  91. package/dist/wire-types.d.ts +51 -0
  92. package/dist/wire-types.js +1 -0
  93. package/package.json +33 -0
@@ -0,0 +1,59 @@
1
+ /**
2
+ * Qwen (DashScope) Media Transport.
3
+ *
4
+ * DashScope async task API pattern:
5
+ * Submit: POST /api/v1/services/aigc/<service>/generation (X-DashScope-Async: enable)
6
+ * Poll: GET /api/v1/tasks/{taskId}
7
+ *
8
+ * Auth: Authorization: Bearer $DASHSCOPE_API_KEY
9
+ * Docs: https://help.aliyun.com/zh/model-studio/developer-reference/
10
+ *
11
+ * Video models: wan2.7-t2v (text-to-video), wan2.7-i2v (image-to-video)
12
+ * TTS endpoints:
13
+ * Qwen3-TTS: /api/v1/services/aigc/multimodal-generation/generation
14
+ * CosyVoice: /api/v1/services/audio/tts/SpeechSynthesizer
15
+ * Rerank: /api/v1/services/rerank/text-rerank/text-rerank
16
+ * Embedding: /compatible-mode/v1/embeddings
17
+ */
18
+ import type { AsyncMediaTransport, MediaRequest, MediaResult, MediaType } from "../media-transport.js";
19
+ export interface QwenMediaConfig {
20
+ /** Base URL, e.g. "https://dashscope.aliyuncs.com" */
21
+ baseUrl: string;
22
+ timeoutMs?: number;
23
+ }
24
+ export declare class QwenMediaTransport implements AsyncMediaTransport {
25
+ readonly supportedTypes: readonly MediaType[];
26
+ private baseUrl;
27
+ private timeoutMs;
28
+ constructor(config: QwenMediaConfig);
29
+ generate(request: MediaRequest, apiKey: string, signal?: AbortSignal): Promise<MediaResult>;
30
+ private generateTTS;
31
+ private generateQwenTTS;
32
+ private generateCosyVoiceTTS;
33
+ private generateEmbedding;
34
+ private generateRerank;
35
+ private pollTask;
36
+ private generateVideo;
37
+ /**
38
+ * Query a single task by ID using DashScope unified task endpoint.
39
+ * GET /api/v1/tasks/{taskId}
40
+ */
41
+ getTaskStatus(taskId: string, apiKey: string, signal?: AbortSignal): Promise<{
42
+ status: string;
43
+ task: Record<string, unknown>;
44
+ }>;
45
+ /**
46
+ * List tasks 鈥?DashScope has a task list API.
47
+ * GET /api/v1/tasks?page_no=1&page_size=20&status=RUNNING
48
+ */
49
+ listVideoTasks(apiKey: string, options?: {
50
+ after?: string;
51
+ limit?: number;
52
+ status?: string;
53
+ }, signal?: AbortSignal): Promise<Record<string, unknown>>;
54
+ /**
55
+ * Cancel/delete 鈥?DashScope does not have a public task cancellation API.
56
+ */
57
+ deleteVideoTask(_taskId: string, _apiKey: string, _signal?: AbortSignal): Promise<void>;
58
+ private extractMediaUrl;
59
+ }
@@ -0,0 +1,411 @@
1
+ /**
2
+ * Qwen (DashScope) Media Transport.
3
+ *
4
+ * DashScope async task API pattern:
5
+ * Submit: POST /api/v1/services/aigc/<service>/generation (X-DashScope-Async: enable)
6
+ * Poll: GET /api/v1/tasks/{taskId}
7
+ *
8
+ * Auth: Authorization: Bearer $DASHSCOPE_API_KEY
9
+ * Docs: https://help.aliyun.com/zh/model-studio/developer-reference/
10
+ *
11
+ * Video models: wan2.7-t2v (text-to-video), wan2.7-i2v (image-to-video)
12
+ * TTS endpoints:
13
+ * Qwen3-TTS: /api/v1/services/aigc/multimodal-generation/generation
14
+ * CosyVoice: /api/v1/services/audio/tts/SpeechSynthesizer
15
+ * Rerank: /api/v1/services/rerank/text-rerank/text-rerank
16
+ * Embedding: /compatible-mode/v1/embeddings
17
+ */
18
+ const POLL_INTERVAL_MS = 2_000;
19
+ const MAX_POLL_MS_TTS = 120_000; // 2 min for TTS
20
+ const MAX_POLL_MS_VIDEO = 600_000; // 10 min for video
21
+ const DEFAULT_TIMEOUT_MS = 30_000;
22
+ export class QwenMediaTransport {
23
+ supportedTypes = ["tts", "video", "embedding", "rerank"];
24
+ baseUrl;
25
+ timeoutMs;
26
+ constructor(config) {
27
+ // Strip /apps/anthropic suffix 鈥?TTS uses a different path
28
+ this.baseUrl = config.baseUrl
29
+ .replace(/\/apps\/anthropic\/?$/, "")
30
+ .replace(/\/+$/, "");
31
+ this.timeoutMs = config.timeoutMs ?? DEFAULT_TIMEOUT_MS;
32
+ }
33
+ async generate(request, apiKey, signal) {
34
+ if (request.mediaType === "video") {
35
+ return this.generateVideo(request, apiKey, signal);
36
+ }
37
+ if (request.mediaType === "embedding") {
38
+ return this.generateEmbedding(request, apiKey, signal);
39
+ }
40
+ if (request.mediaType === "rerank") {
41
+ return this.generateRerank(request, apiKey, signal);
42
+ }
43
+ if (request.mediaType !== "tts") {
44
+ throw new Error(`QwenMediaTransport: unsupported mediaType "${request.mediaType}"`);
45
+ }
46
+ return this.generateTTS(request, apiKey, signal);
47
+ }
48
+ async generateTTS(request, apiKey, signal) {
49
+ const start = Date.now();
50
+ const text = request.text || request.prompt;
51
+ if (!text) {
52
+ throw new Error("QwenMediaTransport: text or prompt is required for TTS");
53
+ }
54
+ if ((request.model || "").startsWith("cosyvoice")) {
55
+ return this.generateCosyVoiceTTS(request, apiKey, signal, start, text);
56
+ }
57
+ return this.generateQwenTTS(request, apiKey, signal, start, text);
58
+ }
59
+ async generateQwenTTS(request, apiKey, signal, start, text) {
60
+ const submitUrl = `${this.baseUrl}/api/v1/services/aigc/multimodal-generation/generation`;
61
+ const body = {
62
+ model: request.model || "qwen3-tts-flash",
63
+ input: {
64
+ text,
65
+ voice: request.voice ?? "Cherry",
66
+ language_type: request.metadata?.languageType ?? "English",
67
+ },
68
+ };
69
+ const submitRes = await fetch(submitUrl, {
70
+ method: "POST",
71
+ headers: {
72
+ "Content-Type": "application/json",
73
+ Authorization: `Bearer ${apiKey}`,
74
+ },
75
+ body: JSON.stringify(body),
76
+ signal: signal ?? AbortSignal.timeout(this.timeoutMs),
77
+ });
78
+ if (!submitRes.ok) {
79
+ const errText = await submitRes.text().catch(() => "");
80
+ throw new Error(`DashScope Qwen TTS error ${submitRes.status}: ${errText}`);
81
+ }
82
+ const submitData = await submitRes.json();
83
+ if (submitData.code) {
84
+ throw new Error(`DashScope Qwen TTS rejected: ${submitData.code}: ${submitData.message ?? ""}`);
85
+ }
86
+ const audioUrl = submitData.output?.audio?.url;
87
+ const characters = submitData.usage?.characters ?? text.length;
88
+ return {
89
+ mediaUrls: audioUrl ? [audioUrl] : [],
90
+ model: request.model || "qwen3-tts-flash",
91
+ durationMs: Date.now() - start,
92
+ billingUnit: "per_character",
93
+ billingQuantity: characters,
94
+ metadata: {
95
+ usage: submitData.usage,
96
+ finishReason: submitData.output?.finish_reason,
97
+ requestId: submitData.request_id,
98
+ },
99
+ };
100
+ }
101
+ async generateCosyVoiceTTS(request, apiKey, signal, start, text) {
102
+ const submitUrl = `${this.baseUrl}/api/v1/services/audio/tts/SpeechSynthesizer`;
103
+ const body = {
104
+ model: request.model || "cosyvoice-v2",
105
+ input: { text },
106
+ parameters: {
107
+ voice: request.voice ?? "longxiaochun_v2",
108
+ format: request.audioFormat ?? "mp3",
109
+ sample_rate: request.metadata?.sampleRate ?? 24000,
110
+ },
111
+ };
112
+ const submitRes = await fetch(submitUrl, {
113
+ method: "POST",
114
+ headers: {
115
+ "Content-Type": "application/json",
116
+ Authorization: `Bearer ${apiKey}`,
117
+ },
118
+ body: JSON.stringify(body),
119
+ signal: signal ?? AbortSignal.timeout(this.timeoutMs),
120
+ });
121
+ if (!submitRes.ok) {
122
+ const errText = await submitRes.text().catch(() => "");
123
+ throw new Error(`DashScope CosyVoice TTS error ${submitRes.status}: ${errText}`);
124
+ }
125
+ const data = await submitRes.json();
126
+ if (data.code) {
127
+ throw new Error(`DashScope CosyVoice TTS rejected: ${data.code}: ${data.message ?? ""}`);
128
+ }
129
+ const audioUrl = data.output?.audio?.url;
130
+ const characters = data.usage?.characters ?? text.length;
131
+ return {
132
+ mediaUrls: audioUrl ? [audioUrl] : [],
133
+ model: request.model || "cosyvoice-v2",
134
+ durationMs: Date.now() - start,
135
+ billingUnit: "per_character",
136
+ billingQuantity: characters,
137
+ metadata: {
138
+ usage: data.usage,
139
+ finishReason: data.output?.finish_reason,
140
+ requestId: data.request_id,
141
+ },
142
+ };
143
+ }
144
+ async generateEmbedding(request, apiKey, signal) {
145
+ const start = Date.now();
146
+ const text = request.text || request.prompt;
147
+ if (!text)
148
+ throw new Error("QwenMediaTransport: text or prompt is required for embedding");
149
+ const url = `${this.baseUrl}/compatible-mode/v1/embeddings`;
150
+ const body = {
151
+ model: request.model || "text-embedding-v4",
152
+ input: text,
153
+ };
154
+ if (request.metadata?.dimensions)
155
+ body.dimensions = request.metadata.dimensions;
156
+ const res = await fetch(url, {
157
+ method: "POST",
158
+ headers: {
159
+ "Content-Type": "application/json",
160
+ Authorization: `Bearer ${apiKey}`,
161
+ },
162
+ body: JSON.stringify(body),
163
+ signal: signal ?? AbortSignal.timeout(this.timeoutMs),
164
+ });
165
+ if (!res.ok) {
166
+ const textBody = await res.text().catch(() => "");
167
+ throw new Error(`DashScope embedding error ${res.status}: ${textBody}`);
168
+ }
169
+ const data = await res.json();
170
+ const embeddings = (data.data ?? []).map(item => item.embedding).filter((item) => Array.isArray(item));
171
+ const totalTokens = data.usage?.total_tokens ?? data.usage?.prompt_tokens;
172
+ return {
173
+ mediaUrls: [],
174
+ model: data.model ?? request.model ?? "text-embedding-v4",
175
+ durationMs: Date.now() - start,
176
+ billingUnit: totalTokens !== undefined ? "per_token" : undefined,
177
+ billingQuantity: totalTokens,
178
+ metadata: {
179
+ embeddings,
180
+ dimensions: embeddings[0]?.length ?? 0,
181
+ usage: data.usage,
182
+ },
183
+ };
184
+ }
185
+ async generateRerank(request, apiKey, signal) {
186
+ const start = Date.now();
187
+ const query = request.prompt;
188
+ const documents = request.metadata?.documents;
189
+ if (!query)
190
+ throw new Error("QwenMediaTransport: prompt (query) is required for rerank");
191
+ if (!Array.isArray(documents))
192
+ throw new Error("QwenMediaTransport: metadata.documents is required for rerank");
193
+ const url = `${this.baseUrl}/api/v1/services/rerank/text-rerank/text-rerank`;
194
+ const body = {
195
+ model: request.model || "qwen3-rerank",
196
+ input: { query, documents },
197
+ parameters: {},
198
+ };
199
+ if (request.metadata?.topN !== undefined) {
200
+ body.parameters.top_n = request.metadata.topN;
201
+ }
202
+ const res = await fetch(url, {
203
+ method: "POST",
204
+ headers: {
205
+ "Content-Type": "application/json",
206
+ Authorization: `Bearer ${apiKey}`,
207
+ },
208
+ body: JSON.stringify(body),
209
+ signal: signal ?? AbortSignal.timeout(this.timeoutMs),
210
+ });
211
+ if (!res.ok) {
212
+ const textBody = await res.text().catch(() => "");
213
+ throw new Error(`DashScope rerank error ${res.status}: ${textBody}`);
214
+ }
215
+ const data = await res.json();
216
+ const totalTokens = data.usage?.total_tokens;
217
+ return {
218
+ mediaUrls: [],
219
+ model: request.model || "qwen3-rerank",
220
+ durationMs: Date.now() - start,
221
+ billingUnit: totalTokens !== undefined ? "per_token" : undefined,
222
+ billingQuantity: totalTokens,
223
+ metadata: {
224
+ results: (data.output?.results ?? []).map(r => ({
225
+ index: r.index,
226
+ relevanceScore: r.relevance_score,
227
+ document: r.document,
228
+ })),
229
+ usage: data.usage,
230
+ },
231
+ };
232
+ }
233
+ async pollTask(taskId, apiKey, signal, onProgress, maxPollMs = MAX_POLL_MS_TTS) {
234
+ const deadline = Date.now() + maxPollMs;
235
+ const pollUrl = `${this.baseUrl}/api/v1/tasks/${taskId}`;
236
+ while (Date.now() < deadline) {
237
+ signal?.throwIfAborted();
238
+ await sleep(POLL_INTERVAL_MS);
239
+ const res = await fetch(pollUrl, {
240
+ method: "GET",
241
+ headers: { Authorization: `Bearer ${apiKey}` },
242
+ signal: signal ?? AbortSignal.timeout(this.timeoutMs),
243
+ });
244
+ if (!res.ok) {
245
+ const text = await res.text().catch(() => "");
246
+ throw new Error(`DashScope TTS poll error ${res.status}: ${text}`);
247
+ }
248
+ const data = await res.json();
249
+ if (data.code) {
250
+ throw new Error(`DashScope poll rejected: ${data.code} 鈥?${data.message ?? ""}`);
251
+ }
252
+ const status = data.output?.task_status;
253
+ if (status === "SUCCEEDED") {
254
+ onProgress?.(100, "completed", taskId);
255
+ const mediaUrl = this.extractMediaUrl(data);
256
+ return mediaUrl ? [mediaUrl] : [];
257
+ }
258
+ if (status === "FAILED") {
259
+ throw new Error(`DashScope task failed: ${data.message ?? "unknown error"}`);
260
+ }
261
+ // PENDING / RUNNING 鈥?report progress and continue polling
262
+ const elapsed = Date.now() - (deadline - maxPollMs);
263
+ onProgress?.(Math.min(95, Math.round((elapsed / maxPollMs) * 100)), status ?? "running", taskId);
264
+ }
265
+ throw new Error("DashScope task timed out");
266
+ }
267
+ // 鈹€鈹€ Video Generation (qwen-ProviderMax 搂19-23) 鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€
268
+ // DashScope video uses the same async task pattern as TTS.
269
+ // Submit: POST /api/v1/services/aigc/generation/generation
270
+ // Poll: GET /api/v1/tasks/{taskId}
271
+ async generateVideo(request, apiKey, signal) {
272
+ const start = Date.now();
273
+ const model = request.model || "wan2.1-t2v-turbo";
274
+ // DashScope video request body
275
+ const input = {
276
+ prompt: request.prompt,
277
+ };
278
+ // Image-to-video: pass img_url
279
+ if (request.imageUrl) {
280
+ input.img_url = request.imageUrl;
281
+ }
282
+ else if (request.referenceImages?.length) {
283
+ // First-last-frame or reference mode
284
+ if (request.imageRoles) {
285
+ const firstIdx = request.imageRoles.indexOf("first_frame");
286
+ const lastIdx = request.imageRoles.indexOf("last_frame");
287
+ if (firstIdx >= 0)
288
+ input.img_url = request.referenceImages[firstIdx];
289
+ if (lastIdx >= 0)
290
+ input.tail_image_url = request.referenceImages[lastIdx];
291
+ // Reference images
292
+ const refIndices = request.imageRoles
293
+ .map((role, i) => role === "reference_image" ? i : -1)
294
+ .filter(i => i >= 0);
295
+ if (refIndices.length > 0) {
296
+ input.ref_image_url = request.referenceImages[refIndices[0]];
297
+ }
298
+ }
299
+ else {
300
+ input.img_url = request.referenceImages[0];
301
+ }
302
+ }
303
+ const parameters = {};
304
+ if (request.duration)
305
+ parameters.duration = request.duration;
306
+ if (request.resolution)
307
+ parameters.resolution = request.resolution;
308
+ if (request.aspectRatio)
309
+ parameters.ratio = request.aspectRatio;
310
+ if (request.metadata?.size)
311
+ parameters.size = request.metadata.size;
312
+ const submitUrl = `${this.baseUrl}/api/v1/services/aigc/generation/generation`;
313
+ const body = { model, input };
314
+ if (Object.keys(parameters).length > 0)
315
+ body.parameters = parameters;
316
+ const submitRes = await fetch(submitUrl, {
317
+ method: "POST",
318
+ headers: {
319
+ "Content-Type": "application/json",
320
+ Authorization: `Bearer ${apiKey}`,
321
+ "X-DashScope-Async": "enable",
322
+ },
323
+ body: JSON.stringify(body),
324
+ signal: signal ?? AbortSignal.timeout(this.timeoutMs),
325
+ });
326
+ if (!submitRes.ok) {
327
+ const errText = await submitRes.text().catch(() => "");
328
+ throw new Error(`DashScope video submit error ${submitRes.status}: ${errText}`);
329
+ }
330
+ const submitData = await submitRes.json();
331
+ if (submitData.code) {
332
+ throw new Error(`DashScope video submit rejected: ${submitData.code} 鈥?${submitData.message ?? ""}`);
333
+ }
334
+ const taskId = submitData.output?.task_id;
335
+ if (!taskId) {
336
+ throw new Error("DashScope video submit: no task_id in response");
337
+ }
338
+ // Poll for completion with longer timeout for video
339
+ const videoUrls = await this.pollTask(taskId, apiKey, signal, request.onProgress, MAX_POLL_MS_VIDEO);
340
+ return {
341
+ mediaUrls: videoUrls,
342
+ model,
343
+ durationMs: Date.now() - start,
344
+ taskId,
345
+ };
346
+ }
347
+ // 鈹€鈹€ AsyncMediaTransport: Task Management 鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€
348
+ /**
349
+ * Query a single task by ID using DashScope unified task endpoint.
350
+ * GET /api/v1/tasks/{taskId}
351
+ */
352
+ async getTaskStatus(taskId, apiKey, signal) {
353
+ const url = `${this.baseUrl}/api/v1/tasks/${taskId}`;
354
+ const res = await fetch(url, {
355
+ method: "GET",
356
+ headers: { Authorization: `Bearer ${apiKey}` },
357
+ signal: signal ?? AbortSignal.timeout(this.timeoutMs),
358
+ });
359
+ if (!res.ok) {
360
+ const text = await res.text().catch(() => "");
361
+ throw new Error(`DashScope task status error ${res.status}: ${text}`);
362
+ }
363
+ const data = await res.json();
364
+ const output = data.output;
365
+ const rawStatus = output?.task_status ?? "unknown";
366
+ const normalizedStatus = rawStatus === "SUCCEEDED" ? "succeeded"
367
+ : rawStatus === "FAILED" ? "failed"
368
+ : rawStatus.toLowerCase();
369
+ return { status: normalizedStatus, task: data };
370
+ }
371
+ /**
372
+ * List tasks 鈥?DashScope has a task list API.
373
+ * GET /api/v1/tasks?page_no=1&page_size=20&status=RUNNING
374
+ */
375
+ async listVideoTasks(apiKey, options, signal) {
376
+ const params = new URLSearchParams();
377
+ params.set("page_size", String(options?.limit ?? 20));
378
+ if (options?.status)
379
+ params.set("status", options.status.toUpperCase());
380
+ const url = `${this.baseUrl}/api/v1/tasks?${params.toString()}`;
381
+ const res = await fetch(url, {
382
+ method: "GET",
383
+ headers: { Authorization: `Bearer ${apiKey}` },
384
+ signal: signal ?? AbortSignal.timeout(this.timeoutMs),
385
+ });
386
+ if (!res.ok) {
387
+ // DashScope may not support task list 鈥?return empty gracefully
388
+ return { data: [] };
389
+ }
390
+ return await res.json();
391
+ }
392
+ /**
393
+ * Cancel/delete 鈥?DashScope does not have a public task cancellation API.
394
+ */
395
+ async deleteVideoTask(_taskId, _apiKey, _signal) {
396
+ throw new Error("DashScope does not support task cancellation.");
397
+ }
398
+ extractMediaUrl(data) {
399
+ // DashScope returns different URL fields per media type
400
+ if (data.output?.video_url)
401
+ return data.output.video_url;
402
+ if (data.output?.audio_url)
403
+ return data.output.audio_url;
404
+ if (data.output?.results?.[0]?.url)
405
+ return data.output.results[0].url;
406
+ return undefined;
407
+ }
408
+ }
409
+ function sleep(ms) {
410
+ return new Promise(resolve => setTimeout(resolve, ms));
411
+ }
@@ -0,0 +1,183 @@
1
+ /**
2
+ * Realtime WebSocket Transport 鈥?bidirectional audio/voice streaming
3
+ * via the OpenAI Realtime API protocol (also compatible with GLM Realtime).
4
+ *
5
+ * ## Protocol: WebSocket JSON events
6
+ *
7
+ * Client 鈫?Server:
8
+ * - session.update: configure session (model, voice, tools, etc.)
9
+ * - input_audio_buffer.append: send audio chunks (base64 PCM16)
10
+ * - input_audio_buffer.commit: signal end of audio input
11
+ * - conversation.item.create: inject text/function_result items
12
+ * - response.create: request a model response
13
+ * - response.cancel: abort in-progress response
14
+ *
15
+ * Server 鈫?Client:
16
+ * - session.created: session initialized
17
+ * - session.updated: config acknowledged
18
+ * - input_audio_buffer.speech_started: VAD detected speech
19
+ * - input_audio_buffer.speech_stopped: VAD detected silence
20
+ * - response.created: response generation started
21
+ * - response.output_item.added: new output item (text/audio/function_call)
22
+ * - response.audio.delta: audio chunk (base64 PCM16)
23
+ * - response.audio_transcript.delta: transcript of generated speech
24
+ * - response.text.delta: text generation delta
25
+ * - response.function_call_arguments.delta: tool call args delta
26
+ * - response.function_call_arguments.done: tool call complete
27
+ * - response.output_item.done: output item finished
28
+ * - response.done: full response complete
29
+ * - error: server error
30
+ *
31
+ * ## Architecture
32
+ *
33
+ * RealtimeTransport manages a single persistent WebSocket connection per session.
34
+ * It exposes an event-driven API (AsyncGenerator) that the agent tool-loop
35
+ * can consume for voice-enabled interactions.
36
+ *
37
+ * Docs:
38
+ * - OpenAI: https://platform.openai.com/docs/api-reference/realtime
39
+ * - GLM: https://docs.bigmodel.cn/cn/guide/develop/realtime-api
40
+ */
41
+ export interface RealtimeConfig {
42
+ /** WebSocket endpoint (e.g. "wss://api.openai.com/v1/realtime") */
43
+ baseUrl: string;
44
+ /** Model to use (e.g. "gpt-realtime-2", "glm-realtime") */
45
+ model: string;
46
+ /** API key */
47
+ apiKey: string;
48
+ /** Voice for TTS output */
49
+ voice?: string;
50
+ /** Input modalities: "text", "audio", or both */
51
+ inputModalities?: Array<"text" | "audio">;
52
+ /** Output modalities: "text", "audio", or both */
53
+ outputModalities?: Array<"text" | "audio">;
54
+ /** Temperature for generation */
55
+ temperature?: number;
56
+ /** Tool definitions for function calling */
57
+ tools?: RealtimeTool[];
58
+ /** Voice Activity Detection mode */
59
+ vadMode?: "server_vad" | "none";
60
+ /** VAD threshold (0.0-1.0) */
61
+ vadThreshold?: number;
62
+ /** Auth type: "header" (OpenAI) or "query" (GLM) */
63
+ authMode?: "header" | "query";
64
+ }
65
+ export interface RealtimeTool {
66
+ type: "function";
67
+ name: string;
68
+ description: string;
69
+ parameters: Record<string, unknown>;
70
+ }
71
+ export type RealtimeEvent = {
72
+ type: "session_created";
73
+ sessionId: string;
74
+ } | {
75
+ type: "speech_started";
76
+ } | {
77
+ type: "speech_stopped";
78
+ audioEndMs: number;
79
+ } | {
80
+ type: "audio_delta";
81
+ delta: string;
82
+ } | {
83
+ type: "audio_transcript_delta";
84
+ delta: string;
85
+ } | {
86
+ type: "text_delta";
87
+ delta: string;
88
+ } | {
89
+ type: "function_call_start";
90
+ callId: string;
91
+ name: string;
92
+ } | {
93
+ type: "function_call_delta";
94
+ callId: string;
95
+ delta: string;
96
+ } | {
97
+ type: "function_call_done";
98
+ callId: string;
99
+ name: string;
100
+ arguments: string;
101
+ } | {
102
+ type: "response_done";
103
+ usage?: RealtimeUsage;
104
+ } | {
105
+ type: "error";
106
+ code: string;
107
+ message: string;
108
+ } | {
109
+ type: "closed";
110
+ code: number;
111
+ reason: string;
112
+ };
113
+ export interface RealtimeUsage {
114
+ inputTokens: number;
115
+ outputTokens: number;
116
+ inputAudioTokens?: number;
117
+ outputAudioTokens?: number;
118
+ }
119
+ /**
120
+ * Manages a persistent WebSocket connection for real-time audio/voice
121
+ * interactions with an LLM provider.
122
+ *
123
+ * Usage:
124
+ * ```ts
125
+ * const rt = new RealtimeTransport(config);
126
+ * rt.connect();
127
+ *
128
+ * // Send audio
129
+ * rt.appendAudio(base64Chunk);
130
+ * rt.commitAudio();
131
+ *
132
+ * // Or send text
133
+ * rt.sendText("Hello!");
134
+ *
135
+ * // Submit function results
136
+ * rt.sendFunctionResult(callId, result);
137
+ *
138
+ * // Consume events
139
+ * for await (const event of rt.events()) {
140
+ * switch (event.type) {
141
+ * case "audio_delta": playAudio(event.delta); break;
142
+ * case "function_call_done": handleToolCall(event); break;
143
+ * }
144
+ * }
145
+ *
146
+ * rt.close();
147
+ * ```
148
+ */
149
+ export declare class RealtimeTransport {
150
+ private ws;
151
+ private config;
152
+ private eventQueue;
153
+ private waiters;
154
+ private closed;
155
+ constructor(config: RealtimeConfig);
156
+ /** Open WebSocket connection and configure session. */
157
+ connect(): Promise<void>;
158
+ /** Send audio data (base64 PCM16). */
159
+ appendAudio(base64Chunk: string): void;
160
+ /** Mark end of audio input and trigger response. */
161
+ commitAudio(): void;
162
+ /** Send a text message. */
163
+ sendText(text: string): void;
164
+ /** Submit a function call result back to the model. */
165
+ sendFunctionResult(callId: string, output: string): void;
166
+ /** Trigger a model response (e.g. after sending text). */
167
+ requestResponse(): void;
168
+ /** Cancel an in-progress response. */
169
+ cancelResponse(): void;
170
+ /** Async iterator of server events. */
171
+ events(): AsyncGenerator<RealtimeEvent>;
172
+ /** Close the WebSocket connection. */
173
+ close(): void;
174
+ private buildUrl;
175
+ private sendSessionUpdate;
176
+ private send;
177
+ private push;
178
+ private drainWaiters;
179
+ /**
180
+ * Parse a server-sent JSON event into our typed event(s).
181
+ */
182
+ private parseServerEvent;
183
+ }