@xiaozhiclaw/provider-core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/aliyun-oss-file-upload-adapter.d.ts +44 -0
- package/dist/adapters/aliyun-oss-file-upload-adapter.js +96 -0
- package/dist/adapters/gemini-file-upload-adapter.d.ts +26 -0
- package/dist/adapters/gemini-file-upload-adapter.js +92 -0
- package/dist/adapters/hub-oss-file-upload-adapter.d.ts +29 -0
- package/dist/adapters/hub-oss-file-upload-adapter.js +53 -0
- package/dist/adapters/index.d.ts +10 -0
- package/dist/adapters/index.js +10 -0
- package/dist/adapters/openai-file-upload-adapter.d.ts +38 -0
- package/dist/adapters/openai-file-upload-adapter.js +56 -0
- package/dist/adapters/volcengine-file-upload-adapter.d.ts +24 -0
- package/dist/adapters/volcengine-file-upload-adapter.js +45 -0
- package/dist/builtin-providers.d.ts +8 -0
- package/dist/builtin-providers.js +2237 -0
- package/dist/constants.d.ts +1 -0
- package/dist/constants.js +1 -0
- package/dist/credentials.d.ts +1 -0
- package/dist/credentials.js +8 -0
- package/dist/debug-transport.d.ts +12 -0
- package/dist/debug-transport.js +99 -0
- package/dist/errors.d.ts +11 -0
- package/dist/errors.js +12 -0
- package/dist/events.d.ts +48 -0
- package/dist/events.js +1 -0
- package/dist/file-upload-service.d.ts +68 -0
- package/dist/file-upload-service.js +110 -0
- package/dist/gemini-schema-utils.d.ts +17 -0
- package/dist/gemini-schema-utils.js +76 -0
- package/dist/index.d.ts +37 -0
- package/dist/index.js +33 -0
- package/dist/llm-client.d.ts +43 -0
- package/dist/llm-client.js +217 -0
- package/dist/media-client.d.ts +42 -0
- package/dist/media-client.js +174 -0
- package/dist/media-transport.d.ts +176 -0
- package/dist/media-transport.js +16 -0
- package/dist/media.d.ts +2 -0
- package/dist/media.js +1 -0
- package/dist/model-detection.d.ts +22 -0
- package/dist/model-detection.js +28 -0
- package/dist/paths.d.ts +2 -0
- package/dist/paths.js +11 -0
- package/dist/provider-def.d.ts +220 -0
- package/dist/provider-def.js +9 -0
- package/dist/provider-registry.d.ts +51 -0
- package/dist/provider-registry.js +130 -0
- package/dist/provider-tool-api.d.ts +44 -0
- package/dist/provider-tool-api.js +9 -0
- package/dist/provider-variant-resolver.d.ts +35 -0
- package/dist/provider-variant-resolver.js +174 -0
- package/dist/retry.d.ts +37 -0
- package/dist/retry.js +71 -0
- package/dist/transport.d.ts +281 -0
- package/dist/transport.js +27 -0
- package/dist/transports/anthropic-messages.d.ts +65 -0
- package/dist/transports/anthropic-messages.js +1004 -0
- package/dist/transports/gemini-cache-api.d.ts +86 -0
- package/dist/transports/gemini-cache-api.js +141 -0
- package/dist/transports/gemini-file-api.d.ts +90 -0
- package/dist/transports/gemini-file-api.js +164 -0
- package/dist/transports/gemini-generatecontent.d.ts +56 -0
- package/dist/transports/gemini-generatecontent.js +688 -0
- package/dist/transports/gemini-lyria-realtime.d.ts +117 -0
- package/dist/transports/gemini-lyria-realtime.js +295 -0
- package/dist/transports/gemini-media.d.ts +53 -0
- package/dist/transports/gemini-media.js +383 -0
- package/dist/transports/media-resolve.d.ts +50 -0
- package/dist/transports/media-resolve.js +91 -0
- package/dist/transports/minimax-media.d.ts +56 -0
- package/dist/transports/minimax-media.js +433 -0
- package/dist/transports/openai-chat.d.ts +81 -0
- package/dist/transports/openai-chat.js +782 -0
- package/dist/transports/openai-media.d.ts +24 -0
- package/dist/transports/openai-media.js +118 -0
- package/dist/transports/openai-responses.d.ts +63 -0
- package/dist/transports/openai-responses.js +778 -0
- package/dist/transports/qwen-media.d.ts +59 -0
- package/dist/transports/qwen-media.js +411 -0
- package/dist/transports/realtime-transport.d.ts +183 -0
- package/dist/transports/realtime-transport.js +332 -0
- package/dist/transports/volcengine-grounding.d.ts +58 -0
- package/dist/transports/volcengine-grounding.js +69 -0
- package/dist/transports/volcengine-media.d.ts +94 -0
- package/dist/transports/volcengine-media.js +801 -0
- package/dist/transports/volcengine-responses.d.ts +64 -0
- package/dist/transports/volcengine-responses.js +797 -0
- package/dist/transports/zhipu-media.d.ts +82 -0
- package/dist/transports/zhipu-media.js +522 -0
- package/dist/transports/zhipu-tool-api.d.ts +35 -0
- package/dist/transports/zhipu-tool-api.js +126 -0
- package/dist/wire-types.d.ts +51 -0
- package/dist/wire-types.js +1 -0
- package/package.json +33 -0
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Realtime WebSocket Transport 鈥?bidirectional audio/voice streaming
|
|
3
|
+
* via the OpenAI Realtime API protocol (also compatible with GLM Realtime).
|
|
4
|
+
*
|
|
5
|
+
* ## Protocol: WebSocket JSON events
|
|
6
|
+
*
|
|
7
|
+
* Client 鈫?Server:
|
|
8
|
+
* - session.update: configure session (model, voice, tools, etc.)
|
|
9
|
+
* - input_audio_buffer.append: send audio chunks (base64 PCM16)
|
|
10
|
+
* - input_audio_buffer.commit: signal end of audio input
|
|
11
|
+
* - conversation.item.create: inject text/function_result items
|
|
12
|
+
* - response.create: request a model response
|
|
13
|
+
* - response.cancel: abort in-progress response
|
|
14
|
+
*
|
|
15
|
+
* Server 鈫?Client:
|
|
16
|
+
* - session.created: session initialized
|
|
17
|
+
* - session.updated: config acknowledged
|
|
18
|
+
* - input_audio_buffer.speech_started: VAD detected speech
|
|
19
|
+
* - input_audio_buffer.speech_stopped: VAD detected silence
|
|
20
|
+
* - response.created: response generation started
|
|
21
|
+
* - response.output_item.added: new output item (text/audio/function_call)
|
|
22
|
+
* - response.audio.delta: audio chunk (base64 PCM16)
|
|
23
|
+
* - response.audio_transcript.delta: transcript of generated speech
|
|
24
|
+
* - response.text.delta: text generation delta
|
|
25
|
+
* - response.function_call_arguments.delta: tool call args delta
|
|
26
|
+
* - response.function_call_arguments.done: tool call complete
|
|
27
|
+
* - response.output_item.done: output item finished
|
|
28
|
+
* - response.done: full response complete
|
|
29
|
+
* - error: server error
|
|
30
|
+
*
|
|
31
|
+
* ## Architecture
|
|
32
|
+
*
|
|
33
|
+
* RealtimeTransport manages a single persistent WebSocket connection per session.
|
|
34
|
+
* It exposes an event-driven API (AsyncGenerator) that the agent tool-loop
|
|
35
|
+
* can consume for voice-enabled interactions.
|
|
36
|
+
*
|
|
37
|
+
* Docs:
|
|
38
|
+
* - OpenAI: https://platform.openai.com/docs/api-reference/realtime
|
|
39
|
+
* - GLM: https://docs.bigmodel.cn/cn/guide/develop/realtime-api
|
|
40
|
+
*/
|
|
41
|
+
// 鈹€鈹€ Transport 鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€
|
|
42
|
+
/**
|
|
43
|
+
* Manages a persistent WebSocket connection for real-time audio/voice
|
|
44
|
+
* interactions with an LLM provider.
|
|
45
|
+
*
|
|
46
|
+
* Usage:
|
|
47
|
+
* ```ts
|
|
48
|
+
* const rt = new RealtimeTransport(config);
|
|
49
|
+
* rt.connect();
|
|
50
|
+
*
|
|
51
|
+
* // Send audio
|
|
52
|
+
* rt.appendAudio(base64Chunk);
|
|
53
|
+
* rt.commitAudio();
|
|
54
|
+
*
|
|
55
|
+
* // Or send text
|
|
56
|
+
* rt.sendText("Hello!");
|
|
57
|
+
*
|
|
58
|
+
* // Submit function results
|
|
59
|
+
* rt.sendFunctionResult(callId, result);
|
|
60
|
+
*
|
|
61
|
+
* // Consume events
|
|
62
|
+
* for await (const event of rt.events()) {
|
|
63
|
+
* switch (event.type) {
|
|
64
|
+
* case "audio_delta": playAudio(event.delta); break;
|
|
65
|
+
* case "function_call_done": handleToolCall(event); break;
|
|
66
|
+
* }
|
|
67
|
+
* }
|
|
68
|
+
*
|
|
69
|
+
* rt.close();
|
|
70
|
+
* ```
|
|
71
|
+
*/
|
|
72
|
+
export class RealtimeTransport {
|
|
73
|
+
ws = null;
|
|
74
|
+
config;
|
|
75
|
+
eventQueue = [];
|
|
76
|
+
waiters = [];
|
|
77
|
+
closed = false;
|
|
78
|
+
constructor(config) {
|
|
79
|
+
this.config = config;
|
|
80
|
+
}
|
|
81
|
+
/** Open WebSocket connection and configure session. */
|
|
82
|
+
async connect() {
|
|
83
|
+
const url = this.buildUrl();
|
|
84
|
+
// Use native WebSocket (Node 22+ has global WebSocket)
|
|
85
|
+
// For older Node, set globalThis.WebSocket from 'ws' package before calling
|
|
86
|
+
const WS = globalThis.WebSocket;
|
|
87
|
+
if (!WS)
|
|
88
|
+
throw new Error("WebSocket not available. Node 22+ required or polyfill globalThis.WebSocket.");
|
|
89
|
+
// Pass auth headers via protocol sub-protocol trick (OpenAI) or URL query (GLM)
|
|
90
|
+
const protocols = this.config.authMode === "header"
|
|
91
|
+
? ["realtime", `openai-insecure-api-key.${this.config.apiKey}`, "openai-beta.realtime-v1"]
|
|
92
|
+
: undefined;
|
|
93
|
+
this.ws = new WS(url, protocols);
|
|
94
|
+
this.ws.onmessage = (event) => {
|
|
95
|
+
try {
|
|
96
|
+
const data = typeof event.data === "string" ? event.data : String(event.data);
|
|
97
|
+
const msg = JSON.parse(data);
|
|
98
|
+
const events = this.parseServerEvent(msg);
|
|
99
|
+
for (const ev of events) {
|
|
100
|
+
this.push(ev);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
catch {
|
|
104
|
+
this.push({ type: "error", code: "parse_error", message: "Failed to parse server event" });
|
|
105
|
+
}
|
|
106
|
+
};
|
|
107
|
+
this.ws.onclose = (event) => {
|
|
108
|
+
this.push({ type: "closed", code: event.code, reason: event.reason });
|
|
109
|
+
this.closed = true;
|
|
110
|
+
this.drainWaiters();
|
|
111
|
+
};
|
|
112
|
+
this.ws.onerror = () => {
|
|
113
|
+
this.push({ type: "error", code: "ws_error", message: "WebSocket connection error" });
|
|
114
|
+
};
|
|
115
|
+
// Wait for connection to be established
|
|
116
|
+
await new Promise((resolve, reject) => {
|
|
117
|
+
this.ws.onopen = () => {
|
|
118
|
+
this.sendSessionUpdate();
|
|
119
|
+
resolve();
|
|
120
|
+
};
|
|
121
|
+
this.ws.onerror = () => reject(new Error("WebSocket connection failed"));
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
/** Send audio data (base64 PCM16). */
|
|
125
|
+
appendAudio(base64Chunk) {
|
|
126
|
+
this.send({
|
|
127
|
+
type: "input_audio_buffer.append",
|
|
128
|
+
audio: base64Chunk,
|
|
129
|
+
});
|
|
130
|
+
}
|
|
131
|
+
/** Mark end of audio input and trigger response. */
|
|
132
|
+
commitAudio() {
|
|
133
|
+
this.send({ type: "input_audio_buffer.commit" });
|
|
134
|
+
}
|
|
135
|
+
/** Send a text message. */
|
|
136
|
+
sendText(text) {
|
|
137
|
+
this.send({
|
|
138
|
+
type: "conversation.item.create",
|
|
139
|
+
item: {
|
|
140
|
+
type: "message",
|
|
141
|
+
role: "user",
|
|
142
|
+
content: [{ type: "input_text", text }],
|
|
143
|
+
},
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
/** Submit a function call result back to the model. */
|
|
147
|
+
sendFunctionResult(callId, output) {
|
|
148
|
+
this.send({
|
|
149
|
+
type: "conversation.item.create",
|
|
150
|
+
item: {
|
|
151
|
+
type: "function_call_output",
|
|
152
|
+
call_id: callId,
|
|
153
|
+
output,
|
|
154
|
+
},
|
|
155
|
+
});
|
|
156
|
+
// Request a new response after submitting the result
|
|
157
|
+
this.send({ type: "response.create" });
|
|
158
|
+
}
|
|
159
|
+
/** Trigger a model response (e.g. after sending text). */
|
|
160
|
+
requestResponse() {
|
|
161
|
+
this.send({ type: "response.create" });
|
|
162
|
+
}
|
|
163
|
+
/** Cancel an in-progress response. */
|
|
164
|
+
cancelResponse() {
|
|
165
|
+
this.send({ type: "response.cancel" });
|
|
166
|
+
}
|
|
167
|
+
/** Async iterator of server events. */
|
|
168
|
+
async *events() {
|
|
169
|
+
while (!this.closed || this.eventQueue.length > 0) {
|
|
170
|
+
if (this.eventQueue.length > 0) {
|
|
171
|
+
yield this.eventQueue.shift();
|
|
172
|
+
}
|
|
173
|
+
else {
|
|
174
|
+
const event = await new Promise((resolve) => {
|
|
175
|
+
this.waiters.push(resolve);
|
|
176
|
+
});
|
|
177
|
+
if (event.done)
|
|
178
|
+
return;
|
|
179
|
+
yield event.value;
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
/** Close the WebSocket connection. */
|
|
184
|
+
close() {
|
|
185
|
+
this.closed = true;
|
|
186
|
+
if (this.ws && this.ws.readyState !== WebSocket.CLOSED) {
|
|
187
|
+
this.ws.close(1000, "client_close");
|
|
188
|
+
}
|
|
189
|
+
this.drainWaiters();
|
|
190
|
+
}
|
|
191
|
+
// 鈹€鈹€ Private 鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€
|
|
192
|
+
buildUrl() {
|
|
193
|
+
let url = this.config.baseUrl;
|
|
194
|
+
if (this.config.authMode === "query") {
|
|
195
|
+
// GLM style: pass model + API key as query params
|
|
196
|
+
const sep = url.includes("?") ? "&" : "?";
|
|
197
|
+
url += `${sep}model=${encodeURIComponent(this.config.model)}&token=${encodeURIComponent(this.config.apiKey)}`;
|
|
198
|
+
}
|
|
199
|
+
else {
|
|
200
|
+
// OpenAI style: model in query, auth via sub-protocol
|
|
201
|
+
const sep = url.includes("?") ? "&" : "?";
|
|
202
|
+
url += `${sep}model=${encodeURIComponent(this.config.model)}`;
|
|
203
|
+
}
|
|
204
|
+
return url;
|
|
205
|
+
}
|
|
206
|
+
sendSessionUpdate() {
|
|
207
|
+
this.send({
|
|
208
|
+
type: "session.update",
|
|
209
|
+
session: {
|
|
210
|
+
model: this.config.model,
|
|
211
|
+
voice: this.config.voice ?? "alloy",
|
|
212
|
+
modalities: this.config.outputModalities ?? ["text", "audio"],
|
|
213
|
+
instructions: "You are a helpful assistant.",
|
|
214
|
+
temperature: this.config.temperature ?? 0.8,
|
|
215
|
+
tools: this.config.tools?.map(t => ({
|
|
216
|
+
type: t.type,
|
|
217
|
+
name: t.name,
|
|
218
|
+
description: t.description,
|
|
219
|
+
parameters: t.parameters,
|
|
220
|
+
})) ?? [],
|
|
221
|
+
turn_detection: this.config.vadMode === "none"
|
|
222
|
+
? null
|
|
223
|
+
: {
|
|
224
|
+
type: "server_vad",
|
|
225
|
+
threshold: this.config.vadThreshold ?? 0.5,
|
|
226
|
+
prefix_padding_ms: 300,
|
|
227
|
+
silence_duration_ms: 500,
|
|
228
|
+
},
|
|
229
|
+
},
|
|
230
|
+
});
|
|
231
|
+
}
|
|
232
|
+
send(msg) {
|
|
233
|
+
if (!this.ws || this.ws.readyState !== WebSocket.OPEN) {
|
|
234
|
+
throw new Error("WebSocket not connected");
|
|
235
|
+
}
|
|
236
|
+
this.ws.send(JSON.stringify(msg));
|
|
237
|
+
}
|
|
238
|
+
push(event) {
|
|
239
|
+
if (this.waiters.length > 0) {
|
|
240
|
+
const waiter = this.waiters.shift();
|
|
241
|
+
waiter({ value: event, done: false });
|
|
242
|
+
}
|
|
243
|
+
else {
|
|
244
|
+
this.eventQueue.push(event);
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
drainWaiters() {
|
|
248
|
+
for (const waiter of this.waiters) {
|
|
249
|
+
waiter({ value: undefined, done: true });
|
|
250
|
+
}
|
|
251
|
+
this.waiters.length = 0;
|
|
252
|
+
}
|
|
253
|
+
/**
|
|
254
|
+
* Parse a server-sent JSON event into our typed event(s).
|
|
255
|
+
*/
|
|
256
|
+
parseServerEvent(msg) {
|
|
257
|
+
const type = msg.type;
|
|
258
|
+
switch (type) {
|
|
259
|
+
case "session.created":
|
|
260
|
+
return [{ type: "session_created", sessionId: String(msg.session?.id ?? "") }];
|
|
261
|
+
case "input_audio_buffer.speech_started":
|
|
262
|
+
return [{ type: "speech_started" }];
|
|
263
|
+
case "input_audio_buffer.speech_stopped":
|
|
264
|
+
return [{ type: "speech_stopped", audioEndMs: Number(msg.audio_end_ms ?? 0) }];
|
|
265
|
+
case "response.audio.delta":
|
|
266
|
+
return [{ type: "audio_delta", delta: String(msg.delta ?? "") }];
|
|
267
|
+
case "response.audio_transcript.delta":
|
|
268
|
+
return [{ type: "audio_transcript_delta", delta: String(msg.delta ?? "") }];
|
|
269
|
+
case "response.text.delta":
|
|
270
|
+
return [{ type: "text_delta", delta: String(msg.delta ?? "") }];
|
|
271
|
+
case "response.function_call_arguments.delta": {
|
|
272
|
+
return [{
|
|
273
|
+
type: "function_call_delta",
|
|
274
|
+
callId: String(msg.call_id ?? ""),
|
|
275
|
+
delta: String(msg.delta ?? ""),
|
|
276
|
+
}];
|
|
277
|
+
}
|
|
278
|
+
case "response.function_call_arguments.done": {
|
|
279
|
+
return [{
|
|
280
|
+
type: "function_call_done",
|
|
281
|
+
callId: String(msg.call_id ?? ""),
|
|
282
|
+
name: String(msg.name ?? ""),
|
|
283
|
+
arguments: String(msg.arguments ?? ""),
|
|
284
|
+
}];
|
|
285
|
+
}
|
|
286
|
+
case "response.output_item.added": {
|
|
287
|
+
const item = msg.item;
|
|
288
|
+
if (item?.type === "function_call") {
|
|
289
|
+
return [{
|
|
290
|
+
type: "function_call_start",
|
|
291
|
+
callId: String(item.call_id ?? ""),
|
|
292
|
+
name: String(item.name ?? ""),
|
|
293
|
+
}];
|
|
294
|
+
}
|
|
295
|
+
return [];
|
|
296
|
+
}
|
|
297
|
+
case "response.done": {
|
|
298
|
+
const response = msg.response;
|
|
299
|
+
const usage = response?.usage;
|
|
300
|
+
return [{
|
|
301
|
+
type: "response_done",
|
|
302
|
+
usage: usage ? {
|
|
303
|
+
inputTokens: usage.input_tokens ?? 0,
|
|
304
|
+
outputTokens: usage.output_tokens ?? 0,
|
|
305
|
+
inputAudioTokens: response?.input_token_details?.audio_tokens,
|
|
306
|
+
outputAudioTokens: response?.output_token_details?.audio_tokens,
|
|
307
|
+
} : undefined,
|
|
308
|
+
}];
|
|
309
|
+
}
|
|
310
|
+
case "error": {
|
|
311
|
+
const error = msg.error;
|
|
312
|
+
return [{
|
|
313
|
+
type: "error",
|
|
314
|
+
code: String(error?.code ?? "unknown"),
|
|
315
|
+
message: String(error?.message ?? "Unknown error"),
|
|
316
|
+
}];
|
|
317
|
+
}
|
|
318
|
+
// Ignored events (acknowledgements, intermediate states)
|
|
319
|
+
case "session.updated":
|
|
320
|
+
case "response.created":
|
|
321
|
+
case "response.output_item.done":
|
|
322
|
+
case "conversation.item.created":
|
|
323
|
+
case "input_audio_buffer.committed":
|
|
324
|
+
case "input_audio_buffer.cleared":
|
|
325
|
+
case "rate_limits.updated":
|
|
326
|
+
return [];
|
|
327
|
+
default:
|
|
328
|
+
// Unknown event 鈥?silently ignore for forward compatibility
|
|
329
|
+
return [];
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Volcengine Grounding 鈥?spatial coordinate parser (volcengine-ProviderMax 搂14).
|
|
3
|
+
*
|
|
4
|
+
* Parses model-emitted spatial reference tags from text output:
|
|
5
|
+
* - <bbox>x_min y_min x_max y_max</bbox> 鈫?bounding box
|
|
6
|
+
* - <point>x y</point> 鈫?single point
|
|
7
|
+
* - <polygon>x1 y1 x2 y2 ...</polygon> 鈫?polygon vertices
|
|
8
|
+
*
|
|
9
|
+
* All coordinates are in normalized 1000脳1000 space, range [0, 999].
|
|
10
|
+
* Use `toPixelCoords()` to convert to actual image pixel coordinates.
|
|
11
|
+
*/
|
|
12
|
+
export type SpatialReference = {
|
|
13
|
+
type: "bbox";
|
|
14
|
+
x1: number;
|
|
15
|
+
y1: number;
|
|
16
|
+
x2: number;
|
|
17
|
+
y2: number;
|
|
18
|
+
space: "normalized_1000";
|
|
19
|
+
} | {
|
|
20
|
+
type: "point";
|
|
21
|
+
x: number;
|
|
22
|
+
y: number;
|
|
23
|
+
space: "normalized_1000";
|
|
24
|
+
} | {
|
|
25
|
+
type: "polygon";
|
|
26
|
+
points: Array<{
|
|
27
|
+
x: number;
|
|
28
|
+
y: number;
|
|
29
|
+
}>;
|
|
30
|
+
space: "normalized_1000";
|
|
31
|
+
};
|
|
32
|
+
export interface PixelBbox {
|
|
33
|
+
x1: number;
|
|
34
|
+
y1: number;
|
|
35
|
+
x2: number;
|
|
36
|
+
y2: number;
|
|
37
|
+
}
|
|
38
|
+
export interface PixelPoint {
|
|
39
|
+
x: number;
|
|
40
|
+
y: number;
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Extract all spatial references from model output text.
|
|
44
|
+
* Returns an empty array if no grounding tags are found.
|
|
45
|
+
*/
|
|
46
|
+
export declare function parseGroundingTags(text: string): SpatialReference[];
|
|
47
|
+
/**
|
|
48
|
+
* Convert a normalized 1000脳1000 bounding box to pixel coordinates.
|
|
49
|
+
*/
|
|
50
|
+
export declare function bboxToPixels(ref: Extract<SpatialReference, {
|
|
51
|
+
type: "bbox";
|
|
52
|
+
}>, width: number, height: number): PixelBbox;
|
|
53
|
+
/**
|
|
54
|
+
* Convert a normalized 1000脳1000 point to pixel coordinates.
|
|
55
|
+
*/
|
|
56
|
+
export declare function pointToPixels(ref: Extract<SpatialReference, {
|
|
57
|
+
type: "point";
|
|
58
|
+
}>, width: number, height: number): PixelPoint;
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Volcengine Grounding 鈥?spatial coordinate parser (volcengine-ProviderMax 搂14).
|
|
3
|
+
*
|
|
4
|
+
* Parses model-emitted spatial reference tags from text output:
|
|
5
|
+
* - <bbox>x_min y_min x_max y_max</bbox> 鈫?bounding box
|
|
6
|
+
* - <point>x y</point> 鈫?single point
|
|
7
|
+
* - <polygon>x1 y1 x2 y2 ...</polygon> 鈫?polygon vertices
|
|
8
|
+
*
|
|
9
|
+
* All coordinates are in normalized 1000脳1000 space, range [0, 999].
|
|
10
|
+
* Use `toPixelCoords()` to convert to actual image pixel coordinates.
|
|
11
|
+
*/
|
|
12
|
+
// 鈹€鈹€ Parsing 鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€
|
|
13
|
+
const BBOX_RE = /<bbox>\s*(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s*<\/bbox>/g;
|
|
14
|
+
const POINT_RE = /<point>\s*(\d+)\s+(\d+)\s*<\/point>/g;
|
|
15
|
+
const POLYGON_RE = /<polygon>\s*([\d\s]+?)\s*<\/polygon>/g;
|
|
16
|
+
/**
|
|
17
|
+
* Extract all spatial references from model output text.
|
|
18
|
+
* Returns an empty array if no grounding tags are found.
|
|
19
|
+
*/
|
|
20
|
+
export function parseGroundingTags(text) {
|
|
21
|
+
const results = [];
|
|
22
|
+
for (const m of text.matchAll(BBOX_RE)) {
|
|
23
|
+
const [x1, y1, x2, y2] = [+m[1], +m[2], +m[3], +m[4]];
|
|
24
|
+
if (isValidCoord(x1) && isValidCoord(y1) && isValidCoord(x2) && isValidCoord(y2)) {
|
|
25
|
+
results.push({ type: "bbox", x1, y1, x2, y2, space: "normalized_1000" });
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
for (const m of text.matchAll(POINT_RE)) {
|
|
29
|
+
const [x, y] = [+m[1], +m[2]];
|
|
30
|
+
if (isValidCoord(x) && isValidCoord(y)) {
|
|
31
|
+
results.push({ type: "point", x, y, space: "normalized_1000" });
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
for (const m of text.matchAll(POLYGON_RE)) {
|
|
35
|
+
const nums = m[1].trim().split(/\s+/).map(Number);
|
|
36
|
+
if (nums.length >= 4 && nums.length % 2 === 0 && nums.every(isValidCoord)) {
|
|
37
|
+
const points = [];
|
|
38
|
+
for (let i = 0; i < nums.length; i += 2) {
|
|
39
|
+
points.push({ x: nums[i], y: nums[i + 1] });
|
|
40
|
+
}
|
|
41
|
+
results.push({ type: "polygon", points, space: "normalized_1000" });
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
return results;
|
|
45
|
+
}
|
|
46
|
+
// 鈹€鈹€ Coordinate Conversion 鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€
|
|
47
|
+
/**
|
|
48
|
+
* Convert a normalized 1000脳1000 bounding box to pixel coordinates.
|
|
49
|
+
*/
|
|
50
|
+
export function bboxToPixels(ref, width, height) {
|
|
51
|
+
return {
|
|
52
|
+
x1: Math.round(ref.x1 / 1000 * width),
|
|
53
|
+
y1: Math.round(ref.y1 / 1000 * height),
|
|
54
|
+
x2: Math.round(ref.x2 / 1000 * width),
|
|
55
|
+
y2: Math.round(ref.y2 / 1000 * height),
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Convert a normalized 1000脳1000 point to pixel coordinates.
|
|
60
|
+
*/
|
|
61
|
+
export function pointToPixels(ref, width, height) {
|
|
62
|
+
return {
|
|
63
|
+
x: Math.round(ref.x / 1000 * width),
|
|
64
|
+
y: Math.round(ref.y / 1000 * height),
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
function isValidCoord(n) {
|
|
68
|
+
return Number.isInteger(n) && n >= 0 && n <= 999;
|
|
69
|
+
}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Volcengine Media Transport 鈥?Doubao Seedream (image), Seedance (video), 3D generation.
|
|
3
|
+
*
|
|
4
|
+
* API reference:
|
|
5
|
+
* Image: POST /v3/images/generations (sync)
|
|
6
|
+
* Video: POST /v3/contents/generations/tasks (async job)
|
|
7
|
+
* 3D: POST /v3/contents/generations/tasks (async job, same endpoint as video)
|
|
8
|
+
*
|
|
9
|
+
* Auth: Authorization: Bearer $ARK_API_KEY
|
|
10
|
+
* Docs: https://www.volcengine.com/docs/82379/1330310
|
|
11
|
+
* https://www.volcengine.com/docs/82379/1874993 (3D)
|
|
12
|
+
*/
|
|
13
|
+
import type { AsyncMediaTransport, MediaRequest, MediaResult, MediaType } from "../media-transport.js";
|
|
14
|
+
export interface VolcengineMediaConfig {
|
|
15
|
+
/** Base URL, e.g. "https://ark.cn-beijing.volces.com/api" */
|
|
16
|
+
baseUrl: string;
|
|
17
|
+
timeoutMs?: number;
|
|
18
|
+
}
|
|
19
|
+
export declare class VolcengineMediaTransport implements AsyncMediaTransport {
|
|
20
|
+
readonly supportedTypes: readonly MediaType[];
|
|
21
|
+
private baseUrl;
|
|
22
|
+
private timeoutMs;
|
|
23
|
+
constructor(config: VolcengineMediaConfig);
|
|
24
|
+
generate(request: MediaRequest, apiKey: string, signal?: AbortSignal): Promise<MediaResult>;
|
|
25
|
+
private generateEmbedding;
|
|
26
|
+
/**
|
|
27
|
+
* Check if this transport can handle a given operation.
|
|
28
|
+
* Video edit/merge/upscale are routed through the same video endpoint.
|
|
29
|
+
*/
|
|
30
|
+
canHandle(request: MediaRequest): boolean;
|
|
31
|
+
private generateImage;
|
|
32
|
+
/**
|
|
33
|
+
* Parse streaming image SSE 鈥?yields progressive image quality upgrades.
|
|
34
|
+
* Final event contains the full-quality image URL.
|
|
35
|
+
*/
|
|
36
|
+
private parseStreamingImage;
|
|
37
|
+
private generateVideo;
|
|
38
|
+
private generate3D;
|
|
39
|
+
/**
|
|
40
|
+
* Query a single video generation task by ID.
|
|
41
|
+
* GET /v3/contents/generations/tasks/{taskId}
|
|
42
|
+
*/
|
|
43
|
+
getTaskStatus(taskId: string, apiKey: string, signal?: AbortSignal): Promise<{
|
|
44
|
+
status: string;
|
|
45
|
+
task: Record<string, unknown>;
|
|
46
|
+
}>;
|
|
47
|
+
/**
|
|
48
|
+
* List video generation tasks with optional filters.
|
|
49
|
+
* GET /v3/contents/generations/tasks
|
|
50
|
+
*/
|
|
51
|
+
listVideoTasks(apiKey: string, options?: {
|
|
52
|
+
after?: string;
|
|
53
|
+
limit?: number;
|
|
54
|
+
status?: string;
|
|
55
|
+
}, signal?: AbortSignal): Promise<Record<string, unknown>>;
|
|
56
|
+
/**
|
|
57
|
+
* Cancel or delete a video generation task.
|
|
58
|
+
* DELETE /v3/contents/generations/tasks/{taskId}
|
|
59
|
+
*/
|
|
60
|
+
deleteVideoTask(taskId: string, apiKey: string, signal?: AbortSignal): Promise<void>;
|
|
61
|
+
/**
|
|
62
|
+
* Upload a file to Volcengine Files API for reuse in multimodal requests.
|
|
63
|
+
* POST /v3/files
|
|
64
|
+
*/
|
|
65
|
+
uploadFile(file: Blob | Buffer, apiKey: string, options?: {
|
|
66
|
+
purpose?: string;
|
|
67
|
+
filename?: string;
|
|
68
|
+
}, signal?: AbortSignal): Promise<{
|
|
69
|
+
id: string;
|
|
70
|
+
status: string;
|
|
71
|
+
}>;
|
|
72
|
+
/**
|
|
73
|
+
* Get file info by ID.
|
|
74
|
+
* GET /v3/files/{fileId}
|
|
75
|
+
*/
|
|
76
|
+
getFile(fileId: string, apiKey: string, signal?: AbortSignal): Promise<Record<string, unknown>>;
|
|
77
|
+
/**
|
|
78
|
+
* List uploaded files.
|
|
79
|
+
* GET /v3/files
|
|
80
|
+
*/
|
|
81
|
+
listFiles(apiKey: string, options?: {
|
|
82
|
+
after?: string;
|
|
83
|
+
limit?: number;
|
|
84
|
+
purpose?: string;
|
|
85
|
+
order?: "asc" | "desc";
|
|
86
|
+
}, signal?: AbortSignal): Promise<Record<string, unknown>>;
|
|
87
|
+
/**
|
|
88
|
+
* Delete a file.
|
|
89
|
+
* DELETE /v3/files/{fileId}
|
|
90
|
+
*/
|
|
91
|
+
deleteFile(fileId: string, apiKey: string, signal?: AbortSignal): Promise<void>;
|
|
92
|
+
private submitTask;
|
|
93
|
+
private pollTask;
|
|
94
|
+
}
|