qlogicagent 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent.js +6 -6
- package/dist/cli.js +181 -172
- package/dist/index.js +181 -172
- package/dist/orchestration.js +1 -1
- package/dist/types/agent/tool-loop.d.ts +2 -0
- package/dist/types/agent/types.d.ts +46 -1
- package/dist/types/cli/stdio-server.d.ts +10 -0
- package/dist/types/cli/tool-bootstrap.d.ts +13 -1
- package/dist/types/llm/index.d.ts +1 -1
- package/dist/types/llm/llm-client.d.ts +1 -1
- package/dist/types/llm/media-client.d.ts +3 -4
- package/dist/types/llm/media-transport.d.ts +75 -4
- package/dist/types/llm/provider-def.d.ts +124 -3
- package/dist/types/llm/provider-registry.d.ts +5 -0
- package/dist/types/llm/provider-tool-api.d.ts +44 -0
- package/dist/types/llm/retry.d.ts +37 -0
- package/dist/types/llm/transport.d.ts +157 -2
- package/dist/types/llm/transports/anthropic-messages.d.ts +7 -0
- package/dist/types/llm/transports/minimax-media.d.ts +5 -0
- package/dist/types/llm/transports/openai-chat.d.ts +44 -3
- package/dist/types/llm/transports/realtime-transport.d.ts +183 -0
- package/dist/types/llm/transports/volcengine-grounding.d.ts +58 -0
- package/dist/types/llm/transports/volcengine-media.d.ts +50 -0
- package/dist/types/llm/transports/volcengine-responses.d.ts +60 -0
- package/dist/types/llm/transports/zhipu-media.d.ts +60 -0
- package/dist/types/llm/transports/zhipu-tool-api.d.ts +35 -0
- package/dist/types/orchestration/tool-cascade.d.ts +40 -0
- package/dist/types/orchestration/tool-loop/tool-schema.d.ts +1 -1
- package/dist/types/protocol/methods.d.ts +19 -0
- package/dist/types/skills/memory/memory-extractor.d.ts +1 -1
- package/dist/types/skills/tools/file-management-tool.d.ts +90 -0
- package/dist/types/skills/tools/image-generate-tool.d.ts +13 -1
- package/dist/types/skills/tools/music-generate-tool.d.ts +25 -0
- package/dist/types/skills/tools/stt-tool.d.ts +33 -0
- package/dist/types/skills/tools/three-d-generate-tool.d.ts +45 -0
- package/dist/types/skills/tools/tts-tool.d.ts +12 -0
- package/dist/types/skills/tools/video-edit-tool.d.ts +5 -2
- package/dist/types/skills/tools/video-generate-tool.d.ts +102 -2
- package/dist/types/skills/tools/video-merge-tool.d.ts +1 -1
- package/dist/types/skills/tools/video-upscale-tool.d.ts +1 -1
- package/dist/types/skills/tools/voice-clone-tool.d.ts +40 -0
- package/package.json +1 -1
|
@@ -9,17 +9,146 @@
|
|
|
9
9
|
* - Anthropic Messages API
|
|
10
10
|
*/
|
|
11
11
|
import type { ChatMessage, ToolDefinition } from "../agent/types.js";
|
|
12
|
+
export type StructuredOutputConfig = {
|
|
13
|
+
mode: "json_object";
|
|
14
|
+
} | {
|
|
15
|
+
mode: "json_schema";
|
|
16
|
+
name: string;
|
|
17
|
+
schema: Record<string, unknown>;
|
|
18
|
+
strict?: boolean;
|
|
19
|
+
};
|
|
20
|
+
export interface CachingConfig {
|
|
21
|
+
type: "enabled" | "disabled";
|
|
22
|
+
/** Enable prefix caching mode (§20.3). Requires store=true and stream=false. */
|
|
23
|
+
prefix?: boolean;
|
|
24
|
+
}
|
|
25
|
+
export type ContextEdit = {
|
|
26
|
+
type: "clear_thinking";
|
|
27
|
+
keep?: "all" | {
|
|
28
|
+
type: "thinking_turns";
|
|
29
|
+
value: number;
|
|
30
|
+
};
|
|
31
|
+
} | {
|
|
32
|
+
type: "clear_tool_uses";
|
|
33
|
+
trigger?: {
|
|
34
|
+
type: "tool_uses";
|
|
35
|
+
value: number;
|
|
36
|
+
};
|
|
37
|
+
keep?: {
|
|
38
|
+
type: "tool_uses";
|
|
39
|
+
value: number;
|
|
40
|
+
};
|
|
41
|
+
excludeTools?: string[];
|
|
42
|
+
clearToolInput?: boolean;
|
|
43
|
+
};
|
|
44
|
+
export interface ContextManagementConfig {
|
|
45
|
+
edits: ContextEdit[];
|
|
46
|
+
}
|
|
12
47
|
export interface LLMRequest {
|
|
13
48
|
model: string;
|
|
14
49
|
messages: ChatMessage[];
|
|
15
50
|
tools?: ToolDefinition[];
|
|
16
|
-
toolChoice?: "auto" | "none" | "required"
|
|
51
|
+
toolChoice?: "auto" | "none" | "required" | {
|
|
52
|
+
type: "function";
|
|
53
|
+
name: string;
|
|
54
|
+
};
|
|
17
55
|
temperature?: number;
|
|
56
|
+
/** Nucleus sampling: controls diversity via cumulative probability cutoff. */
|
|
57
|
+
topP?: number;
|
|
18
58
|
maxTokens?: number;
|
|
19
59
|
reasoning?: {
|
|
20
|
-
effort: "low" | "medium" | "high";
|
|
60
|
+
effort: "minimal" | "low" | "medium" | "high";
|
|
61
|
+
/** Request encrypted original reasoning content (Volcengine §17.7). */
|
|
62
|
+
includeEncryptedReasoning?: boolean;
|
|
21
63
|
};
|
|
64
|
+
/** Volcengine: max builtin tool calls per turn (§19.15). */
|
|
65
|
+
maxToolCalls?: number;
|
|
66
|
+
/**
|
|
67
|
+
* DeepSeek prefix completion: force model to continue from this prefix.
|
|
68
|
+
* Requires `/beta` endpoint; adds a trailing assistant message with `prefix: true`.
|
|
69
|
+
*/
|
|
70
|
+
prefixMessage?: string;
|
|
71
|
+
/**
|
|
72
|
+
* Model requires streaming — disable non-streaming fallback in transports.
|
|
73
|
+
* When true, transports must NOT fall back to non-streaming requests on failure.
|
|
74
|
+
* Set for models like QwQ/Omni where the provider rejects non-streaming calls.
|
|
75
|
+
*/
|
|
76
|
+
streamRequired?: boolean;
|
|
77
|
+
/**
|
|
78
|
+
* Disable injection of provider-native builtin tools (web_search, code_interpreter)
|
|
79
|
+
* for this specific request. Allows session-level control over GLM/Kimi builtin tools.
|
|
80
|
+
*/
|
|
81
|
+
disableBuiltinTools?: boolean;
|
|
82
|
+
/**
|
|
83
|
+
* Volcengine builtin tools to inject (web_search, image_process, knowledge_search).
|
|
84
|
+
* Each entry specifies a tool type and optional config.
|
|
85
|
+
* These are platform-executed tools requiring beta headers.
|
|
86
|
+
*/
|
|
87
|
+
builtinTools?: Array<{
|
|
88
|
+
type: "builtin_web_search" | "builtin_image_process" | "builtin_knowledge_search" | "builtin_doubao_app";
|
|
89
|
+
config?: Record<string, unknown>;
|
|
90
|
+
}>;
|
|
91
|
+
/**
|
|
92
|
+
* Server-side context continuation via response chain (§5).
|
|
93
|
+
* When set, the server automatically includes previous context,
|
|
94
|
+
* so messages[] only needs to contain the NEW user message.
|
|
95
|
+
*/
|
|
96
|
+
previousResponseId?: string;
|
|
97
|
+
/**
|
|
98
|
+
* Control server-side storage of this request's input/output (§5.1).
|
|
99
|
+
* Default: true (server stores for 3 days).
|
|
100
|
+
*/
|
|
101
|
+
store?: boolean;
|
|
102
|
+
/** Expiration time for stored response (Unix seconds, max 7 days from now) */
|
|
103
|
+
storeExpireAt?: number;
|
|
104
|
+
/**
|
|
105
|
+
* Per-turn system instruction augmentation (§8).
|
|
106
|
+
* Temporarily overlays persona or adds constraints for this turn only.
|
|
107
|
+
* NOTE: Incompatible with caching — do not use both together.
|
|
108
|
+
*/
|
|
109
|
+
instructions?: string;
|
|
110
|
+
/**
|
|
111
|
+
* Structured output format (§16).
|
|
112
|
+
* Forces model to produce JSON conforming to the specified schema.
|
|
113
|
+
*/
|
|
114
|
+
structuredOutput?: StructuredOutputConfig;
|
|
115
|
+
/**
|
|
116
|
+
* Caching configuration (§20).
|
|
117
|
+
* Controls prefix/session caching behavior.
|
|
118
|
+
* NOTE: Incompatible with instructions, json_schema, and builtin tools.
|
|
119
|
+
*/
|
|
120
|
+
caching?: CachingConfig;
|
|
121
|
+
/**
|
|
122
|
+
* Context management edits (§21, beta).
|
|
123
|
+
* Server-side trimming of historical thinking chains and tool call traces.
|
|
124
|
+
*/
|
|
125
|
+
contextManagement?: ContextManagementConfig;
|
|
22
126
|
}
|
|
127
|
+
/**
|
|
128
|
+
* FIM completion request — DeepSeek Beta Completions API.
|
|
129
|
+
* POST /beta/v1/completions with prompt + suffix.
|
|
130
|
+
* Only works with non-thinking mode.
|
|
131
|
+
*/
|
|
132
|
+
export interface FIMRequest {
|
|
133
|
+
model: string;
|
|
134
|
+
/** Text before the cursor (prefix context) */
|
|
135
|
+
prompt: string;
|
|
136
|
+
/** Text after the cursor (suffix context) */
|
|
137
|
+
suffix?: string;
|
|
138
|
+
/** Max tokens to generate for the infill */
|
|
139
|
+
maxTokens?: number;
|
|
140
|
+
/** Sampling temperature */
|
|
141
|
+
temperature?: number;
|
|
142
|
+
/** Stop sequences */
|
|
143
|
+
stop?: string[];
|
|
144
|
+
}
|
|
145
|
+
export type FIMChunk = {
|
|
146
|
+
type: "delta";
|
|
147
|
+
text: string;
|
|
148
|
+
} | {
|
|
149
|
+
type: "done";
|
|
150
|
+
finishReason: string;
|
|
151
|
+
};
|
|
23
152
|
export type LLMChunk = {
|
|
24
153
|
type: "delta";
|
|
25
154
|
text: string;
|
|
@@ -43,6 +172,27 @@ export type LLMChunk = {
|
|
|
43
172
|
reasoningTokens?: number;
|
|
44
173
|
cacheReadTokens?: number;
|
|
45
174
|
cacheCreationTokens?: number;
|
|
175
|
+
} | {
|
|
176
|
+
type: "response_id";
|
|
177
|
+
id: string;
|
|
178
|
+
} | {
|
|
179
|
+
/** Informational status from platform-executed builtin tools (web_search, image_process). */
|
|
180
|
+
type: "builtin_tool_status";
|
|
181
|
+
toolType: string;
|
|
182
|
+
event: string;
|
|
183
|
+
data?: Record<string, unknown>;
|
|
184
|
+
} | {
|
|
185
|
+
/** Web search citation annotations from Volcengine web_search results. */
|
|
186
|
+
type: "annotations";
|
|
187
|
+
annotations: Array<{
|
|
188
|
+
type: string;
|
|
189
|
+
url?: string;
|
|
190
|
+
title?: string;
|
|
191
|
+
[key: string]: unknown;
|
|
192
|
+
}>;
|
|
193
|
+
} | {
|
|
194
|
+
type: "error";
|
|
195
|
+
message: string;
|
|
46
196
|
} | {
|
|
47
197
|
type: "done";
|
|
48
198
|
finishReason: string;
|
|
@@ -58,6 +208,11 @@ export interface LLMTransport {
|
|
|
58
208
|
* apiKey is passed explicitly (from agent.turn.config, not env).
|
|
59
209
|
*/
|
|
60
210
|
stream(request: LLMRequest, apiKey: string, signal?: AbortSignal): AsyncGenerator<LLMChunk>;
|
|
211
|
+
/**
|
|
212
|
+
* FIM (Fill-In-Middle) completion — optional capability.
|
|
213
|
+
* Only implemented by providers that support it (DeepSeek /beta endpoint).
|
|
214
|
+
*/
|
|
215
|
+
complete?(request: FIMRequest, apiKey: string, signal?: AbortSignal): AsyncGenerator<FIMChunk>;
|
|
61
216
|
}
|
|
62
217
|
/**
|
|
63
218
|
* Accumulate tool_call_delta chunks into complete ToolCall objects.
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
* - signature_delta handling for thinking blocks
|
|
13
13
|
*/
|
|
14
14
|
import type { LLMChunk, LLMRequest, LLMTransport } from "../transport.js";
|
|
15
|
+
import type { ProviderQuirks } from "../provider-def.js";
|
|
15
16
|
export interface AnthropicTransportConfig {
|
|
16
17
|
baseUrl: string;
|
|
17
18
|
/** anthropic-version header (default "2023-06-01") */
|
|
@@ -24,6 +25,10 @@ export interface AnthropicTransportConfig {
|
|
|
24
25
|
enablePromptCaching?: boolean;
|
|
25
26
|
/** Max retry attempts on transient errors (default 3) */
|
|
26
27
|
maxRetries?: number;
|
|
28
|
+
/** Omit temperature when it equals 0 — MiniMax rejects temperature=0 */
|
|
29
|
+
omitZeroTemperature?: boolean;
|
|
30
|
+
/** Provider-specific quirks for conditional logic (CC/altcode parity) */
|
|
31
|
+
quirks?: ProviderQuirks;
|
|
27
32
|
}
|
|
28
33
|
export declare class AnthropicMessagesTransport implements LLMTransport {
|
|
29
34
|
private baseUrl;
|
|
@@ -32,6 +37,8 @@ export declare class AnthropicMessagesTransport implements LLMTransport {
|
|
|
32
37
|
private streamIdleTimeoutMs;
|
|
33
38
|
private enablePromptCaching;
|
|
34
39
|
private maxRetries;
|
|
40
|
+
private omitZeroTemperature;
|
|
41
|
+
private quirks;
|
|
35
42
|
constructor(config: AnthropicTransportConfig);
|
|
36
43
|
stream(request: LLMRequest, apiKey: string, signal?: AbortSignal): AsyncGenerator<LLMChunk>;
|
|
37
44
|
/**
|
|
@@ -18,4 +18,9 @@ export declare class MiniMaxMediaTransport implements MediaTransport {
|
|
|
18
18
|
constructor(config: MiniMaxMediaConfig);
|
|
19
19
|
generate(request: MediaRequest, apiKey: string, signal?: AbortSignal): Promise<MediaResult>;
|
|
20
20
|
private pollTask;
|
|
21
|
+
/**
|
|
22
|
+
* Generate lyrics from a text prompt via MiniMax Lyrics Generation API.
|
|
23
|
+
* POST /v1/lyrics_generation — returns structured lyrics with tags.
|
|
24
|
+
*/
|
|
25
|
+
generateLyrics(prompt: string, apiKey: string, signal?: AbortSignal): Promise<string>;
|
|
21
26
|
}
|
|
@@ -11,7 +11,8 @@
|
|
|
11
11
|
*
|
|
12
12
|
* Adapted from admin-infer-proxy-client.ts SSE logic + Hermes openai_chat.py transport.
|
|
13
13
|
*/
|
|
14
|
-
import type { LLMChunk, LLMRequest, LLMTransport } from "../transport.js";
|
|
14
|
+
import type { LLMChunk, LLMRequest, LLMTransport, FIMRequest, FIMChunk } from "../transport.js";
|
|
15
|
+
import type { ProviderQuirks } from "../provider-def.js";
|
|
15
16
|
export interface OpenAIChatTransportConfig {
|
|
16
17
|
baseUrl: string;
|
|
17
18
|
/** Additional headers (e.g. for specific providers) */
|
|
@@ -22,6 +23,8 @@ export interface OpenAIChatTransportConfig {
|
|
|
22
23
|
supportsStreamOptions?: boolean;
|
|
23
24
|
/** Whether to omit temperature when it equals 0 (e.g. Moonshot rejects 0) */
|
|
24
25
|
omitZeroTemperature?: boolean;
|
|
26
|
+
/** Provider-specific quirks (CC/altcode parity) */
|
|
27
|
+
quirks?: ProviderQuirks;
|
|
25
28
|
}
|
|
26
29
|
export declare class OpenAIChatTransport implements LLMTransport {
|
|
27
30
|
private baseUrl;
|
|
@@ -29,8 +32,46 @@ export declare class OpenAIChatTransport implements LLMTransport {
|
|
|
29
32
|
private timeoutMs;
|
|
30
33
|
private supportsStreamOptions;
|
|
31
34
|
private omitZeroTemperature;
|
|
35
|
+
private quirks;
|
|
36
|
+
private cumulativeReasoningLen;
|
|
37
|
+
private cumulativeContentLen;
|
|
32
38
|
constructor(config: OpenAIChatTransportConfig);
|
|
33
39
|
stream(request: LLMRequest, apiKey: string, signal?: AbortSignal): AsyncGenerator<LLMChunk>;
|
|
34
|
-
private
|
|
35
|
-
|
|
40
|
+
private fetchAndStream;
|
|
41
|
+
/**
|
|
42
|
+
* Handle non-streaming JSON response from providers that ignore stream:true.
|
|
43
|
+
* Synthesize the same LLMChunk events a streaming response would produce.
|
|
44
|
+
*/
|
|
45
|
+
private handleNonStreamingResponse;
|
|
46
|
+
/**
|
|
47
|
+
* Parse SSE stream with 90s idle watchdog (CC parity).
|
|
48
|
+
* If no data arrives within STREAM_IDLE_TIMEOUT_MS, throw to trigger retry.
|
|
49
|
+
*/
|
|
50
|
+
private parseSSEStreamWithWatchdog;
|
|
51
|
+
private processChunk;
|
|
52
|
+
/**
|
|
53
|
+
* FIM completion via /beta/v1/completions.
|
|
54
|
+
* Only works with DeepSeek (requires supportsPrefixCompletion quirk).
|
|
55
|
+
* Non-thinking mode only; max completion 4K tokens.
|
|
56
|
+
*/
|
|
57
|
+
complete(request: FIMRequest, apiKey: string, signal?: AbortSignal): AsyncGenerator<FIMChunk>;
|
|
58
|
+
/**
|
|
59
|
+
* Upload a file for use in conversations (Kimi File API).
|
|
60
|
+
* Returns a file_id that can be referenced in user messages.
|
|
61
|
+
* POST /v1/files with multipart/form-data.
|
|
62
|
+
*/
|
|
63
|
+
uploadFile(fileBlob: Blob, filename: string, purpose: string, apiKey: string, signal?: AbortSignal): Promise<{
|
|
64
|
+
fileId: string;
|
|
65
|
+
filename: string;
|
|
66
|
+
bytes: number;
|
|
67
|
+
}>;
|
|
68
|
+
/**
|
|
69
|
+
* Get file content/status — GET /v1/files/{file_id}
|
|
70
|
+
*/
|
|
71
|
+
getFileInfo(fileId: string, apiKey: string, signal?: AbortSignal): Promise<{
|
|
72
|
+
id: string;
|
|
73
|
+
filename: string;
|
|
74
|
+
bytes: number;
|
|
75
|
+
status: string;
|
|
76
|
+
}>;
|
|
36
77
|
}
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Realtime WebSocket Transport — bidirectional audio/voice streaming
|
|
3
|
+
* via the OpenAI Realtime API protocol (also compatible with GLM Realtime).
|
|
4
|
+
*
|
|
5
|
+
* ## Protocol: WebSocket JSON events
|
|
6
|
+
*
|
|
7
|
+
* Client → Server:
|
|
8
|
+
* - session.update: configure session (model, voice, tools, etc.)
|
|
9
|
+
* - input_audio_buffer.append: send audio chunks (base64 PCM16)
|
|
10
|
+
* - input_audio_buffer.commit: signal end of audio input
|
|
11
|
+
* - conversation.item.create: inject text/function_result items
|
|
12
|
+
* - response.create: request a model response
|
|
13
|
+
* - response.cancel: abort in-progress response
|
|
14
|
+
*
|
|
15
|
+
* Server → Client:
|
|
16
|
+
* - session.created: session initialized
|
|
17
|
+
* - session.updated: config acknowledged
|
|
18
|
+
* - input_audio_buffer.speech_started: VAD detected speech
|
|
19
|
+
* - input_audio_buffer.speech_stopped: VAD detected silence
|
|
20
|
+
* - response.created: response generation started
|
|
21
|
+
* - response.output_item.added: new output item (text/audio/function_call)
|
|
22
|
+
* - response.audio.delta: audio chunk (base64 PCM16)
|
|
23
|
+
* - response.audio_transcript.delta: transcript of generated speech
|
|
24
|
+
* - response.text.delta: text generation delta
|
|
25
|
+
* - response.function_call_arguments.delta: tool call args delta
|
|
26
|
+
* - response.function_call_arguments.done: tool call complete
|
|
27
|
+
* - response.output_item.done: output item finished
|
|
28
|
+
* - response.done: full response complete
|
|
29
|
+
* - error: server error
|
|
30
|
+
*
|
|
31
|
+
* ## Architecture
|
|
32
|
+
*
|
|
33
|
+
* RealtimeTransport manages a single persistent WebSocket connection per session.
|
|
34
|
+
* It exposes an event-driven API (AsyncGenerator) that the agent tool-loop
|
|
35
|
+
* can consume for voice-enabled interactions.
|
|
36
|
+
*
|
|
37
|
+
* Docs:
|
|
38
|
+
* - OpenAI: https://platform.openai.com/docs/api-reference/realtime
|
|
39
|
+
* - GLM: https://docs.bigmodel.cn/cn/guide/develop/realtime-api
|
|
40
|
+
*/
|
|
41
|
+
export interface RealtimeConfig {
|
|
42
|
+
/** WebSocket endpoint (e.g. "wss://api.openai.com/v1/realtime") */
|
|
43
|
+
baseUrl: string;
|
|
44
|
+
/** Model to use (e.g. "gpt-4o-realtime-preview", "glm-realtime") */
|
|
45
|
+
model: string;
|
|
46
|
+
/** API key */
|
|
47
|
+
apiKey: string;
|
|
48
|
+
/** Voice for TTS output */
|
|
49
|
+
voice?: string;
|
|
50
|
+
/** Input modalities: "text", "audio", or both */
|
|
51
|
+
inputModalities?: Array<"text" | "audio">;
|
|
52
|
+
/** Output modalities: "text", "audio", or both */
|
|
53
|
+
outputModalities?: Array<"text" | "audio">;
|
|
54
|
+
/** Temperature for generation */
|
|
55
|
+
temperature?: number;
|
|
56
|
+
/** Tool definitions for function calling */
|
|
57
|
+
tools?: RealtimeTool[];
|
|
58
|
+
/** Voice Activity Detection mode */
|
|
59
|
+
vadMode?: "server_vad" | "none";
|
|
60
|
+
/** VAD threshold (0.0-1.0) */
|
|
61
|
+
vadThreshold?: number;
|
|
62
|
+
/** Auth type: "header" (OpenAI) or "query" (GLM) */
|
|
63
|
+
authMode?: "header" | "query";
|
|
64
|
+
}
|
|
65
|
+
export interface RealtimeTool {
|
|
66
|
+
type: "function";
|
|
67
|
+
name: string;
|
|
68
|
+
description: string;
|
|
69
|
+
parameters: Record<string, unknown>;
|
|
70
|
+
}
|
|
71
|
+
export type RealtimeEvent = {
|
|
72
|
+
type: "session_created";
|
|
73
|
+
sessionId: string;
|
|
74
|
+
} | {
|
|
75
|
+
type: "speech_started";
|
|
76
|
+
} | {
|
|
77
|
+
type: "speech_stopped";
|
|
78
|
+
audioEndMs: number;
|
|
79
|
+
} | {
|
|
80
|
+
type: "audio_delta";
|
|
81
|
+
delta: string;
|
|
82
|
+
} | {
|
|
83
|
+
type: "audio_transcript_delta";
|
|
84
|
+
delta: string;
|
|
85
|
+
} | {
|
|
86
|
+
type: "text_delta";
|
|
87
|
+
delta: string;
|
|
88
|
+
} | {
|
|
89
|
+
type: "function_call_start";
|
|
90
|
+
callId: string;
|
|
91
|
+
name: string;
|
|
92
|
+
} | {
|
|
93
|
+
type: "function_call_delta";
|
|
94
|
+
callId: string;
|
|
95
|
+
delta: string;
|
|
96
|
+
} | {
|
|
97
|
+
type: "function_call_done";
|
|
98
|
+
callId: string;
|
|
99
|
+
name: string;
|
|
100
|
+
arguments: string;
|
|
101
|
+
} | {
|
|
102
|
+
type: "response_done";
|
|
103
|
+
usage?: RealtimeUsage;
|
|
104
|
+
} | {
|
|
105
|
+
type: "error";
|
|
106
|
+
code: string;
|
|
107
|
+
message: string;
|
|
108
|
+
} | {
|
|
109
|
+
type: "closed";
|
|
110
|
+
code: number;
|
|
111
|
+
reason: string;
|
|
112
|
+
};
|
|
113
|
+
export interface RealtimeUsage {
|
|
114
|
+
inputTokens: number;
|
|
115
|
+
outputTokens: number;
|
|
116
|
+
inputAudioTokens?: number;
|
|
117
|
+
outputAudioTokens?: number;
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* Manages a persistent WebSocket connection for real-time audio/voice
|
|
121
|
+
* interactions with an LLM provider.
|
|
122
|
+
*
|
|
123
|
+
* Usage:
|
|
124
|
+
* ```ts
|
|
125
|
+
* const rt = new RealtimeTransport(config);
|
|
126
|
+
* rt.connect();
|
|
127
|
+
*
|
|
128
|
+
* // Send audio
|
|
129
|
+
* rt.appendAudio(base64Chunk);
|
|
130
|
+
* rt.commitAudio();
|
|
131
|
+
*
|
|
132
|
+
* // Or send text
|
|
133
|
+
* rt.sendText("Hello!");
|
|
134
|
+
*
|
|
135
|
+
* // Submit function results
|
|
136
|
+
* rt.sendFunctionResult(callId, result);
|
|
137
|
+
*
|
|
138
|
+
* // Consume events
|
|
139
|
+
* for await (const event of rt.events()) {
|
|
140
|
+
* switch (event.type) {
|
|
141
|
+
* case "audio_delta": playAudio(event.delta); break;
|
|
142
|
+
* case "function_call_done": handleToolCall(event); break;
|
|
143
|
+
* }
|
|
144
|
+
* }
|
|
145
|
+
*
|
|
146
|
+
* rt.close();
|
|
147
|
+
* ```
|
|
148
|
+
*/
|
|
149
|
+
export declare class RealtimeTransport {
|
|
150
|
+
private ws;
|
|
151
|
+
private config;
|
|
152
|
+
private eventQueue;
|
|
153
|
+
private waiters;
|
|
154
|
+
private closed;
|
|
155
|
+
constructor(config: RealtimeConfig);
|
|
156
|
+
/** Open WebSocket connection and configure session. */
|
|
157
|
+
connect(): Promise<void>;
|
|
158
|
+
/** Send audio data (base64 PCM16). */
|
|
159
|
+
appendAudio(base64Chunk: string): void;
|
|
160
|
+
/** Mark end of audio input and trigger response. */
|
|
161
|
+
commitAudio(): void;
|
|
162
|
+
/** Send a text message. */
|
|
163
|
+
sendText(text: string): void;
|
|
164
|
+
/** Submit a function call result back to the model. */
|
|
165
|
+
sendFunctionResult(callId: string, output: string): void;
|
|
166
|
+
/** Trigger a model response (e.g. after sending text). */
|
|
167
|
+
requestResponse(): void;
|
|
168
|
+
/** Cancel an in-progress response. */
|
|
169
|
+
cancelResponse(): void;
|
|
170
|
+
/** Async iterator of server events. */
|
|
171
|
+
events(): AsyncGenerator<RealtimeEvent>;
|
|
172
|
+
/** Close the WebSocket connection. */
|
|
173
|
+
close(): void;
|
|
174
|
+
private buildUrl;
|
|
175
|
+
private sendSessionUpdate;
|
|
176
|
+
private send;
|
|
177
|
+
private push;
|
|
178
|
+
private drainWaiters;
|
|
179
|
+
/**
|
|
180
|
+
* Parse a server-sent JSON event into our typed event(s).
|
|
181
|
+
*/
|
|
182
|
+
private parseServerEvent;
|
|
183
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Volcengine Grounding — spatial coordinate parser (volcengine-ProviderMax §14).
|
|
3
|
+
*
|
|
4
|
+
* Parses model-emitted spatial reference tags from text output:
|
|
5
|
+
* - <bbox>x_min y_min x_max y_max</bbox> → bounding box
|
|
6
|
+
* - <point>x y</point> → single point
|
|
7
|
+
* - <polygon>x1 y1 x2 y2 ...</polygon> → polygon vertices
|
|
8
|
+
*
|
|
9
|
+
* All coordinates are in normalized 1000×1000 space, range [0, 999].
|
|
10
|
+
* Use `toPixelCoords()` to convert to actual image pixel coordinates.
|
|
11
|
+
*/
|
|
12
|
+
export type SpatialReference = {
|
|
13
|
+
type: "bbox";
|
|
14
|
+
x1: number;
|
|
15
|
+
y1: number;
|
|
16
|
+
x2: number;
|
|
17
|
+
y2: number;
|
|
18
|
+
space: "normalized_1000";
|
|
19
|
+
} | {
|
|
20
|
+
type: "point";
|
|
21
|
+
x: number;
|
|
22
|
+
y: number;
|
|
23
|
+
space: "normalized_1000";
|
|
24
|
+
} | {
|
|
25
|
+
type: "polygon";
|
|
26
|
+
points: Array<{
|
|
27
|
+
x: number;
|
|
28
|
+
y: number;
|
|
29
|
+
}>;
|
|
30
|
+
space: "normalized_1000";
|
|
31
|
+
};
|
|
32
|
+
export interface PixelBbox {
|
|
33
|
+
x1: number;
|
|
34
|
+
y1: number;
|
|
35
|
+
x2: number;
|
|
36
|
+
y2: number;
|
|
37
|
+
}
|
|
38
|
+
export interface PixelPoint {
|
|
39
|
+
x: number;
|
|
40
|
+
y: number;
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Extract all spatial references from model output text.
|
|
44
|
+
* Returns an empty array if no grounding tags are found.
|
|
45
|
+
*/
|
|
46
|
+
export declare function parseGroundingTags(text: string): SpatialReference[];
|
|
47
|
+
/**
|
|
48
|
+
* Convert a normalized 1000×1000 bounding box to pixel coordinates.
|
|
49
|
+
*/
|
|
50
|
+
export declare function bboxToPixels(ref: Extract<SpatialReference, {
|
|
51
|
+
type: "bbox";
|
|
52
|
+
}>, width: number, height: number): PixelBbox;
|
|
53
|
+
/**
|
|
54
|
+
* Convert a normalized 1000×1000 point to pixel coordinates.
|
|
55
|
+
*/
|
|
56
|
+
export declare function pointToPixels(ref: Extract<SpatialReference, {
|
|
57
|
+
type: "point";
|
|
58
|
+
}>, width: number, height: number): PixelPoint;
|
|
@@ -27,8 +27,58 @@ export declare class VolcengineMediaTransport implements MediaTransport {
|
|
|
27
27
|
*/
|
|
28
28
|
canHandle(request: MediaRequest): boolean;
|
|
29
29
|
private generateImage;
|
|
30
|
+
/**
|
|
31
|
+
* Parse streaming image SSE — yields progressive image quality upgrades.
|
|
32
|
+
* Final event contains the full-quality image URL.
|
|
33
|
+
*/
|
|
34
|
+
private parseStreamingImage;
|
|
30
35
|
private generateVideo;
|
|
31
36
|
private generate3D;
|
|
37
|
+
/**
|
|
38
|
+
* List video generation tasks with optional filters.
|
|
39
|
+
* GET /v3/contents/generations/tasks
|
|
40
|
+
*/
|
|
41
|
+
listVideoTasks(apiKey: string, options?: {
|
|
42
|
+
after?: string;
|
|
43
|
+
limit?: number;
|
|
44
|
+
status?: string;
|
|
45
|
+
}, signal?: AbortSignal): Promise<Record<string, unknown>>;
|
|
46
|
+
/**
|
|
47
|
+
* Cancel or delete a video generation task.
|
|
48
|
+
* DELETE /v3/contents/generations/tasks/{taskId}
|
|
49
|
+
*/
|
|
50
|
+
deleteVideoTask(taskId: string, apiKey: string, signal?: AbortSignal): Promise<void>;
|
|
51
|
+
/**
|
|
52
|
+
* Upload a file to Volcengine Files API for reuse in multimodal requests.
|
|
53
|
+
* POST /v3/files
|
|
54
|
+
*/
|
|
55
|
+
uploadFile(file: Blob | Buffer, apiKey: string, options?: {
|
|
56
|
+
purpose?: string;
|
|
57
|
+
filename?: string;
|
|
58
|
+
}, signal?: AbortSignal): Promise<{
|
|
59
|
+
id: string;
|
|
60
|
+
status: string;
|
|
61
|
+
}>;
|
|
62
|
+
/**
|
|
63
|
+
* Get file info by ID.
|
|
64
|
+
* GET /v3/files/{fileId}
|
|
65
|
+
*/
|
|
66
|
+
getFile(fileId: string, apiKey: string, signal?: AbortSignal): Promise<Record<string, unknown>>;
|
|
67
|
+
/**
|
|
68
|
+
* List uploaded files.
|
|
69
|
+
* GET /v3/files
|
|
70
|
+
*/
|
|
71
|
+
listFiles(apiKey: string, options?: {
|
|
72
|
+
after?: string;
|
|
73
|
+
limit?: number;
|
|
74
|
+
purpose?: string;
|
|
75
|
+
order?: "asc" | "desc";
|
|
76
|
+
}, signal?: AbortSignal): Promise<Record<string, unknown>>;
|
|
77
|
+
/**
|
|
78
|
+
* Delete a file.
|
|
79
|
+
* DELETE /v3/files/{fileId}
|
|
80
|
+
*/
|
|
81
|
+
deleteFile(fileId: string, apiKey: string, signal?: AbortSignal): Promise<void>;
|
|
32
82
|
private submitTask;
|
|
33
83
|
private pollTask;
|
|
34
84
|
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Volcengine Responses API Transport — SSE streaming implementation.
|
|
3
|
+
*
|
|
4
|
+
* Implements the fire mountain ark Responses API (`/api/v3/responses`),
|
|
5
|
+
* which is the officially recommended primary path for Doubao LLM text generation
|
|
6
|
+
* (250615+ models: doubao-seed-2.0 series).
|
|
7
|
+
*
|
|
8
|
+
* Key differences from OpenAI Chat Completions:
|
|
9
|
+
* - Endpoint: POST {baseUrl}/v3/responses
|
|
10
|
+
* - Request body uses `input` (not `messages`), `instructions`, `thinking`, `reasoning`
|
|
11
|
+
* - SSE events: response.output_text.delta, response.reasoning_summary_text.delta,
|
|
12
|
+
* response.function_call_arguments.delta, response.completed, etc.
|
|
13
|
+
* - Tool calling: function_call / function_call_output with call_id
|
|
14
|
+
* - Context persistence: previous_response_id for server-side session continuation
|
|
15
|
+
* - Deep thinking: thinking.type (enabled/disabled/auto) + reasoning.effort
|
|
16
|
+
*
|
|
17
|
+
* Docs: https://www.volcengine.com/docs/82379/1399008
|
|
18
|
+
*/
|
|
19
|
+
import type { LLMChunk, LLMRequest, LLMTransport } from "../transport.js";
|
|
20
|
+
import type { ProviderQuirks } from "../provider-def.js";
|
|
21
|
+
export interface VolcengineResponsesTransportConfig {
|
|
22
|
+
baseUrl: string;
|
|
23
|
+
extraHeaders?: Record<string, string>;
|
|
24
|
+
timeoutMs?: number;
|
|
25
|
+
quirks?: ProviderQuirks;
|
|
26
|
+
}
|
|
27
|
+
export declare class VolcengineResponsesTransport implements LLMTransport {
|
|
28
|
+
private baseUrl;
|
|
29
|
+
private extraHeaders;
|
|
30
|
+
private timeoutMs;
|
|
31
|
+
private quirks;
|
|
32
|
+
constructor(config: VolcengineResponsesTransportConfig);
|
|
33
|
+
stream(request: LLMRequest, apiKey: string, signal?: AbortSignal): AsyncGenerator<LLMChunk>;
|
|
34
|
+
/**
|
|
35
|
+
* Resolve known Volcengine Responses API incompatibilities:
|
|
36
|
+
* - instructions + caching → drop caching (§20.7)
|
|
37
|
+
* - caching + json_schema → downgrade to json_object (§20.10)
|
|
38
|
+
* - caching + builtin_web_search/image_process → drop those builtin tools
|
|
39
|
+
* Returns a shallow copy with fields adjusted; never mutates the original.
|
|
40
|
+
*/
|
|
41
|
+
private resolveConstraints;
|
|
42
|
+
private buildRequestBody;
|
|
43
|
+
private fetchAndStream;
|
|
44
|
+
private handleNonStreamingResponse;
|
|
45
|
+
/**
|
|
46
|
+
* Parse Volcengine Responses API SSE stream.
|
|
47
|
+
*
|
|
48
|
+
* Event format: "event: <type>\ndata: <json>\n\n"
|
|
49
|
+
* Key events:
|
|
50
|
+
* - response.output_text.delta → text content delta
|
|
51
|
+
* - response.reasoning_summary_text.delta → thinking/reasoning text
|
|
52
|
+
* - response.function_call_arguments.delta → tool call arguments streaming
|
|
53
|
+
* - response.output_item.added → new output item started
|
|
54
|
+
* - response.output_item.done → output item completed
|
|
55
|
+
* - response.completed → full response complete with usage
|
|
56
|
+
* - response.failed → error
|
|
57
|
+
*/
|
|
58
|
+
private parseSSEStream;
|
|
59
|
+
private processEvent;
|
|
60
|
+
}
|