voice-agent-ai-sdk 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +231 -0
- package/dist/VoiceAgent.d.ts +242 -0
- package/dist/VoiceAgent.d.ts.map +1 -0
- package/dist/VoiceAgent.js +1093 -0
- package/dist/VoiceAgent.js.map +1 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +6 -0
- package/dist/index.js.map +1 -0
- package/dist/utils/StreamBuffer.d.ts +1 -0
- package/dist/utils/StreamBuffer.d.ts.map +1 -0
- package/dist/utils/StreamBuffer.js +2 -0
- package/dist/utils/StreamBuffer.js.map +1 -0
- package/package.json +57 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Bijit Mondal
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
# voice-agent-ai-sdk
|
|
2
|
+
|
|
3
|
+
[](https://www.npmjs.com/package/voice-agent-ai-sdk)
|
|
4
|
+
|
|
5
|
+
Streaming voice/text agent SDK built on [AI SDK](https://sdk.vercel.ai/) with optional WebSocket transport.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- **Streaming text generation** via AI SDK `streamText` with multi-step tool calling.
|
|
10
|
+
- **Chunked streaming TTS** — text is split at sentence boundaries and converted to speech in parallel as the LLM streams, giving low time-to-first-audio.
|
|
11
|
+
- **Audio transcription** via AI SDK `experimental_transcribe` (e.g. Whisper).
|
|
12
|
+
- **Barge-in / interruption** — user speech cancels both the in-flight LLM stream and pending TTS, saving tokens and latency.
|
|
13
|
+
- **Memory management** — configurable sliding-window on conversation history (`maxMessages`, `maxTotalChars`) and audio input size limits.
|
|
14
|
+
- **Serial request queue** — concurrent `sendText` / audio inputs are queued and processed one at a time, preventing race conditions.
|
|
15
|
+
- **Graceful lifecycle** — `disconnect()` aborts all in-flight work; `destroy()` permanently releases every resource.
|
|
16
|
+
- **WebSocket transport** with a full protocol of stream, tool, and speech lifecycle events.
|
|
17
|
+
- **Works without WebSocket** — call `sendText()` directly for text-only or server-side use.
|
|
18
|
+
|
|
19
|
+
## Prerequisites
|
|
20
|
+
|
|
21
|
+
- Node.js 20+
|
|
22
|
+
- pnpm
|
|
23
|
+
- OpenAI API key
|
|
24
|
+
|
|
25
|
+
## Setup
|
|
26
|
+
|
|
27
|
+
1. Install dependencies:
|
|
28
|
+
|
|
29
|
+
pnpm install
|
|
30
|
+
|
|
31
|
+
2. Configure environment variables in `.env`:
|
|
32
|
+
|
|
33
|
+
OPENAI_API_KEY=your_openai_api_key
|
|
34
|
+
VOICE_WS_ENDPOINT=ws://localhost:8080
|
|
35
|
+
|
|
36
|
+
`VOICE_WS_ENDPOINT` is optional for text-only usage.
|
|
37
|
+
|
|
38
|
+
## VoiceAgent usage (as in the demo)
|
|
39
|
+
|
|
40
|
+
Minimal end-to-end example using AI SDK tools, streaming text, and streaming TTS:
|
|
41
|
+
|
|
42
|
+
```ts
|
|
43
|
+
import "dotenv/config";
|
|
44
|
+
import { VoiceAgent } from "./src";
|
|
45
|
+
import { tool } from "ai";
|
|
46
|
+
import { z } from "zod";
|
|
47
|
+
import { openai } from "@ai-sdk/openai";
|
|
48
|
+
|
|
49
|
+
const weatherTool = tool({
|
|
50
|
+
description: "Get the weather in a location",
|
|
51
|
+
inputSchema: z.object({ location: z.string() }),
|
|
52
|
+
execute: async ({ location }) => ({ location, temperature: 72, conditions: "sunny" }),
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
const agent = new VoiceAgent({
|
|
56
|
+
model: openai("gpt-4o"),
|
|
57
|
+
transcriptionModel: openai.transcription("whisper-1"),
|
|
58
|
+
speechModel: openai.speech("gpt-4o-mini-tts"),
|
|
59
|
+
instructions: "You are a helpful voice assistant.",
|
|
60
|
+
voice: "alloy",
|
|
61
|
+
speechInstructions: "Speak in a friendly, natural conversational tone.",
|
|
62
|
+
outputFormat: "mp3",
|
|
63
|
+
streamingSpeech: {
|
|
64
|
+
minChunkSize: 40,
|
|
65
|
+
maxChunkSize: 180,
|
|
66
|
+
parallelGeneration: true,
|
|
67
|
+
maxParallelRequests: 2,
|
|
68
|
+
},
|
|
69
|
+
// Memory management (new in 0.1.0)
|
|
70
|
+
history: {
|
|
71
|
+
maxMessages: 50, // keep last 50 messages
|
|
72
|
+
maxTotalChars: 100_000, // or trim when total chars exceed 100k
|
|
73
|
+
},
|
|
74
|
+
maxAudioInputSize: 5 * 1024 * 1024, // 5 MB limit
|
|
75
|
+
endpoint: process.env.VOICE_WS_ENDPOINT,
|
|
76
|
+
tools: { getWeather: weatherTool },
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
agent.on("text", ({ role, text }) => {
|
|
80
|
+
const prefix = role === "user" ? "👤" : "🤖";
|
|
81
|
+
console.log(prefix, text);
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
agent.on("chunk:text_delta", ({ text }) => process.stdout.write(text));
|
|
85
|
+
agent.on("speech_start", ({ streaming }) => console.log("speech_start", streaming));
|
|
86
|
+
agent.on("audio_chunk", ({ chunkId, format, uint8Array }) => {
|
|
87
|
+
console.log("audio_chunk", chunkId, format, uint8Array.length);
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
await agent.sendText("What's the weather in San Francisco?");
|
|
91
|
+
|
|
92
|
+
if (process.env.VOICE_WS_ENDPOINT) {
|
|
93
|
+
await agent.connect(process.env.VOICE_WS_ENDPOINT);
|
|
94
|
+
}
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Configuration options
|
|
98
|
+
|
|
99
|
+
The agent accepts:
|
|
100
|
+
|
|
101
|
+
| Option | Required | Default | Description |
|
|
102
|
+
|---|---|---|---|
|
|
103
|
+
| `model` | **yes** | — | AI SDK chat model (e.g. `openai("gpt-4o")`) |
|
|
104
|
+
| `transcriptionModel` | no | — | AI SDK transcription model (e.g. `openai.transcription("whisper-1")`) |
|
|
105
|
+
| `speechModel` | no | — | AI SDK speech model (e.g. `openai.speech("gpt-4o-mini-tts")`) |
|
|
106
|
+
| `instructions` | no | `"You are a helpful voice assistant."` | System prompt |
|
|
107
|
+
| `stopWhen` | no | `stepCountIs(5)` | Stopping condition for multi-step tool loops |
|
|
108
|
+
| `tools` | no | `{}` | AI SDK tools map |
|
|
109
|
+
| `endpoint` | no | — | Default WebSocket URL for `connect()` |
|
|
110
|
+
| `voice` | no | `"alloy"` | TTS voice |
|
|
111
|
+
| `speechInstructions` | no | — | Style instructions passed to the speech model |
|
|
112
|
+
| `outputFormat` | no | `"mp3"` | Audio output format (`mp3`, `opus`, `wav`, …) |
|
|
113
|
+
| `streamingSpeech` | no | see below | Streaming TTS chunk tuning |
|
|
114
|
+
| `history` | no | see below | Conversation memory limits |
|
|
115
|
+
| `maxAudioInputSize` | no | `10485760` (10 MB) | Maximum accepted audio input in bytes |
|
|
116
|
+
|
|
117
|
+
#### `streamingSpeech`
|
|
118
|
+
|
|
119
|
+
| Key | Default | Description |
|
|
120
|
+
|---|---|---|
|
|
121
|
+
| `minChunkSize` | `50` | Min characters before a sentence is sent to TTS |
|
|
122
|
+
| `maxChunkSize` | `200` | Max characters per chunk (force-split at clause boundary) |
|
|
123
|
+
| `parallelGeneration` | `true` | Start TTS for upcoming chunks while the current one plays |
|
|
124
|
+
| `maxParallelRequests` | `3` | Cap on concurrent TTS requests |
|
|
125
|
+
|
|
126
|
+
#### `history`
|
|
127
|
+
|
|
128
|
+
| Key | Default | Description |
|
|
129
|
+
|---|---|---|
|
|
130
|
+
| `maxMessages` | `100` | Max messages kept in history (0 = unlimited). Oldest are trimmed in pairs. |
|
|
131
|
+
| `maxTotalChars` | `0` (unlimited) | Max total characters across all messages. Oldest are trimmed when exceeded. |
|
|
132
|
+
|
|
133
|
+
### Methods
|
|
134
|
+
|
|
135
|
+
| Method | Description |
|
|
136
|
+
|---|---|
|
|
137
|
+
| `sendText(text)` | Process text input. Returns a promise with the full assistant response. Requests are queued serially. |
|
|
138
|
+
| `sendAudio(base64Audio)` | Transcribe base64 audio and process the result. |
|
|
139
|
+
| `sendAudioBuffer(buffer)` | Same as above, accepts a raw `Buffer` / `Uint8Array`. |
|
|
140
|
+
| `transcribeAudio(buffer)` | Transcribe audio to text without generating a response. |
|
|
141
|
+
| `generateAndSendSpeechFull(text)` | Non-streaming TTS fallback (entire text at once). |
|
|
142
|
+
| `interruptSpeech(reason?)` | Cancel in-flight TTS only (LLM stream keeps running). |
|
|
143
|
+
| `interruptCurrentResponse(reason?)` | Cancel **both** the LLM stream and TTS. Used for barge-in. |
|
|
144
|
+
| `connect(url?)` / `handleSocket(ws)` | Establish or attach a WebSocket. Safe to call multiple times. |
|
|
145
|
+
| `disconnect()` | Close the socket and abort all in-flight work. |
|
|
146
|
+
| `destroy()` | Permanently release all resources. The agent cannot be reused. |
|
|
147
|
+
| `clearHistory()` | Clear conversation history. |
|
|
148
|
+
| `getHistory()` / `setHistory(msgs)` | Read or restore conversation history. |
|
|
149
|
+
| `registerTools(tools)` | Merge additional tools into the agent. |
|
|
150
|
+
|
|
151
|
+
### Read-only properties
|
|
152
|
+
|
|
153
|
+
| Property | Type | Description |
|
|
154
|
+
|---|---|---|
|
|
155
|
+
| `connected` | `boolean` | Whether a WebSocket is connected |
|
|
156
|
+
| `processing` | `boolean` | Whether a request is currently being processed |
|
|
157
|
+
| `speaking` | `boolean` | Whether audio is currently being generated / sent |
|
|
158
|
+
| `pendingSpeechChunks` | `number` | Number of queued TTS chunks |
|
|
159
|
+
| `destroyed` | `boolean` | Whether `destroy()` has been called |
|
|
160
|
+
|
|
161
|
+
### Events
|
|
162
|
+
|
|
163
|
+
| Event | Payload | When |
|
|
164
|
+
|---|---|---|
|
|
165
|
+
| `text` | `{ role, text }` | User input received or full assistant response ready |
|
|
166
|
+
| `chunk:text_delta` | `{ id, text }` | Each streaming text token from the LLM |
|
|
167
|
+
| `chunk:reasoning_delta` | `{ id, text }` | Each reasoning token (models that support it) |
|
|
168
|
+
| `chunk:tool_call` | `{ toolName, toolCallId, input }` | Tool invocation detected |
|
|
169
|
+
| `tool_result` | `{ name, toolCallId, result }` | Tool execution finished |
|
|
170
|
+
| `speech_start` | `{ streaming }` | TTS generation begins |
|
|
171
|
+
| `speech_complete` | `{ streaming }` | All TTS chunks sent |
|
|
172
|
+
| `speech_interrupted` | `{ reason }` | Speech was cancelled (barge-in, disconnect, error) |
|
|
173
|
+
| `speech_chunk_queued` | `{ id, text }` | A text chunk entered the TTS queue |
|
|
174
|
+
| `audio_chunk` | `{ chunkId, data, format, text, uint8Array }` | One TTS chunk is ready |
|
|
175
|
+
| `audio` | `{ data, format, uint8Array }` | Full non-streaming TTS audio |
|
|
176
|
+
| `transcription` | `{ text, language }` | Audio transcription result |
|
|
177
|
+
| `audio_received` | `{ size }` | Raw audio input received (before transcription) |
|
|
178
|
+
| `history_trimmed` | `{ removedCount, reason }` | Oldest messages evicted from history |
|
|
179
|
+
| `connected` / `disconnected` | — | WebSocket lifecycle |
|
|
180
|
+
| `warning` | `string` | Non-fatal issues (empty input, etc.) |
|
|
181
|
+
| `error` | `Error` | Errors from LLM, TTS, transcription, or WebSocket |
|
|
182
|
+
|
|
183
|
+
## Run (text-only check)
|
|
184
|
+
|
|
185
|
+
This validates LLM + tool + streaming speech without requiring WebSocket:
|
|
186
|
+
|
|
187
|
+
pnpm demo
|
|
188
|
+
|
|
189
|
+
Expected logs include `text`, `chunk:text_delta`, tool events, and speech chunk events.
|
|
190
|
+
|
|
191
|
+
## Run (WebSocket check)
|
|
192
|
+
|
|
193
|
+
1. Start the local WS server:
|
|
194
|
+
|
|
195
|
+
pnpm ws:server
|
|
196
|
+
|
|
197
|
+
2. In another terminal, run the demo:
|
|
198
|
+
|
|
199
|
+
pnpm demo
|
|
200
|
+
|
|
201
|
+
The demo will:
|
|
202
|
+
- run `sendText()` first (text-only sanity check), then
|
|
203
|
+
- connect to `VOICE_WS_ENDPOINT` if provided,
|
|
204
|
+
- emit streaming protocol messages (`text_delta`, `tool_call`, `audio_chunk`, `response_complete`, etc.).
|
|
205
|
+
|
|
206
|
+
## Browser voice client (HTML)
|
|
207
|
+
|
|
208
|
+
A simple browser client is available at [example/voice-client.html](example/voice-client.html).
|
|
209
|
+
|
|
210
|
+
What it does:
|
|
211
|
+
- captures microphone speech using Web Speech API (speech-to-text)
|
|
212
|
+
- sends transcript to the agent via WebSocket (`type: "transcript"`)
|
|
213
|
+
- receives streaming `audio_chunk` messages and plays them in order
|
|
214
|
+
|
|
215
|
+
How to use:
|
|
216
|
+
1. Start your agent server/WebSocket endpoint.
|
|
217
|
+
2. Open [example/voice-client.html](example/voice-client.html) in a browser (Chrome/Edge recommended).
|
|
218
|
+
3. Connect to `ws://localhost:8080` (or your endpoint), then click **Start Mic**.
|
|
219
|
+
|
|
220
|
+
## Scripts
|
|
221
|
+
|
|
222
|
+
- `pnpm build` – build TypeScript
|
|
223
|
+
- `pnpm dev` – watch TypeScript
|
|
224
|
+
- `pnpm demo` – run demo client
|
|
225
|
+
- `pnpm ws:server` – run local test WebSocket server
|
|
226
|
+
|
|
227
|
+
## Notes
|
|
228
|
+
|
|
229
|
+
- If `VOICE_WS_ENDPOINT` is empty, WebSocket connect is skipped.
|
|
230
|
+
- The sample WS server sends a mock `transcript` message for end-to-end testing.
|
|
231
|
+
- Streaming TTS uses chunk queueing and supports interruption (`interrupt`).
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
import { WebSocket } from "ws";
|
|
2
|
+
import { EventEmitter } from "events";
|
|
3
|
+
import { streamText, LanguageModel, type Tool, type ModelMessage, type TranscriptionModel, type SpeechModel } from "ai";
|
|
4
|
+
/**
|
|
5
|
+
* Configuration for streaming speech behavior
|
|
6
|
+
*/
|
|
7
|
+
interface StreamingSpeechConfig {
|
|
8
|
+
/** Minimum characters before generating speech for a chunk */
|
|
9
|
+
minChunkSize: number;
|
|
10
|
+
/** Maximum characters per chunk (will split at sentence boundary before this) */
|
|
11
|
+
maxChunkSize: number;
|
|
12
|
+
/** Whether to enable parallel TTS generation */
|
|
13
|
+
parallelGeneration: boolean;
|
|
14
|
+
/** Maximum number of parallel TTS requests */
|
|
15
|
+
maxParallelRequests: number;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Configuration for conversation history memory management
|
|
19
|
+
*/
|
|
20
|
+
interface HistoryConfig {
|
|
21
|
+
/** Maximum number of messages to keep in history. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */
|
|
22
|
+
maxMessages: number;
|
|
23
|
+
/** Maximum total character count across all messages. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */
|
|
24
|
+
maxTotalChars: number;
|
|
25
|
+
}
|
|
26
|
+
export interface VoiceAgentOptions {
|
|
27
|
+
model: LanguageModel;
|
|
28
|
+
transcriptionModel?: TranscriptionModel;
|
|
29
|
+
speechModel?: SpeechModel;
|
|
30
|
+
instructions?: string;
|
|
31
|
+
stopWhen?: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
|
|
32
|
+
tools?: Record<string, Tool>;
|
|
33
|
+
endpoint?: string;
|
|
34
|
+
voice?: string;
|
|
35
|
+
speechInstructions?: string;
|
|
36
|
+
outputFormat?: string;
|
|
37
|
+
/** Configuration for streaming speech generation */
|
|
38
|
+
streamingSpeech?: Partial<StreamingSpeechConfig>;
|
|
39
|
+
/** Configuration for conversation history memory limits */
|
|
40
|
+
history?: Partial<HistoryConfig>;
|
|
41
|
+
/** Maximum audio input size in bytes (default: 10 MB) */
|
|
42
|
+
maxAudioInputSize?: number;
|
|
43
|
+
}
|
|
44
|
+
export declare class VoiceAgent extends EventEmitter {
|
|
45
|
+
private socket?;
|
|
46
|
+
private tools;
|
|
47
|
+
private model;
|
|
48
|
+
private transcriptionModel?;
|
|
49
|
+
private speechModel?;
|
|
50
|
+
private instructions;
|
|
51
|
+
private stopWhen;
|
|
52
|
+
private endpoint?;
|
|
53
|
+
private isConnected;
|
|
54
|
+
private conversationHistory;
|
|
55
|
+
private voice;
|
|
56
|
+
private speechInstructions?;
|
|
57
|
+
private outputFormat;
|
|
58
|
+
private isProcessing;
|
|
59
|
+
private isDestroyed;
|
|
60
|
+
private inputQueue;
|
|
61
|
+
private processingQueue;
|
|
62
|
+
private currentStreamAbortController?;
|
|
63
|
+
private historyConfig;
|
|
64
|
+
private maxAudioInputSize;
|
|
65
|
+
private streamingSpeechConfig;
|
|
66
|
+
private currentSpeechAbortController?;
|
|
67
|
+
private speechChunkQueue;
|
|
68
|
+
private nextChunkId;
|
|
69
|
+
private isSpeaking;
|
|
70
|
+
private pendingTextBuffer;
|
|
71
|
+
private speechQueueDonePromise?;
|
|
72
|
+
private speechQueueDoneResolve?;
|
|
73
|
+
constructor(options: VoiceAgentOptions);
|
|
74
|
+
/**
|
|
75
|
+
* Ensure the agent has not been destroyed. Throws if it has.
|
|
76
|
+
*/
|
|
77
|
+
private ensureNotDestroyed;
|
|
78
|
+
private setupListeners;
|
|
79
|
+
/**
|
|
80
|
+
* Clean up all in-flight state when the connection drops.
|
|
81
|
+
*/
|
|
82
|
+
private cleanupOnDisconnect;
|
|
83
|
+
registerTools(tools: Record<string, Tool>): void;
|
|
84
|
+
/**
|
|
85
|
+
* Transcribe audio data to text using the configured transcription model
|
|
86
|
+
*/
|
|
87
|
+
transcribeAudio(audioData: Buffer | Uint8Array): Promise<string>;
|
|
88
|
+
/**
|
|
89
|
+
* Generate speech from text using the configured speech model
|
|
90
|
+
* @param abortSignal Optional signal to cancel the speech generation
|
|
91
|
+
*/
|
|
92
|
+
generateSpeechFromText(text: string, abortSignal?: AbortSignal): Promise<Uint8Array>;
|
|
93
|
+
/**
|
|
94
|
+
* Interrupt ongoing speech generation and playback (barge-in support).
|
|
95
|
+
* This only interrupts TTS — the LLM stream is left running.
|
|
96
|
+
*/
|
|
97
|
+
interruptSpeech(reason?: string): void;
|
|
98
|
+
/**
|
|
99
|
+
* Interrupt both the current LLM stream and ongoing speech.
|
|
100
|
+
* Use this for barge-in scenarios where the entire response should be cancelled.
|
|
101
|
+
*/
|
|
102
|
+
interruptCurrentResponse(reason?: string): void;
|
|
103
|
+
/**
|
|
104
|
+
* Extract complete sentences from text buffer
|
|
105
|
+
* Returns [extractedSentences, remainingBuffer]
|
|
106
|
+
*/
|
|
107
|
+
private extractSentences;
|
|
108
|
+
/**
|
|
109
|
+
* Trim conversation history to stay within configured limits.
|
|
110
|
+
* Removes oldest messages (always in pairs to preserve user/assistant turns).
|
|
111
|
+
*/
|
|
112
|
+
private trimHistory;
|
|
113
|
+
/**
|
|
114
|
+
* Queue a text chunk for speech generation
|
|
115
|
+
*/
|
|
116
|
+
private queueSpeechChunk;
|
|
117
|
+
/**
|
|
118
|
+
* Generate audio for a single chunk
|
|
119
|
+
*/
|
|
120
|
+
private generateChunkAudio;
|
|
121
|
+
/**
|
|
122
|
+
* Process the speech queue and send audio chunks in order
|
|
123
|
+
*/
|
|
124
|
+
private processSpeechQueue;
|
|
125
|
+
/**
|
|
126
|
+
* Process text delta for streaming speech.
|
|
127
|
+
* Call this as text chunks arrive from LLM.
|
|
128
|
+
*/
|
|
129
|
+
private processTextForStreamingSpeech;
|
|
130
|
+
/**
|
|
131
|
+
* Flush any remaining text in the buffer to speech
|
|
132
|
+
* Call this when stream ends
|
|
133
|
+
*/
|
|
134
|
+
private flushStreamingSpeech;
|
|
135
|
+
/**
|
|
136
|
+
* Process incoming audio data: transcribe and generate response
|
|
137
|
+
*/
|
|
138
|
+
private processAudioInput;
|
|
139
|
+
connect(url?: string): Promise<void>;
|
|
140
|
+
/**
|
|
141
|
+
* Attach an existing WebSocket (server-side usage).
|
|
142
|
+
* Use this when a WS server accepts a connection and you want the
|
|
143
|
+
* agent to handle messages on that socket.
|
|
144
|
+
*/
|
|
145
|
+
handleSocket(socket: WebSocket): void;
|
|
146
|
+
/**
|
|
147
|
+
* Send text input for processing (bypasses transcription).
|
|
148
|
+
* Requests are queued and processed serially to prevent race conditions.
|
|
149
|
+
*/
|
|
150
|
+
sendText(text: string): Promise<string>;
|
|
151
|
+
/**
|
|
152
|
+
* Send audio data to be transcribed and processed
|
|
153
|
+
* @param audioData Base64 encoded audio data
|
|
154
|
+
*/
|
|
155
|
+
sendAudio(audioData: string): Promise<void>;
|
|
156
|
+
/**
|
|
157
|
+
* Send raw audio buffer to be transcribed and processed
|
|
158
|
+
*/
|
|
159
|
+
sendAudioBuffer(audioBuffer: Buffer | Uint8Array): Promise<void>;
|
|
160
|
+
/**
|
|
161
|
+
* Enqueue a text input for serial processing.
|
|
162
|
+
* This ensures only one processUserInput runs at a time, preventing
|
|
163
|
+
* race conditions on conversationHistory, fullText accumulation, etc.
|
|
164
|
+
*/
|
|
165
|
+
private enqueueInput;
|
|
166
|
+
/**
|
|
167
|
+
* Drain the input queue, processing one request at a time.
|
|
168
|
+
*/
|
|
169
|
+
private drainInputQueue;
|
|
170
|
+
/**
|
|
171
|
+
* Process user input with streaming text generation.
|
|
172
|
+
* Handles the full pipeline: text -> LLM (streaming) -> TTS -> WebSocket.
|
|
173
|
+
*
|
|
174
|
+
* This method is designed to be called serially via drainInputQueue().
|
|
175
|
+
*/
|
|
176
|
+
private processUserInput;
|
|
177
|
+
/**
|
|
178
|
+
* Generate speech for full text at once (non-streaming fallback)
|
|
179
|
+
* Useful when you want to bypass streaming speech for short responses
|
|
180
|
+
*/
|
|
181
|
+
generateAndSendSpeechFull(text: string): Promise<void>;
|
|
182
|
+
/**
|
|
183
|
+
* Send a message via WebSocket if connected.
|
|
184
|
+
* Gracefully handles send failures (e.g., socket closing mid-send).
|
|
185
|
+
*/
|
|
186
|
+
private sendWebSocketMessage;
|
|
187
|
+
/**
|
|
188
|
+
* Start listening for voice input
|
|
189
|
+
*/
|
|
190
|
+
startListening(): void;
|
|
191
|
+
/**
|
|
192
|
+
* Stop listening for voice input
|
|
193
|
+
*/
|
|
194
|
+
stopListening(): void;
|
|
195
|
+
/**
|
|
196
|
+
* Clear conversation history
|
|
197
|
+
*/
|
|
198
|
+
clearHistory(): void;
|
|
199
|
+
/**
|
|
200
|
+
* Get current conversation history
|
|
201
|
+
*/
|
|
202
|
+
getHistory(): ModelMessage[];
|
|
203
|
+
/**
|
|
204
|
+
* Set conversation history (useful for restoring sessions)
|
|
205
|
+
*/
|
|
206
|
+
setHistory(history: ModelMessage[]): void;
|
|
207
|
+
/**
|
|
208
|
+
* Internal helper to close and clean up the current socket.
|
|
209
|
+
*/
|
|
210
|
+
private disconnectSocket;
|
|
211
|
+
/**
|
|
212
|
+
* Disconnect from WebSocket and stop all in-flight work.
|
|
213
|
+
*/
|
|
214
|
+
disconnect(): void;
|
|
215
|
+
/**
|
|
216
|
+
* Permanently destroy the agent, releasing all resources.
|
|
217
|
+
* After calling this, the agent cannot be reused.
|
|
218
|
+
*/
|
|
219
|
+
destroy(): void;
|
|
220
|
+
/**
|
|
221
|
+
* Check if agent is connected to WebSocket
|
|
222
|
+
*/
|
|
223
|
+
get connected(): boolean;
|
|
224
|
+
/**
|
|
225
|
+
* Check if agent is currently processing a request
|
|
226
|
+
*/
|
|
227
|
+
get processing(): boolean;
|
|
228
|
+
/**
|
|
229
|
+
* Check if agent is currently speaking (generating/playing audio)
|
|
230
|
+
*/
|
|
231
|
+
get speaking(): boolean;
|
|
232
|
+
/**
|
|
233
|
+
* Get the number of pending speech chunks in the queue
|
|
234
|
+
*/
|
|
235
|
+
get pendingSpeechChunks(): number;
|
|
236
|
+
/**
|
|
237
|
+
* Check if agent has been permanently destroyed
|
|
238
|
+
*/
|
|
239
|
+
get destroyed(): boolean;
|
|
240
|
+
}
|
|
241
|
+
export {};
|
|
242
|
+
//# sourceMappingURL=VoiceAgent.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"VoiceAgent.d.ts","sourceRoot":"","sources":["../src/VoiceAgent.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,IAAI,CAAC;AAC/B,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EACL,UAAU,EACV,aAAa,EAEb,KAAK,IAAI,EACT,KAAK,YAAY,EAGjB,KAAK,kBAAkB,EACvB,KAAK,WAAW,EACjB,MAAM,IAAI,CAAC;AAWZ;;GAEG;AACH,UAAU,qBAAqB;IAC7B,8DAA8D;IAC9D,YAAY,EAAE,MAAM,CAAC;IACrB,iFAAiF;IACjF,YAAY,EAAE,MAAM,CAAC;IACrB,gDAAgD;IAChD,kBAAkB,EAAE,OAAO,CAAC;IAC5B,8CAA8C;IAC9C,mBAAmB,EAAE,MAAM,CAAC;CAC7B;AAED;;GAEG;AACH,UAAU,aAAa;IACrB,yHAAyH;IACzH,WAAW,EAAE,MAAM,CAAC;IACpB,6HAA6H;IAC7H,aAAa,EAAE,MAAM,CAAC;CACvB;AAKD,MAAM,WAAW,iBAAiB;IAChC,KAAK,EAAE,aAAa,CAAC;IACrB,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IACxC,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,WAAW,CAAC,UAAU,CAAC,OAAO,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC;IACrE,KAAK,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAC7B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,oDAAoD;IACpD,eAAe,CAAC,EAAE,OAAO,CAAC,qBAAqB,CAAC,CAAC;IACjD,2DAA2D;IAC3D,OAAO,CAAC,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;IACjC,yDAAyD;IACzD,iBAAiB,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED,qBAAa,UAAW,SAAQ,YAAY;IAC1C,OAAO,CAAC,MAAM,CAAC,CAAY;IAC3B,OAAO,CAAC,KAAK,CAA4B;IACzC,OAAO,CAAC,KAAK,CAAgB;IAC7B,OAAO,CAAC,kBAAkB,CAAC,CAAqB;IAChD,OAAO,CAAC,WAAW,CAAC,CAAc;IAClC,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,QAAQ,CAA4D;IAC5E,OAAO,CAAC,QAAQ,CAAC,CAAS;IAC1B,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,mBAAmB,CAAsB;IACjD,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,kBAAkB,CAAC,CAAS;IACpC,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,WAAW,CAAS;IAG5B,OAAO,CAAC,UAAU,CAA2F;IAC7G,OAAO,CAAC,eAAe,CAAS;IAGhC,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IAGvD,OAAO,CAAC,aAAa,CAAgB;IACrC,OAAO,CAAC,iBAAiB,CAAS;IAGlC,OAAO,CAAC,qBAAqB,CAAwB;IACrD,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IACvD,OAAO,CAAC,gBAAgB,CAAqB;IAC7C,OAAO,CAAC,WAAW,CAAK;IACxB,OAAO,CAAC,UAAU,CAAS;IAC3B,OAAO,CAAC,iBAAiB,CAAM;IAG/B,OAAO,CAAC,sBAAsB,CAAC,CAAgB;IAC/C,OAAO,CAAC,sBAAsB,CAAC,CAAa;gBAEhC,OAAO,EAAE,iBAAiB;IAkCtC;;OAEG;IACH,OAAO,CAAC,kBAAkB;IAM1B,OAAO,CAAC,cAAc;IAuDtB;;OAEG;IACH,OAAO,CAAC,mBAAmB;IA8BpB,aAAa,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC;IAIhD;;OAEG;IACU,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC;IAuC7E;;;OAGG;IACU,sBAAsB,CACjC,IAAI,EAAE,MAAM,EACZ,WAAW,CAAC,EAAE,WAAW,GACxB,OAAO,CAAC,UAAU,CAAC;IAiBtB;;;OAGG;IACI,eAAe,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAgC5D;;;OAGG;IACI,wBAAwB,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAUrE;;;OAGG;IACH,OAAO,CAAC,gBAAgB;IA8CxB;;;OAGG;IACH,OAAO,CAAC,WAAW;IAmCnB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAsCxB;;OAEG;YACW,kBAAkB;IAwBhC;;OAEG;YACW,kBAAkB;IA+FhC;;;OAGG;IACH,OAAO,CAAC,6BAA6B;IAarC;;;OAGG;IACH,OAAO,CAAC,oBAAoB;IAO5B;;OAEG;YACW,iBAAiB;IAiDlB,OAAO,CAAC,GAAG,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA8BjD;;;;OAIG;IACI,YAAY,CAAC,MAAM,EAAE,SAAS,GAAG,IAAI;IAc5C;;;OAGG;IACU,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAQpD;;;OAGG;IACU,SAAS,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAKxD;;OAEG;IACU,eAAe,CAAC,WAAW,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC;IAM7E;;;;OAIG;IACH,OAAO,CAAC,YAAY;IAOpB;;OAEG;YACW,eAAe;IAmB7B;;;;;OAKG;YACW,gBAAgB;IAuT9B;;;OAGG;IACU,yBAAyB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA8BnE;;;OAGG;IACH,OAAO,CAAC,oBAAoB;IA2B5B;;OAEG;IACH,cAAc;IAKd;;OAEG;IACH,aAAa;IAKb;;OAEG;IACH,YAAY;IAKZ;;OAEG;IACH,UAAU,IAAI,YAAY,EAAE;IAI5B;;OAEG;IACH,UAAU,CAAC,OAAO,EAAE,YAAY,EAAE;IAIlC;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAmBxB;;OAEG;IACH,UAAU;IAIV;;;OAGG;IACH,OAAO;IAQP;;OAEG;IACH,IAAI,SAAS,IAAI,OAAO,CAEvB;IAED;;OAEG;IACH,IAAI,UAAU,IAAI,OAAO,CAExB;IAED;;OAEG;IACH,IAAI,QAAQ,IAAI,OAAO,CAEtB;IAED;;OAEG;IACH,IAAI,mBAAmB,IAAI,MAAM,CAEhC;IAED;;OAEG;IACH,IAAI,SAAS,IAAI,OAAO,CAEvB;CACF"}
|