@craftedxp/voice-js 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CONSUMING.md CHANGED
@@ -42,8 +42,8 @@ For the landing dashboard in this repo:
42
42
  ```jsonc
43
43
  {
44
44
  "dependencies": {
45
- "@craftedxp/voice-js": "file:../sdk/voice-js"
46
- }
45
+ "@craftedxp/voice-js": "file:../sdk/voice-js",
46
+ },
47
47
  }
48
48
  ```
49
49
 
@@ -101,6 +101,7 @@ Browsers require a user gesture to start `AudioContext`. The SDK calls `audioCon
101
101
  ## CSP / mic permission
102
102
 
103
103
  For consumers running on a strict CSP, allow:
104
+
104
105
  - `connect-src wss://your-voxline-server.com`
105
106
  - `worker-src 'self' blob:` (the audio worklet is registered from a Blob URL)
106
107
 
@@ -113,6 +114,7 @@ The SDK doesn't log to the console by default. To see protocol-level events, wir
113
114
  ## Updating
114
115
 
115
116
  When the SDK changes:
117
+
116
118
  - **Tarball path:** re-`npm pack` then `npm install <newTgz>` in the consumer.
117
119
  - **`file:` path:** `npm run build` in `sdk/voice-js/` (refreshes `dist/`); the consumer picks it up on the next bundler refresh.
118
120
  - **Registry path:** bump the version in your `package.json` and `npm install`.
package/DEVELOPING.md CHANGED
@@ -26,6 +26,7 @@ Either way, the consumer's bundler (Webpack / Vite / esbuild / Next) picks up th
26
26
  - `dist/embed.iife.js` — minified IIFE for `<script>` embed; bundles the browser entry inline.
27
27
 
28
28
  Source files map to:
29
+
29
30
  - `src/browser.ts` — entry, factory implementation, public re-exports.
30
31
  - `src/node.ts` — entry, dynamic `ws` loader, factory implementation.
31
32
  - `src/VoiceClient.ts` — browser `BrowserVoiceClient` implementing the `Call` interface.
package/README.md CHANGED
@@ -4,7 +4,7 @@ JS SDK for embedding a voice agent call in any JS environment — browser tabs,
4
4
 
5
5
  Companion to [`@craftedxp/voice-rn`](https://www.npmjs.com/package/@craftedxp/voice-rn) (React Native) and [`@craftedxp/sdk-node`](https://www.npmjs.com/package/@craftedxp/sdk-node) (server-side `sk_` SDK).
6
6
 
7
- > **Internal testing release.** API surface may evolve before a stable release. **0.2.0** is a breaking rename + redesign of the previous `@voxline/web@0.1.0` — the singleton-`VoiceClient`-with-`apiKey` pattern is gone in favour of a `configureVoiceClient({ fetchToken })` factory that mirrors `voice-rn` 0.3.x. See [Migrating from `@voxline/web`](#migrating-from-voxlineweb) below.
7
+ > **Internal testing release.** API surface may evolve before a stable release. **0.3.1** adds Node-consumer ergonomics: `onInterrupt`/`onAgentTurnStart` callbacks on `startCall`, and the `NodeVoiceClientFactory` return type from the Node entry. **0.3.0** added [client tools](#client-tools) — handlers the agent's LLM can call on the consumer's machine. **0.2.0** was a breaking rename + redesign of the previous `@voxline/web@0.1.0` — the singleton-`VoiceClient`-with-`apiKey` pattern is gone in favour of a `configureVoiceClient({ fetchToken })` factory that mirrors `voice-rn` 0.3.x. See [Migrating from `@voxline/web`](#migrating-from-voxlineweb) below.
8
8
 
9
9
  ## Install
10
10
 
@@ -67,9 +67,9 @@ const call = await voice.startCall({
67
67
  onEnd: ({ reason, durationMs }) => log('ended', reason, durationMs),
68
68
  })
69
69
 
70
- call.mute() // gate mic frames (server still sees wire cadence)
70
+ call.mute() // gate mic frames (server still sees wire cadence)
71
71
  call.unmute()
72
- call.end() // close WS + stop mic + fire onEnd
72
+ call.end() // close WS + stop mic + fire onEnd
73
73
  ```
74
74
 
75
75
  ## Quick start (Node / Electron-main)
@@ -84,13 +84,42 @@ const voice = configureVoiceClient({
84
84
  })
85
85
 
86
86
  // Bring your own audio. Example: sox subprocesses for mic + speakers.
87
- const mic = spawn('sox', ['-d', '-r', '16000', '-c', '1', '-b', '16', '-e', 'signed', '-t', 'raw', '-'])
88
- const spk = spawn('sox', ['-t', 'raw', '-r', '16000', '-c', '1', '-b', '16', '-e', 'signed', '-', '-d'])
87
+ const mic = spawn('sox', [
88
+ '-d',
89
+ '-r',
90
+ '16000',
91
+ '-c',
92
+ '1',
93
+ '-b',
94
+ '16',
95
+ '-e',
96
+ 'signed',
97
+ '-t',
98
+ 'raw',
99
+ '-',
100
+ ])
101
+ const spk = spawn('sox', [
102
+ '-t',
103
+ 'raw',
104
+ '-r',
105
+ '16000',
106
+ '-c',
107
+ '1',
108
+ '-b',
109
+ '16',
110
+ '-e',
111
+ 'signed',
112
+ '-',
113
+ '-d',
114
+ ])
89
115
 
90
116
  const call = await voice.startCall({
91
117
  agentId: 'agt_xxx',
92
118
  onAudioChunk: (pcm) => spk.stdin.write(Buffer.from(pcm)),
93
- onEnd: () => { mic.kill(); spk.stdin.end() },
119
+ onEnd: () => {
120
+ mic.kill()
121
+ spk.stdin.end()
122
+ },
94
123
  })
95
124
 
96
125
  mic.stdout.on('data', (chunk) => call.sendAudioChunk(chunk))
@@ -102,30 +131,33 @@ The Node bundle has the same `configureVoiceClient` / `startCall` shape, plus an
102
131
 
103
132
  ### `configureVoiceClient(config)`
104
133
 
105
- | Field | Type | Notes |
106
- |---|---|---|
107
- | `apiBase` | `string` | Full HTTPS URL of the Voxline server. WS scheme derived: `https`→`wss`. Trailing slash optional. |
108
- | `fetchToken` | `(args) => Promise<string>` | Called by the SDK whenever it needs a fresh `ct_`. Mirrors `@craftedxp/voice-rn`'s shape exactly — `{ agentId, userId?, context?, metadata? }`. |
109
- | `defaultMetadata` | `Record<string, string>?` | Applied to every `startCall`. Per-call merges on top. |
110
- | `defaultContext` | `Record<string, unknown>?` | Applied to every `startCall`. Per-call merges on top. |
134
+ | Field | Type | Notes |
135
+ | ----------------- | --------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
136
+ | `apiBase` | `string` | Full HTTPS URL of the Voxline server. WS scheme derived: `https`→`wss`. Trailing slash optional. |
137
+ | `fetchToken` | `(args) => Promise<string>` | Called by the SDK whenever it needs a fresh `ct_`. Mirrors `@craftedxp/voice-rn`'s shape exactly — `{ agentId, userId?, context?, metadata? }`. |
138
+ | `defaultMetadata` | `Record<string, string>?` | Applied to every `startCall`. Per-call merges on top. |
139
+ | `defaultContext` | `Record<string, unknown>?` | Applied to every `startCall`. Per-call merges on top. |
111
140
 
112
141
  Returns a `VoiceClientFactory` with one method:
113
142
 
114
143
  ### `factory.startCall(options)`
115
144
 
116
- | Field | Type | Notes |
117
- |---|---|---|
118
- | `agentId` | `string` | Required. |
119
- | `userId` | `string?` | Round-tripped to fetchToken as `userId`; server uses it for contact memory. |
120
- | `context` | `Record<string, unknown>?` | Per-call structured context. Merged on top of `defaultContext`. Lowered into the agent's system prompt server-side. |
121
- | `metadata` | `Record<string, string>?` | Per-call key/value. Merged on top of `defaultMetadata`. Round-tripped on `call.ended` webhook. NOT lowered into the prompt. |
122
- | `bargeIn` | `boolean?` | Default `true`. Set `false` for alarm-style flows where the user shouldn't accidentally interrupt the script. |
123
- | `token` | `string?` | **Test-only escape hatch** pre-minted `ct_`, bypasses `fetchToken`. Don't use in production. |
124
- | `onStateChange` | `(state) => void` | Fires on every state machine transition. |
125
- | `onTranscript` | `(entries) => void` | Fires on every transcript update. |
126
- | `onVolume` | `({ input, output }) => void` | 0-1 RMS. ~10 Hz cadence. Browser bundle only. |
127
- | `onError` | `(err) => void` | Stable `code` from `CallErrorCode`; matches `voice-rn` codes where overlap. |
128
- | `onEnd` | `({ reason, errorCode?, durationMs }) => void` | Fires once when the call ends. |
145
+ | Field | Type | Notes |
146
+ | ------------------ | ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
147
+ | `agentId` | `string` | Required. |
148
+ | `userId` | `string?` | Round-tripped to fetchToken as `userId`; server uses it for contact memory. |
149
+ | `context` | `Record<string, unknown>?` | Per-call structured context. Merged on top of `defaultContext`. Lowered into the agent's system prompt server-side. |
150
+ | `metadata` | `Record<string, string>?` | Per-call key/value. Merged on top of `defaultMetadata`. Round-tripped on `call.ended` webhook. NOT lowered into the prompt. |
151
+ | `bargeIn` | `boolean?` | Default `true`. Set `false` for alarm-style flows where the user shouldn't accidentally interrupt the script. |
152
+ | `clientTools` | `ClientToolMap?` | Per-call client tools the agent's LLM can invoke. See [Client tools](#client-tools) section below. Validated synchronously at `startCall` bad input throws. |
153
+ | `token` | `string?` | **Test-only escape hatch** — pre-minted `ct_`, bypasses `fetchToken`. Don't use in production. |
154
+ | `onStateChange` | `(state) => void` | Fires on every state machine transition. |
155
+ | `onTranscript` | `(entries) => void` | Fires on every transcript update. |
156
+ | `onInterrupt` | `() => void` | Server signaled barge-in. Browser bundle auto-flushes built-in playback before this fires. Node consumers should drain their custom playback queue here. |
157
+ | `onAgentTurnStart` | `() => void` | New agent turn began. Use when you want a precise turn-start anchor without diffing `onStateChange`. |
158
+ | `onVolume` | `({ input, output }) => void` | 0-1 RMS. ~10 Hz cadence. Browser bundle only. |
159
+ | `onError` | `(err) => void` | Stable `code` from `CallErrorCode`; matches `voice-rn` codes where overlap. |
160
+ | `onEnd` | `({ reason, errorCode?, durationMs }) => void` | Fires once when the call ends. |
129
161
 
130
162
  Resolves to a `Call` handle:
131
163
 
@@ -152,21 +184,75 @@ interface NodeCall extends Call {
152
184
 
153
185
  ```ts
154
186
  type CallState =
155
- | 'idle' | 'connecting' | 'listening'
156
- | 'user_speaking' | 'agent_speaking'
157
- | 'ended' | 'error'
187
+ | 'idle'
188
+ | 'connecting'
189
+ | 'listening'
190
+ | 'user_speaking'
191
+ | 'agent_speaking'
192
+ | 'ended'
193
+ | 'error'
158
194
 
159
195
  type CallErrorCode =
160
- | 'missing_credentials' | 'forbidden'
161
- | 'mic_denied' | 'mic_start_failed' | 'audio_session_failed'
162
- | 'token_expired' | 'token_invalid' | 'unauthorized'
163
- | 'network_unreachable' | 'socket_error'
164
- | 'payment_required' | 'not_found'
165
- | 'silence_timeout' | 'server_error'
196
+ | 'missing_credentials'
197
+ | 'forbidden'
198
+ | 'mic_denied'
199
+ | 'mic_start_failed'
200
+ | 'audio_session_failed'
201
+ | 'token_expired'
202
+ | 'token_invalid'
203
+ | 'unauthorized'
204
+ | 'network_unreachable'
205
+ | 'socket_error'
206
+ | 'payment_required'
207
+ | 'not_found'
208
+ | 'silence_timeout'
209
+ | 'server_error'
166
210
 
167
211
  type CallEndReason = 'agent_ended' | 'user_hangup' | 'timeout' | 'error'
168
212
  ```
169
213
 
214
+ ## Client tools
215
+
216
+ You can declare tools the agent's LLM can call **on the consumer's machine**. The
217
+ tool's handler runs in your app — server side has no access to it. Useful for
218
+ surface-only actions (read DOM state, hit a private API, mutate local storage,
219
+ control the UI).
220
+
221
+ ```ts
222
+ import { configureVoiceClient, type ClientToolMap } from '@craftedxp/voice-js'
223
+
224
+ const tools: ClientToolMap = {
225
+ addTodoItem: {
226
+ description: "Add an item to the user's todo list.",
227
+ parameters: {
228
+ type: 'object',
229
+ properties: { text: { type: 'string' } },
230
+ required: ['text'],
231
+ },
232
+ usage: 'Call when the user asks to add or capture a task.',
233
+ handler: async ({ text }) => {
234
+ await myAppApi.addTodo(String(text))
235
+ return `Added "${text}".`
236
+ },
237
+ },
238
+ }
239
+
240
+ const voice = configureVoiceClient({ apiBase: '...', fetchToken: async () => '...' })
241
+ const call = await voice.startCall({ agentId: 'agt_xxx', clientTools: tools })
242
+ ```
243
+
244
+ The SDK validates `clientTools` at `startCall` (sync, throws on malformed input),
245
+ then sends `client_tools_register` to the server right after `connected`. When
246
+ the agent's LLM invokes a registered tool, your handler runs and the SDK posts
247
+ the result back through the same WebSocket.
248
+
249
+ Handler return values are stringified (object → `JSON.stringify`) before being
250
+ sent back; throws become `{ error: ... }` frames. The server enforces a default
251
+ 10s / max 30s timeout per `timeoutMs` in your declaration.
252
+
253
+ For the full wire protocol, sequencing, and constraints see
254
+ [`docs/integration-echocheck.md`](../../docs/integration-echocheck.md#client-declared-tools).
255
+
170
256
  ## Migrating from `@voxline/web`
171
257
 
172
258
  ```diff
@@ -221,6 +307,25 @@ Three semantic shifts to be aware of:
221
307
 
222
308
  The embed widget (`<script src="embed.js" data-token="ct_...">`) keeps the same HTML API, but the `data-api-key` attribute is no longer accepted — mint server-side and inject `data-token` instead.
223
309
 
310
+ ## Troubleshooting
311
+
312
+ **Agent's last syllable cuts off and plays into the next agent message.** Almost
313
+ always a misfiring barge-in (acoustic echo from a laptop speaker → mic, or a
314
+ false-positive VAD on background noise). Three quick fixes, in order:
315
+
316
+ 1. **Test with headphones.** Eliminates acoustic echo. If the symptom disappears,
317
+ it was echo. Tell production users to wear headphones, or fall back to (3).
318
+ 2. **Check for a phantom user turn between two agent turns** in the transcript
319
+ that contains words the agent just said. That confirms STT is hearing the
320
+ agent's voice through the mic.
321
+ 3. **Pass `bargeIn: false` on `startCall`** for non-conversational flows. Adds
322
+ `?barge=off` to the WS URL and the SDK ignores `interrupt` events
323
+ client-side. Tradeoff: user can't interrupt mid-sentence.
324
+
325
+ For the full diagnostic walkthrough (including the rarer Gemini-Live
326
+ stale-audio-leak case and audio-handling guidance for Node/Electron consumers),
327
+ see [`docs/integration-echocheck.md` → Audio quality](../../docs/integration-echocheck.md#audio-quality--the-cut-off-syllable-trap).
328
+
224
329
  ## Embed widget
225
330
 
226
331
  For drop-in `<script>` consumers (landing pages, no-build embeds):
@@ -239,7 +344,9 @@ Renders a floating call button with a Shadow-DOM transcript panel. Pre-mint the
239
344
 
240
345
  ## Status
241
346
 
242
- - **0.2.0** (current) — first `@craftedxp/voice-js` release. Browser + Node dual bundle, `fetchToken` factory, voice-rn 0.3.x parity. Migration path from `@voxline/web@0.1.0` documented above.
347
+ - **0.3.1** (current) — adds `onInterrupt` / `onAgentTurnStart` callbacks on `StartCallOptions` and `NodeVoiceClientFactory` proper return type for the Node entry. Backwards-compatible.
348
+ - 0.3.0 — adds client-tools support. New `clientTools` option on `startCall` accepts a `ClientToolMap` (description, parameters, handler, optional usage/timeoutMs/example). Browser and Node bundles both supported. Backwards-compatible — existing consumers see no change.
349
+ - 0.2.0 — first `@craftedxp/voice-js` release. Browser + Node dual bundle, `fetchToken` factory, voice-rn 0.3.x parity. Migration path from `@voxline/web@0.1.0` documented above.
243
350
  - 0.1.0 — `@voxline/web`. Singleton `VoiceClient` class, `apiKey` accepted. Retired in 0.2.0; never published to npm so no deprecation window.
244
351
 
245
352
  See [`CONSUMING.md`](CONSUMING.md) for the full setup walkthrough and [`DEVELOPING.md`](DEVELOPING.md) for SDK-author iteration.
@@ -1,3 +1,18 @@
1
+ interface ClientTool {
2
+ description: string;
3
+ parameters: Record<string, unknown>;
4
+ usage?: string;
5
+ timeoutMs?: number;
6
+ example?: string;
7
+ handler: (args: Record<string, unknown>) => Promise<string | object> | string | object;
8
+ }
9
+ type ClientToolMap = Record<string, ClientTool>;
10
+ interface ClientToolCallFrame {
11
+ toolCallId: string;
12
+ name: string;
13
+ args: Record<string, unknown>;
14
+ }
15
+
1
16
  type CallState = 'idle' | 'connecting' | 'listening' | 'user_speaking' | 'agent_speaking' | 'ended' | 'error';
2
17
  type TranscriptEntry = {
3
18
  id: string;
@@ -51,6 +66,8 @@ interface ProtocolCallbacks {
51
66
  onInterrupt: () => void;
52
67
  onAgentTurnStart: () => void;
53
68
  onCallEnd: (reason: CallEndReason) => void;
69
+ onConnected: () => void;
70
+ onClientToolCall: (frame: ClientToolCallFrame) => void;
54
71
  }
55
72
  declare function handleServerMessage(raw: string, state: ProtocolState, cb: ProtocolCallbacks): void;
56
73
  interface BuildWsUrlArgs {
@@ -131,6 +148,16 @@ interface StartCallOptions {
131
148
  * accidentally interrupt the script. Default true.
132
149
  */
133
150
  bargeIn?: boolean;
151
+ /**
152
+ * Client-side tools the agent's LLM can call mid-conversation. Each
153
+ * tool's handler runs on the consumer's side; result is fed back to
154
+ * the LLM through the existing call WebSocket. Schema and handler
155
+ * colocate. Validated synchronously at startCall — bad input throws.
156
+ *
157
+ * See docs/integration-echocheck.md for the wire protocol and the
158
+ * server-side guarantees.
159
+ */
160
+ clientTools?: ClientToolMap;
134
161
  /**
135
162
  * Test-only escape hatch — pass a pre-minted `ct_` directly and skip
136
163
  * the `fetchToken` call. Don't use this in production code: tokens
@@ -143,6 +170,23 @@ interface StartCallOptions {
143
170
  onEnd?: (end: CallEndEvent) => void;
144
171
  /** Volume-meter event for VU UIs. ~10 Hz cadence (browser bundle only). */
145
172
  onVolume?: (vol: VolumeEvent) => void;
173
+ /**
174
+ * Fires when the server signals barge-in (the user started talking
175
+ * mid-agent-turn). The browser bundle automatically flushes its
176
+ * built-in audio playback before this callback runs; the callback is
177
+ * fired regardless. Node / Electron consumers with custom playback
178
+ * should drain their audio queue here so the agent goes silent
179
+ * immediately.
180
+ */
181
+ onInterrupt?: () => void;
182
+ /**
183
+ * Fires on `agent_turn_start` — the server has begun a new agent
184
+ * turn. The state-machine transition to `agent_speaking` happens at
185
+ * the same moment via `onStateChange`; use this when you want a
186
+ * precise turn anchor (e.g. "agent has been speaking for N ms" UIs)
187
+ * without diffing state.
188
+ */
189
+ onAgentTurnStart?: () => void;
146
190
  }
147
191
  interface Call {
148
192
  /** Current state. Snapshot — subscribe via onStateChange for live updates. */
@@ -274,4 +318,4 @@ type ReconnectingWebSocket = ReturnType<typeof createReconnectingWebSocket>;
274
318
  */
275
319
  declare function configureVoiceClient(config: VoiceClientConfig): VoiceClientFactory;
276
320
 
277
- export { type Call, type CallEndEvent, type CallEndReason, type CallError, type CallErrorCode, type CallState, type CaptureController, type CaptureOptions, type FetchToken, type FetchTokenArgs, type OnAgentSpeakingChange, type OnChunk, type OnError, type OnVolume$1 as OnVolume, type PlaybackController, type PlaybackOptions, type ProtocolCallbacks, type ProtocolState, type RWSEvent, type RWSOptions, type ReconnectingWebSocket, type ServerMessage, type StartCallOptions, type TranscriptEntry, type VoiceClientConfig, type VoiceClientFactory, type VolumeEvent, type WebSocketFactory, type WebSocketLike, buildWsUrl, configureVoiceClient, createAudioCapture, createAudioPlayback, createProtocolState, createReconnectingWebSocket, handleServerMessage };
321
+ export { type Call, type CallEndEvent, type CallEndReason, type CallError, type CallErrorCode, type CallState, type CaptureController, type CaptureOptions, type ClientTool, type ClientToolMap, type FetchToken, type FetchTokenArgs, type OnAgentSpeakingChange, type OnChunk, type OnError, type OnVolume$1 as OnVolume, type PlaybackController, type PlaybackOptions, type ProtocolCallbacks, type ProtocolState, type RWSEvent, type RWSOptions, type ReconnectingWebSocket, type ServerMessage, type StartCallOptions, type TranscriptEntry, type VoiceClientConfig, type VoiceClientFactory, type VolumeEvent, type WebSocketFactory, type WebSocketLike, buildWsUrl, configureVoiceClient, createAudioCapture, createAudioPlayback, createProtocolState, createReconnectingWebSocket, handleServerMessage };
package/dist/browser.d.ts CHANGED
@@ -1,3 +1,18 @@
1
+ interface ClientTool {
2
+ description: string;
3
+ parameters: Record<string, unknown>;
4
+ usage?: string;
5
+ timeoutMs?: number;
6
+ example?: string;
7
+ handler: (args: Record<string, unknown>) => Promise<string | object> | string | object;
8
+ }
9
+ type ClientToolMap = Record<string, ClientTool>;
10
+ interface ClientToolCallFrame {
11
+ toolCallId: string;
12
+ name: string;
13
+ args: Record<string, unknown>;
14
+ }
15
+
1
16
  type CallState = 'idle' | 'connecting' | 'listening' | 'user_speaking' | 'agent_speaking' | 'ended' | 'error';
2
17
  type TranscriptEntry = {
3
18
  id: string;
@@ -51,6 +66,8 @@ interface ProtocolCallbacks {
51
66
  onInterrupt: () => void;
52
67
  onAgentTurnStart: () => void;
53
68
  onCallEnd: (reason: CallEndReason) => void;
69
+ onConnected: () => void;
70
+ onClientToolCall: (frame: ClientToolCallFrame) => void;
54
71
  }
55
72
  declare function handleServerMessage(raw: string, state: ProtocolState, cb: ProtocolCallbacks): void;
56
73
  interface BuildWsUrlArgs {
@@ -131,6 +148,16 @@ interface StartCallOptions {
131
148
  * accidentally interrupt the script. Default true.
132
149
  */
133
150
  bargeIn?: boolean;
151
+ /**
152
+ * Client-side tools the agent's LLM can call mid-conversation. Each
153
+ * tool's handler runs on the consumer's side; result is fed back to
154
+ * the LLM through the existing call WebSocket. Schema and handler
155
+ * colocate. Validated synchronously at startCall — bad input throws.
156
+ *
157
+ * See docs/integration-echocheck.md for the wire protocol and the
158
+ * server-side guarantees.
159
+ */
160
+ clientTools?: ClientToolMap;
134
161
  /**
135
162
  * Test-only escape hatch — pass a pre-minted `ct_` directly and skip
136
163
  * the `fetchToken` call. Don't use this in production code: tokens
@@ -143,6 +170,23 @@ interface StartCallOptions {
143
170
  onEnd?: (end: CallEndEvent) => void;
144
171
  /** Volume-meter event for VU UIs. ~10 Hz cadence (browser bundle only). */
145
172
  onVolume?: (vol: VolumeEvent) => void;
173
+ /**
174
+ * Fires when the server signals barge-in (the user started talking
175
+ * mid-agent-turn). The browser bundle automatically flushes its
176
+ * built-in audio playback before this callback runs; the callback is
177
+ * fired regardless. Node / Electron consumers with custom playback
178
+ * should drain their audio queue here so the agent goes silent
179
+ * immediately.
180
+ */
181
+ onInterrupt?: () => void;
182
+ /**
183
+ * Fires on `agent_turn_start` — the server has begun a new agent
184
+ * turn. The state-machine transition to `agent_speaking` happens at
185
+ * the same moment via `onStateChange`; use this when you want a
186
+ * precise turn anchor (e.g. "agent has been speaking for N ms" UIs)
187
+ * without diffing state.
188
+ */
189
+ onAgentTurnStart?: () => void;
146
190
  }
147
191
  interface Call {
148
192
  /** Current state. Snapshot — subscribe via onStateChange for live updates. */
@@ -274,4 +318,4 @@ type ReconnectingWebSocket = ReturnType<typeof createReconnectingWebSocket>;
274
318
  */
275
319
  declare function configureVoiceClient(config: VoiceClientConfig): VoiceClientFactory;
276
320
 
277
- export { type Call, type CallEndEvent, type CallEndReason, type CallError, type CallErrorCode, type CallState, type CaptureController, type CaptureOptions, type FetchToken, type FetchTokenArgs, type OnAgentSpeakingChange, type OnChunk, type OnError, type OnVolume$1 as OnVolume, type PlaybackController, type PlaybackOptions, type ProtocolCallbacks, type ProtocolState, type RWSEvent, type RWSOptions, type ReconnectingWebSocket, type ServerMessage, type StartCallOptions, type TranscriptEntry, type VoiceClientConfig, type VoiceClientFactory, type VolumeEvent, type WebSocketFactory, type WebSocketLike, buildWsUrl, configureVoiceClient, createAudioCapture, createAudioPlayback, createProtocolState, createReconnectingWebSocket, handleServerMessage };
321
+ export { type Call, type CallEndEvent, type CallEndReason, type CallError, type CallErrorCode, type CallState, type CaptureController, type CaptureOptions, type ClientTool, type ClientToolMap, type FetchToken, type FetchTokenArgs, type OnAgentSpeakingChange, type OnChunk, type OnError, type OnVolume$1 as OnVolume, type PlaybackController, type PlaybackOptions, type ProtocolCallbacks, type ProtocolState, type RWSEvent, type RWSOptions, type ReconnectingWebSocket, type ServerMessage, type StartCallOptions, type TranscriptEntry, type VoiceClientConfig, type VoiceClientFactory, type VolumeEvent, type WebSocketFactory, type WebSocketLike, buildWsUrl, configureVoiceClient, createAudioCapture, createAudioPlayback, createProtocolState, createReconnectingWebSocket, handleServerMessage };
package/dist/browser.js CHANGED
@@ -369,6 +369,7 @@ function handleServerMessage(raw, state, cb) {
369
369
  }
370
370
  switch (msg.type) {
371
371
  case "connected":
372
+ cb.onConnected();
372
373
  setState(state, "listening", cb);
373
374
  return;
374
375
  case "transcript": {
@@ -444,6 +445,14 @@ function handleServerMessage(raw, state, cb) {
444
445
  ];
445
446
  cb.onTranscript(state.transcript);
446
447
  return;
448
+ case "client_tool_call": {
449
+ const toolCallId = String(msg.toolCallId ?? "");
450
+ const name = String(msg.name ?? "");
451
+ const args = msg.args ?? {};
452
+ if (!toolCallId || !name) return;
453
+ cb.onClientToolCall({ toolCallId, name, args });
454
+ return;
455
+ }
447
456
  case "call_end": {
448
457
  const reasonRaw = String(msg.reason ?? "");
449
458
  const reason = mapEndReason(reasonRaw);
@@ -501,6 +510,87 @@ function buildWsUrl(args) {
501
510
  return `${proto}//${base.host}/v1/agents/${encodeURIComponent(args.agentId)}/call?token=${encodeURIComponent(args.token)}${bargeQS}`;
502
511
  }
503
512
 
513
+ // src/clientTools.ts
514
+ var NAME_RE = /^[a-zA-Z_][a-zA-Z0-9_]*$/;
515
+ var MAX_TOOLS = 64;
516
+ var MAX_USAGE = 500;
517
+ var MAX_TIMEOUT_MS = 3e4;
518
+ var validateClientToolMap = (tools) => {
519
+ if (tools === void 0) return;
520
+ if (typeof tools !== "object" || tools === null || Array.isArray(tools)) {
521
+ throw new Error("clientTools must be an object keyed by tool name");
522
+ }
523
+ const entries = Object.entries(tools);
524
+ if (entries.length > MAX_TOOLS) {
525
+ throw new Error(`clientTools may declare at most 64 tools (got ${entries.length})`);
526
+ }
527
+ for (const [name, def] of entries) {
528
+ if (!NAME_RE.test(name)) {
529
+ throw new Error(
530
+ `clientTools["${name}"]: name must be a valid identifier (^[a-zA-Z_][a-zA-Z0-9_]*$)`
531
+ );
532
+ }
533
+ if (!def || typeof def !== "object") {
534
+ throw new Error(`clientTools["${name}"]: must be an object`);
535
+ }
536
+ if (typeof def.description !== "string" || def.description.length === 0) {
537
+ throw new Error(`clientTools["${name}"]: must have a description`);
538
+ }
539
+ if (typeof def.handler !== "function") {
540
+ throw new Error(`clientTools["${name}"]: must have a handler function`);
541
+ }
542
+ if (def.usage !== void 0 && def.usage.length > MAX_USAGE) {
543
+ throw new Error(`clientTools["${name}"]: usage must be \u2264500 chars`);
544
+ }
545
+ if (def.timeoutMs !== void 0 && (!Number.isFinite(def.timeoutMs) || def.timeoutMs <= 0 || def.timeoutMs > MAX_TIMEOUT_MS)) {
546
+ throw new Error(`clientTools["${name}"]: timeoutMs must be in (0, 30000]`);
547
+ }
548
+ }
549
+ };
550
+ var buildRegisterFrame = (tools) => ({
551
+ type: "client_tools_register",
552
+ tools: Object.entries(tools).map(([name, def]) => ({
553
+ name,
554
+ description: def.description,
555
+ parameters: def.parameters,
556
+ ...def.usage !== void 0 ? { usage: def.usage } : {},
557
+ ...def.timeoutMs !== void 0 ? { timeoutMs: def.timeoutMs } : {}
558
+ }))
559
+ });
560
+ var dispatchClientToolCall = (send, tools, frame) => {
561
+ const safeSend = (payload) => {
562
+ try {
563
+ send(payload);
564
+ } catch {
565
+ }
566
+ };
567
+ const tool = tools[frame.name];
568
+ if (!tool) {
569
+ safeSend({
570
+ type: "client_tool_result",
571
+ toolCallId: frame.toolCallId,
572
+ error: `No handler for ${frame.name}`
573
+ });
574
+ return;
575
+ }
576
+ void (async () => {
577
+ try {
578
+ const out = await tool.handler(frame.args);
579
+ safeSend({
580
+ type: "client_tool_result",
581
+ toolCallId: frame.toolCallId,
582
+ result: typeof out === "string" ? out : JSON.stringify(out)
583
+ });
584
+ } catch (err) {
585
+ safeSend({
586
+ type: "client_tool_result",
587
+ toolCallId: frame.toolCallId,
588
+ error: err instanceof Error ? err.message : String(err)
589
+ });
590
+ }
591
+ })();
592
+ };
593
+
504
594
  // src/VoiceClient.ts
505
595
  var BrowserVoiceClient = class {
506
596
  constructor(args) {
@@ -529,6 +619,10 @@ var BrowserVoiceClient = class {
529
619
  // ---------------------------------------------------------------
530
620
  // Internal
531
621
  // ---------------------------------------------------------------
622
+ this.sendClientToolsRegister = () => {
623
+ const frame = buildRegisterFrame(this.args.options.clientTools ?? {});
624
+ this.rws?.send(JSON.stringify(frame));
625
+ };
532
626
  this.setState = (next) => {
533
627
  if (this.proto.state === next) return;
534
628
  this.proto.state = next;
@@ -556,9 +650,18 @@ var BrowserVoiceClient = class {
556
650
  onState: this.setState,
557
651
  onTranscript: (entries) => this.args.options.onTranscript?.(entries),
558
652
  onError: this.emitError,
559
- onInterrupt: () => this.playback?.flush(),
560
- onAgentTurnStart: () => void 0,
561
- onCallEnd: (reason) => this.teardown(reason)
653
+ onInterrupt: () => {
654
+ this.playback?.flush();
655
+ this.args.options.onInterrupt?.();
656
+ },
657
+ onAgentTurnStart: () => this.args.options.onAgentTurnStart?.(),
658
+ onCallEnd: (reason) => this.teardown(reason),
659
+ onConnected: () => this.sendClientToolsRegister(),
660
+ onClientToolCall: (frame) => dispatchClientToolCall(
661
+ (f) => this.rws?.send(JSON.stringify(f)),
662
+ this.args.options.clientTools ?? {},
663
+ frame
664
+ )
562
665
  });
563
666
  } else {
564
667
  this.playback?.enqueue(ev.data);
@@ -623,6 +726,7 @@ var BrowserVoiceClient = class {
623
726
  };
624
727
  this.args = args;
625
728
  this.proto = createProtocolState();
729
+ validateClientToolMap(args.options.clientTools);
626
730
  }
627
731
  // ---------------------------------------------------------------
628
732
  // Call interface