@craftedxp/voice-js 0.2.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CONSUMING.md +4 -2
- package/DEVELOPING.md +1 -0
- package/README.md +142 -35
- package/dist/browser.d.mts +45 -1
- package/dist/browser.d.ts +45 -1
- package/dist/browser.js +107 -3
- package/dist/browser.js.map +1 -1
- package/dist/browser.mjs +107 -3
- package/dist/browser.mjs.map +1 -1
- package/dist/embed.iife.js +3 -3
- package/dist/node.d.mts +57 -2
- package/dist/node.d.ts +57 -2
- package/dist/node.js +104 -3
- package/dist/node.js.map +1 -1
- package/dist/node.mjs +104 -3
- package/dist/node.mjs.map +1 -1
- package/package.json +3 -1
package/CONSUMING.md
CHANGED
|
@@ -42,8 +42,8 @@ For the landing dashboard in this repo:
|
|
|
42
42
|
```jsonc
|
|
43
43
|
{
|
|
44
44
|
"dependencies": {
|
|
45
|
-
"@craftedxp/voice-js": "file:../sdk/voice-js"
|
|
46
|
-
}
|
|
45
|
+
"@craftedxp/voice-js": "file:../sdk/voice-js",
|
|
46
|
+
},
|
|
47
47
|
}
|
|
48
48
|
```
|
|
49
49
|
|
|
@@ -101,6 +101,7 @@ Browsers require a user gesture to start `AudioContext`. The SDK calls `audioCon
|
|
|
101
101
|
## CSP / mic permission
|
|
102
102
|
|
|
103
103
|
For consumers running on a strict CSP, allow:
|
|
104
|
+
|
|
104
105
|
- `connect-src wss://your-voxline-server.com`
|
|
105
106
|
- `worker-src 'self' blob:` (the audio worklet is registered from a Blob URL)
|
|
106
107
|
|
|
@@ -113,6 +114,7 @@ The SDK doesn't log to the console by default. To see protocol-level events, wir
|
|
|
113
114
|
## Updating
|
|
114
115
|
|
|
115
116
|
When the SDK changes:
|
|
117
|
+
|
|
116
118
|
- **Tarball path:** re-`npm pack` then `npm install <newTgz>` in the consumer.
|
|
117
119
|
- **`file:` path:** `npm run build` in `sdk/voice-js/` (refreshes `dist/`); the consumer picks it up on the next bundler refresh.
|
|
118
120
|
- **Registry path:** bump the version in your `package.json` and `npm install`.
|
package/DEVELOPING.md
CHANGED
|
@@ -26,6 +26,7 @@ Either way, the consumer's bundler (Webpack / Vite / esbuild / Next) picks up th
|
|
|
26
26
|
- `dist/embed.iife.js` — minified IIFE for `<script>` embed; bundles the browser entry inline.
|
|
27
27
|
|
|
28
28
|
Source files map to:
|
|
29
|
+
|
|
29
30
|
- `src/browser.ts` — entry, factory implementation, public re-exports.
|
|
30
31
|
- `src/node.ts` — entry, dynamic `ws` loader, factory implementation.
|
|
31
32
|
- `src/VoiceClient.ts` — browser `BrowserVoiceClient` implementing the `Call` interface.
|
package/README.md
CHANGED
|
@@ -4,7 +4,7 @@ JS SDK for embedding a voice agent call in any JS environment — browser tabs,
|
|
|
4
4
|
|
|
5
5
|
Companion to [`@craftedxp/voice-rn`](https://www.npmjs.com/package/@craftedxp/voice-rn) (React Native) and [`@craftedxp/sdk-node`](https://www.npmjs.com/package/@craftedxp/sdk-node) (server-side `sk_` SDK).
|
|
6
6
|
|
|
7
|
-
> **Internal testing release.** API surface may evolve before a stable release. **0.2.0**
|
|
7
|
+
> **Internal testing release.** API surface may evolve before a stable release. **0.3.1** adds Node-consumer ergonomics: `onInterrupt`/`onAgentTurnStart` callbacks on `startCall`, and the `NodeVoiceClientFactory` return type from the Node entry. **0.3.0** added [client tools](#client-tools) — handlers the agent's LLM can call on the consumer's machine. **0.2.0** was a breaking rename + redesign of the previous `@voxline/web@0.1.0` — the singleton-`VoiceClient`-with-`apiKey` pattern is gone in favour of a `configureVoiceClient({ fetchToken })` factory that mirrors `voice-rn` 0.3.x. See [Migrating from `@voxline/web`](#migrating-from-voxlineweb) below.
|
|
8
8
|
|
|
9
9
|
## Install
|
|
10
10
|
|
|
@@ -67,9 +67,9 @@ const call = await voice.startCall({
|
|
|
67
67
|
onEnd: ({ reason, durationMs }) => log('ended', reason, durationMs),
|
|
68
68
|
})
|
|
69
69
|
|
|
70
|
-
call.mute()
|
|
70
|
+
call.mute() // gate mic frames (server still sees wire cadence)
|
|
71
71
|
call.unmute()
|
|
72
|
-
call.end()
|
|
72
|
+
call.end() // close WS + stop mic + fire onEnd
|
|
73
73
|
```
|
|
74
74
|
|
|
75
75
|
## Quick start (Node / Electron-main)
|
|
@@ -84,13 +84,42 @@ const voice = configureVoiceClient({
|
|
|
84
84
|
})
|
|
85
85
|
|
|
86
86
|
// Bring your own audio. Example: sox subprocesses for mic + speakers.
|
|
87
|
-
const mic = spawn('sox', [
|
|
88
|
-
|
|
87
|
+
const mic = spawn('sox', [
|
|
88
|
+
'-d',
|
|
89
|
+
'-r',
|
|
90
|
+
'16000',
|
|
91
|
+
'-c',
|
|
92
|
+
'1',
|
|
93
|
+
'-b',
|
|
94
|
+
'16',
|
|
95
|
+
'-e',
|
|
96
|
+
'signed',
|
|
97
|
+
'-t',
|
|
98
|
+
'raw',
|
|
99
|
+
'-',
|
|
100
|
+
])
|
|
101
|
+
const spk = spawn('sox', [
|
|
102
|
+
'-t',
|
|
103
|
+
'raw',
|
|
104
|
+
'-r',
|
|
105
|
+
'16000',
|
|
106
|
+
'-c',
|
|
107
|
+
'1',
|
|
108
|
+
'-b',
|
|
109
|
+
'16',
|
|
110
|
+
'-e',
|
|
111
|
+
'signed',
|
|
112
|
+
'-',
|
|
113
|
+
'-d',
|
|
114
|
+
])
|
|
89
115
|
|
|
90
116
|
const call = await voice.startCall({
|
|
91
117
|
agentId: 'agt_xxx',
|
|
92
118
|
onAudioChunk: (pcm) => spk.stdin.write(Buffer.from(pcm)),
|
|
93
|
-
onEnd: () => {
|
|
119
|
+
onEnd: () => {
|
|
120
|
+
mic.kill()
|
|
121
|
+
spk.stdin.end()
|
|
122
|
+
},
|
|
94
123
|
})
|
|
95
124
|
|
|
96
125
|
mic.stdout.on('data', (chunk) => call.sendAudioChunk(chunk))
|
|
@@ -102,30 +131,33 @@ The Node bundle has the same `configureVoiceClient` / `startCall` shape, plus an
|
|
|
102
131
|
|
|
103
132
|
### `configureVoiceClient(config)`
|
|
104
133
|
|
|
105
|
-
| Field
|
|
106
|
-
|
|
107
|
-
| `apiBase`
|
|
108
|
-
| `fetchToken`
|
|
109
|
-
| `defaultMetadata` | `Record<string, string>?`
|
|
110
|
-
| `defaultContext`
|
|
134
|
+
| Field | Type | Notes |
|
|
135
|
+
| ----------------- | --------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
136
|
+
| `apiBase` | `string` | Full HTTPS URL of the Voxline server. WS scheme derived: `https`→`wss`. Trailing slash optional. |
|
|
137
|
+
| `fetchToken` | `(args) => Promise<string>` | Called by the SDK whenever it needs a fresh `ct_`. Mirrors `@craftedxp/voice-rn`'s shape exactly — `{ agentId, userId?, context?, metadata? }`. |
|
|
138
|
+
| `defaultMetadata` | `Record<string, string>?` | Applied to every `startCall`. Per-call merges on top. |
|
|
139
|
+
| `defaultContext` | `Record<string, unknown>?` | Applied to every `startCall`. Per-call merges on top. |
|
|
111
140
|
|
|
112
141
|
Returns a `VoiceClientFactory` with one method:
|
|
113
142
|
|
|
114
143
|
### `factory.startCall(options)`
|
|
115
144
|
|
|
116
|
-
| Field
|
|
117
|
-
|
|
118
|
-
| `agentId`
|
|
119
|
-
| `userId`
|
|
120
|
-
| `context`
|
|
121
|
-
| `metadata`
|
|
122
|
-
| `bargeIn`
|
|
123
|
-
| `
|
|
124
|
-
| `
|
|
125
|
-
| `
|
|
126
|
-
| `
|
|
127
|
-
| `
|
|
128
|
-
| `
|
|
145
|
+
| Field | Type | Notes |
|
|
146
|
+
| ------------------ | ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
147
|
+
| `agentId` | `string` | Required. |
|
|
148
|
+
| `userId` | `string?` | Round-tripped to fetchToken as `userId`; server uses it for contact memory. |
|
|
149
|
+
| `context` | `Record<string, unknown>?` | Per-call structured context. Merged on top of `defaultContext`. Lowered into the agent's system prompt server-side. |
|
|
150
|
+
| `metadata` | `Record<string, string>?` | Per-call key/value. Merged on top of `defaultMetadata`. Round-tripped on `call.ended` webhook. NOT lowered into the prompt. |
|
|
151
|
+
| `bargeIn` | `boolean?` | Default `true`. Set `false` for alarm-style flows where the user shouldn't accidentally interrupt the script. |
|
|
152
|
+
| `clientTools` | `ClientToolMap?` | Per-call client tools the agent's LLM can invoke. See [Client tools](#client-tools) section below. Validated synchronously at `startCall` — bad input throws. |
|
|
153
|
+
| `token` | `string?` | **Test-only escape hatch** — pre-minted `ct_`, bypasses `fetchToken`. Don't use in production. |
|
|
154
|
+
| `onStateChange` | `(state) => void` | Fires on every state machine transition. |
|
|
155
|
+
| `onTranscript` | `(entries) => void` | Fires on every transcript update. |
|
|
156
|
+
| `onInterrupt` | `() => void` | Server signaled barge-in. Browser bundle auto-flushes built-in playback before this fires. Node consumers should drain their custom playback queue here. |
|
|
157
|
+
| `onAgentTurnStart` | `() => void` | New agent turn began. Use when you want a precise turn-start anchor without diffing `onStateChange`. |
|
|
158
|
+
| `onVolume` | `({ input, output }) => void` | 0-1 RMS. ~10 Hz cadence. Browser bundle only. |
|
|
159
|
+
| `onError` | `(err) => void` | Stable `code` from `CallErrorCode`; matches `voice-rn` codes where overlap. |
|
|
160
|
+
| `onEnd` | `({ reason, errorCode?, durationMs }) => void` | Fires once when the call ends. |
|
|
129
161
|
|
|
130
162
|
Resolves to a `Call` handle:
|
|
131
163
|
|
|
@@ -152,21 +184,75 @@ interface NodeCall extends Call {
|
|
|
152
184
|
|
|
153
185
|
```ts
|
|
154
186
|
type CallState =
|
|
155
|
-
| 'idle'
|
|
156
|
-
| '
|
|
157
|
-
| '
|
|
187
|
+
| 'idle'
|
|
188
|
+
| 'connecting'
|
|
189
|
+
| 'listening'
|
|
190
|
+
| 'user_speaking'
|
|
191
|
+
| 'agent_speaking'
|
|
192
|
+
| 'ended'
|
|
193
|
+
| 'error'
|
|
158
194
|
|
|
159
195
|
type CallErrorCode =
|
|
160
|
-
| 'missing_credentials'
|
|
161
|
-
| '
|
|
162
|
-
| '
|
|
163
|
-
| '
|
|
164
|
-
| '
|
|
165
|
-
| '
|
|
196
|
+
| 'missing_credentials'
|
|
197
|
+
| 'forbidden'
|
|
198
|
+
| 'mic_denied'
|
|
199
|
+
| 'mic_start_failed'
|
|
200
|
+
| 'audio_session_failed'
|
|
201
|
+
| 'token_expired'
|
|
202
|
+
| 'token_invalid'
|
|
203
|
+
| 'unauthorized'
|
|
204
|
+
| 'network_unreachable'
|
|
205
|
+
| 'socket_error'
|
|
206
|
+
| 'payment_required'
|
|
207
|
+
| 'not_found'
|
|
208
|
+
| 'silence_timeout'
|
|
209
|
+
| 'server_error'
|
|
166
210
|
|
|
167
211
|
type CallEndReason = 'agent_ended' | 'user_hangup' | 'timeout' | 'error'
|
|
168
212
|
```
|
|
169
213
|
|
|
214
|
+
## Client tools
|
|
215
|
+
|
|
216
|
+
You can declare tools the agent's LLM can call **on the consumer's machine**. The
|
|
217
|
+
tool's handler runs in your app — server side has no access to it. Useful for
|
|
218
|
+
surface-only actions (read DOM state, hit a private API, mutate local storage,
|
|
219
|
+
control the UI).
|
|
220
|
+
|
|
221
|
+
```ts
|
|
222
|
+
import { configureVoiceClient, type ClientToolMap } from '@craftedxp/voice-js'
|
|
223
|
+
|
|
224
|
+
const tools: ClientToolMap = {
|
|
225
|
+
addTodoItem: {
|
|
226
|
+
description: "Add an item to the user's todo list.",
|
|
227
|
+
parameters: {
|
|
228
|
+
type: 'object',
|
|
229
|
+
properties: { text: { type: 'string' } },
|
|
230
|
+
required: ['text'],
|
|
231
|
+
},
|
|
232
|
+
usage: 'Call when the user asks to add or capture a task.',
|
|
233
|
+
handler: async ({ text }) => {
|
|
234
|
+
await myAppApi.addTodo(String(text))
|
|
235
|
+
return `Added "${text}".`
|
|
236
|
+
},
|
|
237
|
+
},
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
const voice = configureVoiceClient({ apiBase: '...', fetchToken: async () => '...' })
|
|
241
|
+
const call = await voice.startCall({ agentId: 'agt_xxx', clientTools: tools })
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
The SDK validates `clientTools` at `startCall` (sync, throws on malformed input),
|
|
245
|
+
then sends `client_tools_register` to the server right after `connected`. When
|
|
246
|
+
the agent's LLM invokes a registered tool, your handler runs and the SDK posts
|
|
247
|
+
the result back through the same WebSocket.
|
|
248
|
+
|
|
249
|
+
Handler return values are stringified (object → `JSON.stringify`) before being
|
|
250
|
+
sent back; throws become `{ error: ... }` frames. The server enforces a default
|
|
251
|
+
10s / max 30s timeout per `timeoutMs` in your declaration.
|
|
252
|
+
|
|
253
|
+
For the full wire protocol, sequencing, and constraints see
|
|
254
|
+
[`docs/integration-echocheck.md`](../../docs/integration-echocheck.md#client-declared-tools).
|
|
255
|
+
|
|
170
256
|
## Migrating from `@voxline/web`
|
|
171
257
|
|
|
172
258
|
```diff
|
|
@@ -221,6 +307,25 @@ Three semantic shifts to be aware of:
|
|
|
221
307
|
|
|
222
308
|
The embed widget (`<script src="embed.js" data-token="ct_...">`) keeps the same HTML API, but the `data-api-key` attribute is no longer accepted — mint server-side and inject `data-token` instead.
|
|
223
309
|
|
|
310
|
+
## Troubleshooting
|
|
311
|
+
|
|
312
|
+
**Agent's last syllable cuts off and plays into the next agent message.** Almost
|
|
313
|
+
always a misfiring barge-in (acoustic echo from a laptop speaker → mic, or a
|
|
314
|
+
false-positive VAD on background noise). Three quick fixes, in order:
|
|
315
|
+
|
|
316
|
+
1. **Test with headphones.** Eliminates acoustic echo. If the symptom disappears,
|
|
317
|
+
it was echo. Tell production users to wear headphones, or fall back to (3).
|
|
318
|
+
2. **Check for a phantom user turn between two agent turns** in the transcript
|
|
319
|
+
that contains words the agent just said. That confirms STT is hearing the
|
|
320
|
+
agent's voice through the mic.
|
|
321
|
+
3. **Pass `bargeIn: false` on `startCall`** for non-conversational flows. Adds
|
|
322
|
+
`?barge=off` to the WS URL and the SDK ignores `interrupt` events
|
|
323
|
+
client-side. Tradeoff: user can't interrupt mid-sentence.
|
|
324
|
+
|
|
325
|
+
For the full diagnostic walkthrough (including the rarer Gemini-Live
|
|
326
|
+
stale-audio-leak case and audio-handling guidance for Node/Electron consumers),
|
|
327
|
+
see [`docs/integration-echocheck.md` → Audio quality](../../docs/integration-echocheck.md#audio-quality--the-cut-off-syllable-trap).
|
|
328
|
+
|
|
224
329
|
## Embed widget
|
|
225
330
|
|
|
226
331
|
For drop-in `<script>` consumers (landing pages, no-build embeds):
|
|
@@ -239,7 +344,9 @@ Renders a floating call button with a Shadow-DOM transcript panel. Pre-mint the
|
|
|
239
344
|
|
|
240
345
|
## Status
|
|
241
346
|
|
|
242
|
-
- **0.
|
|
347
|
+
- **0.3.1** (current) — adds `onInterrupt` / `onAgentTurnStart` callbacks on `StartCallOptions` and `NodeVoiceClientFactory` proper return type for the Node entry. Backwards-compatible.
|
|
348
|
+
- 0.3.0 — adds client-tools support. New `clientTools` option on `startCall` accepts a `ClientToolMap` (description, parameters, handler, optional usage/timeoutMs/example). Browser and Node bundles both supported. Backwards-compatible — existing consumers see no change.
|
|
349
|
+
- 0.2.0 — first `@craftedxp/voice-js` release. Browser + Node dual bundle, `fetchToken` factory, voice-rn 0.3.x parity. Migration path from `@voxline/web@0.1.0` documented above.
|
|
243
350
|
- 0.1.0 — `@voxline/web`. Singleton `VoiceClient` class, `apiKey` accepted. Retired in 0.2.0; never published to npm so no deprecation window.
|
|
244
351
|
|
|
245
352
|
See [`CONSUMING.md`](CONSUMING.md) for the full setup walkthrough and [`DEVELOPING.md`](DEVELOPING.md) for SDK-author iteration.
|
package/dist/browser.d.mts
CHANGED
|
@@ -1,3 +1,18 @@
|
|
|
1
|
+
interface ClientTool {
|
|
2
|
+
description: string;
|
|
3
|
+
parameters: Record<string, unknown>;
|
|
4
|
+
usage?: string;
|
|
5
|
+
timeoutMs?: number;
|
|
6
|
+
example?: string;
|
|
7
|
+
handler: (args: Record<string, unknown>) => Promise<string | object> | string | object;
|
|
8
|
+
}
|
|
9
|
+
type ClientToolMap = Record<string, ClientTool>;
|
|
10
|
+
interface ClientToolCallFrame {
|
|
11
|
+
toolCallId: string;
|
|
12
|
+
name: string;
|
|
13
|
+
args: Record<string, unknown>;
|
|
14
|
+
}
|
|
15
|
+
|
|
1
16
|
type CallState = 'idle' | 'connecting' | 'listening' | 'user_speaking' | 'agent_speaking' | 'ended' | 'error';
|
|
2
17
|
type TranscriptEntry = {
|
|
3
18
|
id: string;
|
|
@@ -51,6 +66,8 @@ interface ProtocolCallbacks {
|
|
|
51
66
|
onInterrupt: () => void;
|
|
52
67
|
onAgentTurnStart: () => void;
|
|
53
68
|
onCallEnd: (reason: CallEndReason) => void;
|
|
69
|
+
onConnected: () => void;
|
|
70
|
+
onClientToolCall: (frame: ClientToolCallFrame) => void;
|
|
54
71
|
}
|
|
55
72
|
declare function handleServerMessage(raw: string, state: ProtocolState, cb: ProtocolCallbacks): void;
|
|
56
73
|
interface BuildWsUrlArgs {
|
|
@@ -131,6 +148,16 @@ interface StartCallOptions {
|
|
|
131
148
|
* accidentally interrupt the script. Default true.
|
|
132
149
|
*/
|
|
133
150
|
bargeIn?: boolean;
|
|
151
|
+
/**
|
|
152
|
+
* Client-side tools the agent's LLM can call mid-conversation. Each
|
|
153
|
+
* tool's handler runs on the consumer's side; result is fed back to
|
|
154
|
+
* the LLM through the existing call WebSocket. Schema and handler
|
|
155
|
+
* colocate. Validated synchronously at startCall — bad input throws.
|
|
156
|
+
*
|
|
157
|
+
* See docs/integration-echocheck.md for the wire protocol and the
|
|
158
|
+
* server-side guarantees.
|
|
159
|
+
*/
|
|
160
|
+
clientTools?: ClientToolMap;
|
|
134
161
|
/**
|
|
135
162
|
* Test-only escape hatch — pass a pre-minted `ct_` directly and skip
|
|
136
163
|
* the `fetchToken` call. Don't use this in production code: tokens
|
|
@@ -143,6 +170,23 @@ interface StartCallOptions {
|
|
|
143
170
|
onEnd?: (end: CallEndEvent) => void;
|
|
144
171
|
/** Volume-meter event for VU UIs. ~10 Hz cadence (browser bundle only). */
|
|
145
172
|
onVolume?: (vol: VolumeEvent) => void;
|
|
173
|
+
/**
|
|
174
|
+
* Fires when the server signals barge-in (the user started talking
|
|
175
|
+
* mid-agent-turn). The browser bundle automatically flushes its
|
|
176
|
+
* built-in audio playback before this callback runs; the callback is
|
|
177
|
+
* fired regardless. Node / Electron consumers with custom playback
|
|
178
|
+
* should drain their audio queue here so the agent goes silent
|
|
179
|
+
* immediately.
|
|
180
|
+
*/
|
|
181
|
+
onInterrupt?: () => void;
|
|
182
|
+
/**
|
|
183
|
+
* Fires on `agent_turn_start` — the server has begun a new agent
|
|
184
|
+
* turn. The state-machine transition to `agent_speaking` happens at
|
|
185
|
+
* the same moment via `onStateChange`; use this when you want a
|
|
186
|
+
* precise turn anchor (e.g. "agent has been speaking for N ms" UIs)
|
|
187
|
+
* without diffing state.
|
|
188
|
+
*/
|
|
189
|
+
onAgentTurnStart?: () => void;
|
|
146
190
|
}
|
|
147
191
|
interface Call {
|
|
148
192
|
/** Current state. Snapshot — subscribe via onStateChange for live updates. */
|
|
@@ -274,4 +318,4 @@ type ReconnectingWebSocket = ReturnType<typeof createReconnectingWebSocket>;
|
|
|
274
318
|
*/
|
|
275
319
|
declare function configureVoiceClient(config: VoiceClientConfig): VoiceClientFactory;
|
|
276
320
|
|
|
277
|
-
export { type Call, type CallEndEvent, type CallEndReason, type CallError, type CallErrorCode, type CallState, type CaptureController, type CaptureOptions, type FetchToken, type FetchTokenArgs, type OnAgentSpeakingChange, type OnChunk, type OnError, type OnVolume$1 as OnVolume, type PlaybackController, type PlaybackOptions, type ProtocolCallbacks, type ProtocolState, type RWSEvent, type RWSOptions, type ReconnectingWebSocket, type ServerMessage, type StartCallOptions, type TranscriptEntry, type VoiceClientConfig, type VoiceClientFactory, type VolumeEvent, type WebSocketFactory, type WebSocketLike, buildWsUrl, configureVoiceClient, createAudioCapture, createAudioPlayback, createProtocolState, createReconnectingWebSocket, handleServerMessage };
|
|
321
|
+
export { type Call, type CallEndEvent, type CallEndReason, type CallError, type CallErrorCode, type CallState, type CaptureController, type CaptureOptions, type ClientTool, type ClientToolMap, type FetchToken, type FetchTokenArgs, type OnAgentSpeakingChange, type OnChunk, type OnError, type OnVolume$1 as OnVolume, type PlaybackController, type PlaybackOptions, type ProtocolCallbacks, type ProtocolState, type RWSEvent, type RWSOptions, type ReconnectingWebSocket, type ServerMessage, type StartCallOptions, type TranscriptEntry, type VoiceClientConfig, type VoiceClientFactory, type VolumeEvent, type WebSocketFactory, type WebSocketLike, buildWsUrl, configureVoiceClient, createAudioCapture, createAudioPlayback, createProtocolState, createReconnectingWebSocket, handleServerMessage };
|
package/dist/browser.d.ts
CHANGED
|
@@ -1,3 +1,18 @@
|
|
|
1
|
+
interface ClientTool {
|
|
2
|
+
description: string;
|
|
3
|
+
parameters: Record<string, unknown>;
|
|
4
|
+
usage?: string;
|
|
5
|
+
timeoutMs?: number;
|
|
6
|
+
example?: string;
|
|
7
|
+
handler: (args: Record<string, unknown>) => Promise<string | object> | string | object;
|
|
8
|
+
}
|
|
9
|
+
type ClientToolMap = Record<string, ClientTool>;
|
|
10
|
+
interface ClientToolCallFrame {
|
|
11
|
+
toolCallId: string;
|
|
12
|
+
name: string;
|
|
13
|
+
args: Record<string, unknown>;
|
|
14
|
+
}
|
|
15
|
+
|
|
1
16
|
type CallState = 'idle' | 'connecting' | 'listening' | 'user_speaking' | 'agent_speaking' | 'ended' | 'error';
|
|
2
17
|
type TranscriptEntry = {
|
|
3
18
|
id: string;
|
|
@@ -51,6 +66,8 @@ interface ProtocolCallbacks {
|
|
|
51
66
|
onInterrupt: () => void;
|
|
52
67
|
onAgentTurnStart: () => void;
|
|
53
68
|
onCallEnd: (reason: CallEndReason) => void;
|
|
69
|
+
onConnected: () => void;
|
|
70
|
+
onClientToolCall: (frame: ClientToolCallFrame) => void;
|
|
54
71
|
}
|
|
55
72
|
declare function handleServerMessage(raw: string, state: ProtocolState, cb: ProtocolCallbacks): void;
|
|
56
73
|
interface BuildWsUrlArgs {
|
|
@@ -131,6 +148,16 @@ interface StartCallOptions {
|
|
|
131
148
|
* accidentally interrupt the script. Default true.
|
|
132
149
|
*/
|
|
133
150
|
bargeIn?: boolean;
|
|
151
|
+
/**
|
|
152
|
+
* Client-side tools the agent's LLM can call mid-conversation. Each
|
|
153
|
+
* tool's handler runs on the consumer's side; result is fed back to
|
|
154
|
+
* the LLM through the existing call WebSocket. Schema and handler
|
|
155
|
+
* colocate. Validated synchronously at startCall — bad input throws.
|
|
156
|
+
*
|
|
157
|
+
* See docs/integration-echocheck.md for the wire protocol and the
|
|
158
|
+
* server-side guarantees.
|
|
159
|
+
*/
|
|
160
|
+
clientTools?: ClientToolMap;
|
|
134
161
|
/**
|
|
135
162
|
* Test-only escape hatch — pass a pre-minted `ct_` directly and skip
|
|
136
163
|
* the `fetchToken` call. Don't use this in production code: tokens
|
|
@@ -143,6 +170,23 @@ interface StartCallOptions {
|
|
|
143
170
|
onEnd?: (end: CallEndEvent) => void;
|
|
144
171
|
/** Volume-meter event for VU UIs. ~10 Hz cadence (browser bundle only). */
|
|
145
172
|
onVolume?: (vol: VolumeEvent) => void;
|
|
173
|
+
/**
|
|
174
|
+
* Fires when the server signals barge-in (the user started talking
|
|
175
|
+
* mid-agent-turn). The browser bundle automatically flushes its
|
|
176
|
+
* built-in audio playback before this callback runs; the callback is
|
|
177
|
+
* fired regardless. Node / Electron consumers with custom playback
|
|
178
|
+
* should drain their audio queue here so the agent goes silent
|
|
179
|
+
* immediately.
|
|
180
|
+
*/
|
|
181
|
+
onInterrupt?: () => void;
|
|
182
|
+
/**
|
|
183
|
+
* Fires on `agent_turn_start` — the server has begun a new agent
|
|
184
|
+
* turn. The state-machine transition to `agent_speaking` happens at
|
|
185
|
+
* the same moment via `onStateChange`; use this when you want a
|
|
186
|
+
* precise turn anchor (e.g. "agent has been speaking for N ms" UIs)
|
|
187
|
+
* without diffing state.
|
|
188
|
+
*/
|
|
189
|
+
onAgentTurnStart?: () => void;
|
|
146
190
|
}
|
|
147
191
|
interface Call {
|
|
148
192
|
/** Current state. Snapshot — subscribe via onStateChange for live updates. */
|
|
@@ -274,4 +318,4 @@ type ReconnectingWebSocket = ReturnType<typeof createReconnectingWebSocket>;
|
|
|
274
318
|
*/
|
|
275
319
|
declare function configureVoiceClient(config: VoiceClientConfig): VoiceClientFactory;
|
|
276
320
|
|
|
277
|
-
export { type Call, type CallEndEvent, type CallEndReason, type CallError, type CallErrorCode, type CallState, type CaptureController, type CaptureOptions, type FetchToken, type FetchTokenArgs, type OnAgentSpeakingChange, type OnChunk, type OnError, type OnVolume$1 as OnVolume, type PlaybackController, type PlaybackOptions, type ProtocolCallbacks, type ProtocolState, type RWSEvent, type RWSOptions, type ReconnectingWebSocket, type ServerMessage, type StartCallOptions, type TranscriptEntry, type VoiceClientConfig, type VoiceClientFactory, type VolumeEvent, type WebSocketFactory, type WebSocketLike, buildWsUrl, configureVoiceClient, createAudioCapture, createAudioPlayback, createProtocolState, createReconnectingWebSocket, handleServerMessage };
|
|
321
|
+
export { type Call, type CallEndEvent, type CallEndReason, type CallError, type CallErrorCode, type CallState, type CaptureController, type CaptureOptions, type ClientTool, type ClientToolMap, type FetchToken, type FetchTokenArgs, type OnAgentSpeakingChange, type OnChunk, type OnError, type OnVolume$1 as OnVolume, type PlaybackController, type PlaybackOptions, type ProtocolCallbacks, type ProtocolState, type RWSEvent, type RWSOptions, type ReconnectingWebSocket, type ServerMessage, type StartCallOptions, type TranscriptEntry, type VoiceClientConfig, type VoiceClientFactory, type VolumeEvent, type WebSocketFactory, type WebSocketLike, buildWsUrl, configureVoiceClient, createAudioCapture, createAudioPlayback, createProtocolState, createReconnectingWebSocket, handleServerMessage };
|
package/dist/browser.js
CHANGED
|
@@ -369,6 +369,7 @@ function handleServerMessage(raw, state, cb) {
|
|
|
369
369
|
}
|
|
370
370
|
switch (msg.type) {
|
|
371
371
|
case "connected":
|
|
372
|
+
cb.onConnected();
|
|
372
373
|
setState(state, "listening", cb);
|
|
373
374
|
return;
|
|
374
375
|
case "transcript": {
|
|
@@ -444,6 +445,14 @@ function handleServerMessage(raw, state, cb) {
|
|
|
444
445
|
];
|
|
445
446
|
cb.onTranscript(state.transcript);
|
|
446
447
|
return;
|
|
448
|
+
case "client_tool_call": {
|
|
449
|
+
const toolCallId = String(msg.toolCallId ?? "");
|
|
450
|
+
const name = String(msg.name ?? "");
|
|
451
|
+
const args = msg.args ?? {};
|
|
452
|
+
if (!toolCallId || !name) return;
|
|
453
|
+
cb.onClientToolCall({ toolCallId, name, args });
|
|
454
|
+
return;
|
|
455
|
+
}
|
|
447
456
|
case "call_end": {
|
|
448
457
|
const reasonRaw = String(msg.reason ?? "");
|
|
449
458
|
const reason = mapEndReason(reasonRaw);
|
|
@@ -501,6 +510,87 @@ function buildWsUrl(args) {
|
|
|
501
510
|
return `${proto}//${base.host}/v1/agents/${encodeURIComponent(args.agentId)}/call?token=${encodeURIComponent(args.token)}${bargeQS}`;
|
|
502
511
|
}
|
|
503
512
|
|
|
513
|
+
// src/clientTools.ts
|
|
514
|
+
var NAME_RE = /^[a-zA-Z_][a-zA-Z0-9_]*$/;
|
|
515
|
+
var MAX_TOOLS = 64;
|
|
516
|
+
var MAX_USAGE = 500;
|
|
517
|
+
var MAX_TIMEOUT_MS = 3e4;
|
|
518
|
+
var validateClientToolMap = (tools) => {
|
|
519
|
+
if (tools === void 0) return;
|
|
520
|
+
if (typeof tools !== "object" || tools === null || Array.isArray(tools)) {
|
|
521
|
+
throw new Error("clientTools must be an object keyed by tool name");
|
|
522
|
+
}
|
|
523
|
+
const entries = Object.entries(tools);
|
|
524
|
+
if (entries.length > MAX_TOOLS) {
|
|
525
|
+
throw new Error(`clientTools may declare at most 64 tools (got ${entries.length})`);
|
|
526
|
+
}
|
|
527
|
+
for (const [name, def] of entries) {
|
|
528
|
+
if (!NAME_RE.test(name)) {
|
|
529
|
+
throw new Error(
|
|
530
|
+
`clientTools["${name}"]: name must be a valid identifier (^[a-zA-Z_][a-zA-Z0-9_]*$)`
|
|
531
|
+
);
|
|
532
|
+
}
|
|
533
|
+
if (!def || typeof def !== "object") {
|
|
534
|
+
throw new Error(`clientTools["${name}"]: must be an object`);
|
|
535
|
+
}
|
|
536
|
+
if (typeof def.description !== "string" || def.description.length === 0) {
|
|
537
|
+
throw new Error(`clientTools["${name}"]: must have a description`);
|
|
538
|
+
}
|
|
539
|
+
if (typeof def.handler !== "function") {
|
|
540
|
+
throw new Error(`clientTools["${name}"]: must have a handler function`);
|
|
541
|
+
}
|
|
542
|
+
if (def.usage !== void 0 && def.usage.length > MAX_USAGE) {
|
|
543
|
+
throw new Error(`clientTools["${name}"]: usage must be \u2264500 chars`);
|
|
544
|
+
}
|
|
545
|
+
if (def.timeoutMs !== void 0 && (!Number.isFinite(def.timeoutMs) || def.timeoutMs <= 0 || def.timeoutMs > MAX_TIMEOUT_MS)) {
|
|
546
|
+
throw new Error(`clientTools["${name}"]: timeoutMs must be in (0, 30000]`);
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
};
|
|
550
|
+
var buildRegisterFrame = (tools) => ({
|
|
551
|
+
type: "client_tools_register",
|
|
552
|
+
tools: Object.entries(tools).map(([name, def]) => ({
|
|
553
|
+
name,
|
|
554
|
+
description: def.description,
|
|
555
|
+
parameters: def.parameters,
|
|
556
|
+
...def.usage !== void 0 ? { usage: def.usage } : {},
|
|
557
|
+
...def.timeoutMs !== void 0 ? { timeoutMs: def.timeoutMs } : {}
|
|
558
|
+
}))
|
|
559
|
+
});
|
|
560
|
+
var dispatchClientToolCall = (send, tools, frame) => {
|
|
561
|
+
const safeSend = (payload) => {
|
|
562
|
+
try {
|
|
563
|
+
send(payload);
|
|
564
|
+
} catch {
|
|
565
|
+
}
|
|
566
|
+
};
|
|
567
|
+
const tool = tools[frame.name];
|
|
568
|
+
if (!tool) {
|
|
569
|
+
safeSend({
|
|
570
|
+
type: "client_tool_result",
|
|
571
|
+
toolCallId: frame.toolCallId,
|
|
572
|
+
error: `No handler for ${frame.name}`
|
|
573
|
+
});
|
|
574
|
+
return;
|
|
575
|
+
}
|
|
576
|
+
void (async () => {
|
|
577
|
+
try {
|
|
578
|
+
const out = await tool.handler(frame.args);
|
|
579
|
+
safeSend({
|
|
580
|
+
type: "client_tool_result",
|
|
581
|
+
toolCallId: frame.toolCallId,
|
|
582
|
+
result: typeof out === "string" ? out : JSON.stringify(out)
|
|
583
|
+
});
|
|
584
|
+
} catch (err) {
|
|
585
|
+
safeSend({
|
|
586
|
+
type: "client_tool_result",
|
|
587
|
+
toolCallId: frame.toolCallId,
|
|
588
|
+
error: err instanceof Error ? err.message : String(err)
|
|
589
|
+
});
|
|
590
|
+
}
|
|
591
|
+
})();
|
|
592
|
+
};
|
|
593
|
+
|
|
504
594
|
// src/VoiceClient.ts
|
|
505
595
|
var BrowserVoiceClient = class {
|
|
506
596
|
constructor(args) {
|
|
@@ -529,6 +619,10 @@ var BrowserVoiceClient = class {
|
|
|
529
619
|
// ---------------------------------------------------------------
|
|
530
620
|
// Internal
|
|
531
621
|
// ---------------------------------------------------------------
|
|
622
|
+
this.sendClientToolsRegister = () => {
|
|
623
|
+
const frame = buildRegisterFrame(this.args.options.clientTools ?? {});
|
|
624
|
+
this.rws?.send(JSON.stringify(frame));
|
|
625
|
+
};
|
|
532
626
|
this.setState = (next) => {
|
|
533
627
|
if (this.proto.state === next) return;
|
|
534
628
|
this.proto.state = next;
|
|
@@ -556,9 +650,18 @@ var BrowserVoiceClient = class {
|
|
|
556
650
|
onState: this.setState,
|
|
557
651
|
onTranscript: (entries) => this.args.options.onTranscript?.(entries),
|
|
558
652
|
onError: this.emitError,
|
|
559
|
-
onInterrupt: () =>
|
|
560
|
-
|
|
561
|
-
|
|
653
|
+
onInterrupt: () => {
|
|
654
|
+
this.playback?.flush();
|
|
655
|
+
this.args.options.onInterrupt?.();
|
|
656
|
+
},
|
|
657
|
+
onAgentTurnStart: () => this.args.options.onAgentTurnStart?.(),
|
|
658
|
+
onCallEnd: (reason) => this.teardown(reason),
|
|
659
|
+
onConnected: () => this.sendClientToolsRegister(),
|
|
660
|
+
onClientToolCall: (frame) => dispatchClientToolCall(
|
|
661
|
+
(f) => this.rws?.send(JSON.stringify(f)),
|
|
662
|
+
this.args.options.clientTools ?? {},
|
|
663
|
+
frame
|
|
664
|
+
)
|
|
562
665
|
});
|
|
563
666
|
} else {
|
|
564
667
|
this.playback?.enqueue(ev.data);
|
|
@@ -623,6 +726,7 @@ var BrowserVoiceClient = class {
|
|
|
623
726
|
};
|
|
624
727
|
this.args = args;
|
|
625
728
|
this.proto = createProtocolState();
|
|
729
|
+
validateClientToolMap(args.options.clientTools);
|
|
626
730
|
}
|
|
627
731
|
// ---------------------------------------------------------------
|
|
628
732
|
// Call interface
|