@absolutejs/voice 0.0.20 → 0.0.22-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +884 -4
- package/dist/angular/index.d.ts +1 -0
- package/dist/angular/index.js +759 -3
- package/dist/angular/voice-controller.service.d.ts +27 -0
- package/dist/angular/voice-stream.service.d.ts +6 -0
- package/dist/audioConditioning.d.ts +3 -0
- package/dist/client/actions.d.ts +48 -0
- package/dist/client/audioPlayer.d.ts +40 -0
- package/dist/client/connection.d.ts +5 -0
- package/dist/client/controller.d.ts +2 -0
- package/dist/client/duplex.d.ts +3 -0
- package/dist/client/htmxBootstrap.js +660 -167
- package/dist/client/index.d.ts +3 -0
- package/dist/client/index.js +991 -6
- package/dist/client/microphone.d.ts +4 -2
- package/dist/correction.d.ts +33 -0
- package/dist/fileStore.d.ts +27 -0
- package/dist/index.d.ts +15 -0
- package/dist/index.js +3721 -298
- package/dist/ops.d.ts +100 -0
- package/dist/presets.d.ts +13 -0
- package/dist/react/index.d.ts +1 -0
- package/dist/react/index.js +728 -3
- package/dist/react/useVoiceController.d.ts +26 -0
- package/dist/react/useVoiceStream.d.ts +7 -0
- package/dist/routing.d.ts +3 -0
- package/dist/runtimeOps.d.ts +23 -0
- package/dist/store.d.ts +2 -2
- package/dist/svelte/index.d.ts +1 -0
- package/dist/svelte/index.js +691 -3
- package/dist/telephony/response.d.ts +7 -0
- package/dist/telephony/twilio.d.ts +116 -0
- package/dist/testing/benchmark.d.ts +93 -2
- package/dist/testing/corrected.d.ts +41 -0
- package/dist/testing/duplex.d.ts +59 -0
- package/dist/testing/fixtures.d.ts +18 -2
- package/dist/testing/index.d.ts +5 -0
- package/dist/testing/index.js +6247 -402
- package/dist/testing/review.d.ts +143 -0
- package/dist/testing/sessionBenchmark.d.ts +92 -2
- package/dist/testing/stt.d.ts +3 -1
- package/dist/testing/telephony.d.ts +70 -0
- package/dist/testing/tts.d.ts +73 -0
- package/dist/turnDetection.d.ts +5 -1
- package/dist/turnProfiles.d.ts +6 -0
- package/dist/types.d.ts +487 -10
- package/dist/vue/index.d.ts +1 -0
- package/dist/vue/index.js +750 -3
- package/dist/vue/useVoiceController.d.ts +30 -0
- package/dist/vue/useVoiceStream.d.ts +11 -0
- package/fixtures/README.md +9 -0
- package/fixtures/manifest.json +59 -1
- package/fixtures/pcm/dialogue-three-clean.pcm +0 -0
- package/fixtures/pcm/dialogue-three-mixed.pcm +0 -0
- package/fixtures/pcm/dialogue-two-clean.pcm +0 -0
- package/fixtures/pcm/dialogue-two-noisy.pcm +0 -0
- package/package.json +135 -1
package/dist/client/index.js
CHANGED
|
@@ -76,24 +76,30 @@ var WS_NORMAL_CLOSURE = 1000;
|
|
|
76
76
|
var DEFAULT_MAX_RECONNECT_ATTEMPTS = 10;
|
|
77
77
|
var DEFAULT_PING_INTERVAL = 30000;
|
|
78
78
|
var RECONNECT_DELAY_MS = 500;
|
|
79
|
+
var DEFAULT_SCENARIO_QUERY_PARAM = "scenarioId";
|
|
79
80
|
var noop = () => {};
|
|
80
81
|
var noopUnsubscribe = () => noop;
|
|
81
82
|
var NOOP_CONNECTION = {
|
|
83
|
+
start: () => {},
|
|
82
84
|
close: noop,
|
|
83
85
|
endTurn: noop,
|
|
84
86
|
getReadyState: () => WS_CLOSED,
|
|
87
|
+
getScenarioId: () => "",
|
|
85
88
|
getSessionId: () => "",
|
|
86
89
|
send: noop,
|
|
87
90
|
sendAudio: noop,
|
|
88
91
|
subscribe: noopUnsubscribe
|
|
89
92
|
};
|
|
90
93
|
var createSessionId = () => crypto.randomUUID();
|
|
91
|
-
var buildWsUrl = (path, sessionId) => {
|
|
94
|
+
var buildWsUrl = (path, sessionId, scenarioId) => {
|
|
92
95
|
const { hostname, port, protocol } = window.location;
|
|
93
96
|
const wsProtocol = protocol === "https:" ? "wss:" : "ws:";
|
|
94
97
|
const portSuffix = port ? `:${port}` : "";
|
|
95
98
|
const url = new URL(`${wsProtocol}//${hostname}${portSuffix}${path}`);
|
|
96
99
|
url.searchParams.set("sessionId", sessionId);
|
|
100
|
+
if (scenarioId) {
|
|
101
|
+
url.searchParams.set(DEFAULT_SCENARIO_QUERY_PARAM, scenarioId);
|
|
102
|
+
}
|
|
97
103
|
return url.toString();
|
|
98
104
|
};
|
|
99
105
|
var isVoiceServerMessage = (value) => {
|
|
@@ -101,6 +107,7 @@ var isVoiceServerMessage = (value) => {
|
|
|
101
107
|
return false;
|
|
102
108
|
}
|
|
103
109
|
switch (value.type) {
|
|
110
|
+
case "audio":
|
|
104
111
|
case "assistant":
|
|
105
112
|
case "complete":
|
|
106
113
|
case "error":
|
|
@@ -136,6 +143,7 @@ var createVoiceConnection = (path, options = {}) => {
|
|
|
136
143
|
const state = {
|
|
137
144
|
isConnected: false,
|
|
138
145
|
pendingMessages: [],
|
|
146
|
+
scenarioId: options.scenarioId ?? null,
|
|
139
147
|
pingInterval: null,
|
|
140
148
|
reconnectAttempts: 0,
|
|
141
149
|
reconnectTimeout: null,
|
|
@@ -173,13 +181,14 @@ var createVoiceConnection = (path, options = {}) => {
|
|
|
173
181
|
}, RECONNECT_DELAY_MS);
|
|
174
182
|
};
|
|
175
183
|
const connect = () => {
|
|
176
|
-
const ws = new WebSocket(buildWsUrl(path, state.sessionId));
|
|
184
|
+
const ws = new WebSocket(buildWsUrl(path, state.sessionId, state.scenarioId));
|
|
177
185
|
ws.binaryType = "arraybuffer";
|
|
178
186
|
ws.onopen = () => {
|
|
179
187
|
state.isConnected = true;
|
|
180
188
|
state.reconnectAttempts = 0;
|
|
181
189
|
flushPendingMessages();
|
|
182
190
|
listeners.forEach((listener) => listener({
|
|
191
|
+
scenarioId: state.scenarioId ?? undefined,
|
|
183
192
|
sessionId: state.sessionId,
|
|
184
193
|
status: "active",
|
|
185
194
|
type: "session"
|
|
@@ -197,6 +206,7 @@ var createVoiceConnection = (path, options = {}) => {
|
|
|
197
206
|
}
|
|
198
207
|
if (parsed.type === "session") {
|
|
199
208
|
state.sessionId = parsed.sessionId;
|
|
209
|
+
state.scenarioId = parsed.scenarioId ?? state.scenarioId;
|
|
200
210
|
}
|
|
201
211
|
listeners.forEach((listener) => listener(parsed));
|
|
202
212
|
};
|
|
@@ -220,6 +230,19 @@ var createVoiceConnection = (path, options = {}) => {
|
|
|
220
230
|
const send = (message) => {
|
|
221
231
|
sendSerialized(JSON.stringify(message));
|
|
222
232
|
};
|
|
233
|
+
const start = (input = {}) => {
|
|
234
|
+
if (input.sessionId) {
|
|
235
|
+
state.sessionId = input.sessionId;
|
|
236
|
+
}
|
|
237
|
+
if (input.scenarioId) {
|
|
238
|
+
state.scenarioId = input.scenarioId;
|
|
239
|
+
}
|
|
240
|
+
send({
|
|
241
|
+
type: "start",
|
|
242
|
+
sessionId: state.sessionId,
|
|
243
|
+
scenarioId: state.scenarioId ?? undefined
|
|
244
|
+
});
|
|
245
|
+
};
|
|
223
246
|
const sendAudio = (audio) => {
|
|
224
247
|
sendSerialized(audio);
|
|
225
248
|
};
|
|
@@ -243,15 +266,363 @@ var createVoiceConnection = (path, options = {}) => {
|
|
|
243
266
|
};
|
|
244
267
|
connect();
|
|
245
268
|
return {
|
|
269
|
+
start,
|
|
246
270
|
close,
|
|
247
271
|
endTurn,
|
|
248
272
|
getReadyState: () => state.ws?.readyState ?? WS_CLOSED,
|
|
273
|
+
getScenarioId: () => state.scenarioId ?? "",
|
|
249
274
|
getSessionId: () => state.sessionId,
|
|
250
275
|
send,
|
|
251
276
|
sendAudio,
|
|
252
277
|
subscribe
|
|
253
278
|
};
|
|
254
279
|
};
|
|
280
|
+
// src/client/audioPlayer.ts
|
|
281
|
+
var DEFAULT_LOOKAHEAD_MS = 15;
|
|
282
|
+
var createInitialState = () => ({
|
|
283
|
+
activeSourceCount: 0,
|
|
284
|
+
error: null,
|
|
285
|
+
isActive: false,
|
|
286
|
+
isPlaying: false,
|
|
287
|
+
lastInterruptLatencyMs: undefined,
|
|
288
|
+
lastPlaybackStopLatencyMs: undefined,
|
|
289
|
+
processedChunkCount: 0,
|
|
290
|
+
queuedChunkCount: 0
|
|
291
|
+
});
|
|
292
|
+
var getAudioContextCtor = () => {
|
|
293
|
+
if (typeof window === "undefined") {
|
|
294
|
+
return typeof AudioContext === "undefined" ? undefined : AudioContext;
|
|
295
|
+
}
|
|
296
|
+
return window.AudioContext ?? window.webkitAudioContext;
|
|
297
|
+
};
|
|
298
|
+
var decodePCM16LEChunk = (audioContext, chunk) => {
|
|
299
|
+
const format = chunk.format;
|
|
300
|
+
if (format.container !== "raw" || format.encoding !== "pcm_s16le") {
|
|
301
|
+
throw new Error(`Unsupported assistant audio format: ${format.container}/${format.encoding}`);
|
|
302
|
+
}
|
|
303
|
+
const bytes = chunk.chunk;
|
|
304
|
+
const channels = Math.max(1, format.channels);
|
|
305
|
+
const sampleCount = Math.floor(bytes.byteLength / 2);
|
|
306
|
+
const frameCount = Math.max(1, Math.floor(sampleCount / channels));
|
|
307
|
+
const audioBuffer = audioContext.createBuffer(channels, frameCount, format.sampleRateHz);
|
|
308
|
+
const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
|
|
309
|
+
for (let channelIndex = 0;channelIndex < channels; channelIndex += 1) {
|
|
310
|
+
const channelData = audioBuffer.getChannelData(channelIndex);
|
|
311
|
+
for (let frameIndex = 0;frameIndex < frameCount; frameIndex += 1) {
|
|
312
|
+
const sampleIndex = frameIndex * channels + channelIndex;
|
|
313
|
+
const sampleOffset = sampleIndex * 2;
|
|
314
|
+
if (sampleOffset + 1 >= bytes.byteLength) {
|
|
315
|
+
channelData[frameIndex] = 0;
|
|
316
|
+
continue;
|
|
317
|
+
}
|
|
318
|
+
channelData[frameIndex] = view.getInt16(sampleOffset, true) / 32768;
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
return audioBuffer;
|
|
322
|
+
};
|
|
323
|
+
var createVoiceAudioPlayer = (source, options = {}) => {
|
|
324
|
+
const subscribers = new Set;
|
|
325
|
+
const sourceNodes = new Set;
|
|
326
|
+
const lookaheadSeconds = (options.lookaheadMs ?? DEFAULT_LOOKAHEAD_MS) / 1000;
|
|
327
|
+
let state = createInitialState();
|
|
328
|
+
let audioContext = null;
|
|
329
|
+
let outputNode = null;
|
|
330
|
+
let queueEndTime = 0;
|
|
331
|
+
let syncPromise = Promise.resolve();
|
|
332
|
+
let interruptStartedAt = null;
|
|
333
|
+
let interruptPromise = null;
|
|
334
|
+
let resolveInterruptPromise = null;
|
|
335
|
+
let interruptFallbackTimer = null;
|
|
336
|
+
const notify = () => {
|
|
337
|
+
for (const subscriber of subscribers) {
|
|
338
|
+
subscriber();
|
|
339
|
+
}
|
|
340
|
+
};
|
|
341
|
+
const setState = (next) => {
|
|
342
|
+
state = {
|
|
343
|
+
...state,
|
|
344
|
+
...next
|
|
345
|
+
};
|
|
346
|
+
notify();
|
|
347
|
+
};
|
|
348
|
+
const clearError = () => {
|
|
349
|
+
if (state.error !== null) {
|
|
350
|
+
setState({ error: null });
|
|
351
|
+
}
|
|
352
|
+
};
|
|
353
|
+
const clearInterruptTimer = () => {
|
|
354
|
+
if (interruptFallbackTimer !== null) {
|
|
355
|
+
clearTimeout(interruptFallbackTimer);
|
|
356
|
+
interruptFallbackTimer = null;
|
|
357
|
+
}
|
|
358
|
+
};
|
|
359
|
+
const resolveInterrupt = (latencyMs) => {
|
|
360
|
+
clearInterruptTimer();
|
|
361
|
+
interruptStartedAt = null;
|
|
362
|
+
setState({
|
|
363
|
+
activeSourceCount: sourceNodes.size,
|
|
364
|
+
isPlaying: false,
|
|
365
|
+
lastInterruptLatencyMs: latencyMs,
|
|
366
|
+
lastPlaybackStopLatencyMs: state.lastPlaybackStopLatencyMs ?? latencyMs
|
|
367
|
+
});
|
|
368
|
+
resolveInterruptPromise?.();
|
|
369
|
+
resolveInterruptPromise = null;
|
|
370
|
+
interruptPromise = null;
|
|
371
|
+
};
|
|
372
|
+
const estimateOutputStopLatencyMs = (context) => {
|
|
373
|
+
if (!context) {
|
|
374
|
+
return 0;
|
|
375
|
+
}
|
|
376
|
+
return Math.max(0, ((context.baseLatency ?? 0) + (context.outputLatency ?? 0)) * 1000);
|
|
377
|
+
};
|
|
378
|
+
const restoreOutputGain = (context) => {
|
|
379
|
+
if (!outputNode) {
|
|
380
|
+
return;
|
|
381
|
+
}
|
|
382
|
+
const gainValue = 1;
|
|
383
|
+
if (outputNode.gain.setValueAtTime) {
|
|
384
|
+
outputNode.gain.setValueAtTime(gainValue, context?.currentTime ?? 0);
|
|
385
|
+
return;
|
|
386
|
+
}
|
|
387
|
+
outputNode.gain.value = gainValue;
|
|
388
|
+
};
|
|
389
|
+
const muteOutputGain = (context) => {
|
|
390
|
+
if (!outputNode) {
|
|
391
|
+
return;
|
|
392
|
+
}
|
|
393
|
+
const gainValue = 0;
|
|
394
|
+
if (outputNode.gain.setValueAtTime) {
|
|
395
|
+
outputNode.gain.setValueAtTime(gainValue, context?.currentTime ?? 0);
|
|
396
|
+
return;
|
|
397
|
+
}
|
|
398
|
+
outputNode.gain.value = gainValue;
|
|
399
|
+
};
|
|
400
|
+
const maybeResolveInterrupt = () => {
|
|
401
|
+
if (interruptStartedAt === null || sourceNodes.size > 0) {
|
|
402
|
+
return;
|
|
403
|
+
}
|
|
404
|
+
resolveInterrupt(Date.now() - interruptStartedAt);
|
|
405
|
+
};
|
|
406
|
+
const ensureAudioContext = async () => {
|
|
407
|
+
if (audioContext) {
|
|
408
|
+
return audioContext;
|
|
409
|
+
}
|
|
410
|
+
if (options.createAudioContext) {
|
|
411
|
+
audioContext = options.createAudioContext();
|
|
412
|
+
} else {
|
|
413
|
+
const AudioContextCtor = getAudioContextCtor();
|
|
414
|
+
if (!AudioContextCtor) {
|
|
415
|
+
throw new Error("Assistant audio playback requires AudioContext support.");
|
|
416
|
+
}
|
|
417
|
+
audioContext = new AudioContextCtor;
|
|
418
|
+
}
|
|
419
|
+
if (audioContext.createGain) {
|
|
420
|
+
outputNode = audioContext.createGain();
|
|
421
|
+
outputNode.connect?.(audioContext.destination);
|
|
422
|
+
}
|
|
423
|
+
queueEndTime = audioContext.currentTime;
|
|
424
|
+
return audioContext;
|
|
425
|
+
};
|
|
426
|
+
const scheduleChunk = async (chunk) => {
|
|
427
|
+
const context = await ensureAudioContext();
|
|
428
|
+
const buffer = decodePCM16LEChunk(context, chunk);
|
|
429
|
+
const node = context.createBufferSource();
|
|
430
|
+
node.buffer = buffer;
|
|
431
|
+
node.connect(outputNode ?? context.destination);
|
|
432
|
+
node.onended = () => {
|
|
433
|
+
sourceNodes.delete(node);
|
|
434
|
+
node.disconnect?.();
|
|
435
|
+
setState({
|
|
436
|
+
activeSourceCount: sourceNodes.size,
|
|
437
|
+
isPlaying: sourceNodes.size > 0 && state.isActive
|
|
438
|
+
});
|
|
439
|
+
maybeResolveInterrupt();
|
|
440
|
+
};
|
|
441
|
+
const startAt = Math.max(context.currentTime + lookaheadSeconds, queueEndTime);
|
|
442
|
+
queueEndTime = startAt + buffer.duration;
|
|
443
|
+
sourceNodes.add(node);
|
|
444
|
+
setState({
|
|
445
|
+
activeSourceCount: sourceNodes.size,
|
|
446
|
+
isPlaying: true
|
|
447
|
+
});
|
|
448
|
+
node.start(startAt);
|
|
449
|
+
};
|
|
450
|
+
const stopQueuedPlayback = (options2) => {
|
|
451
|
+
for (const node of [...sourceNodes]) {
|
|
452
|
+
node.stop?.();
|
|
453
|
+
}
|
|
454
|
+
queueEndTime = audioContext ? audioContext.currentTime : 0;
|
|
455
|
+
if (options2?.forceClear) {
|
|
456
|
+
for (const node of sourceNodes) {
|
|
457
|
+
node.disconnect?.();
|
|
458
|
+
}
|
|
459
|
+
sourceNodes.clear();
|
|
460
|
+
maybeResolveInterrupt();
|
|
461
|
+
}
|
|
462
|
+
};
|
|
463
|
+
const sync = async () => {
|
|
464
|
+
if (!state.isActive) {
|
|
465
|
+
return;
|
|
466
|
+
}
|
|
467
|
+
const nextChunks = source.assistantAudio.slice(state.processedChunkCount);
|
|
468
|
+
if (nextChunks.length === 0) {
|
|
469
|
+
return;
|
|
470
|
+
}
|
|
471
|
+
try {
|
|
472
|
+
clearError();
|
|
473
|
+
for (const chunk of nextChunks) {
|
|
474
|
+
await scheduleChunk(chunk);
|
|
475
|
+
}
|
|
476
|
+
setState({
|
|
477
|
+
processedChunkCount: source.assistantAudio.length,
|
|
478
|
+
queuedChunkCount: state.queuedChunkCount + nextChunks.length
|
|
479
|
+
});
|
|
480
|
+
} catch (error) {
|
|
481
|
+
setState({
|
|
482
|
+
error: error instanceof Error ? error.message : String(error)
|
|
483
|
+
});
|
|
484
|
+
}
|
|
485
|
+
};
|
|
486
|
+
const queueSync = () => {
|
|
487
|
+
syncPromise = syncPromise.then(() => sync(), () => sync());
|
|
488
|
+
return syncPromise;
|
|
489
|
+
};
|
|
490
|
+
const unsubscribeSource = source.subscribe(() => {
|
|
491
|
+
if (options.autoStart && !state.isActive && source.assistantAudio.length > 0) {
|
|
492
|
+
player.start();
|
|
493
|
+
return;
|
|
494
|
+
}
|
|
495
|
+
if (state.isActive) {
|
|
496
|
+
queueSync();
|
|
497
|
+
}
|
|
498
|
+
});
|
|
499
|
+
const player = {
|
|
500
|
+
close: async () => {
|
|
501
|
+
unsubscribeSource();
|
|
502
|
+
stopQueuedPlayback({ forceClear: true });
|
|
503
|
+
clearInterruptTimer();
|
|
504
|
+
resolveInterruptPromise?.();
|
|
505
|
+
resolveInterruptPromise = null;
|
|
506
|
+
interruptPromise = null;
|
|
507
|
+
interruptStartedAt = null;
|
|
508
|
+
if (audioContext && audioContext.state !== "closed") {
|
|
509
|
+
await audioContext.close();
|
|
510
|
+
}
|
|
511
|
+
audioContext = null;
|
|
512
|
+
outputNode?.disconnect?.();
|
|
513
|
+
outputNode = null;
|
|
514
|
+
queueEndTime = 0;
|
|
515
|
+
setState({
|
|
516
|
+
activeSourceCount: 0,
|
|
517
|
+
isActive: false,
|
|
518
|
+
isPlaying: false
|
|
519
|
+
});
|
|
520
|
+
},
|
|
521
|
+
get activeSourceCount() {
|
|
522
|
+
return state.activeSourceCount;
|
|
523
|
+
},
|
|
524
|
+
get error() {
|
|
525
|
+
return state.error;
|
|
526
|
+
},
|
|
527
|
+
getSnapshot: () => state,
|
|
528
|
+
get isActive() {
|
|
529
|
+
return state.isActive;
|
|
530
|
+
},
|
|
531
|
+
get isPlaying() {
|
|
532
|
+
return state.isPlaying;
|
|
533
|
+
},
|
|
534
|
+
interrupt: async () => {
|
|
535
|
+
const startedAt = Date.now();
|
|
536
|
+
const context = await ensureAudioContext();
|
|
537
|
+
interruptStartedAt = startedAt;
|
|
538
|
+
muteOutputGain(context);
|
|
539
|
+
const playbackStopLatencyMs = Date.now() - startedAt + estimateOutputStopLatencyMs(context);
|
|
540
|
+
setState({
|
|
541
|
+
isActive: false,
|
|
542
|
+
isPlaying: sourceNodes.size > 0,
|
|
543
|
+
lastPlaybackStopLatencyMs: playbackStopLatencyMs
|
|
544
|
+
});
|
|
545
|
+
if (sourceNodes.size === 0) {
|
|
546
|
+
resolveInterrupt(playbackStopLatencyMs);
|
|
547
|
+
return;
|
|
548
|
+
}
|
|
549
|
+
if (!interruptPromise) {
|
|
550
|
+
interruptPromise = new Promise((resolve) => {
|
|
551
|
+
resolveInterruptPromise = resolve;
|
|
552
|
+
});
|
|
553
|
+
}
|
|
554
|
+
clearInterruptTimer();
|
|
555
|
+
interruptFallbackTimer = setTimeout(() => {
|
|
556
|
+
for (const node of sourceNodes) {
|
|
557
|
+
node.disconnect?.();
|
|
558
|
+
}
|
|
559
|
+
sourceNodes.clear();
|
|
560
|
+
resolveInterrupt(Date.now() - startedAt);
|
|
561
|
+
}, 250);
|
|
562
|
+
stopQueuedPlayback();
|
|
563
|
+
await interruptPromise;
|
|
564
|
+
},
|
|
565
|
+
get lastInterruptLatencyMs() {
|
|
566
|
+
return state.lastInterruptLatencyMs;
|
|
567
|
+
},
|
|
568
|
+
get lastPlaybackStopLatencyMs() {
|
|
569
|
+
return state.lastPlaybackStopLatencyMs;
|
|
570
|
+
},
|
|
571
|
+
pause: async () => {
|
|
572
|
+
if (!audioContext) {
|
|
573
|
+
setState({
|
|
574
|
+
activeSourceCount: 0,
|
|
575
|
+
isActive: false,
|
|
576
|
+
isPlaying: false
|
|
577
|
+
});
|
|
578
|
+
return;
|
|
579
|
+
}
|
|
580
|
+
await audioContext.suspend();
|
|
581
|
+
setState({
|
|
582
|
+
activeSourceCount: sourceNodes.size,
|
|
583
|
+
isActive: false,
|
|
584
|
+
isPlaying: false
|
|
585
|
+
});
|
|
586
|
+
},
|
|
587
|
+
get processedChunkCount() {
|
|
588
|
+
return state.processedChunkCount;
|
|
589
|
+
},
|
|
590
|
+
get queuedChunkCount() {
|
|
591
|
+
return state.queuedChunkCount;
|
|
592
|
+
},
|
|
593
|
+
start: async () => {
|
|
594
|
+
try {
|
|
595
|
+
clearError();
|
|
596
|
+
const context = await ensureAudioContext();
|
|
597
|
+
restoreOutputGain(context);
|
|
598
|
+
if (context.state === "suspended") {
|
|
599
|
+
await context.resume();
|
|
600
|
+
}
|
|
601
|
+
setState({
|
|
602
|
+
activeSourceCount: sourceNodes.size,
|
|
603
|
+
isActive: true,
|
|
604
|
+
isPlaying: context.state === "running"
|
|
605
|
+
});
|
|
606
|
+
await queueSync();
|
|
607
|
+
} catch (error) {
|
|
608
|
+
setState({
|
|
609
|
+
error: error instanceof Error ? error.message : String(error),
|
|
610
|
+
isActive: false,
|
|
611
|
+
isPlaying: false
|
|
612
|
+
});
|
|
613
|
+
throw error;
|
|
614
|
+
}
|
|
615
|
+
},
|
|
616
|
+
subscribe: (subscriber) => {
|
|
617
|
+
subscribers.add(subscriber);
|
|
618
|
+
return () => {
|
|
619
|
+
subscribers.delete(subscriber);
|
|
620
|
+
};
|
|
621
|
+
}
|
|
622
|
+
};
|
|
623
|
+
return player;
|
|
624
|
+
};
|
|
625
|
+
var decodeVoiceAudioChunk = (audioContext, chunk) => decodePCM16LEChunk(audioContext, chunk);
|
|
255
626
|
// src/client/actions.ts
|
|
256
627
|
var normalizeErrorMessage = (value) => {
|
|
257
628
|
if (typeof value === "string" && value.trim()) {
|
|
@@ -282,6 +653,14 @@ var normalizeErrorMessage = (value) => {
|
|
|
282
653
|
};
|
|
283
654
|
var serverMessageToAction = (message) => {
|
|
284
655
|
switch (message.type) {
|
|
656
|
+
case "audio":
|
|
657
|
+
return {
|
|
658
|
+
chunk: Uint8Array.from(atob(message.chunkBase64), (char) => char.charCodeAt(0)),
|
|
659
|
+
format: message.format,
|
|
660
|
+
receivedAt: message.receivedAt,
|
|
661
|
+
turnId: message.turnId,
|
|
662
|
+
type: "audio"
|
|
663
|
+
};
|
|
285
664
|
case "assistant":
|
|
286
665
|
return {
|
|
287
666
|
text: message.text,
|
|
@@ -310,6 +689,7 @@ var serverMessageToAction = (message) => {
|
|
|
310
689
|
case "session":
|
|
311
690
|
return {
|
|
312
691
|
sessionId: message.sessionId,
|
|
692
|
+
scenarioId: message.scenarioId,
|
|
313
693
|
status: message.status,
|
|
314
694
|
type: "session"
|
|
315
695
|
};
|
|
@@ -324,23 +704,39 @@ var serverMessageToAction = (message) => {
|
|
|
324
704
|
};
|
|
325
705
|
|
|
326
706
|
// src/client/store.ts
|
|
327
|
-
var
|
|
707
|
+
var createInitialState2 = () => ({
|
|
708
|
+
assistantAudio: [],
|
|
328
709
|
assistantTexts: [],
|
|
329
710
|
error: null,
|
|
330
711
|
isConnected: false,
|
|
712
|
+
scenarioId: null,
|
|
331
713
|
partial: "",
|
|
332
714
|
sessionId: null,
|
|
333
715
|
status: "idle",
|
|
334
716
|
turns: []
|
|
335
717
|
});
|
|
336
718
|
var createVoiceStreamStore = () => {
|
|
337
|
-
let state =
|
|
719
|
+
let state = createInitialState2();
|
|
338
720
|
const subscribers = new Set;
|
|
339
721
|
const notify = () => {
|
|
340
722
|
subscribers.forEach((subscriber) => subscriber());
|
|
341
723
|
};
|
|
342
724
|
const dispatch = (action) => {
|
|
343
725
|
switch (action.type) {
|
|
726
|
+
case "audio":
|
|
727
|
+
state = {
|
|
728
|
+
...state,
|
|
729
|
+
assistantAudio: [
|
|
730
|
+
...state.assistantAudio,
|
|
731
|
+
{
|
|
732
|
+
chunk: action.chunk,
|
|
733
|
+
format: action.format,
|
|
734
|
+
receivedAt: action.receivedAt,
|
|
735
|
+
turnId: action.turnId
|
|
736
|
+
}
|
|
737
|
+
]
|
|
738
|
+
};
|
|
739
|
+
break;
|
|
344
740
|
case "assistant":
|
|
345
741
|
state = {
|
|
346
742
|
...state,
|
|
@@ -389,6 +785,7 @@ var createVoiceStreamStore = () => {
|
|
|
389
785
|
state = {
|
|
390
786
|
...state,
|
|
391
787
|
error: null,
|
|
788
|
+
scenarioId: action.scenarioId ?? state.scenarioId,
|
|
392
789
|
isConnected: action.status === "active",
|
|
393
790
|
sessionId: action.sessionId,
|
|
394
791
|
status: action.status
|
|
@@ -422,6 +819,12 @@ var createVoiceStream = (path, options = {}) => {
|
|
|
422
819
|
const connection = createVoiceConnection(path, options);
|
|
423
820
|
const store = createVoiceStreamStore();
|
|
424
821
|
const subscribers = new Set;
|
|
822
|
+
const start = (input) => Promise.resolve().then(() => {
|
|
823
|
+
if (!input?.sessionId && !input?.scenarioId) {
|
|
824
|
+
return;
|
|
825
|
+
}
|
|
826
|
+
connection.start(input);
|
|
827
|
+
});
|
|
425
828
|
const notify = () => {
|
|
426
829
|
subscribers.forEach((subscriber) => subscriber());
|
|
427
830
|
};
|
|
@@ -454,6 +857,10 @@ var createVoiceStream = (path, options = {}) => {
|
|
|
454
857
|
get isConnected() {
|
|
455
858
|
return store.getSnapshot().isConnected;
|
|
456
859
|
},
|
|
860
|
+
get scenarioId() {
|
|
861
|
+
return store.getSnapshot().scenarioId;
|
|
862
|
+
},
|
|
863
|
+
start,
|
|
457
864
|
get partial() {
|
|
458
865
|
return store.getSnapshot().partial;
|
|
459
866
|
},
|
|
@@ -469,6 +876,9 @@ var createVoiceStream = (path, options = {}) => {
|
|
|
469
876
|
get assistantTexts() {
|
|
470
877
|
return store.getSnapshot().assistantTexts;
|
|
471
878
|
},
|
|
879
|
+
get assistantAudio() {
|
|
880
|
+
return store.getSnapshot().assistantAudio;
|
|
881
|
+
},
|
|
472
882
|
sendAudio(audio) {
|
|
473
883
|
connection.sendAudio(audio);
|
|
474
884
|
},
|
|
@@ -527,6 +937,7 @@ var bindVoiceHTMX = (stream, options) => {
|
|
|
527
937
|
unsubscribe();
|
|
528
938
|
};
|
|
529
939
|
};
|
|
940
|
+
|
|
530
941
|
// src/client/microphone.ts
|
|
531
942
|
var clampSample = (value) => Math.max(-1, Math.min(1, value));
|
|
532
943
|
var floatTo16BitPCM = (input) => {
|
|
@@ -537,6 +948,22 @@ var floatTo16BitPCM = (input) => {
|
|
|
537
948
|
}
|
|
538
949
|
return new Uint8Array(output.buffer);
|
|
539
950
|
};
|
|
951
|
+
var getPcmLevel = (audio) => {
|
|
952
|
+
const bytes = audio instanceof Uint8Array ? audio : new Uint8Array(audio);
|
|
953
|
+
if (bytes.byteLength < 2) {
|
|
954
|
+
return 0;
|
|
955
|
+
}
|
|
956
|
+
const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
|
|
957
|
+
if (samples.length === 0) {
|
|
958
|
+
return 0;
|
|
959
|
+
}
|
|
960
|
+
let sumSquares = 0;
|
|
961
|
+
for (const sample of samples) {
|
|
962
|
+
const normalized = sample / 32768;
|
|
963
|
+
sumSquares += normalized * normalized;
|
|
964
|
+
}
|
|
965
|
+
return Math.min(1, Math.max(0, Math.sqrt(sumSquares / samples.length) * 5.5));
|
|
966
|
+
};
|
|
540
967
|
var downsampleBuffer = (input, sourceRate, targetRate) => {
|
|
541
968
|
if (sourceRate === targetRate) {
|
|
542
969
|
return input;
|
|
@@ -584,7 +1011,9 @@ var createMicrophoneCapture = (options) => {
|
|
|
584
1011
|
processorNode.onaudioprocess = (event) => {
|
|
585
1012
|
const channel = event.inputBuffer.getChannelData(0);
|
|
586
1013
|
const downsampled = downsampleBuffer(channel, audioContext?.sampleRate ?? 48000, options.sampleRateHz ?? 16000);
|
|
587
|
-
|
|
1014
|
+
const pcm = floatTo16BitPCM(downsampled);
|
|
1015
|
+
options.onLevel?.(getPcmLevel(pcm));
|
|
1016
|
+
options.onAudio(pcm);
|
|
588
1017
|
};
|
|
589
1018
|
sourceNode.connect(processorNode);
|
|
590
1019
|
processorNode.connect(audioContext.destination);
|
|
@@ -594,6 +1023,7 @@ var createMicrophoneCapture = (options) => {
|
|
|
594
1023
|
sourceNode?.disconnect();
|
|
595
1024
|
mediaStream?.getTracks().forEach((track) => track.stop());
|
|
596
1025
|
audioContext?.close();
|
|
1026
|
+
options.onLevel?.(0);
|
|
597
1027
|
audioContext = null;
|
|
598
1028
|
mediaStream = null;
|
|
599
1029
|
processorNode = null;
|
|
@@ -601,9 +1031,564 @@ var createMicrophoneCapture = (options) => {
|
|
|
601
1031
|
};
|
|
602
1032
|
return { start, stop };
|
|
603
1033
|
};
|
|
1034
|
+
|
|
1035
|
+
// src/audioConditioning.ts
|
|
1036
|
+
var DEFAULT_TARGET_LEVEL = 0.08;
|
|
1037
|
+
var DEFAULT_MAX_GAIN = 3;
|
|
1038
|
+
var DEFAULT_NOISE_GATE_THRESHOLD = 0.006;
|
|
1039
|
+
var DEFAULT_NOISE_GATE_ATTENUATION = 0.15;
|
|
1040
|
+
var toInt16Array = (audio) => {
|
|
1041
|
+
if (audio instanceof ArrayBuffer) {
|
|
1042
|
+
return new Int16Array(audio, 0, Math.floor(audio.byteLength / 2));
|
|
1043
|
+
}
|
|
1044
|
+
return new Int16Array(audio.buffer, audio.byteOffset, Math.floor(audio.byteLength / 2));
|
|
1045
|
+
};
|
|
1046
|
+
var computeRms = (samples) => {
|
|
1047
|
+
if (samples.length === 0) {
|
|
1048
|
+
return 0;
|
|
1049
|
+
}
|
|
1050
|
+
let sumSquares = 0;
|
|
1051
|
+
for (const sample of samples) {
|
|
1052
|
+
const normalized = sample / 32768;
|
|
1053
|
+
sumSquares += normalized * normalized;
|
|
1054
|
+
}
|
|
1055
|
+
return Math.sqrt(sumSquares / samples.length);
|
|
1056
|
+
};
|
|
1057
|
+
var resolveAudioConditioningConfig = (config) => {
|
|
1058
|
+
if (!config || config.enabled === false) {
|
|
1059
|
+
return;
|
|
1060
|
+
}
|
|
1061
|
+
return {
|
|
1062
|
+
enabled: true,
|
|
1063
|
+
maxGain: config.maxGain ?? DEFAULT_MAX_GAIN,
|
|
1064
|
+
noiseGateAttenuation: config.noiseGateAttenuation ?? DEFAULT_NOISE_GATE_ATTENUATION,
|
|
1065
|
+
noiseGateThreshold: config.noiseGateThreshold ?? DEFAULT_NOISE_GATE_THRESHOLD,
|
|
1066
|
+
targetLevel: config.targetLevel ?? DEFAULT_TARGET_LEVEL
|
|
1067
|
+
};
|
|
1068
|
+
};
|
|
1069
|
+
var conditionAudioChunk = (audio, config) => {
|
|
1070
|
+
if (!config) {
|
|
1071
|
+
return audio;
|
|
1072
|
+
}
|
|
1073
|
+
const source = toInt16Array(audio);
|
|
1074
|
+
if (source.length === 0) {
|
|
1075
|
+
return audio;
|
|
1076
|
+
}
|
|
1077
|
+
const rms = computeRms(source);
|
|
1078
|
+
const output = new Int16Array(source.length);
|
|
1079
|
+
const gateFactor = rms < config.noiseGateThreshold ? config.noiseGateAttenuation : 1;
|
|
1080
|
+
const baseLevel = Math.max(rms * gateFactor, 0.000001);
|
|
1081
|
+
const gain = Math.min(config.maxGain, config.targetLevel / baseLevel);
|
|
1082
|
+
const appliedGain = Math.max(0.25, gain) * gateFactor;
|
|
1083
|
+
for (let index = 0;index < source.length; index += 1) {
|
|
1084
|
+
const next = Math.round(source[index] * appliedGain);
|
|
1085
|
+
output[index] = Math.max(-32768, Math.min(32767, next));
|
|
1086
|
+
}
|
|
1087
|
+
return new Uint8Array(output.buffer);
|
|
1088
|
+
};
|
|
1089
|
+
|
|
1090
|
+
// src/turnProfiles.ts
|
|
1091
|
+
var TURN_PROFILE_DEFAULTS = {
|
|
1092
|
+
balanced: {
|
|
1093
|
+
qualityProfile: "general",
|
|
1094
|
+
silenceMs: 1400,
|
|
1095
|
+
speechThreshold: 0.012,
|
|
1096
|
+
transcriptStabilityMs: 1000
|
|
1097
|
+
},
|
|
1098
|
+
fast: {
|
|
1099
|
+
qualityProfile: "general",
|
|
1100
|
+
silenceMs: 700,
|
|
1101
|
+
speechThreshold: 0.015,
|
|
1102
|
+
transcriptStabilityMs: 450
|
|
1103
|
+
},
|
|
1104
|
+
"long-form": {
|
|
1105
|
+
qualityProfile: "general",
|
|
1106
|
+
silenceMs: 2200,
|
|
1107
|
+
speechThreshold: 0.01,
|
|
1108
|
+
transcriptStabilityMs: 1500
|
|
1109
|
+
}
|
|
1110
|
+
};
|
|
1111
|
+
var QUALITY_PROFILE_DEFAULTS = {
|
|
1112
|
+
general: {},
|
|
1113
|
+
"accent-heavy": {
|
|
1114
|
+
silenceMs: 1200,
|
|
1115
|
+
speechThreshold: 0.01,
|
|
1116
|
+
transcriptStabilityMs: 1200
|
|
1117
|
+
},
|
|
1118
|
+
"noisy-room": {
|
|
1119
|
+
silenceMs: 2000,
|
|
1120
|
+
speechThreshold: 0.02,
|
|
1121
|
+
transcriptStabilityMs: 1600
|
|
1122
|
+
},
|
|
1123
|
+
"short-command": {
|
|
1124
|
+
silenceMs: 500,
|
|
1125
|
+
speechThreshold: 0.016,
|
|
1126
|
+
transcriptStabilityMs: 420
|
|
1127
|
+
}
|
|
1128
|
+
};
|
|
1129
|
+
var DEFAULT_TURN_PROFILE = "fast";
|
|
1130
|
+
var DEFAULT_QUALITY_PROFILE = "general";
|
|
1131
|
+
var resolveTurnDetectionConfig = (config) => {
|
|
1132
|
+
const profile = config?.profile ?? DEFAULT_TURN_PROFILE;
|
|
1133
|
+
const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
|
|
1134
|
+
const preset = TURN_PROFILE_DEFAULTS[profile];
|
|
1135
|
+
const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
|
|
1136
|
+
return {
|
|
1137
|
+
profile,
|
|
1138
|
+
qualityProfile,
|
|
1139
|
+
silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
|
|
1140
|
+
speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
|
|
1141
|
+
transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
|
|
1142
|
+
};
|
|
1143
|
+
};
|
|
1144
|
+
|
|
1145
|
+
// src/presets.ts
|
|
1146
|
+
var PRESET_INPUTS = {
|
|
1147
|
+
chat: {
|
|
1148
|
+
audioConditioning: {
|
|
1149
|
+
enabled: true,
|
|
1150
|
+
maxGain: 2.5,
|
|
1151
|
+
noiseGateAttenuation: 0,
|
|
1152
|
+
noiseGateThreshold: 0.004,
|
|
1153
|
+
targetLevel: 0.08
|
|
1154
|
+
},
|
|
1155
|
+
capture: {
|
|
1156
|
+
channelCount: 1,
|
|
1157
|
+
sampleRateHz: 16000
|
|
1158
|
+
},
|
|
1159
|
+
connection: {
|
|
1160
|
+
maxReconnectAttempts: 10,
|
|
1161
|
+
pingInterval: 30000,
|
|
1162
|
+
reconnect: true
|
|
1163
|
+
},
|
|
1164
|
+
sttLifecycle: "continuous",
|
|
1165
|
+
turnDetection: {
|
|
1166
|
+
qualityProfile: "short-command",
|
|
1167
|
+
profile: "balanced"
|
|
1168
|
+
}
|
|
1169
|
+
},
|
|
1170
|
+
default: {
|
|
1171
|
+
capture: {
|
|
1172
|
+
channelCount: 1,
|
|
1173
|
+
sampleRateHz: 16000
|
|
1174
|
+
},
|
|
1175
|
+
connection: {
|
|
1176
|
+
maxReconnectAttempts: 10,
|
|
1177
|
+
pingInterval: 30000,
|
|
1178
|
+
reconnect: true
|
|
1179
|
+
},
|
|
1180
|
+
sttLifecycle: "continuous",
|
|
1181
|
+
turnDetection: {
|
|
1182
|
+
qualityProfile: "general",
|
|
1183
|
+
profile: "fast"
|
|
1184
|
+
}
|
|
1185
|
+
},
|
|
1186
|
+
dictation: {
|
|
1187
|
+
audioConditioning: {
|
|
1188
|
+
enabled: true,
|
|
1189
|
+
maxGain: 2.25,
|
|
1190
|
+
noiseGateAttenuation: 0.05,
|
|
1191
|
+
noiseGateThreshold: 0.003,
|
|
1192
|
+
targetLevel: 0.08
|
|
1193
|
+
},
|
|
1194
|
+
capture: {
|
|
1195
|
+
channelCount: 1,
|
|
1196
|
+
sampleRateHz: 16000
|
|
1197
|
+
},
|
|
1198
|
+
connection: {
|
|
1199
|
+
maxReconnectAttempts: 12,
|
|
1200
|
+
pingInterval: 30000,
|
|
1201
|
+
reconnect: true
|
|
1202
|
+
},
|
|
1203
|
+
sttLifecycle: "continuous",
|
|
1204
|
+
turnDetection: {
|
|
1205
|
+
qualityProfile: "accent-heavy",
|
|
1206
|
+
profile: "long-form"
|
|
1207
|
+
}
|
|
1208
|
+
},
|
|
1209
|
+
"guided-intake": {
|
|
1210
|
+
audioConditioning: {
|
|
1211
|
+
enabled: true,
|
|
1212
|
+
maxGain: 2.5,
|
|
1213
|
+
noiseGateAttenuation: 0,
|
|
1214
|
+
noiseGateThreshold: 0.004,
|
|
1215
|
+
targetLevel: 0.08
|
|
1216
|
+
},
|
|
1217
|
+
capture: {
|
|
1218
|
+
channelCount: 1,
|
|
1219
|
+
sampleRateHz: 16000
|
|
1220
|
+
},
|
|
1221
|
+
connection: {
|
|
1222
|
+
maxReconnectAttempts: 12,
|
|
1223
|
+
pingInterval: 30000,
|
|
1224
|
+
reconnect: true
|
|
1225
|
+
},
|
|
1226
|
+
sttLifecycle: "turn-scoped",
|
|
1227
|
+
turnDetection: {
|
|
1228
|
+
qualityProfile: "accent-heavy",
|
|
1229
|
+
profile: "long-form"
|
|
1230
|
+
}
|
|
1231
|
+
},
|
|
1232
|
+
"noisy-room": {
|
|
1233
|
+
audioConditioning: {
|
|
1234
|
+
enabled: true,
|
|
1235
|
+
maxGain: 3,
|
|
1236
|
+
noiseGateAttenuation: 0.12,
|
|
1237
|
+
noiseGateThreshold: 0.006,
|
|
1238
|
+
targetLevel: 0.085
|
|
1239
|
+
},
|
|
1240
|
+
capture: {
|
|
1241
|
+
channelCount: 1,
|
|
1242
|
+
sampleRateHz: 16000
|
|
1243
|
+
},
|
|
1244
|
+
connection: {
|
|
1245
|
+
maxReconnectAttempts: 14,
|
|
1246
|
+
pingInterval: 45000,
|
|
1247
|
+
reconnect: true
|
|
1248
|
+
},
|
|
1249
|
+
sttLifecycle: "continuous",
|
|
1250
|
+
turnDetection: {
|
|
1251
|
+
qualityProfile: "noisy-room",
|
|
1252
|
+
profile: "long-form",
|
|
1253
|
+
silenceMs: 2100,
|
|
1254
|
+
speechThreshold: 0.02,
|
|
1255
|
+
transcriptStabilityMs: 1650
|
|
1256
|
+
}
|
|
1257
|
+
},
|
|
1258
|
+
"pstn-balanced": {
|
|
1259
|
+
audioConditioning: {
|
|
1260
|
+
enabled: true,
|
|
1261
|
+
maxGain: 2.8,
|
|
1262
|
+
noiseGateAttenuation: 0.07,
|
|
1263
|
+
noiseGateThreshold: 0.005,
|
|
1264
|
+
targetLevel: 0.08
|
|
1265
|
+
},
|
|
1266
|
+
capture: {
|
|
1267
|
+
channelCount: 1,
|
|
1268
|
+
sampleRateHz: 16000
|
|
1269
|
+
},
|
|
1270
|
+
connection: {
|
|
1271
|
+
maxReconnectAttempts: 14,
|
|
1272
|
+
pingInterval: 45000,
|
|
1273
|
+
reconnect: true
|
|
1274
|
+
},
|
|
1275
|
+
sttLifecycle: "continuous",
|
|
1276
|
+
turnDetection: {
|
|
1277
|
+
qualityProfile: "noisy-room",
|
|
1278
|
+
profile: "long-form",
|
|
1279
|
+
silenceMs: 660,
|
|
1280
|
+
speechThreshold: 0.012,
|
|
1281
|
+
transcriptStabilityMs: 300
|
|
1282
|
+
}
|
|
1283
|
+
},
|
|
1284
|
+
"pstn-fast": {
|
|
1285
|
+
audioConditioning: {
|
|
1286
|
+
enabled: true,
|
|
1287
|
+
maxGain: 2.75,
|
|
1288
|
+
noiseGateAttenuation: 0.06,
|
|
1289
|
+
noiseGateThreshold: 0.005,
|
|
1290
|
+
targetLevel: 0.08
|
|
1291
|
+
},
|
|
1292
|
+
capture: {
|
|
1293
|
+
channelCount: 1,
|
|
1294
|
+
sampleRateHz: 16000
|
|
1295
|
+
},
|
|
1296
|
+
connection: {
|
|
1297
|
+
maxReconnectAttempts: 14,
|
|
1298
|
+
pingInterval: 45000,
|
|
1299
|
+
reconnect: true
|
|
1300
|
+
},
|
|
1301
|
+
sttLifecycle: "continuous",
|
|
1302
|
+
turnDetection: {
|
|
1303
|
+
qualityProfile: "noisy-room",
|
|
1304
|
+
profile: "long-form",
|
|
1305
|
+
silenceMs: 620,
|
|
1306
|
+
speechThreshold: 0.012,
|
|
1307
|
+
transcriptStabilityMs: 280
|
|
1308
|
+
}
|
|
1309
|
+
},
|
|
1310
|
+
reliability: {
|
|
1311
|
+
audioConditioning: {
|
|
1312
|
+
enabled: true,
|
|
1313
|
+
maxGain: 2.9,
|
|
1314
|
+
noiseGateAttenuation: 0.08,
|
|
1315
|
+
noiseGateThreshold: 0.005,
|
|
1316
|
+
targetLevel: 0.08
|
|
1317
|
+
},
|
|
1318
|
+
capture: {
|
|
1319
|
+
channelCount: 1,
|
|
1320
|
+
sampleRateHz: 16000
|
|
1321
|
+
},
|
|
1322
|
+
connection: {
|
|
1323
|
+
maxReconnectAttempts: 14,
|
|
1324
|
+
pingInterval: 45000,
|
|
1325
|
+
reconnect: true
|
|
1326
|
+
},
|
|
1327
|
+
sttLifecycle: "continuous",
|
|
1328
|
+
turnDetection: {
|
|
1329
|
+
qualityProfile: "noisy-room",
|
|
1330
|
+
profile: "long-form"
|
|
1331
|
+
}
|
|
1332
|
+
}
|
|
1333
|
+
};
|
|
1334
|
+
var resolveVoiceRuntimePreset = (name = "default") => {
|
|
1335
|
+
const preset = PRESET_INPUTS[name];
|
|
1336
|
+
return {
|
|
1337
|
+
audioConditioning: resolveAudioConditioningConfig(preset.audioConditioning),
|
|
1338
|
+
capture: {
|
|
1339
|
+
channelCount: preset.capture?.channelCount ?? 1,
|
|
1340
|
+
sampleRateHz: preset.capture?.sampleRateHz ?? 16000
|
|
1341
|
+
},
|
|
1342
|
+
connection: {
|
|
1343
|
+
...preset.connection
|
|
1344
|
+
},
|
|
1345
|
+
name,
|
|
1346
|
+
sttLifecycle: preset.sttLifecycle ?? "continuous",
|
|
1347
|
+
turnDetection: resolveTurnDetectionConfig(preset.turnDetection)
|
|
1348
|
+
};
|
|
1349
|
+
};
|
|
1350
|
+
|
|
1351
|
+
// src/client/controller.ts
|
|
1352
|
+
var createInitialState3 = (stream) => ({
|
|
1353
|
+
assistantAudio: [...stream.assistantAudio],
|
|
1354
|
+
assistantTexts: [...stream.assistantTexts],
|
|
1355
|
+
error: stream.error,
|
|
1356
|
+
isConnected: stream.isConnected,
|
|
1357
|
+
isRecording: false,
|
|
1358
|
+
partial: stream.partial,
|
|
1359
|
+
recordingError: null,
|
|
1360
|
+
sessionId: stream.sessionId,
|
|
1361
|
+
scenarioId: stream.scenarioId,
|
|
1362
|
+
status: stream.status,
|
|
1363
|
+
turns: [...stream.turns]
|
|
1364
|
+
});
|
|
1365
|
+
var createVoiceController = (path, options = {}) => {
|
|
1366
|
+
const preset = resolveVoiceRuntimePreset(options.preset);
|
|
1367
|
+
const stream = createVoiceStream(path, {
|
|
1368
|
+
...preset.connection,
|
|
1369
|
+
...options.connection
|
|
1370
|
+
});
|
|
1371
|
+
let capture = null;
|
|
1372
|
+
let state = createInitialState3(stream);
|
|
1373
|
+
const subscribers = new Set;
|
|
1374
|
+
const notify = () => {
|
|
1375
|
+
for (const subscriber of subscribers) {
|
|
1376
|
+
subscriber();
|
|
1377
|
+
}
|
|
1378
|
+
};
|
|
1379
|
+
const sync = () => {
|
|
1380
|
+
state = {
|
|
1381
|
+
...state,
|
|
1382
|
+
assistantAudio: [...stream.assistantAudio],
|
|
1383
|
+
assistantTexts: [...stream.assistantTexts],
|
|
1384
|
+
error: stream.error,
|
|
1385
|
+
isConnected: stream.isConnected,
|
|
1386
|
+
partial: stream.partial,
|
|
1387
|
+
sessionId: stream.sessionId,
|
|
1388
|
+
scenarioId: stream.scenarioId,
|
|
1389
|
+
status: stream.status,
|
|
1390
|
+
turns: [...stream.turns]
|
|
1391
|
+
};
|
|
1392
|
+
if (options.autoStopOnComplete !== false && state.status === "completed" && state.isRecording) {
|
|
1393
|
+
capture?.stop();
|
|
1394
|
+
capture = null;
|
|
1395
|
+
state = {
|
|
1396
|
+
...state,
|
|
1397
|
+
isRecording: false
|
|
1398
|
+
};
|
|
1399
|
+
}
|
|
1400
|
+
notify();
|
|
1401
|
+
};
|
|
1402
|
+
const unsubscribeStream = stream.subscribe(sync);
|
|
1403
|
+
sync();
|
|
1404
|
+
const ensureCapture = () => {
|
|
1405
|
+
if (capture) {
|
|
1406
|
+
return capture;
|
|
1407
|
+
}
|
|
1408
|
+
capture = createMicrophoneCapture({
|
|
1409
|
+
channelCount: options.capture?.channelCount ?? preset.capture.channelCount,
|
|
1410
|
+
onLevel: options.capture?.onLevel,
|
|
1411
|
+
onAudio: (audio) => stream.sendAudio(audio),
|
|
1412
|
+
sampleRateHz: options.capture?.sampleRateHz ?? preset.capture.sampleRateHz
|
|
1413
|
+
});
|
|
1414
|
+
return capture;
|
|
1415
|
+
};
|
|
1416
|
+
const stopRecording = () => {
|
|
1417
|
+
capture?.stop();
|
|
1418
|
+
capture = null;
|
|
1419
|
+
state = {
|
|
1420
|
+
...state,
|
|
1421
|
+
isRecording: false
|
|
1422
|
+
};
|
|
1423
|
+
notify();
|
|
1424
|
+
};
|
|
1425
|
+
const startRecording = async () => {
|
|
1426
|
+
if (state.isRecording) {
|
|
1427
|
+
return;
|
|
1428
|
+
}
|
|
1429
|
+
try {
|
|
1430
|
+
state = {
|
|
1431
|
+
...state,
|
|
1432
|
+
recordingError: null
|
|
1433
|
+
};
|
|
1434
|
+
notify();
|
|
1435
|
+
await ensureCapture().start();
|
|
1436
|
+
state = {
|
|
1437
|
+
...state,
|
|
1438
|
+
isRecording: true
|
|
1439
|
+
};
|
|
1440
|
+
notify();
|
|
1441
|
+
} catch (error) {
|
|
1442
|
+
capture = null;
|
|
1443
|
+
state = {
|
|
1444
|
+
...state,
|
|
1445
|
+
isRecording: false,
|
|
1446
|
+
recordingError: error instanceof Error ? error.message : String(error)
|
|
1447
|
+
};
|
|
1448
|
+
notify();
|
|
1449
|
+
throw error;
|
|
1450
|
+
}
|
|
1451
|
+
};
|
|
1452
|
+
const close = () => {
|
|
1453
|
+
unsubscribeStream();
|
|
1454
|
+
stopRecording();
|
|
1455
|
+
stream.close();
|
|
1456
|
+
};
|
|
1457
|
+
return {
|
|
1458
|
+
bindHTMX(bindingOptions) {
|
|
1459
|
+
return bindVoiceHTMX(stream, bindingOptions);
|
|
1460
|
+
},
|
|
1461
|
+
close,
|
|
1462
|
+
endTurn: () => stream.endTurn(),
|
|
1463
|
+
get error() {
|
|
1464
|
+
return state.error;
|
|
1465
|
+
},
|
|
1466
|
+
getServerSnapshot: () => state,
|
|
1467
|
+
getSnapshot: () => state,
|
|
1468
|
+
get isConnected() {
|
|
1469
|
+
return state.isConnected;
|
|
1470
|
+
},
|
|
1471
|
+
get isRecording() {
|
|
1472
|
+
return state.isRecording;
|
|
1473
|
+
},
|
|
1474
|
+
get partial() {
|
|
1475
|
+
return state.partial;
|
|
1476
|
+
},
|
|
1477
|
+
get recordingError() {
|
|
1478
|
+
return state.recordingError;
|
|
1479
|
+
},
|
|
1480
|
+
sendAudio: (audio) => stream.sendAudio(audio),
|
|
1481
|
+
get sessionId() {
|
|
1482
|
+
return state.sessionId;
|
|
1483
|
+
},
|
|
1484
|
+
get scenarioId() {
|
|
1485
|
+
return state.scenarioId;
|
|
1486
|
+
},
|
|
1487
|
+
startRecording,
|
|
1488
|
+
get status() {
|
|
1489
|
+
return state.status;
|
|
1490
|
+
},
|
|
1491
|
+
stopRecording,
|
|
1492
|
+
subscribe: (subscriber) => {
|
|
1493
|
+
subscribers.add(subscriber);
|
|
1494
|
+
return () => {
|
|
1495
|
+
subscribers.delete(subscriber);
|
|
1496
|
+
};
|
|
1497
|
+
},
|
|
1498
|
+
toggleRecording: async () => {
|
|
1499
|
+
if (state.isRecording) {
|
|
1500
|
+
stopRecording();
|
|
1501
|
+
return;
|
|
1502
|
+
}
|
|
1503
|
+
await startRecording();
|
|
1504
|
+
},
|
|
1505
|
+
get turns() {
|
|
1506
|
+
return state.turns;
|
|
1507
|
+
},
|
|
1508
|
+
get assistantTexts() {
|
|
1509
|
+
return state.assistantTexts;
|
|
1510
|
+
},
|
|
1511
|
+
get assistantAudio() {
|
|
1512
|
+
return state.assistantAudio;
|
|
1513
|
+
}
|
|
1514
|
+
};
|
|
1515
|
+
};
|
|
1516
|
+
// src/client/duplex.ts
|
|
1517
|
+
var DEFAULT_INTERRUPT_THRESHOLD = 0.08;
|
|
1518
|
+
var shouldInterruptForLevel = (level, options = {}) => (options.enabled ?? true) && level >= (options.interruptThreshold ?? DEFAULT_INTERRUPT_THRESHOLD);
|
|
1519
|
+
var bindVoiceBargeIn = (controller, player, options = {}) => {
|
|
1520
|
+
let lastPartial = controller.partial;
|
|
1521
|
+
const interruptIfPlaying = () => {
|
|
1522
|
+
if (!player.isPlaying || options.enabled === false) {
|
|
1523
|
+
return;
|
|
1524
|
+
}
|
|
1525
|
+
player.interrupt();
|
|
1526
|
+
};
|
|
1527
|
+
const unsubscribe = controller.subscribe(() => {
|
|
1528
|
+
if (options.interruptOnPartial === false) {
|
|
1529
|
+
lastPartial = controller.partial;
|
|
1530
|
+
return;
|
|
1531
|
+
}
|
|
1532
|
+
if (!lastPartial && controller.partial) {
|
|
1533
|
+
interruptIfPlaying();
|
|
1534
|
+
}
|
|
1535
|
+
lastPartial = controller.partial;
|
|
1536
|
+
});
|
|
1537
|
+
return {
|
|
1538
|
+
close: () => {
|
|
1539
|
+
unsubscribe();
|
|
1540
|
+
},
|
|
1541
|
+
handleLevel: (level) => {
|
|
1542
|
+
if (shouldInterruptForLevel(level, options)) {
|
|
1543
|
+
interruptIfPlaying();
|
|
1544
|
+
}
|
|
1545
|
+
},
|
|
1546
|
+
sendAudio: (audio) => {
|
|
1547
|
+
interruptIfPlaying();
|
|
1548
|
+
controller.sendAudio(audio);
|
|
1549
|
+
}
|
|
1550
|
+
};
|
|
1551
|
+
};
|
|
1552
|
+
var createVoiceDuplexController = (path, options = {}) => {
|
|
1553
|
+
let bargeInBinding = null;
|
|
1554
|
+
const controller = createVoiceController(path, {
|
|
1555
|
+
...options,
|
|
1556
|
+
capture: {
|
|
1557
|
+
...options.capture,
|
|
1558
|
+
onLevel: (level) => {
|
|
1559
|
+
bargeInBinding?.handleLevel(level);
|
|
1560
|
+
options.capture?.onLevel?.(level);
|
|
1561
|
+
}
|
|
1562
|
+
}
|
|
1563
|
+
});
|
|
1564
|
+
const audioPlayer = createVoiceAudioPlayer(controller, options.audioPlayer);
|
|
1565
|
+
bargeInBinding = bindVoiceBargeIn(controller, audioPlayer, options.bargeIn);
|
|
1566
|
+
const close = () => {
|
|
1567
|
+
bargeInBinding?.close();
|
|
1568
|
+
bargeInBinding = null;
|
|
1569
|
+
audioPlayer.close();
|
|
1570
|
+
controller.close();
|
|
1571
|
+
};
|
|
1572
|
+
return {
|
|
1573
|
+
...controller,
|
|
1574
|
+
audioPlayer,
|
|
1575
|
+
close,
|
|
1576
|
+
interruptAssistant: async () => {
|
|
1577
|
+
await audioPlayer.interrupt();
|
|
1578
|
+
},
|
|
1579
|
+
sendAudio: (audio) => {
|
|
1580
|
+
bargeInBinding?.sendAudio(audio);
|
|
1581
|
+
}
|
|
1582
|
+
};
|
|
1583
|
+
};
|
|
604
1584
|
export {
|
|
1585
|
+
decodeVoiceAudioChunk,
|
|
605
1586
|
createVoiceStream,
|
|
1587
|
+
createVoiceDuplexController,
|
|
1588
|
+
createVoiceController,
|
|
606
1589
|
createVoiceConnection,
|
|
1590
|
+
createVoiceAudioPlayer,
|
|
607
1591
|
createMicrophoneCapture,
|
|
608
|
-
bindVoiceHTMX
|
|
1592
|
+
bindVoiceHTMX,
|
|
1593
|
+
bindVoiceBargeIn
|
|
609
1594
|
};
|