@absolutejs/voice 0.0.20 → 0.0.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +387 -4
- package/dist/angular/index.d.ts +1 -0
- package/dist/angular/index.js +669 -3
- package/dist/angular/voice-controller.service.d.ts +21 -0
- package/dist/audioConditioning.d.ts +3 -0
- package/dist/client/actions.d.ts +7 -0
- package/dist/client/connection.d.ts +5 -0
- package/dist/client/controller.d.ts +2 -0
- package/dist/client/htmxBootstrap.js +576 -167
- package/dist/client/index.d.ts +1 -0
- package/dist/client/index.js +486 -3
- package/dist/client/microphone.d.ts +4 -2
- package/dist/correction.d.ts +16 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.js +1314 -283
- package/dist/presets.d.ts +13 -0
- package/dist/react/index.d.ts +1 -0
- package/dist/react/index.js +642 -3
- package/dist/react/useVoiceController.d.ts +20 -0
- package/dist/react/useVoiceStream.d.ts +1 -0
- package/dist/store.d.ts +2 -2
- package/dist/svelte/index.d.ts +1 -0
- package/dist/svelte/index.js +607 -3
- package/dist/testing/benchmark.d.ts +36 -0
- package/dist/testing/index.js +1453 -241
- package/dist/testing/sessionBenchmark.d.ts +67 -2
- package/dist/testing/stt.d.ts +1 -0
- package/dist/turnDetection.d.ts +5 -1
- package/dist/turnProfiles.d.ts +6 -0
- package/dist/types.d.ts +198 -8
- package/dist/vue/index.d.ts +1 -0
- package/dist/vue/index.js +660 -3
- package/dist/vue/useVoiceController.d.ts +19 -0
- package/fixtures/README.md +9 -0
- package/fixtures/manifest.json +59 -1
- package/fixtures/pcm/dialogue-three-clean.pcm +0 -0
- package/fixtures/pcm/dialogue-three-mixed.pcm +0 -0
- package/fixtures/pcm/dialogue-two-clean.pcm +0 -0
- package/fixtures/pcm/dialogue-two-noisy.pcm +0 -0
- package/package.json +21 -1
package/dist/svelte/index.js
CHANGED
|
@@ -127,6 +127,7 @@ var serverMessageToAction = (message) => {
|
|
|
127
127
|
case "session":
|
|
128
128
|
return {
|
|
129
129
|
sessionId: message.sessionId,
|
|
130
|
+
scenarioId: message.scenarioId,
|
|
130
131
|
status: message.status,
|
|
131
132
|
type: "session"
|
|
132
133
|
};
|
|
@@ -147,24 +148,30 @@ var WS_NORMAL_CLOSURE = 1000;
|
|
|
147
148
|
var DEFAULT_MAX_RECONNECT_ATTEMPTS = 10;
|
|
148
149
|
var DEFAULT_PING_INTERVAL = 30000;
|
|
149
150
|
var RECONNECT_DELAY_MS = 500;
|
|
151
|
+
var DEFAULT_SCENARIO_QUERY_PARAM = "scenarioId";
|
|
150
152
|
var noop = () => {};
|
|
151
153
|
var noopUnsubscribe = () => noop;
|
|
152
154
|
var NOOP_CONNECTION = {
|
|
155
|
+
start: () => {},
|
|
153
156
|
close: noop,
|
|
154
157
|
endTurn: noop,
|
|
155
158
|
getReadyState: () => WS_CLOSED,
|
|
159
|
+
getScenarioId: () => "",
|
|
156
160
|
getSessionId: () => "",
|
|
157
161
|
send: noop,
|
|
158
162
|
sendAudio: noop,
|
|
159
163
|
subscribe: noopUnsubscribe
|
|
160
164
|
};
|
|
161
165
|
var createSessionId = () => crypto.randomUUID();
|
|
162
|
-
var buildWsUrl = (path, sessionId) => {
|
|
166
|
+
var buildWsUrl = (path, sessionId, scenarioId) => {
|
|
163
167
|
const { hostname, port, protocol } = window.location;
|
|
164
168
|
const wsProtocol = protocol === "https:" ? "wss:" : "ws:";
|
|
165
169
|
const portSuffix = port ? `:${port}` : "";
|
|
166
170
|
const url = new URL(`${wsProtocol}//${hostname}${portSuffix}${path}`);
|
|
167
171
|
url.searchParams.set("sessionId", sessionId);
|
|
172
|
+
if (scenarioId) {
|
|
173
|
+
url.searchParams.set(DEFAULT_SCENARIO_QUERY_PARAM, scenarioId);
|
|
174
|
+
}
|
|
168
175
|
return url.toString();
|
|
169
176
|
};
|
|
170
177
|
var isVoiceServerMessage = (value) => {
|
|
@@ -207,6 +214,7 @@ var createVoiceConnection = (path, options = {}) => {
|
|
|
207
214
|
const state = {
|
|
208
215
|
isConnected: false,
|
|
209
216
|
pendingMessages: [],
|
|
217
|
+
scenarioId: options.scenarioId ?? null,
|
|
210
218
|
pingInterval: null,
|
|
211
219
|
reconnectAttempts: 0,
|
|
212
220
|
reconnectTimeout: null,
|
|
@@ -244,13 +252,14 @@ var createVoiceConnection = (path, options = {}) => {
|
|
|
244
252
|
}, RECONNECT_DELAY_MS);
|
|
245
253
|
};
|
|
246
254
|
const connect = () => {
|
|
247
|
-
const ws = new WebSocket(buildWsUrl(path, state.sessionId));
|
|
255
|
+
const ws = new WebSocket(buildWsUrl(path, state.sessionId, state.scenarioId));
|
|
248
256
|
ws.binaryType = "arraybuffer";
|
|
249
257
|
ws.onopen = () => {
|
|
250
258
|
state.isConnected = true;
|
|
251
259
|
state.reconnectAttempts = 0;
|
|
252
260
|
flushPendingMessages();
|
|
253
261
|
listeners.forEach((listener) => listener({
|
|
262
|
+
scenarioId: state.scenarioId ?? undefined,
|
|
254
263
|
sessionId: state.sessionId,
|
|
255
264
|
status: "active",
|
|
256
265
|
type: "session"
|
|
@@ -268,6 +277,7 @@ var createVoiceConnection = (path, options = {}) => {
|
|
|
268
277
|
}
|
|
269
278
|
if (parsed.type === "session") {
|
|
270
279
|
state.sessionId = parsed.sessionId;
|
|
280
|
+
state.scenarioId = parsed.scenarioId ?? state.scenarioId;
|
|
271
281
|
}
|
|
272
282
|
listeners.forEach((listener) => listener(parsed));
|
|
273
283
|
};
|
|
@@ -291,6 +301,19 @@ var createVoiceConnection = (path, options = {}) => {
|
|
|
291
301
|
const send = (message) => {
|
|
292
302
|
sendSerialized(JSON.stringify(message));
|
|
293
303
|
};
|
|
304
|
+
const start = (input = {}) => {
|
|
305
|
+
if (input.sessionId) {
|
|
306
|
+
state.sessionId = input.sessionId;
|
|
307
|
+
}
|
|
308
|
+
if (input.scenarioId) {
|
|
309
|
+
state.scenarioId = input.scenarioId;
|
|
310
|
+
}
|
|
311
|
+
send({
|
|
312
|
+
type: "start",
|
|
313
|
+
sessionId: state.sessionId,
|
|
314
|
+
scenarioId: state.scenarioId ?? undefined
|
|
315
|
+
});
|
|
316
|
+
};
|
|
294
317
|
const sendAudio = (audio) => {
|
|
295
318
|
sendSerialized(audio);
|
|
296
319
|
};
|
|
@@ -314,9 +337,11 @@ var createVoiceConnection = (path, options = {}) => {
|
|
|
314
337
|
};
|
|
315
338
|
connect();
|
|
316
339
|
return {
|
|
340
|
+
start,
|
|
317
341
|
close,
|
|
318
342
|
endTurn,
|
|
319
343
|
getReadyState: () => state.ws?.readyState ?? WS_CLOSED,
|
|
344
|
+
getScenarioId: () => state.scenarioId ?? "",
|
|
320
345
|
getSessionId: () => state.sessionId,
|
|
321
346
|
send,
|
|
322
347
|
sendAudio,
|
|
@@ -329,6 +354,7 @@ var createInitialState = () => ({
|
|
|
329
354
|
assistantTexts: [],
|
|
330
355
|
error: null,
|
|
331
356
|
isConnected: false,
|
|
357
|
+
scenarioId: null,
|
|
332
358
|
partial: "",
|
|
333
359
|
sessionId: null,
|
|
334
360
|
status: "idle",
|
|
@@ -390,6 +416,7 @@ var createVoiceStreamStore = () => {
|
|
|
390
416
|
state = {
|
|
391
417
|
...state,
|
|
392
418
|
error: null,
|
|
419
|
+
scenarioId: action.scenarioId ?? state.scenarioId,
|
|
393
420
|
isConnected: action.status === "active",
|
|
394
421
|
sessionId: action.sessionId,
|
|
395
422
|
status: action.status
|
|
@@ -423,6 +450,12 @@ var createVoiceStream = (path, options = {}) => {
|
|
|
423
450
|
const connection = createVoiceConnection(path, options);
|
|
424
451
|
const store = createVoiceStreamStore();
|
|
425
452
|
const subscribers = new Set;
|
|
453
|
+
const start = (input) => Promise.resolve().then(() => {
|
|
454
|
+
if (!input?.sessionId && !input?.scenarioId) {
|
|
455
|
+
return;
|
|
456
|
+
}
|
|
457
|
+
connection.start(input);
|
|
458
|
+
});
|
|
426
459
|
const notify = () => {
|
|
427
460
|
subscribers.forEach((subscriber) => subscriber());
|
|
428
461
|
};
|
|
@@ -455,6 +488,10 @@ var createVoiceStream = (path, options = {}) => {
|
|
|
455
488
|
get isConnected() {
|
|
456
489
|
return store.getSnapshot().isConnected;
|
|
457
490
|
},
|
|
491
|
+
get scenarioId() {
|
|
492
|
+
return store.getSnapshot().scenarioId;
|
|
493
|
+
},
|
|
494
|
+
start,
|
|
458
495
|
get partial() {
|
|
459
496
|
return store.getSnapshot().partial;
|
|
460
497
|
},
|
|
@@ -484,6 +521,573 @@ var createVoiceStream = (path, options = {}) => {
|
|
|
484
521
|
|
|
485
522
|
// src/svelte/createVoiceStream.ts
|
|
486
523
|
var createVoiceStream2 = (path, options = {}) => createVoiceStream(path, options);
|
|
524
|
+
// src/client/htmx.ts
|
|
525
|
+
var DEFAULT_EVENT_NAME = "voice-refresh";
|
|
526
|
+
var DEFAULT_QUERY_PARAM = "sessionId";
|
|
527
|
+
var resolveElement = (input) => {
|
|
528
|
+
if (typeof input !== "string") {
|
|
529
|
+
return input;
|
|
530
|
+
}
|
|
531
|
+
return document.querySelector(input);
|
|
532
|
+
};
|
|
533
|
+
var buildRoute = (element, route, queryParam, sessionId) => {
|
|
534
|
+
const baseRoute = route ?? element.getAttribute("hx-get") ?? "";
|
|
535
|
+
if (!baseRoute) {
|
|
536
|
+
return "";
|
|
537
|
+
}
|
|
538
|
+
const url = new URL(baseRoute, window.location.origin);
|
|
539
|
+
if (sessionId) {
|
|
540
|
+
url.searchParams.set(queryParam, sessionId);
|
|
541
|
+
} else {
|
|
542
|
+
url.searchParams.delete(queryParam);
|
|
543
|
+
}
|
|
544
|
+
return `${url.pathname}${url.search}${url.hash}`;
|
|
545
|
+
};
|
|
546
|
+
var bindVoiceHTMX = (stream, options) => {
|
|
547
|
+
if (typeof window === "undefined" || typeof document === "undefined") {
|
|
548
|
+
return () => {};
|
|
549
|
+
}
|
|
550
|
+
const element = resolveElement(options.element);
|
|
551
|
+
if (!element) {
|
|
552
|
+
return () => {};
|
|
553
|
+
}
|
|
554
|
+
const eventName = options.eventName ?? DEFAULT_EVENT_NAME;
|
|
555
|
+
const queryParam = options.sessionQueryParam ?? DEFAULT_QUERY_PARAM;
|
|
556
|
+
const sync = () => {
|
|
557
|
+
const htmxWindow = window;
|
|
558
|
+
const nextRoute = buildRoute(element, options.route, queryParam, stream.sessionId);
|
|
559
|
+
if (nextRoute) {
|
|
560
|
+
element.setAttribute("hx-get", nextRoute);
|
|
561
|
+
}
|
|
562
|
+
htmxWindow.htmx?.process?.(element);
|
|
563
|
+
htmxWindow.htmx?.trigger?.(element, eventName);
|
|
564
|
+
};
|
|
565
|
+
const unsubscribe = stream.subscribe(sync);
|
|
566
|
+
sync();
|
|
567
|
+
return () => {
|
|
568
|
+
unsubscribe();
|
|
569
|
+
};
|
|
570
|
+
};
|
|
571
|
+
|
|
572
|
+
// src/client/microphone.ts
|
|
573
|
+
var clampSample = (value) => Math.max(-1, Math.min(1, value));
|
|
574
|
+
var floatTo16BitPCM = (input) => {
|
|
575
|
+
const output = new Int16Array(input.length);
|
|
576
|
+
for (let index = 0;index < input.length; index += 1) {
|
|
577
|
+
const sample = clampSample(input[index] ?? 0);
|
|
578
|
+
output[index] = sample < 0 ? sample * 32768 : sample * 32767;
|
|
579
|
+
}
|
|
580
|
+
return new Uint8Array(output.buffer);
|
|
581
|
+
};
|
|
582
|
+
var getPcmLevel = (audio) => {
|
|
583
|
+
const bytes = audio instanceof Uint8Array ? audio : new Uint8Array(audio);
|
|
584
|
+
if (bytes.byteLength < 2) {
|
|
585
|
+
return 0;
|
|
586
|
+
}
|
|
587
|
+
const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
|
|
588
|
+
if (samples.length === 0) {
|
|
589
|
+
return 0;
|
|
590
|
+
}
|
|
591
|
+
let sumSquares = 0;
|
|
592
|
+
for (const sample of samples) {
|
|
593
|
+
const normalized = sample / 32768;
|
|
594
|
+
sumSquares += normalized * normalized;
|
|
595
|
+
}
|
|
596
|
+
return Math.min(1, Math.max(0, Math.sqrt(sumSquares / samples.length) * 5.5));
|
|
597
|
+
};
|
|
598
|
+
var downsampleBuffer = (input, sourceRate, targetRate) => {
|
|
599
|
+
if (sourceRate === targetRate) {
|
|
600
|
+
return input;
|
|
601
|
+
}
|
|
602
|
+
const ratio = sourceRate / targetRate;
|
|
603
|
+
const length = Math.round(input.length / ratio);
|
|
604
|
+
const output = new Float32Array(length);
|
|
605
|
+
let offsetResult = 0;
|
|
606
|
+
let offsetBuffer = 0;
|
|
607
|
+
while (offsetResult < output.length) {
|
|
608
|
+
const nextOffsetBuffer = Math.round((offsetResult + 1) * ratio);
|
|
609
|
+
let accum = 0;
|
|
610
|
+
let count = 0;
|
|
611
|
+
for (let index = offsetBuffer;index < nextOffsetBuffer && index < input.length; index += 1) {
|
|
612
|
+
accum += input[index] ?? 0;
|
|
613
|
+
count += 1;
|
|
614
|
+
}
|
|
615
|
+
output[offsetResult] = count > 0 ? accum / count : 0;
|
|
616
|
+
offsetResult += 1;
|
|
617
|
+
offsetBuffer = nextOffsetBuffer;
|
|
618
|
+
}
|
|
619
|
+
return output;
|
|
620
|
+
};
|
|
621
|
+
var createMicrophoneCapture = (options) => {
|
|
622
|
+
let audioContext = null;
|
|
623
|
+
let sourceNode = null;
|
|
624
|
+
let processorNode = null;
|
|
625
|
+
let mediaStream = null;
|
|
626
|
+
const start = async () => {
|
|
627
|
+
if (typeof navigator === "undefined" || !navigator.mediaDevices?.getUserMedia) {
|
|
628
|
+
throw new Error("Browser microphone capture requires navigator.mediaDevices.getUserMedia.");
|
|
629
|
+
}
|
|
630
|
+
const AudioContextCtor = (typeof window !== "undefined" ? window.AudioContext ?? window.webkitAudioContext : undefined) ?? AudioContext;
|
|
631
|
+
if (!AudioContextCtor) {
|
|
632
|
+
throw new Error("Browser microphone capture requires AudioContext support.");
|
|
633
|
+
}
|
|
634
|
+
mediaStream = await navigator.mediaDevices.getUserMedia({
|
|
635
|
+
audio: {
|
|
636
|
+
channelCount: options.channelCount ?? 1
|
|
637
|
+
}
|
|
638
|
+
});
|
|
639
|
+
audioContext = new AudioContextCtor;
|
|
640
|
+
sourceNode = audioContext.createMediaStreamSource(mediaStream);
|
|
641
|
+
processorNode = audioContext.createScriptProcessor(4096, 1, 1);
|
|
642
|
+
processorNode.onaudioprocess = (event) => {
|
|
643
|
+
const channel = event.inputBuffer.getChannelData(0);
|
|
644
|
+
const downsampled = downsampleBuffer(channel, audioContext?.sampleRate ?? 48000, options.sampleRateHz ?? 16000);
|
|
645
|
+
const pcm = floatTo16BitPCM(downsampled);
|
|
646
|
+
options.onLevel?.(getPcmLevel(pcm));
|
|
647
|
+
options.onAudio(pcm);
|
|
648
|
+
};
|
|
649
|
+
sourceNode.connect(processorNode);
|
|
650
|
+
processorNode.connect(audioContext.destination);
|
|
651
|
+
};
|
|
652
|
+
const stop = () => {
|
|
653
|
+
processorNode?.disconnect();
|
|
654
|
+
sourceNode?.disconnect();
|
|
655
|
+
mediaStream?.getTracks().forEach((track) => track.stop());
|
|
656
|
+
audioContext?.close();
|
|
657
|
+
options.onLevel?.(0);
|
|
658
|
+
audioContext = null;
|
|
659
|
+
mediaStream = null;
|
|
660
|
+
processorNode = null;
|
|
661
|
+
sourceNode = null;
|
|
662
|
+
};
|
|
663
|
+
return { start, stop };
|
|
664
|
+
};
|
|
665
|
+
|
|
666
|
+
// src/audioConditioning.ts
|
|
667
|
+
var DEFAULT_TARGET_LEVEL = 0.08;
|
|
668
|
+
var DEFAULT_MAX_GAIN = 3;
|
|
669
|
+
var DEFAULT_NOISE_GATE_THRESHOLD = 0.006;
|
|
670
|
+
var DEFAULT_NOISE_GATE_ATTENUATION = 0.15;
|
|
671
|
+
var toInt16Array = (audio) => {
|
|
672
|
+
if (audio instanceof ArrayBuffer) {
|
|
673
|
+
return new Int16Array(audio, 0, Math.floor(audio.byteLength / 2));
|
|
674
|
+
}
|
|
675
|
+
return new Int16Array(audio.buffer, audio.byteOffset, Math.floor(audio.byteLength / 2));
|
|
676
|
+
};
|
|
677
|
+
var computeRms = (samples) => {
|
|
678
|
+
if (samples.length === 0) {
|
|
679
|
+
return 0;
|
|
680
|
+
}
|
|
681
|
+
let sumSquares = 0;
|
|
682
|
+
for (const sample of samples) {
|
|
683
|
+
const normalized = sample / 32768;
|
|
684
|
+
sumSquares += normalized * normalized;
|
|
685
|
+
}
|
|
686
|
+
return Math.sqrt(sumSquares / samples.length);
|
|
687
|
+
};
|
|
688
|
+
var resolveAudioConditioningConfig = (config) => {
|
|
689
|
+
if (!config || config.enabled === false) {
|
|
690
|
+
return;
|
|
691
|
+
}
|
|
692
|
+
return {
|
|
693
|
+
enabled: true,
|
|
694
|
+
maxGain: config.maxGain ?? DEFAULT_MAX_GAIN,
|
|
695
|
+
noiseGateAttenuation: config.noiseGateAttenuation ?? DEFAULT_NOISE_GATE_ATTENUATION,
|
|
696
|
+
noiseGateThreshold: config.noiseGateThreshold ?? DEFAULT_NOISE_GATE_THRESHOLD,
|
|
697
|
+
targetLevel: config.targetLevel ?? DEFAULT_TARGET_LEVEL
|
|
698
|
+
};
|
|
699
|
+
};
|
|
700
|
+
var conditionAudioChunk = (audio, config) => {
|
|
701
|
+
if (!config) {
|
|
702
|
+
return audio;
|
|
703
|
+
}
|
|
704
|
+
const source = toInt16Array(audio);
|
|
705
|
+
if (source.length === 0) {
|
|
706
|
+
return audio;
|
|
707
|
+
}
|
|
708
|
+
const rms = computeRms(source);
|
|
709
|
+
const output = new Int16Array(source.length);
|
|
710
|
+
const gateFactor = rms < config.noiseGateThreshold ? config.noiseGateAttenuation : 1;
|
|
711
|
+
const baseLevel = Math.max(rms * gateFactor, 0.000001);
|
|
712
|
+
const gain = Math.min(config.maxGain, config.targetLevel / baseLevel);
|
|
713
|
+
const appliedGain = Math.max(0.25, gain) * gateFactor;
|
|
714
|
+
for (let index = 0;index < source.length; index += 1) {
|
|
715
|
+
const next = Math.round(source[index] * appliedGain);
|
|
716
|
+
output[index] = Math.max(-32768, Math.min(32767, next));
|
|
717
|
+
}
|
|
718
|
+
return new Uint8Array(output.buffer);
|
|
719
|
+
};
|
|
720
|
+
|
|
721
|
+
// src/turnProfiles.ts
|
|
722
|
+
var TURN_PROFILE_DEFAULTS = {
|
|
723
|
+
balanced: {
|
|
724
|
+
qualityProfile: "general",
|
|
725
|
+
silenceMs: 1400,
|
|
726
|
+
speechThreshold: 0.012,
|
|
727
|
+
transcriptStabilityMs: 1000
|
|
728
|
+
},
|
|
729
|
+
fast: {
|
|
730
|
+
qualityProfile: "general",
|
|
731
|
+
silenceMs: 700,
|
|
732
|
+
speechThreshold: 0.015,
|
|
733
|
+
transcriptStabilityMs: 450
|
|
734
|
+
},
|
|
735
|
+
"long-form": {
|
|
736
|
+
qualityProfile: "general",
|
|
737
|
+
silenceMs: 2200,
|
|
738
|
+
speechThreshold: 0.01,
|
|
739
|
+
transcriptStabilityMs: 1500
|
|
740
|
+
}
|
|
741
|
+
};
|
|
742
|
+
var QUALITY_PROFILE_DEFAULTS = {
|
|
743
|
+
general: {},
|
|
744
|
+
"accent-heavy": {
|
|
745
|
+
silenceMs: 1200,
|
|
746
|
+
speechThreshold: 0.01,
|
|
747
|
+
transcriptStabilityMs: 1200
|
|
748
|
+
},
|
|
749
|
+
"noisy-room": {
|
|
750
|
+
silenceMs: 2000,
|
|
751
|
+
speechThreshold: 0.02,
|
|
752
|
+
transcriptStabilityMs: 1600
|
|
753
|
+
},
|
|
754
|
+
"short-command": {
|
|
755
|
+
silenceMs: 500,
|
|
756
|
+
speechThreshold: 0.016,
|
|
757
|
+
transcriptStabilityMs: 420
|
|
758
|
+
}
|
|
759
|
+
};
|
|
760
|
+
var DEFAULT_TURN_PROFILE = "fast";
|
|
761
|
+
var DEFAULT_QUALITY_PROFILE = "general";
|
|
762
|
+
var resolveTurnDetectionConfig = (config) => {
|
|
763
|
+
const profile = config?.profile ?? DEFAULT_TURN_PROFILE;
|
|
764
|
+
const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
|
|
765
|
+
const preset = TURN_PROFILE_DEFAULTS[profile];
|
|
766
|
+
const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
|
|
767
|
+
return {
|
|
768
|
+
profile,
|
|
769
|
+
qualityProfile,
|
|
770
|
+
silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
|
|
771
|
+
speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
|
|
772
|
+
transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
|
|
773
|
+
};
|
|
774
|
+
};
|
|
775
|
+
|
|
776
|
+
// src/presets.ts
|
|
777
|
+
var PRESET_INPUTS = {
|
|
778
|
+
chat: {
|
|
779
|
+
audioConditioning: {
|
|
780
|
+
enabled: true,
|
|
781
|
+
maxGain: 2.5,
|
|
782
|
+
noiseGateAttenuation: 0,
|
|
783
|
+
noiseGateThreshold: 0.004,
|
|
784
|
+
targetLevel: 0.08
|
|
785
|
+
},
|
|
786
|
+
capture: {
|
|
787
|
+
channelCount: 1,
|
|
788
|
+
sampleRateHz: 16000
|
|
789
|
+
},
|
|
790
|
+
connection: {
|
|
791
|
+
maxReconnectAttempts: 10,
|
|
792
|
+
pingInterval: 30000,
|
|
793
|
+
reconnect: true
|
|
794
|
+
},
|
|
795
|
+
sttLifecycle: "continuous",
|
|
796
|
+
turnDetection: {
|
|
797
|
+
qualityProfile: "short-command",
|
|
798
|
+
profile: "balanced"
|
|
799
|
+
}
|
|
800
|
+
},
|
|
801
|
+
default: {
|
|
802
|
+
capture: {
|
|
803
|
+
channelCount: 1,
|
|
804
|
+
sampleRateHz: 16000
|
|
805
|
+
},
|
|
806
|
+
connection: {
|
|
807
|
+
maxReconnectAttempts: 10,
|
|
808
|
+
pingInterval: 30000,
|
|
809
|
+
reconnect: true
|
|
810
|
+
},
|
|
811
|
+
sttLifecycle: "continuous",
|
|
812
|
+
turnDetection: {
|
|
813
|
+
qualityProfile: "general",
|
|
814
|
+
profile: "fast"
|
|
815
|
+
}
|
|
816
|
+
},
|
|
817
|
+
dictation: {
|
|
818
|
+
audioConditioning: {
|
|
819
|
+
enabled: true,
|
|
820
|
+
maxGain: 2.25,
|
|
821
|
+
noiseGateAttenuation: 0.05,
|
|
822
|
+
noiseGateThreshold: 0.003,
|
|
823
|
+
targetLevel: 0.08
|
|
824
|
+
},
|
|
825
|
+
capture: {
|
|
826
|
+
channelCount: 1,
|
|
827
|
+
sampleRateHz: 16000
|
|
828
|
+
},
|
|
829
|
+
connection: {
|
|
830
|
+
maxReconnectAttempts: 12,
|
|
831
|
+
pingInterval: 30000,
|
|
832
|
+
reconnect: true
|
|
833
|
+
},
|
|
834
|
+
sttLifecycle: "continuous",
|
|
835
|
+
turnDetection: {
|
|
836
|
+
qualityProfile: "accent-heavy",
|
|
837
|
+
profile: "long-form"
|
|
838
|
+
}
|
|
839
|
+
},
|
|
840
|
+
"guided-intake": {
|
|
841
|
+
audioConditioning: {
|
|
842
|
+
enabled: true,
|
|
843
|
+
maxGain: 2.5,
|
|
844
|
+
noiseGateAttenuation: 0,
|
|
845
|
+
noiseGateThreshold: 0.004,
|
|
846
|
+
targetLevel: 0.08
|
|
847
|
+
},
|
|
848
|
+
capture: {
|
|
849
|
+
channelCount: 1,
|
|
850
|
+
sampleRateHz: 16000
|
|
851
|
+
},
|
|
852
|
+
connection: {
|
|
853
|
+
maxReconnectAttempts: 12,
|
|
854
|
+
pingInterval: 30000,
|
|
855
|
+
reconnect: true
|
|
856
|
+
},
|
|
857
|
+
sttLifecycle: "turn-scoped",
|
|
858
|
+
turnDetection: {
|
|
859
|
+
qualityProfile: "accent-heavy",
|
|
860
|
+
profile: "long-form"
|
|
861
|
+
}
|
|
862
|
+
},
|
|
863
|
+
"noisy-room": {
|
|
864
|
+
audioConditioning: {
|
|
865
|
+
enabled: true,
|
|
866
|
+
maxGain: 3,
|
|
867
|
+
noiseGateAttenuation: 0.12,
|
|
868
|
+
noiseGateThreshold: 0.006,
|
|
869
|
+
targetLevel: 0.085
|
|
870
|
+
},
|
|
871
|
+
capture: {
|
|
872
|
+
channelCount: 1,
|
|
873
|
+
sampleRateHz: 16000
|
|
874
|
+
},
|
|
875
|
+
connection: {
|
|
876
|
+
maxReconnectAttempts: 14,
|
|
877
|
+
pingInterval: 45000,
|
|
878
|
+
reconnect: true
|
|
879
|
+
},
|
|
880
|
+
sttLifecycle: "continuous",
|
|
881
|
+
turnDetection: {
|
|
882
|
+
qualityProfile: "noisy-room",
|
|
883
|
+
profile: "long-form",
|
|
884
|
+
silenceMs: 2100,
|
|
885
|
+
speechThreshold: 0.02,
|
|
886
|
+
transcriptStabilityMs: 1650
|
|
887
|
+
}
|
|
888
|
+
},
|
|
889
|
+
reliability: {
|
|
890
|
+
audioConditioning: {
|
|
891
|
+
enabled: true,
|
|
892
|
+
maxGain: 2.9,
|
|
893
|
+
noiseGateAttenuation: 0.08,
|
|
894
|
+
noiseGateThreshold: 0.005,
|
|
895
|
+
targetLevel: 0.08
|
|
896
|
+
},
|
|
897
|
+
capture: {
|
|
898
|
+
channelCount: 1,
|
|
899
|
+
sampleRateHz: 16000
|
|
900
|
+
},
|
|
901
|
+
connection: {
|
|
902
|
+
maxReconnectAttempts: 14,
|
|
903
|
+
pingInterval: 45000,
|
|
904
|
+
reconnect: true
|
|
905
|
+
},
|
|
906
|
+
sttLifecycle: "continuous",
|
|
907
|
+
turnDetection: {
|
|
908
|
+
qualityProfile: "noisy-room",
|
|
909
|
+
profile: "long-form"
|
|
910
|
+
}
|
|
911
|
+
}
|
|
912
|
+
};
|
|
913
|
+
var resolveVoiceRuntimePreset = (name = "default") => {
|
|
914
|
+
const preset = PRESET_INPUTS[name];
|
|
915
|
+
return {
|
|
916
|
+
audioConditioning: resolveAudioConditioningConfig(preset.audioConditioning),
|
|
917
|
+
capture: {
|
|
918
|
+
channelCount: preset.capture?.channelCount ?? 1,
|
|
919
|
+
sampleRateHz: preset.capture?.sampleRateHz ?? 16000
|
|
920
|
+
},
|
|
921
|
+
connection: {
|
|
922
|
+
...preset.connection
|
|
923
|
+
},
|
|
924
|
+
name,
|
|
925
|
+
sttLifecycle: preset.sttLifecycle ?? "continuous",
|
|
926
|
+
turnDetection: resolveTurnDetectionConfig(preset.turnDetection)
|
|
927
|
+
};
|
|
928
|
+
};
|
|
929
|
+
|
|
930
|
+
// src/client/controller.ts
|
|
931
|
+
var createInitialState2 = (stream) => ({
|
|
932
|
+
assistantTexts: [...stream.assistantTexts],
|
|
933
|
+
error: stream.error,
|
|
934
|
+
isConnected: stream.isConnected,
|
|
935
|
+
isRecording: false,
|
|
936
|
+
partial: stream.partial,
|
|
937
|
+
recordingError: null,
|
|
938
|
+
sessionId: stream.sessionId,
|
|
939
|
+
scenarioId: stream.scenarioId,
|
|
940
|
+
status: stream.status,
|
|
941
|
+
turns: [...stream.turns]
|
|
942
|
+
});
|
|
943
|
+
var createVoiceController = (path, options = {}) => {
|
|
944
|
+
const preset = resolveVoiceRuntimePreset(options.preset);
|
|
945
|
+
const stream = createVoiceStream(path, {
|
|
946
|
+
...preset.connection,
|
|
947
|
+
...options.connection
|
|
948
|
+
});
|
|
949
|
+
let capture = null;
|
|
950
|
+
let state = createInitialState2(stream);
|
|
951
|
+
const subscribers = new Set;
|
|
952
|
+
const notify = () => {
|
|
953
|
+
for (const subscriber of subscribers) {
|
|
954
|
+
subscriber();
|
|
955
|
+
}
|
|
956
|
+
};
|
|
957
|
+
const sync = () => {
|
|
958
|
+
state = {
|
|
959
|
+
...state,
|
|
960
|
+
assistantTexts: [...stream.assistantTexts],
|
|
961
|
+
error: stream.error,
|
|
962
|
+
isConnected: stream.isConnected,
|
|
963
|
+
partial: stream.partial,
|
|
964
|
+
sessionId: stream.sessionId,
|
|
965
|
+
scenarioId: stream.scenarioId,
|
|
966
|
+
status: stream.status,
|
|
967
|
+
turns: [...stream.turns]
|
|
968
|
+
};
|
|
969
|
+
if (options.autoStopOnComplete !== false && state.status === "completed" && state.isRecording) {
|
|
970
|
+
capture?.stop();
|
|
971
|
+
capture = null;
|
|
972
|
+
state = {
|
|
973
|
+
...state,
|
|
974
|
+
isRecording: false
|
|
975
|
+
};
|
|
976
|
+
}
|
|
977
|
+
notify();
|
|
978
|
+
};
|
|
979
|
+
const unsubscribeStream = stream.subscribe(sync);
|
|
980
|
+
sync();
|
|
981
|
+
const ensureCapture = () => {
|
|
982
|
+
if (capture) {
|
|
983
|
+
return capture;
|
|
984
|
+
}
|
|
985
|
+
capture = createMicrophoneCapture({
|
|
986
|
+
channelCount: options.capture?.channelCount ?? preset.capture.channelCount,
|
|
987
|
+
onLevel: options.capture?.onLevel,
|
|
988
|
+
onAudio: (audio) => stream.sendAudio(audio),
|
|
989
|
+
sampleRateHz: options.capture?.sampleRateHz ?? preset.capture.sampleRateHz
|
|
990
|
+
});
|
|
991
|
+
return capture;
|
|
992
|
+
};
|
|
993
|
+
const stopRecording = () => {
|
|
994
|
+
capture?.stop();
|
|
995
|
+
capture = null;
|
|
996
|
+
state = {
|
|
997
|
+
...state,
|
|
998
|
+
isRecording: false
|
|
999
|
+
};
|
|
1000
|
+
notify();
|
|
1001
|
+
};
|
|
1002
|
+
const startRecording = async () => {
|
|
1003
|
+
if (state.isRecording) {
|
|
1004
|
+
return;
|
|
1005
|
+
}
|
|
1006
|
+
try {
|
|
1007
|
+
state = {
|
|
1008
|
+
...state,
|
|
1009
|
+
recordingError: null
|
|
1010
|
+
};
|
|
1011
|
+
notify();
|
|
1012
|
+
await ensureCapture().start();
|
|
1013
|
+
state = {
|
|
1014
|
+
...state,
|
|
1015
|
+
isRecording: true
|
|
1016
|
+
};
|
|
1017
|
+
notify();
|
|
1018
|
+
} catch (error) {
|
|
1019
|
+
capture = null;
|
|
1020
|
+
state = {
|
|
1021
|
+
...state,
|
|
1022
|
+
isRecording: false,
|
|
1023
|
+
recordingError: error instanceof Error ? error.message : String(error)
|
|
1024
|
+
};
|
|
1025
|
+
notify();
|
|
1026
|
+
throw error;
|
|
1027
|
+
}
|
|
1028
|
+
};
|
|
1029
|
+
const close = () => {
|
|
1030
|
+
unsubscribeStream();
|
|
1031
|
+
stopRecording();
|
|
1032
|
+
stream.close();
|
|
1033
|
+
};
|
|
1034
|
+
return {
|
|
1035
|
+
bindHTMX(bindingOptions) {
|
|
1036
|
+
return bindVoiceHTMX(stream, bindingOptions);
|
|
1037
|
+
},
|
|
1038
|
+
close,
|
|
1039
|
+
endTurn: () => stream.endTurn(),
|
|
1040
|
+
get error() {
|
|
1041
|
+
return state.error;
|
|
1042
|
+
},
|
|
1043
|
+
getServerSnapshot: () => state,
|
|
1044
|
+
getSnapshot: () => state,
|
|
1045
|
+
get isConnected() {
|
|
1046
|
+
return state.isConnected;
|
|
1047
|
+
},
|
|
1048
|
+
get isRecording() {
|
|
1049
|
+
return state.isRecording;
|
|
1050
|
+
},
|
|
1051
|
+
get partial() {
|
|
1052
|
+
return state.partial;
|
|
1053
|
+
},
|
|
1054
|
+
get recordingError() {
|
|
1055
|
+
return state.recordingError;
|
|
1056
|
+
},
|
|
1057
|
+
sendAudio: (audio) => stream.sendAudio(audio),
|
|
1058
|
+
get sessionId() {
|
|
1059
|
+
return state.sessionId;
|
|
1060
|
+
},
|
|
1061
|
+
get scenarioId() {
|
|
1062
|
+
return state.scenarioId;
|
|
1063
|
+
},
|
|
1064
|
+
startRecording,
|
|
1065
|
+
get status() {
|
|
1066
|
+
return state.status;
|
|
1067
|
+
},
|
|
1068
|
+
stopRecording,
|
|
1069
|
+
subscribe: (subscriber) => {
|
|
1070
|
+
subscribers.add(subscriber);
|
|
1071
|
+
return () => {
|
|
1072
|
+
subscribers.delete(subscriber);
|
|
1073
|
+
};
|
|
1074
|
+
},
|
|
1075
|
+
toggleRecording: async () => {
|
|
1076
|
+
if (state.isRecording) {
|
|
1077
|
+
stopRecording();
|
|
1078
|
+
return;
|
|
1079
|
+
}
|
|
1080
|
+
await startRecording();
|
|
1081
|
+
},
|
|
1082
|
+
get turns() {
|
|
1083
|
+
return state.turns;
|
|
1084
|
+
},
|
|
1085
|
+
get assistantTexts() {
|
|
1086
|
+
return state.assistantTexts;
|
|
1087
|
+
}
|
|
1088
|
+
};
|
|
1089
|
+
};
|
|
487
1090
|
export {
|
|
488
|
-
createVoiceStream2 as createVoiceStream
|
|
1091
|
+
createVoiceStream2 as createVoiceStream,
|
|
1092
|
+
createVoiceController
|
|
489
1093
|
};
|