@lokutor/sdk 1.1.9 → 1.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +31 -51
- package/dist/index.d.ts +31 -51
- package/dist/index.js +80 -150
- package/dist/index.mjs +80 -149
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -119,13 +119,11 @@ declare class VoiceAgentClient {
|
|
|
119
119
|
private messages;
|
|
120
120
|
private visemeListeners;
|
|
121
121
|
private wantVisemes;
|
|
122
|
-
private serverUrl;
|
|
123
122
|
constructor(config: LokutorConfig & {
|
|
124
123
|
prompt: string;
|
|
125
124
|
voice?: VoiceStyle;
|
|
126
125
|
language?: Language;
|
|
127
126
|
visemes?: boolean;
|
|
128
|
-
serverUrl?: string;
|
|
129
127
|
onVisemes?: (visemes: Viseme[]) => void;
|
|
130
128
|
});
|
|
131
129
|
/**
|
|
@@ -328,7 +326,7 @@ declare class BrowserAudioManager {
|
|
|
328
326
|
private mediaStream;
|
|
329
327
|
private nextPlaybackTime;
|
|
330
328
|
private activeSources;
|
|
331
|
-
private
|
|
329
|
+
private playbackQueue;
|
|
332
330
|
private inputSampleRate;
|
|
333
331
|
private outputSampleRate;
|
|
334
332
|
private autoGainControl;
|
|
@@ -338,7 +336,6 @@ declare class BrowserAudioManager {
|
|
|
338
336
|
private onInputError?;
|
|
339
337
|
private isMuted;
|
|
340
338
|
private isListening;
|
|
341
|
-
private resampler;
|
|
342
339
|
constructor(config?: BrowserAudioConfig);
|
|
343
340
|
/**
|
|
344
341
|
* Initialize the AudioContext and analyser
|
|
@@ -348,77 +345,60 @@ declare class BrowserAudioManager {
|
|
|
348
345
|
* Start capturing audio from the microphone
|
|
349
346
|
*/
|
|
350
347
|
startMicrophone(onAudioInput: (pcm16Data: Uint8Array) => void): Promise<void>;
|
|
348
|
+
/**
|
|
349
|
+
* Internal method to process microphone audio data
|
|
350
|
+
*/
|
|
351
351
|
private _processAudioInput;
|
|
352
|
+
/**
|
|
353
|
+
* Stop capturing microphone input
|
|
354
|
+
*/
|
|
352
355
|
stopMicrophone(): void;
|
|
353
356
|
/**
|
|
354
357
|
* Play back audio received from the server
|
|
358
|
+
* @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
|
|
355
359
|
*/
|
|
356
360
|
playAudio(pcm16Data: Uint8Array): void;
|
|
357
|
-
private _schedulePlayback;
|
|
358
361
|
/**
|
|
359
|
-
*
|
|
360
|
-
* Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
|
|
362
|
+
* Internal method to schedule and play audio with sample-accurate timing
|
|
361
363
|
*/
|
|
362
|
-
|
|
364
|
+
private _schedulePlayback;
|
|
363
365
|
/**
|
|
364
|
-
*
|
|
366
|
+
* Stop all currently playing audio and clear the queue
|
|
365
367
|
*/
|
|
366
|
-
resetAudioClock(): void;
|
|
367
368
|
stopPlayback(): void;
|
|
369
|
+
/**
|
|
370
|
+
* Toggle mute state
|
|
371
|
+
*/
|
|
368
372
|
setMuted(muted: boolean): void;
|
|
369
|
-
isMicMuted(): boolean;
|
|
370
|
-
getAmplitude(): number;
|
|
371
|
-
getFrequencyData(): Uint8Array;
|
|
372
|
-
getWaveformData(): Uint8Array;
|
|
373
|
-
cleanup(): void;
|
|
374
|
-
getAudioContext(): AudioContext | null;
|
|
375
|
-
}
|
|
376
|
-
|
|
377
|
-
/**
|
|
378
|
-
* High-level AI Voice Agent for browser-based conversations.
|
|
379
|
-
*
|
|
380
|
-
* This class orchestrates microphone input, AI processing, and
|
|
381
|
-
* speaker output, providing a simple interface for building
|
|
382
|
-
* voice assistants with lip-sync support.
|
|
383
|
-
*/
|
|
384
|
-
declare class VoiceAgent {
|
|
385
|
-
private client;
|
|
386
|
-
private audioManager;
|
|
387
|
-
private options;
|
|
388
|
-
private isConnected;
|
|
389
|
-
private visemeQueue;
|
|
390
|
-
constructor(options: VoiceAgentOptions & {
|
|
391
|
-
apiKey: string;
|
|
392
|
-
});
|
|
393
373
|
/**
|
|
394
|
-
*
|
|
395
|
-
* This must be called in response to a user guesture (like a click)
|
|
396
|
-
* to satisfy browser AudioContext requirements.
|
|
374
|
+
* Get current mute state
|
|
397
375
|
*/
|
|
398
|
-
|
|
376
|
+
isMicMuted(): boolean;
|
|
399
377
|
/**
|
|
400
|
-
* Get
|
|
401
|
-
*
|
|
402
|
-
* @returns value between 0 and 1
|
|
378
|
+
* Get current amplitude from analyser (for visualization)
|
|
379
|
+
* Returns value between 0 and 1
|
|
403
380
|
*/
|
|
404
381
|
getAmplitude(): number;
|
|
405
382
|
/**
|
|
406
|
-
*
|
|
383
|
+
* Get frequency data from analyser for visualization
|
|
407
384
|
*/
|
|
408
|
-
|
|
385
|
+
getFrequencyData(): Uint8Array;
|
|
409
386
|
/**
|
|
410
|
-
*
|
|
411
|
-
* at the current playback frame. Use this in a requestAnimationFrame loop.
|
|
387
|
+
* Get time-domain data from analyser for waveform visualization
|
|
412
388
|
*/
|
|
413
|
-
|
|
389
|
+
getWaveformData(): Uint8Array;
|
|
414
390
|
/**
|
|
415
|
-
*
|
|
391
|
+
* Cleanup and close AudioContext
|
|
416
392
|
*/
|
|
417
|
-
|
|
393
|
+
cleanup(): void;
|
|
418
394
|
/**
|
|
419
|
-
*
|
|
395
|
+
* Get current audio context state
|
|
420
396
|
*/
|
|
421
|
-
|
|
397
|
+
getState(): 'running' | 'suspended' | 'closed' | 'interrupted' | null;
|
|
398
|
+
/**
|
|
399
|
+
* Check if microphone is currently listening
|
|
400
|
+
*/
|
|
401
|
+
isRecording(): boolean;
|
|
422
402
|
}
|
|
423
403
|
|
|
424
|
-
export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme,
|
|
404
|
+
export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
|
package/dist/index.d.ts
CHANGED
|
@@ -119,13 +119,11 @@ declare class VoiceAgentClient {
|
|
|
119
119
|
private messages;
|
|
120
120
|
private visemeListeners;
|
|
121
121
|
private wantVisemes;
|
|
122
|
-
private serverUrl;
|
|
123
122
|
constructor(config: LokutorConfig & {
|
|
124
123
|
prompt: string;
|
|
125
124
|
voice?: VoiceStyle;
|
|
126
125
|
language?: Language;
|
|
127
126
|
visemes?: boolean;
|
|
128
|
-
serverUrl?: string;
|
|
129
127
|
onVisemes?: (visemes: Viseme[]) => void;
|
|
130
128
|
});
|
|
131
129
|
/**
|
|
@@ -328,7 +326,7 @@ declare class BrowserAudioManager {
|
|
|
328
326
|
private mediaStream;
|
|
329
327
|
private nextPlaybackTime;
|
|
330
328
|
private activeSources;
|
|
331
|
-
private
|
|
329
|
+
private playbackQueue;
|
|
332
330
|
private inputSampleRate;
|
|
333
331
|
private outputSampleRate;
|
|
334
332
|
private autoGainControl;
|
|
@@ -338,7 +336,6 @@ declare class BrowserAudioManager {
|
|
|
338
336
|
private onInputError?;
|
|
339
337
|
private isMuted;
|
|
340
338
|
private isListening;
|
|
341
|
-
private resampler;
|
|
342
339
|
constructor(config?: BrowserAudioConfig);
|
|
343
340
|
/**
|
|
344
341
|
* Initialize the AudioContext and analyser
|
|
@@ -348,77 +345,60 @@ declare class BrowserAudioManager {
|
|
|
348
345
|
* Start capturing audio from the microphone
|
|
349
346
|
*/
|
|
350
347
|
startMicrophone(onAudioInput: (pcm16Data: Uint8Array) => void): Promise<void>;
|
|
348
|
+
/**
|
|
349
|
+
* Internal method to process microphone audio data
|
|
350
|
+
*/
|
|
351
351
|
private _processAudioInput;
|
|
352
|
+
/**
|
|
353
|
+
* Stop capturing microphone input
|
|
354
|
+
*/
|
|
352
355
|
stopMicrophone(): void;
|
|
353
356
|
/**
|
|
354
357
|
* Play back audio received from the server
|
|
358
|
+
* @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
|
|
355
359
|
*/
|
|
356
360
|
playAudio(pcm16Data: Uint8Array): void;
|
|
357
|
-
private _schedulePlayback;
|
|
358
361
|
/**
|
|
359
|
-
*
|
|
360
|
-
* Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
|
|
362
|
+
* Internal method to schedule and play audio with sample-accurate timing
|
|
361
363
|
*/
|
|
362
|
-
|
|
364
|
+
private _schedulePlayback;
|
|
363
365
|
/**
|
|
364
|
-
*
|
|
366
|
+
* Stop all currently playing audio and clear the queue
|
|
365
367
|
*/
|
|
366
|
-
resetAudioClock(): void;
|
|
367
368
|
stopPlayback(): void;
|
|
369
|
+
/**
|
|
370
|
+
* Toggle mute state
|
|
371
|
+
*/
|
|
368
372
|
setMuted(muted: boolean): void;
|
|
369
|
-
isMicMuted(): boolean;
|
|
370
|
-
getAmplitude(): number;
|
|
371
|
-
getFrequencyData(): Uint8Array;
|
|
372
|
-
getWaveformData(): Uint8Array;
|
|
373
|
-
cleanup(): void;
|
|
374
|
-
getAudioContext(): AudioContext | null;
|
|
375
|
-
}
|
|
376
|
-
|
|
377
|
-
/**
|
|
378
|
-
* High-level AI Voice Agent for browser-based conversations.
|
|
379
|
-
*
|
|
380
|
-
* This class orchestrates microphone input, AI processing, and
|
|
381
|
-
* speaker output, providing a simple interface for building
|
|
382
|
-
* voice assistants with lip-sync support.
|
|
383
|
-
*/
|
|
384
|
-
declare class VoiceAgent {
|
|
385
|
-
private client;
|
|
386
|
-
private audioManager;
|
|
387
|
-
private options;
|
|
388
|
-
private isConnected;
|
|
389
|
-
private visemeQueue;
|
|
390
|
-
constructor(options: VoiceAgentOptions & {
|
|
391
|
-
apiKey: string;
|
|
392
|
-
});
|
|
393
373
|
/**
|
|
394
|
-
*
|
|
395
|
-
* This must be called in response to a user guesture (like a click)
|
|
396
|
-
* to satisfy browser AudioContext requirements.
|
|
374
|
+
* Get current mute state
|
|
397
375
|
*/
|
|
398
|
-
|
|
376
|
+
isMicMuted(): boolean;
|
|
399
377
|
/**
|
|
400
|
-
* Get
|
|
401
|
-
*
|
|
402
|
-
* @returns value between 0 and 1
|
|
378
|
+
* Get current amplitude from analyser (for visualization)
|
|
379
|
+
* Returns value between 0 and 1
|
|
403
380
|
*/
|
|
404
381
|
getAmplitude(): number;
|
|
405
382
|
/**
|
|
406
|
-
*
|
|
383
|
+
* Get frequency data from analyser for visualization
|
|
407
384
|
*/
|
|
408
|
-
|
|
385
|
+
getFrequencyData(): Uint8Array;
|
|
409
386
|
/**
|
|
410
|
-
*
|
|
411
|
-
* at the current playback frame. Use this in a requestAnimationFrame loop.
|
|
387
|
+
* Get time-domain data from analyser for waveform visualization
|
|
412
388
|
*/
|
|
413
|
-
|
|
389
|
+
getWaveformData(): Uint8Array;
|
|
414
390
|
/**
|
|
415
|
-
*
|
|
391
|
+
* Cleanup and close AudioContext
|
|
416
392
|
*/
|
|
417
|
-
|
|
393
|
+
cleanup(): void;
|
|
418
394
|
/**
|
|
419
|
-
*
|
|
395
|
+
* Get current audio context state
|
|
420
396
|
*/
|
|
421
|
-
|
|
397
|
+
getState(): 'running' | 'suspended' | 'closed' | 'interrupted' | null;
|
|
398
|
+
/**
|
|
399
|
+
* Check if microphone is currently listening
|
|
400
|
+
*/
|
|
401
|
+
isRecording(): boolean;
|
|
422
402
|
}
|
|
423
403
|
|
|
424
|
-
export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme,
|
|
404
|
+
export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
|
package/dist/index.js
CHANGED
|
@@ -26,7 +26,6 @@ __export(index_exports, {
|
|
|
26
26
|
Language: () => Language,
|
|
27
27
|
StreamResampler: () => StreamResampler,
|
|
28
28
|
TTSClient: () => TTSClient,
|
|
29
|
-
VoiceAgent: () => VoiceAgent,
|
|
30
29
|
VoiceAgentClient: () => VoiceAgentClient,
|
|
31
30
|
VoiceStyle: () => VoiceStyle,
|
|
32
31
|
applyLowPassFilter: () => applyLowPassFilter,
|
|
@@ -105,13 +104,11 @@ var VoiceAgentClient = class {
|
|
|
105
104
|
messages = [];
|
|
106
105
|
visemeListeners = [];
|
|
107
106
|
wantVisemes = false;
|
|
108
|
-
serverUrl = null;
|
|
109
107
|
constructor(config) {
|
|
110
108
|
this.apiKey = config.apiKey;
|
|
111
109
|
this.prompt = config.prompt;
|
|
112
110
|
this.voice = config.voice || "F1" /* F1 */;
|
|
113
111
|
this.language = config.language || "en" /* ENGLISH */;
|
|
114
|
-
this.serverUrl = config.serverUrl || null;
|
|
115
112
|
this.onTranscription = config.onTranscription;
|
|
116
113
|
this.onResponse = config.onResponse;
|
|
117
114
|
this.onAudioCallback = config.onAudio;
|
|
@@ -126,12 +123,12 @@ var VoiceAgentClient = class {
|
|
|
126
123
|
async connect() {
|
|
127
124
|
return new Promise((resolve, reject) => {
|
|
128
125
|
try {
|
|
129
|
-
let url =
|
|
126
|
+
let url = DEFAULT_URLS.VOICE_AGENT;
|
|
130
127
|
if (this.apiKey) {
|
|
131
128
|
const separator = url.includes("?") ? "&" : "?";
|
|
132
129
|
url += `${separator}api_key=${this.apiKey}`;
|
|
133
130
|
}
|
|
134
|
-
console.log(`\u{1F517} Connecting to ${
|
|
131
|
+
console.log(`\u{1F517} Connecting to ${DEFAULT_URLS.VOICE_AGENT}...`);
|
|
135
132
|
this.ws = new WebSocket(url);
|
|
136
133
|
this.ws.binaryType = "arraybuffer";
|
|
137
134
|
this.ws.onopen = () => {
|
|
@@ -504,8 +501,7 @@ var BrowserAudioManager = class {
|
|
|
504
501
|
// Playback scheduling
|
|
505
502
|
nextPlaybackTime = 0;
|
|
506
503
|
activeSources = [];
|
|
507
|
-
|
|
508
|
-
audioClockOffset = null;
|
|
504
|
+
playbackQueue = [];
|
|
509
505
|
// Configuration
|
|
510
506
|
inputSampleRate;
|
|
511
507
|
outputSampleRate;
|
|
@@ -518,7 +514,6 @@ var BrowserAudioManager = class {
|
|
|
518
514
|
// Audio processing state
|
|
519
515
|
isMuted = false;
|
|
520
516
|
isListening = false;
|
|
521
|
-
resampler = null;
|
|
522
517
|
constructor(config = {}) {
|
|
523
518
|
this.inputSampleRate = config.inputSampleRate ?? AUDIO_CONFIG.SAMPLE_RATE;
|
|
524
519
|
this.outputSampleRate = config.outputSampleRate ?? AUDIO_CONFIG.SPEAKER_SAMPLE_RATE;
|
|
@@ -542,6 +537,7 @@ var BrowserAudioManager = class {
|
|
|
542
537
|
}
|
|
543
538
|
if (this.audioContext.state === "suspended") {
|
|
544
539
|
await this.audioContext.resume();
|
|
540
|
+
console.log("\u{1F442} AudioContext resumed");
|
|
545
541
|
}
|
|
546
542
|
if (analyserConfig?.enabled !== false) {
|
|
547
543
|
this.analyserNode = this.audioContext.createAnalyser();
|
|
@@ -555,7 +551,6 @@ var BrowserAudioManager = class {
|
|
|
555
551
|
if (!this.audioContext) {
|
|
556
552
|
await this.init();
|
|
557
553
|
}
|
|
558
|
-
this.resampler = new StreamResampler(this.audioContext.sampleRate, this.inputSampleRate);
|
|
559
554
|
try {
|
|
560
555
|
this.onAudioInput = onAudioInput;
|
|
561
556
|
this.isListening = true;
|
|
@@ -571,7 +566,9 @@ var BrowserAudioManager = class {
|
|
|
571
566
|
this.scriptProcessor = this.audioContext.createScriptProcessor(
|
|
572
567
|
bufferSize,
|
|
573
568
|
1,
|
|
569
|
+
// input channels
|
|
574
570
|
1
|
|
571
|
+
// output channels
|
|
575
572
|
);
|
|
576
573
|
this.mediaStreamAudioSourceNode.connect(this.scriptProcessor);
|
|
577
574
|
this.scriptProcessor.connect(this.audioContext.destination);
|
|
@@ -588,19 +585,40 @@ var BrowserAudioManager = class {
|
|
|
588
585
|
throw err;
|
|
589
586
|
}
|
|
590
587
|
}
|
|
588
|
+
/**
|
|
589
|
+
* Internal method to process microphone audio data
|
|
590
|
+
*/
|
|
591
591
|
_processAudioInput(event) {
|
|
592
|
-
if (!this.onAudioInput || !this.audioContext || !this.isListening
|
|
593
|
-
|
|
594
|
-
event.
|
|
595
|
-
const
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
592
|
+
if (!this.onAudioInput || !this.audioContext || !this.isListening) return;
|
|
593
|
+
if (this.isMuted) return;
|
|
594
|
+
const inputBuffer = event.inputBuffer;
|
|
595
|
+
const inputData = inputBuffer.getChannelData(0);
|
|
596
|
+
const outputBuffer = event.outputBuffer;
|
|
597
|
+
for (let i = 0; i < outputBuffer.getChannelData(0).length; i++) {
|
|
598
|
+
outputBuffer.getChannelData(0)[i] = 0;
|
|
599
|
+
}
|
|
600
|
+
const hardwareRate = this.audioContext.sampleRate;
|
|
601
|
+
let processedData = new Float32Array(inputData);
|
|
602
|
+
if (hardwareRate !== this.inputSampleRate) {
|
|
603
|
+
processedData = resampleWithAntiAliasing(
|
|
604
|
+
processedData,
|
|
605
|
+
hardwareRate,
|
|
606
|
+
this.inputSampleRate
|
|
607
|
+
);
|
|
599
608
|
}
|
|
609
|
+
const int16Data = float32ToPcm16(processedData);
|
|
610
|
+
const uint8Data = new Uint8Array(
|
|
611
|
+
int16Data.buffer,
|
|
612
|
+
int16Data.byteOffset,
|
|
613
|
+
int16Data.byteLength
|
|
614
|
+
);
|
|
615
|
+
this.onAudioInput(uint8Data);
|
|
600
616
|
}
|
|
617
|
+
/**
|
|
618
|
+
* Stop capturing microphone input
|
|
619
|
+
*/
|
|
601
620
|
stopMicrophone() {
|
|
602
621
|
this.isListening = false;
|
|
603
|
-
this.resampler = null;
|
|
604
622
|
if (this.mediaStream) {
|
|
605
623
|
this.mediaStream.getTracks().forEach((track) => track.stop());
|
|
606
624
|
this.mediaStream = null;
|
|
@@ -613,12 +631,17 @@ var BrowserAudioManager = class {
|
|
|
613
631
|
this.mediaStreamAudioSourceNode.disconnect();
|
|
614
632
|
this.mediaStreamAudioSourceNode = null;
|
|
615
633
|
}
|
|
634
|
+
console.log("\u{1F3A4} Microphone stopped");
|
|
616
635
|
}
|
|
617
636
|
/**
|
|
618
637
|
* Play back audio received from the server
|
|
638
|
+
* @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
|
|
619
639
|
*/
|
|
620
640
|
playAudio(pcm16Data) {
|
|
621
|
-
if (!this.audioContext)
|
|
641
|
+
if (!this.audioContext) {
|
|
642
|
+
console.warn("AudioContext not initialized");
|
|
643
|
+
return;
|
|
644
|
+
}
|
|
622
645
|
const int16Array = new Int16Array(
|
|
623
646
|
pcm16Data.buffer,
|
|
624
647
|
pcm16Data.byteOffset,
|
|
@@ -633,17 +656,18 @@ var BrowserAudioManager = class {
|
|
|
633
656
|
audioBuffer.getChannelData(0).set(float32Data);
|
|
634
657
|
this._schedulePlayback(audioBuffer);
|
|
635
658
|
}
|
|
659
|
+
/**
|
|
660
|
+
* Internal method to schedule and play audio with sample-accurate timing
|
|
661
|
+
*/
|
|
636
662
|
_schedulePlayback(audioBuffer) {
|
|
637
663
|
if (!this.audioContext) return;
|
|
638
664
|
const currentTime = this.audioContext.currentTime;
|
|
639
665
|
const duration = audioBuffer.length / this.outputSampleRate;
|
|
640
666
|
const startTime = Math.max(
|
|
641
667
|
currentTime + 0.01,
|
|
668
|
+
// Minimum 10ms delay
|
|
642
669
|
this.nextPlaybackTime
|
|
643
670
|
);
|
|
644
|
-
if (this.audioClockOffset === null) {
|
|
645
|
-
this.audioClockOffset = startTime;
|
|
646
|
-
}
|
|
647
671
|
this.nextPlaybackTime = startTime + duration;
|
|
648
672
|
const source = this.audioContext.createBufferSource();
|
|
649
673
|
source.buffer = audioBuffer;
|
|
@@ -661,18 +685,8 @@ var BrowserAudioManager = class {
|
|
|
661
685
|
};
|
|
662
686
|
}
|
|
663
687
|
/**
|
|
664
|
-
*
|
|
665
|
-
* Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
|
|
666
|
-
*/
|
|
667
|
-
getAudioClockOffset() {
|
|
668
|
-
return this.audioClockOffset;
|
|
669
|
-
}
|
|
670
|
-
/**
|
|
671
|
-
* Reset the audio clock offset (call when a response is interrupted or finished)
|
|
688
|
+
* Stop all currently playing audio and clear the queue
|
|
672
689
|
*/
|
|
673
|
-
resetAudioClock() {
|
|
674
|
-
this.audioClockOffset = null;
|
|
675
|
-
}
|
|
676
690
|
stopPlayback() {
|
|
677
691
|
this.activeSources.forEach((source) => {
|
|
678
692
|
try {
|
|
@@ -681,15 +695,26 @@ var BrowserAudioManager = class {
|
|
|
681
695
|
}
|
|
682
696
|
});
|
|
683
697
|
this.activeSources = [];
|
|
684
|
-
this.
|
|
685
|
-
this.
|
|
698
|
+
this.playbackQueue = [];
|
|
699
|
+
this.nextPlaybackTime = this.audioContext?.currentTime ?? 0;
|
|
700
|
+
console.log("\u{1F507} Playback stopped");
|
|
686
701
|
}
|
|
702
|
+
/**
|
|
703
|
+
* Toggle mute state
|
|
704
|
+
*/
|
|
687
705
|
setMuted(muted) {
|
|
688
706
|
this.isMuted = muted;
|
|
689
707
|
}
|
|
708
|
+
/**
|
|
709
|
+
* Get current mute state
|
|
710
|
+
*/
|
|
690
711
|
isMicMuted() {
|
|
691
712
|
return this.isMuted;
|
|
692
713
|
}
|
|
714
|
+
/**
|
|
715
|
+
* Get current amplitude from analyser (for visualization)
|
|
716
|
+
* Returns value between 0 and 1
|
|
717
|
+
*/
|
|
693
718
|
getAmplitude() {
|
|
694
719
|
if (!this.analyserNode) return 0;
|
|
695
720
|
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
@@ -697,18 +722,31 @@ var BrowserAudioManager = class {
|
|
|
697
722
|
const rms = calculateRMS(dataArray);
|
|
698
723
|
return Math.min(rms * 10, 1);
|
|
699
724
|
}
|
|
725
|
+
/**
|
|
726
|
+
* Get frequency data from analyser for visualization
|
|
727
|
+
*/
|
|
700
728
|
getFrequencyData() {
|
|
701
|
-
if (!this.analyserNode)
|
|
729
|
+
if (!this.analyserNode) {
|
|
730
|
+
return new Uint8Array(0);
|
|
731
|
+
}
|
|
702
732
|
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
703
733
|
this.analyserNode.getByteFrequencyData(dataArray);
|
|
704
734
|
return dataArray;
|
|
705
735
|
}
|
|
736
|
+
/**
|
|
737
|
+
* Get time-domain data from analyser for waveform visualization
|
|
738
|
+
*/
|
|
706
739
|
getWaveformData() {
|
|
707
|
-
if (!this.analyserNode)
|
|
740
|
+
if (!this.analyserNode) {
|
|
741
|
+
return new Uint8Array(0);
|
|
742
|
+
}
|
|
708
743
|
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
709
744
|
this.analyserNode.getByteTimeDomainData(dataArray);
|
|
710
745
|
return dataArray;
|
|
711
746
|
}
|
|
747
|
+
/**
|
|
748
|
+
* Cleanup and close AudioContext
|
|
749
|
+
*/
|
|
712
750
|
cleanup() {
|
|
713
751
|
this.stopMicrophone();
|
|
714
752
|
this.stopPlayback();
|
|
@@ -717,124 +755,17 @@ var BrowserAudioManager = class {
|
|
|
717
755
|
this.analyserNode = null;
|
|
718
756
|
}
|
|
719
757
|
}
|
|
720
|
-
getAudioContext() {
|
|
721
|
-
return this.audioContext;
|
|
722
|
-
}
|
|
723
|
-
};
|
|
724
|
-
|
|
725
|
-
// src/voice-agent.ts
|
|
726
|
-
var VoiceAgent = class {
|
|
727
|
-
client;
|
|
728
|
-
audioManager;
|
|
729
|
-
options;
|
|
730
|
-
isConnected = false;
|
|
731
|
-
visemeQueue = [];
|
|
732
|
-
constructor(options) {
|
|
733
|
-
this.options = options;
|
|
734
|
-
this.client = new VoiceAgentClient({
|
|
735
|
-
apiKey: options.apiKey,
|
|
736
|
-
prompt: options.prompt || "You are a helpful and friendly AI assistant.",
|
|
737
|
-
voice: options.voice || "F1" /* F1 */,
|
|
738
|
-
language: options.language || "en" /* ENGLISH */,
|
|
739
|
-
visemes: options.visemes ?? true,
|
|
740
|
-
serverUrl: options.serverUrl,
|
|
741
|
-
onTranscription: (text) => {
|
|
742
|
-
if (options.onTranscription) options.onTranscription(text, true);
|
|
743
|
-
},
|
|
744
|
-
onResponse: (text) => {
|
|
745
|
-
if (options.onTranscription) options.onTranscription(text, false);
|
|
746
|
-
},
|
|
747
|
-
onAudio: (data) => {
|
|
748
|
-
this.audioManager.playAudio(data);
|
|
749
|
-
},
|
|
750
|
-
onVisemes: (visemes) => {
|
|
751
|
-
this.visemeQueue.push(...visemes);
|
|
752
|
-
if (options.onVisemes) options.onVisemes(visemes);
|
|
753
|
-
},
|
|
754
|
-
onStatus: (status) => {
|
|
755
|
-
if (options.onStatusChange) options.onStatusChange(status);
|
|
756
|
-
if (status === "interrupted" || status === "thinking") {
|
|
757
|
-
this.audioManager.stopPlayback();
|
|
758
|
-
this.visemeQueue = [];
|
|
759
|
-
}
|
|
760
|
-
},
|
|
761
|
-
onError: (err) => {
|
|
762
|
-
if (options.onError) options.onError(err);
|
|
763
|
-
}
|
|
764
|
-
});
|
|
765
|
-
this.audioManager = new BrowserAudioManager({
|
|
766
|
-
autoGainControl: true,
|
|
767
|
-
echoCancellation: true,
|
|
768
|
-
noiseSuppression: true
|
|
769
|
-
});
|
|
770
|
-
}
|
|
771
758
|
/**
|
|
772
|
-
*
|
|
773
|
-
* This must be called in response to a user guesture (like a click)
|
|
774
|
-
* to satisfy browser AudioContext requirements.
|
|
759
|
+
* Get current audio context state
|
|
775
760
|
*/
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
await this.audioManager.init();
|
|
779
|
-
const connected = await this.client.connect();
|
|
780
|
-
if (!connected) return false;
|
|
781
|
-
this.isConnected = true;
|
|
782
|
-
await this.audioManager.startMicrophone((pcm16Data) => {
|
|
783
|
-
if (this.isConnected) {
|
|
784
|
-
this.client.sendAudio(pcm16Data);
|
|
785
|
-
}
|
|
786
|
-
});
|
|
787
|
-
return true;
|
|
788
|
-
} catch (err) {
|
|
789
|
-
if (this.options.onError) this.options.onError(err);
|
|
790
|
-
return false;
|
|
791
|
-
}
|
|
761
|
+
getState() {
|
|
762
|
+
return this.audioContext?.state ?? null;
|
|
792
763
|
}
|
|
793
764
|
/**
|
|
794
|
-
*
|
|
795
|
-
* Useful for voice activity visualization.
|
|
796
|
-
* @returns value between 0 and 1
|
|
765
|
+
* Check if microphone is currently listening
|
|
797
766
|
*/
|
|
798
|
-
|
|
799
|
-
return this.
|
|
800
|
-
}
|
|
801
|
-
/**
|
|
802
|
-
* Mute or unmute the microphone.
|
|
803
|
-
*/
|
|
804
|
-
toggleMute() {
|
|
805
|
-
const currentState = this.audioManager.isMicMuted();
|
|
806
|
-
this.audioManager.setMuted(!currentState);
|
|
807
|
-
return !currentState;
|
|
808
|
-
}
|
|
809
|
-
/**
|
|
810
|
-
* High-precision method to get visemes that should be active
|
|
811
|
-
* at the current playback frame. Use this in a requestAnimationFrame loop.
|
|
812
|
-
*/
|
|
813
|
-
getFrameVisemes() {
|
|
814
|
-
const offset = this.audioManager.getAudioClockOffset();
|
|
815
|
-
const audioCtx = this.audioManager.getAudioContext();
|
|
816
|
-
if (offset === null || !audioCtx) return [];
|
|
817
|
-
const streamTime = (audioCtx.currentTime - offset) * 1e3;
|
|
818
|
-
const currentBatch = [];
|
|
819
|
-
while (this.visemeQueue.length > 0 && this.visemeQueue[0].t * 1e3 <= streamTime) {
|
|
820
|
-
currentBatch.push(this.visemeQueue.shift());
|
|
821
|
-
}
|
|
822
|
-
return currentBatch;
|
|
823
|
-
}
|
|
824
|
-
/**
|
|
825
|
-
* Change the system prompt mid-conversation.
|
|
826
|
-
*/
|
|
827
|
-
updatePrompt(newPrompt) {
|
|
828
|
-
this.client.updatePrompt(newPrompt);
|
|
829
|
-
}
|
|
830
|
-
/**
|
|
831
|
-
* Disconnect and release audio resources.
|
|
832
|
-
*/
|
|
833
|
-
disconnect() {
|
|
834
|
-
this.isConnected = false;
|
|
835
|
-
this.client.disconnect();
|
|
836
|
-
this.audioManager.cleanup();
|
|
837
|
-
this.visemeQueue = [];
|
|
767
|
+
isRecording() {
|
|
768
|
+
return this.isListening;
|
|
838
769
|
}
|
|
839
770
|
};
|
|
840
771
|
// Annotate the CommonJS export names for ESM import in node:
|
|
@@ -845,7 +776,6 @@ var VoiceAgent = class {
|
|
|
845
776
|
Language,
|
|
846
777
|
StreamResampler,
|
|
847
778
|
TTSClient,
|
|
848
|
-
VoiceAgent,
|
|
849
779
|
VoiceAgentClient,
|
|
850
780
|
VoiceStyle,
|
|
851
781
|
applyLowPassFilter,
|
package/dist/index.mjs
CHANGED
|
@@ -60,13 +60,11 @@ var VoiceAgentClient = class {
|
|
|
60
60
|
messages = [];
|
|
61
61
|
visemeListeners = [];
|
|
62
62
|
wantVisemes = false;
|
|
63
|
-
serverUrl = null;
|
|
64
63
|
constructor(config) {
|
|
65
64
|
this.apiKey = config.apiKey;
|
|
66
65
|
this.prompt = config.prompt;
|
|
67
66
|
this.voice = config.voice || "F1" /* F1 */;
|
|
68
67
|
this.language = config.language || "en" /* ENGLISH */;
|
|
69
|
-
this.serverUrl = config.serverUrl || null;
|
|
70
68
|
this.onTranscription = config.onTranscription;
|
|
71
69
|
this.onResponse = config.onResponse;
|
|
72
70
|
this.onAudioCallback = config.onAudio;
|
|
@@ -81,12 +79,12 @@ var VoiceAgentClient = class {
|
|
|
81
79
|
async connect() {
|
|
82
80
|
return new Promise((resolve, reject) => {
|
|
83
81
|
try {
|
|
84
|
-
let url =
|
|
82
|
+
let url = DEFAULT_URLS.VOICE_AGENT;
|
|
85
83
|
if (this.apiKey) {
|
|
86
84
|
const separator = url.includes("?") ? "&" : "?";
|
|
87
85
|
url += `${separator}api_key=${this.apiKey}`;
|
|
88
86
|
}
|
|
89
|
-
console.log(`\u{1F517} Connecting to ${
|
|
87
|
+
console.log(`\u{1F517} Connecting to ${DEFAULT_URLS.VOICE_AGENT}...`);
|
|
90
88
|
this.ws = new WebSocket(url);
|
|
91
89
|
this.ws.binaryType = "arraybuffer";
|
|
92
90
|
this.ws.onopen = () => {
|
|
@@ -459,8 +457,7 @@ var BrowserAudioManager = class {
|
|
|
459
457
|
// Playback scheduling
|
|
460
458
|
nextPlaybackTime = 0;
|
|
461
459
|
activeSources = [];
|
|
462
|
-
|
|
463
|
-
audioClockOffset = null;
|
|
460
|
+
playbackQueue = [];
|
|
464
461
|
// Configuration
|
|
465
462
|
inputSampleRate;
|
|
466
463
|
outputSampleRate;
|
|
@@ -473,7 +470,6 @@ var BrowserAudioManager = class {
|
|
|
473
470
|
// Audio processing state
|
|
474
471
|
isMuted = false;
|
|
475
472
|
isListening = false;
|
|
476
|
-
resampler = null;
|
|
477
473
|
constructor(config = {}) {
|
|
478
474
|
this.inputSampleRate = config.inputSampleRate ?? AUDIO_CONFIG.SAMPLE_RATE;
|
|
479
475
|
this.outputSampleRate = config.outputSampleRate ?? AUDIO_CONFIG.SPEAKER_SAMPLE_RATE;
|
|
@@ -497,6 +493,7 @@ var BrowserAudioManager = class {
|
|
|
497
493
|
}
|
|
498
494
|
if (this.audioContext.state === "suspended") {
|
|
499
495
|
await this.audioContext.resume();
|
|
496
|
+
console.log("\u{1F442} AudioContext resumed");
|
|
500
497
|
}
|
|
501
498
|
if (analyserConfig?.enabled !== false) {
|
|
502
499
|
this.analyserNode = this.audioContext.createAnalyser();
|
|
@@ -510,7 +507,6 @@ var BrowserAudioManager = class {
|
|
|
510
507
|
if (!this.audioContext) {
|
|
511
508
|
await this.init();
|
|
512
509
|
}
|
|
513
|
-
this.resampler = new StreamResampler(this.audioContext.sampleRate, this.inputSampleRate);
|
|
514
510
|
try {
|
|
515
511
|
this.onAudioInput = onAudioInput;
|
|
516
512
|
this.isListening = true;
|
|
@@ -526,7 +522,9 @@ var BrowserAudioManager = class {
|
|
|
526
522
|
this.scriptProcessor = this.audioContext.createScriptProcessor(
|
|
527
523
|
bufferSize,
|
|
528
524
|
1,
|
|
525
|
+
// input channels
|
|
529
526
|
1
|
|
527
|
+
// output channels
|
|
530
528
|
);
|
|
531
529
|
this.mediaStreamAudioSourceNode.connect(this.scriptProcessor);
|
|
532
530
|
this.scriptProcessor.connect(this.audioContext.destination);
|
|
@@ -543,19 +541,40 @@ var BrowserAudioManager = class {
|
|
|
543
541
|
throw err;
|
|
544
542
|
}
|
|
545
543
|
}
|
|
544
|
+
/**
|
|
545
|
+
* Internal method to process microphone audio data
|
|
546
|
+
*/
|
|
546
547
|
_processAudioInput(event) {
|
|
547
|
-
if (!this.onAudioInput || !this.audioContext || !this.isListening
|
|
548
|
-
|
|
549
|
-
event.
|
|
550
|
-
const
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
548
|
+
if (!this.onAudioInput || !this.audioContext || !this.isListening) return;
|
|
549
|
+
if (this.isMuted) return;
|
|
550
|
+
const inputBuffer = event.inputBuffer;
|
|
551
|
+
const inputData = inputBuffer.getChannelData(0);
|
|
552
|
+
const outputBuffer = event.outputBuffer;
|
|
553
|
+
for (let i = 0; i < outputBuffer.getChannelData(0).length; i++) {
|
|
554
|
+
outputBuffer.getChannelData(0)[i] = 0;
|
|
555
|
+
}
|
|
556
|
+
const hardwareRate = this.audioContext.sampleRate;
|
|
557
|
+
let processedData = new Float32Array(inputData);
|
|
558
|
+
if (hardwareRate !== this.inputSampleRate) {
|
|
559
|
+
processedData = resampleWithAntiAliasing(
|
|
560
|
+
processedData,
|
|
561
|
+
hardwareRate,
|
|
562
|
+
this.inputSampleRate
|
|
563
|
+
);
|
|
554
564
|
}
|
|
565
|
+
const int16Data = float32ToPcm16(processedData);
|
|
566
|
+
const uint8Data = new Uint8Array(
|
|
567
|
+
int16Data.buffer,
|
|
568
|
+
int16Data.byteOffset,
|
|
569
|
+
int16Data.byteLength
|
|
570
|
+
);
|
|
571
|
+
this.onAudioInput(uint8Data);
|
|
555
572
|
}
|
|
573
|
+
/**
|
|
574
|
+
* Stop capturing microphone input
|
|
575
|
+
*/
|
|
556
576
|
stopMicrophone() {
|
|
557
577
|
this.isListening = false;
|
|
558
|
-
this.resampler = null;
|
|
559
578
|
if (this.mediaStream) {
|
|
560
579
|
this.mediaStream.getTracks().forEach((track) => track.stop());
|
|
561
580
|
this.mediaStream = null;
|
|
@@ -568,12 +587,17 @@ var BrowserAudioManager = class {
|
|
|
568
587
|
this.mediaStreamAudioSourceNode.disconnect();
|
|
569
588
|
this.mediaStreamAudioSourceNode = null;
|
|
570
589
|
}
|
|
590
|
+
console.log("\u{1F3A4} Microphone stopped");
|
|
571
591
|
}
|
|
572
592
|
/**
|
|
573
593
|
* Play back audio received from the server
|
|
594
|
+
* @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
|
|
574
595
|
*/
|
|
575
596
|
playAudio(pcm16Data) {
|
|
576
|
-
if (!this.audioContext)
|
|
597
|
+
if (!this.audioContext) {
|
|
598
|
+
console.warn("AudioContext not initialized");
|
|
599
|
+
return;
|
|
600
|
+
}
|
|
577
601
|
const int16Array = new Int16Array(
|
|
578
602
|
pcm16Data.buffer,
|
|
579
603
|
pcm16Data.byteOffset,
|
|
@@ -588,17 +612,18 @@ var BrowserAudioManager = class {
|
|
|
588
612
|
audioBuffer.getChannelData(0).set(float32Data);
|
|
589
613
|
this._schedulePlayback(audioBuffer);
|
|
590
614
|
}
|
|
615
|
+
/**
|
|
616
|
+
* Internal method to schedule and play audio with sample-accurate timing
|
|
617
|
+
*/
|
|
591
618
|
_schedulePlayback(audioBuffer) {
|
|
592
619
|
if (!this.audioContext) return;
|
|
593
620
|
const currentTime = this.audioContext.currentTime;
|
|
594
621
|
const duration = audioBuffer.length / this.outputSampleRate;
|
|
595
622
|
const startTime = Math.max(
|
|
596
623
|
currentTime + 0.01,
|
|
624
|
+
// Minimum 10ms delay
|
|
597
625
|
this.nextPlaybackTime
|
|
598
626
|
);
|
|
599
|
-
if (this.audioClockOffset === null) {
|
|
600
|
-
this.audioClockOffset = startTime;
|
|
601
|
-
}
|
|
602
627
|
this.nextPlaybackTime = startTime + duration;
|
|
603
628
|
const source = this.audioContext.createBufferSource();
|
|
604
629
|
source.buffer = audioBuffer;
|
|
@@ -616,18 +641,8 @@ var BrowserAudioManager = class {
|
|
|
616
641
|
};
|
|
617
642
|
}
|
|
618
643
|
/**
|
|
619
|
-
*
|
|
620
|
-
* Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
|
|
621
|
-
*/
|
|
622
|
-
getAudioClockOffset() {
|
|
623
|
-
return this.audioClockOffset;
|
|
624
|
-
}
|
|
625
|
-
/**
|
|
626
|
-
* Reset the audio clock offset (call when a response is interrupted or finished)
|
|
644
|
+
* Stop all currently playing audio and clear the queue
|
|
627
645
|
*/
|
|
628
|
-
resetAudioClock() {
|
|
629
|
-
this.audioClockOffset = null;
|
|
630
|
-
}
|
|
631
646
|
stopPlayback() {
|
|
632
647
|
this.activeSources.forEach((source) => {
|
|
633
648
|
try {
|
|
@@ -636,15 +651,26 @@ var BrowserAudioManager = class {
|
|
|
636
651
|
}
|
|
637
652
|
});
|
|
638
653
|
this.activeSources = [];
|
|
639
|
-
this.
|
|
640
|
-
this.
|
|
654
|
+
this.playbackQueue = [];
|
|
655
|
+
this.nextPlaybackTime = this.audioContext?.currentTime ?? 0;
|
|
656
|
+
console.log("\u{1F507} Playback stopped");
|
|
641
657
|
}
|
|
658
|
+
/**
|
|
659
|
+
* Toggle mute state
|
|
660
|
+
*/
|
|
642
661
|
setMuted(muted) {
|
|
643
662
|
this.isMuted = muted;
|
|
644
663
|
}
|
|
664
|
+
/**
|
|
665
|
+
* Get current mute state
|
|
666
|
+
*/
|
|
645
667
|
isMicMuted() {
|
|
646
668
|
return this.isMuted;
|
|
647
669
|
}
|
|
670
|
+
/**
|
|
671
|
+
* Get current amplitude from analyser (for visualization)
|
|
672
|
+
* Returns value between 0 and 1
|
|
673
|
+
*/
|
|
648
674
|
getAmplitude() {
|
|
649
675
|
if (!this.analyserNode) return 0;
|
|
650
676
|
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
@@ -652,18 +678,31 @@ var BrowserAudioManager = class {
|
|
|
652
678
|
const rms = calculateRMS(dataArray);
|
|
653
679
|
return Math.min(rms * 10, 1);
|
|
654
680
|
}
|
|
681
|
+
/**
|
|
682
|
+
* Get frequency data from analyser for visualization
|
|
683
|
+
*/
|
|
655
684
|
getFrequencyData() {
|
|
656
|
-
if (!this.analyserNode)
|
|
685
|
+
if (!this.analyserNode) {
|
|
686
|
+
return new Uint8Array(0);
|
|
687
|
+
}
|
|
657
688
|
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
658
689
|
this.analyserNode.getByteFrequencyData(dataArray);
|
|
659
690
|
return dataArray;
|
|
660
691
|
}
|
|
692
|
+
/**
|
|
693
|
+
* Get time-domain data from analyser for waveform visualization
|
|
694
|
+
*/
|
|
661
695
|
getWaveformData() {
|
|
662
|
-
if (!this.analyserNode)
|
|
696
|
+
if (!this.analyserNode) {
|
|
697
|
+
return new Uint8Array(0);
|
|
698
|
+
}
|
|
663
699
|
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
664
700
|
this.analyserNode.getByteTimeDomainData(dataArray);
|
|
665
701
|
return dataArray;
|
|
666
702
|
}
|
|
703
|
+
/**
|
|
704
|
+
* Cleanup and close AudioContext
|
|
705
|
+
*/
|
|
667
706
|
cleanup() {
|
|
668
707
|
this.stopMicrophone();
|
|
669
708
|
this.stopPlayback();
|
|
@@ -672,124 +711,17 @@ var BrowserAudioManager = class {
|
|
|
672
711
|
this.analyserNode = null;
|
|
673
712
|
}
|
|
674
713
|
}
|
|
675
|
-
getAudioContext() {
|
|
676
|
-
return this.audioContext;
|
|
677
|
-
}
|
|
678
|
-
};
|
|
679
|
-
|
|
680
|
-
// src/voice-agent.ts
|
|
681
|
-
var VoiceAgent = class {
|
|
682
|
-
client;
|
|
683
|
-
audioManager;
|
|
684
|
-
options;
|
|
685
|
-
isConnected = false;
|
|
686
|
-
visemeQueue = [];
|
|
687
|
-
constructor(options) {
|
|
688
|
-
this.options = options;
|
|
689
|
-
this.client = new VoiceAgentClient({
|
|
690
|
-
apiKey: options.apiKey,
|
|
691
|
-
prompt: options.prompt || "You are a helpful and friendly AI assistant.",
|
|
692
|
-
voice: options.voice || "F1" /* F1 */,
|
|
693
|
-
language: options.language || "en" /* ENGLISH */,
|
|
694
|
-
visemes: options.visemes ?? true,
|
|
695
|
-
serverUrl: options.serverUrl,
|
|
696
|
-
onTranscription: (text) => {
|
|
697
|
-
if (options.onTranscription) options.onTranscription(text, true);
|
|
698
|
-
},
|
|
699
|
-
onResponse: (text) => {
|
|
700
|
-
if (options.onTranscription) options.onTranscription(text, false);
|
|
701
|
-
},
|
|
702
|
-
onAudio: (data) => {
|
|
703
|
-
this.audioManager.playAudio(data);
|
|
704
|
-
},
|
|
705
|
-
onVisemes: (visemes) => {
|
|
706
|
-
this.visemeQueue.push(...visemes);
|
|
707
|
-
if (options.onVisemes) options.onVisemes(visemes);
|
|
708
|
-
},
|
|
709
|
-
onStatus: (status) => {
|
|
710
|
-
if (options.onStatusChange) options.onStatusChange(status);
|
|
711
|
-
if (status === "interrupted" || status === "thinking") {
|
|
712
|
-
this.audioManager.stopPlayback();
|
|
713
|
-
this.visemeQueue = [];
|
|
714
|
-
}
|
|
715
|
-
},
|
|
716
|
-
onError: (err) => {
|
|
717
|
-
if (options.onError) options.onError(err);
|
|
718
|
-
}
|
|
719
|
-
});
|
|
720
|
-
this.audioManager = new BrowserAudioManager({
|
|
721
|
-
autoGainControl: true,
|
|
722
|
-
echoCancellation: true,
|
|
723
|
-
noiseSuppression: true
|
|
724
|
-
});
|
|
725
|
-
}
|
|
726
714
|
/**
|
|
727
|
-
*
|
|
728
|
-
* This must be called in response to a user guesture (like a click)
|
|
729
|
-
* to satisfy browser AudioContext requirements.
|
|
715
|
+
* Get current audio context state
|
|
730
716
|
*/
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
await this.audioManager.init();
|
|
734
|
-
const connected = await this.client.connect();
|
|
735
|
-
if (!connected) return false;
|
|
736
|
-
this.isConnected = true;
|
|
737
|
-
await this.audioManager.startMicrophone((pcm16Data) => {
|
|
738
|
-
if (this.isConnected) {
|
|
739
|
-
this.client.sendAudio(pcm16Data);
|
|
740
|
-
}
|
|
741
|
-
});
|
|
742
|
-
return true;
|
|
743
|
-
} catch (err) {
|
|
744
|
-
if (this.options.onError) this.options.onError(err);
|
|
745
|
-
return false;
|
|
746
|
-
}
|
|
717
|
+
getState() {
|
|
718
|
+
return this.audioContext?.state ?? null;
|
|
747
719
|
}
|
|
748
720
|
/**
|
|
749
|
-
*
|
|
750
|
-
* Useful for voice activity visualization.
|
|
751
|
-
* @returns value between 0 and 1
|
|
721
|
+
* Check if microphone is currently listening
|
|
752
722
|
*/
|
|
753
|
-
|
|
754
|
-
return this.
|
|
755
|
-
}
|
|
756
|
-
/**
|
|
757
|
-
* Mute or unmute the microphone.
|
|
758
|
-
*/
|
|
759
|
-
toggleMute() {
|
|
760
|
-
const currentState = this.audioManager.isMicMuted();
|
|
761
|
-
this.audioManager.setMuted(!currentState);
|
|
762
|
-
return !currentState;
|
|
763
|
-
}
|
|
764
|
-
/**
|
|
765
|
-
* High-precision method to get visemes that should be active
|
|
766
|
-
* at the current playback frame. Use this in a requestAnimationFrame loop.
|
|
767
|
-
*/
|
|
768
|
-
getFrameVisemes() {
|
|
769
|
-
const offset = this.audioManager.getAudioClockOffset();
|
|
770
|
-
const audioCtx = this.audioManager.getAudioContext();
|
|
771
|
-
if (offset === null || !audioCtx) return [];
|
|
772
|
-
const streamTime = (audioCtx.currentTime - offset) * 1e3;
|
|
773
|
-
const currentBatch = [];
|
|
774
|
-
while (this.visemeQueue.length > 0 && this.visemeQueue[0].t * 1e3 <= streamTime) {
|
|
775
|
-
currentBatch.push(this.visemeQueue.shift());
|
|
776
|
-
}
|
|
777
|
-
return currentBatch;
|
|
778
|
-
}
|
|
779
|
-
/**
|
|
780
|
-
* Change the system prompt mid-conversation.
|
|
781
|
-
*/
|
|
782
|
-
updatePrompt(newPrompt) {
|
|
783
|
-
this.client.updatePrompt(newPrompt);
|
|
784
|
-
}
|
|
785
|
-
/**
|
|
786
|
-
* Disconnect and release audio resources.
|
|
787
|
-
*/
|
|
788
|
-
disconnect() {
|
|
789
|
-
this.isConnected = false;
|
|
790
|
-
this.client.disconnect();
|
|
791
|
-
this.audioManager.cleanup();
|
|
792
|
-
this.visemeQueue = [];
|
|
723
|
+
isRecording() {
|
|
724
|
+
return this.isListening;
|
|
793
725
|
}
|
|
794
726
|
};
|
|
795
727
|
export {
|
|
@@ -799,7 +731,6 @@ export {
|
|
|
799
731
|
Language,
|
|
800
732
|
StreamResampler,
|
|
801
733
|
TTSClient,
|
|
802
|
-
VoiceAgent,
|
|
803
734
|
VoiceAgentClient,
|
|
804
735
|
VoiceStyle,
|
|
805
736
|
applyLowPassFilter,
|