@lokutor/sdk 1.1.8 → 1.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +51 -31
- package/dist/index.d.ts +51 -31
- package/dist/index.js +150 -80
- package/dist/index.mjs +149 -80
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -119,11 +119,13 @@ declare class VoiceAgentClient {
|
|
|
119
119
|
private messages;
|
|
120
120
|
private visemeListeners;
|
|
121
121
|
private wantVisemes;
|
|
122
|
+
private serverUrl;
|
|
122
123
|
constructor(config: LokutorConfig & {
|
|
123
124
|
prompt: string;
|
|
124
125
|
voice?: VoiceStyle;
|
|
125
126
|
language?: Language;
|
|
126
127
|
visemes?: boolean;
|
|
128
|
+
serverUrl?: string;
|
|
127
129
|
onVisemes?: (visemes: Viseme[]) => void;
|
|
128
130
|
});
|
|
129
131
|
/**
|
|
@@ -326,7 +328,7 @@ declare class BrowserAudioManager {
|
|
|
326
328
|
private mediaStream;
|
|
327
329
|
private nextPlaybackTime;
|
|
328
330
|
private activeSources;
|
|
329
|
-
private
|
|
331
|
+
private audioClockOffset;
|
|
330
332
|
private inputSampleRate;
|
|
331
333
|
private outputSampleRate;
|
|
332
334
|
private autoGainControl;
|
|
@@ -336,6 +338,7 @@ declare class BrowserAudioManager {
|
|
|
336
338
|
private onInputError?;
|
|
337
339
|
private isMuted;
|
|
338
340
|
private isListening;
|
|
341
|
+
private resampler;
|
|
339
342
|
constructor(config?: BrowserAudioConfig);
|
|
340
343
|
/**
|
|
341
344
|
* Initialize the AudioContext and analyser
|
|
@@ -345,60 +348,77 @@ declare class BrowserAudioManager {
|
|
|
345
348
|
* Start capturing audio from the microphone
|
|
346
349
|
*/
|
|
347
350
|
startMicrophone(onAudioInput: (pcm16Data: Uint8Array) => void): Promise<void>;
|
|
348
|
-
/**
|
|
349
|
-
* Internal method to process microphone audio data
|
|
350
|
-
*/
|
|
351
351
|
private _processAudioInput;
|
|
352
|
-
/**
|
|
353
|
-
* Stop capturing microphone input
|
|
354
|
-
*/
|
|
355
352
|
stopMicrophone(): void;
|
|
356
353
|
/**
|
|
357
354
|
* Play back audio received from the server
|
|
358
|
-
* @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
|
|
359
355
|
*/
|
|
360
356
|
playAudio(pcm16Data: Uint8Array): void;
|
|
361
|
-
/**
|
|
362
|
-
* Internal method to schedule and play audio with sample-accurate timing
|
|
363
|
-
*/
|
|
364
357
|
private _schedulePlayback;
|
|
365
358
|
/**
|
|
366
|
-
*
|
|
359
|
+
* Get the current high-precision audio clock offset for viseme synchronization.
|
|
360
|
+
* Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
|
|
367
361
|
*/
|
|
368
|
-
|
|
362
|
+
getAudioClockOffset(): number | null;
|
|
369
363
|
/**
|
|
370
|
-
*
|
|
364
|
+
* Reset the audio clock offset (call when a response is interrupted or finished)
|
|
371
365
|
*/
|
|
366
|
+
resetAudioClock(): void;
|
|
367
|
+
stopPlayback(): void;
|
|
372
368
|
setMuted(muted: boolean): void;
|
|
373
|
-
/**
|
|
374
|
-
* Get current mute state
|
|
375
|
-
*/
|
|
376
369
|
isMicMuted(): boolean;
|
|
370
|
+
getAmplitude(): number;
|
|
371
|
+
getFrequencyData(): Uint8Array;
|
|
372
|
+
getWaveformData(): Uint8Array;
|
|
373
|
+
cleanup(): void;
|
|
374
|
+
getAudioContext(): AudioContext | null;
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
/**
|
|
378
|
+
* High-level AI Voice Agent for browser-based conversations.
|
|
379
|
+
*
|
|
380
|
+
* This class orchestrates microphone input, AI processing, and
|
|
381
|
+
* speaker output, providing a simple interface for building
|
|
382
|
+
* voice assistants with lip-sync support.
|
|
383
|
+
*/
|
|
384
|
+
declare class VoiceAgent {
|
|
385
|
+
private client;
|
|
386
|
+
private audioManager;
|
|
387
|
+
private options;
|
|
388
|
+
private isConnected;
|
|
389
|
+
private visemeQueue;
|
|
390
|
+
constructor(options: VoiceAgentOptions & {
|
|
391
|
+
apiKey: string;
|
|
392
|
+
});
|
|
377
393
|
/**
|
|
378
|
-
*
|
|
379
|
-
*
|
|
394
|
+
* Initialize hardware and connect to the AI server.
|
|
395
|
+
* This must be called in response to a user guesture (like a click)
|
|
396
|
+
* to satisfy browser AudioContext requirements.
|
|
380
397
|
*/
|
|
381
|
-
|
|
398
|
+
connect(): Promise<boolean>;
|
|
382
399
|
/**
|
|
383
|
-
* Get
|
|
400
|
+
* Get the current amplitude/volume of the microphone or output audio.
|
|
401
|
+
* Useful for voice activity visualization.
|
|
402
|
+
* @returns value between 0 and 1
|
|
384
403
|
*/
|
|
385
|
-
|
|
404
|
+
getAmplitude(): number;
|
|
386
405
|
/**
|
|
387
|
-
*
|
|
406
|
+
* Mute or unmute the microphone.
|
|
388
407
|
*/
|
|
389
|
-
|
|
408
|
+
toggleMute(): boolean;
|
|
390
409
|
/**
|
|
391
|
-
*
|
|
410
|
+
* High-precision method to get visemes that should be active
|
|
411
|
+
* at the current playback frame. Use this in a requestAnimationFrame loop.
|
|
392
412
|
*/
|
|
393
|
-
|
|
413
|
+
getFrameVisemes(): Viseme[];
|
|
394
414
|
/**
|
|
395
|
-
*
|
|
415
|
+
* Change the system prompt mid-conversation.
|
|
396
416
|
*/
|
|
397
|
-
|
|
417
|
+
updatePrompt(newPrompt: string): void;
|
|
398
418
|
/**
|
|
399
|
-
*
|
|
419
|
+
* Disconnect and release audio resources.
|
|
400
420
|
*/
|
|
401
|
-
|
|
421
|
+
disconnect(): void;
|
|
402
422
|
}
|
|
403
423
|
|
|
404
|
-
export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
|
|
424
|
+
export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme, VoiceAgent, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
|
package/dist/index.d.ts
CHANGED
|
@@ -119,11 +119,13 @@ declare class VoiceAgentClient {
|
|
|
119
119
|
private messages;
|
|
120
120
|
private visemeListeners;
|
|
121
121
|
private wantVisemes;
|
|
122
|
+
private serverUrl;
|
|
122
123
|
constructor(config: LokutorConfig & {
|
|
123
124
|
prompt: string;
|
|
124
125
|
voice?: VoiceStyle;
|
|
125
126
|
language?: Language;
|
|
126
127
|
visemes?: boolean;
|
|
128
|
+
serverUrl?: string;
|
|
127
129
|
onVisemes?: (visemes: Viseme[]) => void;
|
|
128
130
|
});
|
|
129
131
|
/**
|
|
@@ -326,7 +328,7 @@ declare class BrowserAudioManager {
|
|
|
326
328
|
private mediaStream;
|
|
327
329
|
private nextPlaybackTime;
|
|
328
330
|
private activeSources;
|
|
329
|
-
private
|
|
331
|
+
private audioClockOffset;
|
|
330
332
|
private inputSampleRate;
|
|
331
333
|
private outputSampleRate;
|
|
332
334
|
private autoGainControl;
|
|
@@ -336,6 +338,7 @@ declare class BrowserAudioManager {
|
|
|
336
338
|
private onInputError?;
|
|
337
339
|
private isMuted;
|
|
338
340
|
private isListening;
|
|
341
|
+
private resampler;
|
|
339
342
|
constructor(config?: BrowserAudioConfig);
|
|
340
343
|
/**
|
|
341
344
|
* Initialize the AudioContext and analyser
|
|
@@ -345,60 +348,77 @@ declare class BrowserAudioManager {
|
|
|
345
348
|
* Start capturing audio from the microphone
|
|
346
349
|
*/
|
|
347
350
|
startMicrophone(onAudioInput: (pcm16Data: Uint8Array) => void): Promise<void>;
|
|
348
|
-
/**
|
|
349
|
-
* Internal method to process microphone audio data
|
|
350
|
-
*/
|
|
351
351
|
private _processAudioInput;
|
|
352
|
-
/**
|
|
353
|
-
* Stop capturing microphone input
|
|
354
|
-
*/
|
|
355
352
|
stopMicrophone(): void;
|
|
356
353
|
/**
|
|
357
354
|
* Play back audio received from the server
|
|
358
|
-
* @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
|
|
359
355
|
*/
|
|
360
356
|
playAudio(pcm16Data: Uint8Array): void;
|
|
361
|
-
/**
|
|
362
|
-
* Internal method to schedule and play audio with sample-accurate timing
|
|
363
|
-
*/
|
|
364
357
|
private _schedulePlayback;
|
|
365
358
|
/**
|
|
366
|
-
*
|
|
359
|
+
* Get the current high-precision audio clock offset for viseme synchronization.
|
|
360
|
+
* Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
|
|
367
361
|
*/
|
|
368
|
-
|
|
362
|
+
getAudioClockOffset(): number | null;
|
|
369
363
|
/**
|
|
370
|
-
*
|
|
364
|
+
* Reset the audio clock offset (call when a response is interrupted or finished)
|
|
371
365
|
*/
|
|
366
|
+
resetAudioClock(): void;
|
|
367
|
+
stopPlayback(): void;
|
|
372
368
|
setMuted(muted: boolean): void;
|
|
373
|
-
/**
|
|
374
|
-
* Get current mute state
|
|
375
|
-
*/
|
|
376
369
|
isMicMuted(): boolean;
|
|
370
|
+
getAmplitude(): number;
|
|
371
|
+
getFrequencyData(): Uint8Array;
|
|
372
|
+
getWaveformData(): Uint8Array;
|
|
373
|
+
cleanup(): void;
|
|
374
|
+
getAudioContext(): AudioContext | null;
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
/**
|
|
378
|
+
* High-level AI Voice Agent for browser-based conversations.
|
|
379
|
+
*
|
|
380
|
+
* This class orchestrates microphone input, AI processing, and
|
|
381
|
+
* speaker output, providing a simple interface for building
|
|
382
|
+
* voice assistants with lip-sync support.
|
|
383
|
+
*/
|
|
384
|
+
declare class VoiceAgent {
|
|
385
|
+
private client;
|
|
386
|
+
private audioManager;
|
|
387
|
+
private options;
|
|
388
|
+
private isConnected;
|
|
389
|
+
private visemeQueue;
|
|
390
|
+
constructor(options: VoiceAgentOptions & {
|
|
391
|
+
apiKey: string;
|
|
392
|
+
});
|
|
377
393
|
/**
|
|
378
|
-
*
|
|
379
|
-
*
|
|
394
|
+
* Initialize hardware and connect to the AI server.
|
|
395
|
+
* This must be called in response to a user guesture (like a click)
|
|
396
|
+
* to satisfy browser AudioContext requirements.
|
|
380
397
|
*/
|
|
381
|
-
|
|
398
|
+
connect(): Promise<boolean>;
|
|
382
399
|
/**
|
|
383
|
-
* Get
|
|
400
|
+
* Get the current amplitude/volume of the microphone or output audio.
|
|
401
|
+
* Useful for voice activity visualization.
|
|
402
|
+
* @returns value between 0 and 1
|
|
384
403
|
*/
|
|
385
|
-
|
|
404
|
+
getAmplitude(): number;
|
|
386
405
|
/**
|
|
387
|
-
*
|
|
406
|
+
* Mute or unmute the microphone.
|
|
388
407
|
*/
|
|
389
|
-
|
|
408
|
+
toggleMute(): boolean;
|
|
390
409
|
/**
|
|
391
|
-
*
|
|
410
|
+
* High-precision method to get visemes that should be active
|
|
411
|
+
* at the current playback frame. Use this in a requestAnimationFrame loop.
|
|
392
412
|
*/
|
|
393
|
-
|
|
413
|
+
getFrameVisemes(): Viseme[];
|
|
394
414
|
/**
|
|
395
|
-
*
|
|
415
|
+
* Change the system prompt mid-conversation.
|
|
396
416
|
*/
|
|
397
|
-
|
|
417
|
+
updatePrompt(newPrompt: string): void;
|
|
398
418
|
/**
|
|
399
|
-
*
|
|
419
|
+
* Disconnect and release audio resources.
|
|
400
420
|
*/
|
|
401
|
-
|
|
421
|
+
disconnect(): void;
|
|
402
422
|
}
|
|
403
423
|
|
|
404
|
-
export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
|
|
424
|
+
export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme, VoiceAgent, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
|
package/dist/index.js
CHANGED
|
@@ -26,6 +26,7 @@ __export(index_exports, {
|
|
|
26
26
|
Language: () => Language,
|
|
27
27
|
StreamResampler: () => StreamResampler,
|
|
28
28
|
TTSClient: () => TTSClient,
|
|
29
|
+
VoiceAgent: () => VoiceAgent,
|
|
29
30
|
VoiceAgentClient: () => VoiceAgentClient,
|
|
30
31
|
VoiceStyle: () => VoiceStyle,
|
|
31
32
|
applyLowPassFilter: () => applyLowPassFilter,
|
|
@@ -104,11 +105,13 @@ var VoiceAgentClient = class {
|
|
|
104
105
|
messages = [];
|
|
105
106
|
visemeListeners = [];
|
|
106
107
|
wantVisemes = false;
|
|
108
|
+
serverUrl = null;
|
|
107
109
|
constructor(config) {
|
|
108
110
|
this.apiKey = config.apiKey;
|
|
109
111
|
this.prompt = config.prompt;
|
|
110
112
|
this.voice = config.voice || "F1" /* F1 */;
|
|
111
113
|
this.language = config.language || "en" /* ENGLISH */;
|
|
114
|
+
this.serverUrl = config.serverUrl || null;
|
|
112
115
|
this.onTranscription = config.onTranscription;
|
|
113
116
|
this.onResponse = config.onResponse;
|
|
114
117
|
this.onAudioCallback = config.onAudio;
|
|
@@ -123,12 +126,12 @@ var VoiceAgentClient = class {
|
|
|
123
126
|
async connect() {
|
|
124
127
|
return new Promise((resolve, reject) => {
|
|
125
128
|
try {
|
|
126
|
-
let url = DEFAULT_URLS.VOICE_AGENT;
|
|
129
|
+
let url = this.serverUrl || DEFAULT_URLS.VOICE_AGENT;
|
|
127
130
|
if (this.apiKey) {
|
|
128
131
|
const separator = url.includes("?") ? "&" : "?";
|
|
129
132
|
url += `${separator}api_key=${this.apiKey}`;
|
|
130
133
|
}
|
|
131
|
-
console.log(`\u{1F517} Connecting to ${
|
|
134
|
+
console.log(`\u{1F517} Connecting to ${url}...`);
|
|
132
135
|
this.ws = new WebSocket(url);
|
|
133
136
|
this.ws.binaryType = "arraybuffer";
|
|
134
137
|
this.ws.onopen = () => {
|
|
@@ -501,7 +504,8 @@ var BrowserAudioManager = class {
|
|
|
501
504
|
// Playback scheduling
|
|
502
505
|
nextPlaybackTime = 0;
|
|
503
506
|
activeSources = [];
|
|
504
|
-
|
|
507
|
+
// High-precision clock anchor for viseme sync
|
|
508
|
+
audioClockOffset = null;
|
|
505
509
|
// Configuration
|
|
506
510
|
inputSampleRate;
|
|
507
511
|
outputSampleRate;
|
|
@@ -514,6 +518,7 @@ var BrowserAudioManager = class {
|
|
|
514
518
|
// Audio processing state
|
|
515
519
|
isMuted = false;
|
|
516
520
|
isListening = false;
|
|
521
|
+
resampler = null;
|
|
517
522
|
constructor(config = {}) {
|
|
518
523
|
this.inputSampleRate = config.inputSampleRate ?? AUDIO_CONFIG.SAMPLE_RATE;
|
|
519
524
|
this.outputSampleRate = config.outputSampleRate ?? AUDIO_CONFIG.SPEAKER_SAMPLE_RATE;
|
|
@@ -537,7 +542,6 @@ var BrowserAudioManager = class {
|
|
|
537
542
|
}
|
|
538
543
|
if (this.audioContext.state === "suspended") {
|
|
539
544
|
await this.audioContext.resume();
|
|
540
|
-
console.log("\u{1F442} AudioContext resumed");
|
|
541
545
|
}
|
|
542
546
|
if (analyserConfig?.enabled !== false) {
|
|
543
547
|
this.analyserNode = this.audioContext.createAnalyser();
|
|
@@ -551,6 +555,7 @@ var BrowserAudioManager = class {
|
|
|
551
555
|
if (!this.audioContext) {
|
|
552
556
|
await this.init();
|
|
553
557
|
}
|
|
558
|
+
this.resampler = new StreamResampler(this.audioContext.sampleRate, this.inputSampleRate);
|
|
554
559
|
try {
|
|
555
560
|
this.onAudioInput = onAudioInput;
|
|
556
561
|
this.isListening = true;
|
|
@@ -566,9 +571,7 @@ var BrowserAudioManager = class {
|
|
|
566
571
|
this.scriptProcessor = this.audioContext.createScriptProcessor(
|
|
567
572
|
bufferSize,
|
|
568
573
|
1,
|
|
569
|
-
// input channels
|
|
570
574
|
1
|
|
571
|
-
// output channels
|
|
572
575
|
);
|
|
573
576
|
this.mediaStreamAudioSourceNode.connect(this.scriptProcessor);
|
|
574
577
|
this.scriptProcessor.connect(this.audioContext.destination);
|
|
@@ -585,40 +588,19 @@ var BrowserAudioManager = class {
|
|
|
585
588
|
throw err;
|
|
586
589
|
}
|
|
587
590
|
}
|
|
588
|
-
/**
|
|
589
|
-
* Internal method to process microphone audio data
|
|
590
|
-
*/
|
|
591
591
|
_processAudioInput(event) {
|
|
592
|
-
if (!this.onAudioInput || !this.audioContext || !this.isListening) return;
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
const
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
}
|
|
600
|
-
const hardwareRate = this.audioContext.sampleRate;
|
|
601
|
-
let processedData = new Float32Array(inputData);
|
|
602
|
-
if (hardwareRate !== this.inputSampleRate) {
|
|
603
|
-
processedData = resampleWithAntiAliasing(
|
|
604
|
-
processedData,
|
|
605
|
-
hardwareRate,
|
|
606
|
-
this.inputSampleRate
|
|
607
|
-
);
|
|
592
|
+
if (!this.onAudioInput || !this.audioContext || !this.isListening || this.isMuted) return;
|
|
593
|
+
const inputData = event.inputBuffer.getChannelData(0);
|
|
594
|
+
event.outputBuffer.getChannelData(0).fill(0);
|
|
595
|
+
const resampled = this.resampler ? this.resampler.process(inputData) : inputData;
|
|
596
|
+
if (resampled && resampled.length > 0) {
|
|
597
|
+
const int16Data = float32ToPcm16(resampled);
|
|
598
|
+
this.onAudioInput(new Uint8Array(int16Data.buffer, int16Data.byteOffset, int16Data.byteLength));
|
|
608
599
|
}
|
|
609
|
-
const int16Data = float32ToPcm16(processedData);
|
|
610
|
-
const uint8Data = new Uint8Array(
|
|
611
|
-
int16Data.buffer,
|
|
612
|
-
int16Data.byteOffset,
|
|
613
|
-
int16Data.byteLength
|
|
614
|
-
);
|
|
615
|
-
this.onAudioInput(uint8Data);
|
|
616
600
|
}
|
|
617
|
-
/**
|
|
618
|
-
* Stop capturing microphone input
|
|
619
|
-
*/
|
|
620
601
|
stopMicrophone() {
|
|
621
602
|
this.isListening = false;
|
|
603
|
+
this.resampler = null;
|
|
622
604
|
if (this.mediaStream) {
|
|
623
605
|
this.mediaStream.getTracks().forEach((track) => track.stop());
|
|
624
606
|
this.mediaStream = null;
|
|
@@ -631,17 +613,12 @@ var BrowserAudioManager = class {
|
|
|
631
613
|
this.mediaStreamAudioSourceNode.disconnect();
|
|
632
614
|
this.mediaStreamAudioSourceNode = null;
|
|
633
615
|
}
|
|
634
|
-
console.log("\u{1F3A4} Microphone stopped");
|
|
635
616
|
}
|
|
636
617
|
/**
|
|
637
618
|
* Play back audio received from the server
|
|
638
|
-
* @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
|
|
639
619
|
*/
|
|
640
620
|
playAudio(pcm16Data) {
|
|
641
|
-
if (!this.audioContext)
|
|
642
|
-
console.warn("AudioContext not initialized");
|
|
643
|
-
return;
|
|
644
|
-
}
|
|
621
|
+
if (!this.audioContext) return;
|
|
645
622
|
const int16Array = new Int16Array(
|
|
646
623
|
pcm16Data.buffer,
|
|
647
624
|
pcm16Data.byteOffset,
|
|
@@ -656,18 +633,17 @@ var BrowserAudioManager = class {
|
|
|
656
633
|
audioBuffer.getChannelData(0).set(float32Data);
|
|
657
634
|
this._schedulePlayback(audioBuffer);
|
|
658
635
|
}
|
|
659
|
-
/**
|
|
660
|
-
* Internal method to schedule and play audio with sample-accurate timing
|
|
661
|
-
*/
|
|
662
636
|
_schedulePlayback(audioBuffer) {
|
|
663
637
|
if (!this.audioContext) return;
|
|
664
638
|
const currentTime = this.audioContext.currentTime;
|
|
665
639
|
const duration = audioBuffer.length / this.outputSampleRate;
|
|
666
640
|
const startTime = Math.max(
|
|
667
641
|
currentTime + 0.01,
|
|
668
|
-
// Minimum 10ms delay
|
|
669
642
|
this.nextPlaybackTime
|
|
670
643
|
);
|
|
644
|
+
if (this.audioClockOffset === null) {
|
|
645
|
+
this.audioClockOffset = startTime;
|
|
646
|
+
}
|
|
671
647
|
this.nextPlaybackTime = startTime + duration;
|
|
672
648
|
const source = this.audioContext.createBufferSource();
|
|
673
649
|
source.buffer = audioBuffer;
|
|
@@ -685,8 +661,18 @@ var BrowserAudioManager = class {
|
|
|
685
661
|
};
|
|
686
662
|
}
|
|
687
663
|
/**
|
|
688
|
-
*
|
|
664
|
+
* Get the current high-precision audio clock offset for viseme synchronization.
|
|
665
|
+
* Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
|
|
666
|
+
*/
|
|
667
|
+
getAudioClockOffset() {
|
|
668
|
+
return this.audioClockOffset;
|
|
669
|
+
}
|
|
670
|
+
/**
|
|
671
|
+
* Reset the audio clock offset (call when a response is interrupted or finished)
|
|
689
672
|
*/
|
|
673
|
+
resetAudioClock() {
|
|
674
|
+
this.audioClockOffset = null;
|
|
675
|
+
}
|
|
690
676
|
stopPlayback() {
|
|
691
677
|
this.activeSources.forEach((source) => {
|
|
692
678
|
try {
|
|
@@ -695,26 +681,15 @@ var BrowserAudioManager = class {
|
|
|
695
681
|
}
|
|
696
682
|
});
|
|
697
683
|
this.activeSources = [];
|
|
698
|
-
this.
|
|
699
|
-
this.
|
|
700
|
-
console.log("\u{1F507} Playback stopped");
|
|
684
|
+
this.nextPlaybackTime = 0;
|
|
685
|
+
this.resetAudioClock();
|
|
701
686
|
}
|
|
702
|
-
/**
|
|
703
|
-
* Toggle mute state
|
|
704
|
-
*/
|
|
705
687
|
setMuted(muted) {
|
|
706
688
|
this.isMuted = muted;
|
|
707
689
|
}
|
|
708
|
-
/**
|
|
709
|
-
* Get current mute state
|
|
710
|
-
*/
|
|
711
690
|
isMicMuted() {
|
|
712
691
|
return this.isMuted;
|
|
713
692
|
}
|
|
714
|
-
/**
|
|
715
|
-
* Get current amplitude from analyser (for visualization)
|
|
716
|
-
* Returns value between 0 and 1
|
|
717
|
-
*/
|
|
718
693
|
getAmplitude() {
|
|
719
694
|
if (!this.analyserNode) return 0;
|
|
720
695
|
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
@@ -722,31 +697,18 @@ var BrowserAudioManager = class {
|
|
|
722
697
|
const rms = calculateRMS(dataArray);
|
|
723
698
|
return Math.min(rms * 10, 1);
|
|
724
699
|
}
|
|
725
|
-
/**
|
|
726
|
-
* Get frequency data from analyser for visualization
|
|
727
|
-
*/
|
|
728
700
|
getFrequencyData() {
|
|
729
|
-
if (!this.analyserNode)
|
|
730
|
-
return new Uint8Array(0);
|
|
731
|
-
}
|
|
701
|
+
if (!this.analyserNode) return new Uint8Array(0);
|
|
732
702
|
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
733
703
|
this.analyserNode.getByteFrequencyData(dataArray);
|
|
734
704
|
return dataArray;
|
|
735
705
|
}
|
|
736
|
-
/**
|
|
737
|
-
* Get time-domain data from analyser for waveform visualization
|
|
738
|
-
*/
|
|
739
706
|
getWaveformData() {
|
|
740
|
-
if (!this.analyserNode)
|
|
741
|
-
return new Uint8Array(0);
|
|
742
|
-
}
|
|
707
|
+
if (!this.analyserNode) return new Uint8Array(0);
|
|
743
708
|
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
744
709
|
this.analyserNode.getByteTimeDomainData(dataArray);
|
|
745
710
|
return dataArray;
|
|
746
711
|
}
|
|
747
|
-
/**
|
|
748
|
-
* Cleanup and close AudioContext
|
|
749
|
-
*/
|
|
750
712
|
cleanup() {
|
|
751
713
|
this.stopMicrophone();
|
|
752
714
|
this.stopPlayback();
|
|
@@ -755,17 +717,124 @@ var BrowserAudioManager = class {
|
|
|
755
717
|
this.analyserNode = null;
|
|
756
718
|
}
|
|
757
719
|
}
|
|
720
|
+
getAudioContext() {
|
|
721
|
+
return this.audioContext;
|
|
722
|
+
}
|
|
723
|
+
};
|
|
724
|
+
|
|
725
|
+
// src/voice-agent.ts
|
|
726
|
+
var VoiceAgent = class {
|
|
727
|
+
client;
|
|
728
|
+
audioManager;
|
|
729
|
+
options;
|
|
730
|
+
isConnected = false;
|
|
731
|
+
visemeQueue = [];
|
|
732
|
+
constructor(options) {
|
|
733
|
+
this.options = options;
|
|
734
|
+
this.client = new VoiceAgentClient({
|
|
735
|
+
apiKey: options.apiKey,
|
|
736
|
+
prompt: options.prompt || "You are a helpful and friendly AI assistant.",
|
|
737
|
+
voice: options.voice || "F1" /* F1 */,
|
|
738
|
+
language: options.language || "en" /* ENGLISH */,
|
|
739
|
+
visemes: options.visemes ?? true,
|
|
740
|
+
serverUrl: options.serverUrl,
|
|
741
|
+
onTranscription: (text) => {
|
|
742
|
+
if (options.onTranscription) options.onTranscription(text, true);
|
|
743
|
+
},
|
|
744
|
+
onResponse: (text) => {
|
|
745
|
+
if (options.onTranscription) options.onTranscription(text, false);
|
|
746
|
+
},
|
|
747
|
+
onAudio: (data) => {
|
|
748
|
+
this.audioManager.playAudio(data);
|
|
749
|
+
},
|
|
750
|
+
onVisemes: (visemes) => {
|
|
751
|
+
this.visemeQueue.push(...visemes);
|
|
752
|
+
if (options.onVisemes) options.onVisemes(visemes);
|
|
753
|
+
},
|
|
754
|
+
onStatus: (status) => {
|
|
755
|
+
if (options.onStatusChange) options.onStatusChange(status);
|
|
756
|
+
if (status === "interrupted" || status === "thinking") {
|
|
757
|
+
this.audioManager.stopPlayback();
|
|
758
|
+
this.visemeQueue = [];
|
|
759
|
+
}
|
|
760
|
+
},
|
|
761
|
+
onError: (err) => {
|
|
762
|
+
if (options.onError) options.onError(err);
|
|
763
|
+
}
|
|
764
|
+
});
|
|
765
|
+
this.audioManager = new BrowserAudioManager({
|
|
766
|
+
autoGainControl: true,
|
|
767
|
+
echoCancellation: true,
|
|
768
|
+
noiseSuppression: true
|
|
769
|
+
});
|
|
770
|
+
}
|
|
758
771
|
/**
|
|
759
|
-
*
|
|
772
|
+
* Initialize hardware and connect to the AI server.
|
|
773
|
+
* This must be called in response to a user guesture (like a click)
|
|
774
|
+
* to satisfy browser AudioContext requirements.
|
|
760
775
|
*/
|
|
761
|
-
|
|
762
|
-
|
|
776
|
+
async connect() {
|
|
777
|
+
try {
|
|
778
|
+
await this.audioManager.init();
|
|
779
|
+
const connected = await this.client.connect();
|
|
780
|
+
if (!connected) return false;
|
|
781
|
+
this.isConnected = true;
|
|
782
|
+
await this.audioManager.startMicrophone((pcm16Data) => {
|
|
783
|
+
if (this.isConnected) {
|
|
784
|
+
this.client.sendAudio(pcm16Data);
|
|
785
|
+
}
|
|
786
|
+
});
|
|
787
|
+
return true;
|
|
788
|
+
} catch (err) {
|
|
789
|
+
if (this.options.onError) this.options.onError(err);
|
|
790
|
+
return false;
|
|
791
|
+
}
|
|
763
792
|
}
|
|
764
793
|
/**
|
|
765
|
-
*
|
|
794
|
+
* Get the current amplitude/volume of the microphone or output audio.
|
|
795
|
+
* Useful for voice activity visualization.
|
|
796
|
+
* @returns value between 0 and 1
|
|
766
797
|
*/
|
|
767
|
-
|
|
768
|
-
return this.
|
|
798
|
+
getAmplitude() {
|
|
799
|
+
return this.audioManager.getAmplitude();
|
|
800
|
+
}
|
|
801
|
+
/**
|
|
802
|
+
* Mute or unmute the microphone.
|
|
803
|
+
*/
|
|
804
|
+
toggleMute() {
|
|
805
|
+
const currentState = this.audioManager.isMicMuted();
|
|
806
|
+
this.audioManager.setMuted(!currentState);
|
|
807
|
+
return !currentState;
|
|
808
|
+
}
|
|
809
|
+
/**
|
|
810
|
+
* High-precision method to get visemes that should be active
|
|
811
|
+
* at the current playback frame. Use this in a requestAnimationFrame loop.
|
|
812
|
+
*/
|
|
813
|
+
getFrameVisemes() {
|
|
814
|
+
const offset = this.audioManager.getAudioClockOffset();
|
|
815
|
+
const audioCtx = this.audioManager.getAudioContext();
|
|
816
|
+
if (offset === null || !audioCtx) return [];
|
|
817
|
+
const streamTime = (audioCtx.currentTime - offset) * 1e3;
|
|
818
|
+
const currentBatch = [];
|
|
819
|
+
while (this.visemeQueue.length > 0 && this.visemeQueue[0].t * 1e3 <= streamTime) {
|
|
820
|
+
currentBatch.push(this.visemeQueue.shift());
|
|
821
|
+
}
|
|
822
|
+
return currentBatch;
|
|
823
|
+
}
|
|
824
|
+
/**
|
|
825
|
+
* Change the system prompt mid-conversation.
|
|
826
|
+
*/
|
|
827
|
+
updatePrompt(newPrompt) {
|
|
828
|
+
this.client.updatePrompt(newPrompt);
|
|
829
|
+
}
|
|
830
|
+
/**
|
|
831
|
+
* Disconnect and release audio resources.
|
|
832
|
+
*/
|
|
833
|
+
disconnect() {
|
|
834
|
+
this.isConnected = false;
|
|
835
|
+
this.client.disconnect();
|
|
836
|
+
this.audioManager.cleanup();
|
|
837
|
+
this.visemeQueue = [];
|
|
769
838
|
}
|
|
770
839
|
};
|
|
771
840
|
// Annotate the CommonJS export names for ESM import in node:
|
|
@@ -776,6 +845,7 @@ var BrowserAudioManager = class {
|
|
|
776
845
|
Language,
|
|
777
846
|
StreamResampler,
|
|
778
847
|
TTSClient,
|
|
848
|
+
VoiceAgent,
|
|
779
849
|
VoiceAgentClient,
|
|
780
850
|
VoiceStyle,
|
|
781
851
|
applyLowPassFilter,
|
package/dist/index.mjs
CHANGED
|
@@ -60,11 +60,13 @@ var VoiceAgentClient = class {
|
|
|
60
60
|
messages = [];
|
|
61
61
|
visemeListeners = [];
|
|
62
62
|
wantVisemes = false;
|
|
63
|
+
serverUrl = null;
|
|
63
64
|
constructor(config) {
|
|
64
65
|
this.apiKey = config.apiKey;
|
|
65
66
|
this.prompt = config.prompt;
|
|
66
67
|
this.voice = config.voice || "F1" /* F1 */;
|
|
67
68
|
this.language = config.language || "en" /* ENGLISH */;
|
|
69
|
+
this.serverUrl = config.serverUrl || null;
|
|
68
70
|
this.onTranscription = config.onTranscription;
|
|
69
71
|
this.onResponse = config.onResponse;
|
|
70
72
|
this.onAudioCallback = config.onAudio;
|
|
@@ -79,12 +81,12 @@ var VoiceAgentClient = class {
|
|
|
79
81
|
async connect() {
|
|
80
82
|
return new Promise((resolve, reject) => {
|
|
81
83
|
try {
|
|
82
|
-
let url = DEFAULT_URLS.VOICE_AGENT;
|
|
84
|
+
let url = this.serverUrl || DEFAULT_URLS.VOICE_AGENT;
|
|
83
85
|
if (this.apiKey) {
|
|
84
86
|
const separator = url.includes("?") ? "&" : "?";
|
|
85
87
|
url += `${separator}api_key=${this.apiKey}`;
|
|
86
88
|
}
|
|
87
|
-
console.log(`\u{1F517} Connecting to ${
|
|
89
|
+
console.log(`\u{1F517} Connecting to ${url}...`);
|
|
88
90
|
this.ws = new WebSocket(url);
|
|
89
91
|
this.ws.binaryType = "arraybuffer";
|
|
90
92
|
this.ws.onopen = () => {
|
|
@@ -457,7 +459,8 @@ var BrowserAudioManager = class {
|
|
|
457
459
|
// Playback scheduling
|
|
458
460
|
nextPlaybackTime = 0;
|
|
459
461
|
activeSources = [];
|
|
460
|
-
|
|
462
|
+
// High-precision clock anchor for viseme sync
|
|
463
|
+
audioClockOffset = null;
|
|
461
464
|
// Configuration
|
|
462
465
|
inputSampleRate;
|
|
463
466
|
outputSampleRate;
|
|
@@ -470,6 +473,7 @@ var BrowserAudioManager = class {
|
|
|
470
473
|
// Audio processing state
|
|
471
474
|
isMuted = false;
|
|
472
475
|
isListening = false;
|
|
476
|
+
resampler = null;
|
|
473
477
|
constructor(config = {}) {
|
|
474
478
|
this.inputSampleRate = config.inputSampleRate ?? AUDIO_CONFIG.SAMPLE_RATE;
|
|
475
479
|
this.outputSampleRate = config.outputSampleRate ?? AUDIO_CONFIG.SPEAKER_SAMPLE_RATE;
|
|
@@ -493,7 +497,6 @@ var BrowserAudioManager = class {
|
|
|
493
497
|
}
|
|
494
498
|
if (this.audioContext.state === "suspended") {
|
|
495
499
|
await this.audioContext.resume();
|
|
496
|
-
console.log("\u{1F442} AudioContext resumed");
|
|
497
500
|
}
|
|
498
501
|
if (analyserConfig?.enabled !== false) {
|
|
499
502
|
this.analyserNode = this.audioContext.createAnalyser();
|
|
@@ -507,6 +510,7 @@ var BrowserAudioManager = class {
|
|
|
507
510
|
if (!this.audioContext) {
|
|
508
511
|
await this.init();
|
|
509
512
|
}
|
|
513
|
+
this.resampler = new StreamResampler(this.audioContext.sampleRate, this.inputSampleRate);
|
|
510
514
|
try {
|
|
511
515
|
this.onAudioInput = onAudioInput;
|
|
512
516
|
this.isListening = true;
|
|
@@ -522,9 +526,7 @@ var BrowserAudioManager = class {
|
|
|
522
526
|
this.scriptProcessor = this.audioContext.createScriptProcessor(
|
|
523
527
|
bufferSize,
|
|
524
528
|
1,
|
|
525
|
-
// input channels
|
|
526
529
|
1
|
|
527
|
-
// output channels
|
|
528
530
|
);
|
|
529
531
|
this.mediaStreamAudioSourceNode.connect(this.scriptProcessor);
|
|
530
532
|
this.scriptProcessor.connect(this.audioContext.destination);
|
|
@@ -541,40 +543,19 @@ var BrowserAudioManager = class {
|
|
|
541
543
|
throw err;
|
|
542
544
|
}
|
|
543
545
|
}
|
|
544
|
-
/**
|
|
545
|
-
* Internal method to process microphone audio data
|
|
546
|
-
*/
|
|
547
546
|
_processAudioInput(event) {
|
|
548
|
-
if (!this.onAudioInput || !this.audioContext || !this.isListening) return;
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
const
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
}
|
|
556
|
-
const hardwareRate = this.audioContext.sampleRate;
|
|
557
|
-
let processedData = new Float32Array(inputData);
|
|
558
|
-
if (hardwareRate !== this.inputSampleRate) {
|
|
559
|
-
processedData = resampleWithAntiAliasing(
|
|
560
|
-
processedData,
|
|
561
|
-
hardwareRate,
|
|
562
|
-
this.inputSampleRate
|
|
563
|
-
);
|
|
547
|
+
if (!this.onAudioInput || !this.audioContext || !this.isListening || this.isMuted) return;
|
|
548
|
+
const inputData = event.inputBuffer.getChannelData(0);
|
|
549
|
+
event.outputBuffer.getChannelData(0).fill(0);
|
|
550
|
+
const resampled = this.resampler ? this.resampler.process(inputData) : inputData;
|
|
551
|
+
if (resampled && resampled.length > 0) {
|
|
552
|
+
const int16Data = float32ToPcm16(resampled);
|
|
553
|
+
this.onAudioInput(new Uint8Array(int16Data.buffer, int16Data.byteOffset, int16Data.byteLength));
|
|
564
554
|
}
|
|
565
|
-
const int16Data = float32ToPcm16(processedData);
|
|
566
|
-
const uint8Data = new Uint8Array(
|
|
567
|
-
int16Data.buffer,
|
|
568
|
-
int16Data.byteOffset,
|
|
569
|
-
int16Data.byteLength
|
|
570
|
-
);
|
|
571
|
-
this.onAudioInput(uint8Data);
|
|
572
555
|
}
|
|
573
|
-
/**
|
|
574
|
-
* Stop capturing microphone input
|
|
575
|
-
*/
|
|
576
556
|
stopMicrophone() {
|
|
577
557
|
this.isListening = false;
|
|
558
|
+
this.resampler = null;
|
|
578
559
|
if (this.mediaStream) {
|
|
579
560
|
this.mediaStream.getTracks().forEach((track) => track.stop());
|
|
580
561
|
this.mediaStream = null;
|
|
@@ -587,17 +568,12 @@ var BrowserAudioManager = class {
|
|
|
587
568
|
this.mediaStreamAudioSourceNode.disconnect();
|
|
588
569
|
this.mediaStreamAudioSourceNode = null;
|
|
589
570
|
}
|
|
590
|
-
console.log("\u{1F3A4} Microphone stopped");
|
|
591
571
|
}
|
|
592
572
|
/**
|
|
593
573
|
* Play back audio received from the server
|
|
594
|
-
* @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
|
|
595
574
|
*/
|
|
596
575
|
playAudio(pcm16Data) {
|
|
597
|
-
if (!this.audioContext)
|
|
598
|
-
console.warn("AudioContext not initialized");
|
|
599
|
-
return;
|
|
600
|
-
}
|
|
576
|
+
if (!this.audioContext) return;
|
|
601
577
|
const int16Array = new Int16Array(
|
|
602
578
|
pcm16Data.buffer,
|
|
603
579
|
pcm16Data.byteOffset,
|
|
@@ -612,18 +588,17 @@ var BrowserAudioManager = class {
|
|
|
612
588
|
audioBuffer.getChannelData(0).set(float32Data);
|
|
613
589
|
this._schedulePlayback(audioBuffer);
|
|
614
590
|
}
|
|
615
|
-
/**
|
|
616
|
-
* Internal method to schedule and play audio with sample-accurate timing
|
|
617
|
-
*/
|
|
618
591
|
_schedulePlayback(audioBuffer) {
|
|
619
592
|
if (!this.audioContext) return;
|
|
620
593
|
const currentTime = this.audioContext.currentTime;
|
|
621
594
|
const duration = audioBuffer.length / this.outputSampleRate;
|
|
622
595
|
const startTime = Math.max(
|
|
623
596
|
currentTime + 0.01,
|
|
624
|
-
// Minimum 10ms delay
|
|
625
597
|
this.nextPlaybackTime
|
|
626
598
|
);
|
|
599
|
+
if (this.audioClockOffset === null) {
|
|
600
|
+
this.audioClockOffset = startTime;
|
|
601
|
+
}
|
|
627
602
|
this.nextPlaybackTime = startTime + duration;
|
|
628
603
|
const source = this.audioContext.createBufferSource();
|
|
629
604
|
source.buffer = audioBuffer;
|
|
@@ -641,8 +616,18 @@ var BrowserAudioManager = class {
|
|
|
641
616
|
};
|
|
642
617
|
}
|
|
643
618
|
/**
|
|
644
|
-
*
|
|
619
|
+
* Get the current high-precision audio clock offset for viseme synchronization.
|
|
620
|
+
* Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
|
|
621
|
+
*/
|
|
622
|
+
getAudioClockOffset() {
|
|
623
|
+
return this.audioClockOffset;
|
|
624
|
+
}
|
|
625
|
+
/**
|
|
626
|
+
* Reset the audio clock offset (call when a response is interrupted or finished)
|
|
645
627
|
*/
|
|
628
|
+
resetAudioClock() {
|
|
629
|
+
this.audioClockOffset = null;
|
|
630
|
+
}
|
|
646
631
|
stopPlayback() {
|
|
647
632
|
this.activeSources.forEach((source) => {
|
|
648
633
|
try {
|
|
@@ -651,26 +636,15 @@ var BrowserAudioManager = class {
|
|
|
651
636
|
}
|
|
652
637
|
});
|
|
653
638
|
this.activeSources = [];
|
|
654
|
-
this.
|
|
655
|
-
this.
|
|
656
|
-
console.log("\u{1F507} Playback stopped");
|
|
639
|
+
this.nextPlaybackTime = 0;
|
|
640
|
+
this.resetAudioClock();
|
|
657
641
|
}
|
|
658
|
-
/**
|
|
659
|
-
* Toggle mute state
|
|
660
|
-
*/
|
|
661
642
|
setMuted(muted) {
|
|
662
643
|
this.isMuted = muted;
|
|
663
644
|
}
|
|
664
|
-
/**
|
|
665
|
-
* Get current mute state
|
|
666
|
-
*/
|
|
667
645
|
isMicMuted() {
|
|
668
646
|
return this.isMuted;
|
|
669
647
|
}
|
|
670
|
-
/**
|
|
671
|
-
* Get current amplitude from analyser (for visualization)
|
|
672
|
-
* Returns value between 0 and 1
|
|
673
|
-
*/
|
|
674
648
|
getAmplitude() {
|
|
675
649
|
if (!this.analyserNode) return 0;
|
|
676
650
|
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
@@ -678,31 +652,18 @@ var BrowserAudioManager = class {
|
|
|
678
652
|
const rms = calculateRMS(dataArray);
|
|
679
653
|
return Math.min(rms * 10, 1);
|
|
680
654
|
}
|
|
681
|
-
/**
|
|
682
|
-
* Get frequency data from analyser for visualization
|
|
683
|
-
*/
|
|
684
655
|
getFrequencyData() {
|
|
685
|
-
if (!this.analyserNode)
|
|
686
|
-
return new Uint8Array(0);
|
|
687
|
-
}
|
|
656
|
+
if (!this.analyserNode) return new Uint8Array(0);
|
|
688
657
|
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
689
658
|
this.analyserNode.getByteFrequencyData(dataArray);
|
|
690
659
|
return dataArray;
|
|
691
660
|
}
|
|
692
|
-
/**
|
|
693
|
-
* Get time-domain data from analyser for waveform visualization
|
|
694
|
-
*/
|
|
695
661
|
getWaveformData() {
|
|
696
|
-
if (!this.analyserNode)
|
|
697
|
-
return new Uint8Array(0);
|
|
698
|
-
}
|
|
662
|
+
if (!this.analyserNode) return new Uint8Array(0);
|
|
699
663
|
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
700
664
|
this.analyserNode.getByteTimeDomainData(dataArray);
|
|
701
665
|
return dataArray;
|
|
702
666
|
}
|
|
703
|
-
/**
|
|
704
|
-
* Cleanup and close AudioContext
|
|
705
|
-
*/
|
|
706
667
|
cleanup() {
|
|
707
668
|
this.stopMicrophone();
|
|
708
669
|
this.stopPlayback();
|
|
@@ -711,17 +672,124 @@ var BrowserAudioManager = class {
|
|
|
711
672
|
this.analyserNode = null;
|
|
712
673
|
}
|
|
713
674
|
}
|
|
675
|
+
getAudioContext() {
|
|
676
|
+
return this.audioContext;
|
|
677
|
+
}
|
|
678
|
+
};
|
|
679
|
+
|
|
680
|
+
// src/voice-agent.ts
|
|
681
|
+
var VoiceAgent = class {
|
|
682
|
+
client;
|
|
683
|
+
audioManager;
|
|
684
|
+
options;
|
|
685
|
+
isConnected = false;
|
|
686
|
+
visemeQueue = [];
|
|
687
|
+
constructor(options) {
|
|
688
|
+
this.options = options;
|
|
689
|
+
this.client = new VoiceAgentClient({
|
|
690
|
+
apiKey: options.apiKey,
|
|
691
|
+
prompt: options.prompt || "You are a helpful and friendly AI assistant.",
|
|
692
|
+
voice: options.voice || "F1" /* F1 */,
|
|
693
|
+
language: options.language || "en" /* ENGLISH */,
|
|
694
|
+
visemes: options.visemes ?? true,
|
|
695
|
+
serverUrl: options.serverUrl,
|
|
696
|
+
onTranscription: (text) => {
|
|
697
|
+
if (options.onTranscription) options.onTranscription(text, true);
|
|
698
|
+
},
|
|
699
|
+
onResponse: (text) => {
|
|
700
|
+
if (options.onTranscription) options.onTranscription(text, false);
|
|
701
|
+
},
|
|
702
|
+
onAudio: (data) => {
|
|
703
|
+
this.audioManager.playAudio(data);
|
|
704
|
+
},
|
|
705
|
+
onVisemes: (visemes) => {
|
|
706
|
+
this.visemeQueue.push(...visemes);
|
|
707
|
+
if (options.onVisemes) options.onVisemes(visemes);
|
|
708
|
+
},
|
|
709
|
+
onStatus: (status) => {
|
|
710
|
+
if (options.onStatusChange) options.onStatusChange(status);
|
|
711
|
+
if (status === "interrupted" || status === "thinking") {
|
|
712
|
+
this.audioManager.stopPlayback();
|
|
713
|
+
this.visemeQueue = [];
|
|
714
|
+
}
|
|
715
|
+
},
|
|
716
|
+
onError: (err) => {
|
|
717
|
+
if (options.onError) options.onError(err);
|
|
718
|
+
}
|
|
719
|
+
});
|
|
720
|
+
this.audioManager = new BrowserAudioManager({
|
|
721
|
+
autoGainControl: true,
|
|
722
|
+
echoCancellation: true,
|
|
723
|
+
noiseSuppression: true
|
|
724
|
+
});
|
|
725
|
+
}
|
|
714
726
|
/**
|
|
715
|
-
*
|
|
727
|
+
* Initialize hardware and connect to the AI server.
|
|
728
|
+
* This must be called in response to a user guesture (like a click)
|
|
729
|
+
* to satisfy browser AudioContext requirements.
|
|
716
730
|
*/
|
|
717
|
-
|
|
718
|
-
|
|
731
|
+
async connect() {
|
|
732
|
+
try {
|
|
733
|
+
await this.audioManager.init();
|
|
734
|
+
const connected = await this.client.connect();
|
|
735
|
+
if (!connected) return false;
|
|
736
|
+
this.isConnected = true;
|
|
737
|
+
await this.audioManager.startMicrophone((pcm16Data) => {
|
|
738
|
+
if (this.isConnected) {
|
|
739
|
+
this.client.sendAudio(pcm16Data);
|
|
740
|
+
}
|
|
741
|
+
});
|
|
742
|
+
return true;
|
|
743
|
+
} catch (err) {
|
|
744
|
+
if (this.options.onError) this.options.onError(err);
|
|
745
|
+
return false;
|
|
746
|
+
}
|
|
719
747
|
}
|
|
720
748
|
/**
|
|
721
|
-
*
|
|
749
|
+
* Get the current amplitude/volume of the microphone or output audio.
|
|
750
|
+
* Useful for voice activity visualization.
|
|
751
|
+
* @returns value between 0 and 1
|
|
722
752
|
*/
|
|
723
|
-
|
|
724
|
-
return this.
|
|
753
|
+
getAmplitude() {
|
|
754
|
+
return this.audioManager.getAmplitude();
|
|
755
|
+
}
|
|
756
|
+
/**
|
|
757
|
+
* Mute or unmute the microphone.
|
|
758
|
+
*/
|
|
759
|
+
toggleMute() {
|
|
760
|
+
const currentState = this.audioManager.isMicMuted();
|
|
761
|
+
this.audioManager.setMuted(!currentState);
|
|
762
|
+
return !currentState;
|
|
763
|
+
}
|
|
764
|
+
/**
|
|
765
|
+
* High-precision method to get visemes that should be active
|
|
766
|
+
* at the current playback frame. Use this in a requestAnimationFrame loop.
|
|
767
|
+
*/
|
|
768
|
+
getFrameVisemes() {
|
|
769
|
+
const offset = this.audioManager.getAudioClockOffset();
|
|
770
|
+
const audioCtx = this.audioManager.getAudioContext();
|
|
771
|
+
if (offset === null || !audioCtx) return [];
|
|
772
|
+
const streamTime = (audioCtx.currentTime - offset) * 1e3;
|
|
773
|
+
const currentBatch = [];
|
|
774
|
+
while (this.visemeQueue.length > 0 && this.visemeQueue[0].t * 1e3 <= streamTime) {
|
|
775
|
+
currentBatch.push(this.visemeQueue.shift());
|
|
776
|
+
}
|
|
777
|
+
return currentBatch;
|
|
778
|
+
}
|
|
779
|
+
/**
|
|
780
|
+
* Change the system prompt mid-conversation.
|
|
781
|
+
*/
|
|
782
|
+
updatePrompt(newPrompt) {
|
|
783
|
+
this.client.updatePrompt(newPrompt);
|
|
784
|
+
}
|
|
785
|
+
/**
|
|
786
|
+
* Disconnect and release audio resources.
|
|
787
|
+
*/
|
|
788
|
+
disconnect() {
|
|
789
|
+
this.isConnected = false;
|
|
790
|
+
this.client.disconnect();
|
|
791
|
+
this.audioManager.cleanup();
|
|
792
|
+
this.visemeQueue = [];
|
|
725
793
|
}
|
|
726
794
|
};
|
|
727
795
|
export {
|
|
@@ -731,6 +799,7 @@ export {
|
|
|
731
799
|
Language,
|
|
732
800
|
StreamResampler,
|
|
733
801
|
TTSClient,
|
|
802
|
+
VoiceAgent,
|
|
734
803
|
VoiceAgentClient,
|
|
735
804
|
VoiceStyle,
|
|
736
805
|
applyLowPassFilter,
|