@lokutor/sdk 1.1.7 → 1.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +57 -34
- package/dist/index.d.ts +57 -34
- package/dist/index.js +156 -83
- package/dist/index.mjs +155 -83
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -90,11 +90,12 @@ interface VoiceAgentOptions {
|
|
|
90
90
|
}
|
|
91
91
|
/**
|
|
92
92
|
* Viseme data for lip-sync animation
|
|
93
|
+
* Format: {"v": index, "c": character, "t": timestamp}
|
|
93
94
|
*/
|
|
94
95
|
interface Viseme {
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
96
|
+
v: number;
|
|
97
|
+
c: string;
|
|
98
|
+
t: number;
|
|
98
99
|
}
|
|
99
100
|
|
|
100
101
|
/**
|
|
@@ -117,10 +118,14 @@ declare class VoiceAgentClient {
|
|
|
117
118
|
private isConnected;
|
|
118
119
|
private messages;
|
|
119
120
|
private visemeListeners;
|
|
121
|
+
private wantVisemes;
|
|
122
|
+
private serverUrl;
|
|
120
123
|
constructor(config: LokutorConfig & {
|
|
121
124
|
prompt: string;
|
|
122
125
|
voice?: VoiceStyle;
|
|
123
126
|
language?: Language;
|
|
127
|
+
visemes?: boolean;
|
|
128
|
+
serverUrl?: string;
|
|
124
129
|
onVisemes?: (visemes: Viseme[]) => void;
|
|
125
130
|
});
|
|
126
131
|
/**
|
|
@@ -323,7 +328,7 @@ declare class BrowserAudioManager {
|
|
|
323
328
|
private mediaStream;
|
|
324
329
|
private nextPlaybackTime;
|
|
325
330
|
private activeSources;
|
|
326
|
-
private
|
|
331
|
+
private audioClockOffset;
|
|
327
332
|
private inputSampleRate;
|
|
328
333
|
private outputSampleRate;
|
|
329
334
|
private autoGainControl;
|
|
@@ -333,6 +338,7 @@ declare class BrowserAudioManager {
|
|
|
333
338
|
private onInputError?;
|
|
334
339
|
private isMuted;
|
|
335
340
|
private isListening;
|
|
341
|
+
private resampler;
|
|
336
342
|
constructor(config?: BrowserAudioConfig);
|
|
337
343
|
/**
|
|
338
344
|
* Initialize the AudioContext and analyser
|
|
@@ -342,60 +348,77 @@ declare class BrowserAudioManager {
|
|
|
342
348
|
* Start capturing audio from the microphone
|
|
343
349
|
*/
|
|
344
350
|
startMicrophone(onAudioInput: (pcm16Data: Uint8Array) => void): Promise<void>;
|
|
345
|
-
/**
|
|
346
|
-
* Internal method to process microphone audio data
|
|
347
|
-
*/
|
|
348
351
|
private _processAudioInput;
|
|
349
|
-
/**
|
|
350
|
-
* Stop capturing microphone input
|
|
351
|
-
*/
|
|
352
352
|
stopMicrophone(): void;
|
|
353
353
|
/**
|
|
354
354
|
* Play back audio received from the server
|
|
355
|
-
* @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
|
|
356
355
|
*/
|
|
357
356
|
playAudio(pcm16Data: Uint8Array): void;
|
|
358
|
-
/**
|
|
359
|
-
* Internal method to schedule and play audio with sample-accurate timing
|
|
360
|
-
*/
|
|
361
357
|
private _schedulePlayback;
|
|
362
358
|
/**
|
|
363
|
-
*
|
|
359
|
+
* Get the current high-precision audio clock offset for viseme synchronization.
|
|
360
|
+
* Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
|
|
364
361
|
*/
|
|
365
|
-
|
|
362
|
+
getAudioClockOffset(): number | null;
|
|
366
363
|
/**
|
|
367
|
-
*
|
|
364
|
+
* Reset the audio clock offset (call when a response is interrupted or finished)
|
|
368
365
|
*/
|
|
366
|
+
resetAudioClock(): void;
|
|
367
|
+
stopPlayback(): void;
|
|
369
368
|
setMuted(muted: boolean): void;
|
|
370
|
-
/**
|
|
371
|
-
* Get current mute state
|
|
372
|
-
*/
|
|
373
369
|
isMicMuted(): boolean;
|
|
370
|
+
getAmplitude(): number;
|
|
371
|
+
getFrequencyData(): Uint8Array;
|
|
372
|
+
getWaveformData(): Uint8Array;
|
|
373
|
+
cleanup(): void;
|
|
374
|
+
getAudioContext(): AudioContext | null;
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
/**
|
|
378
|
+
* High-level AI Voice Agent for browser-based conversations.
|
|
379
|
+
*
|
|
380
|
+
* This class orchestrates microphone input, AI processing, and
|
|
381
|
+
* speaker output, providing a simple interface for building
|
|
382
|
+
* voice assistants with lip-sync support.
|
|
383
|
+
*/
|
|
384
|
+
declare class VoiceAgent {
|
|
385
|
+
private client;
|
|
386
|
+
private audioManager;
|
|
387
|
+
private options;
|
|
388
|
+
private isConnected;
|
|
389
|
+
private visemeQueue;
|
|
390
|
+
constructor(options: VoiceAgentOptions & {
|
|
391
|
+
apiKey: string;
|
|
392
|
+
});
|
|
374
393
|
/**
|
|
375
|
-
*
|
|
376
|
-
*
|
|
394
|
+
* Initialize hardware and connect to the AI server.
|
|
395
|
+
* This must be called in response to a user guesture (like a click)
|
|
396
|
+
* to satisfy browser AudioContext requirements.
|
|
377
397
|
*/
|
|
378
|
-
|
|
398
|
+
connect(): Promise<boolean>;
|
|
379
399
|
/**
|
|
380
|
-
* Get
|
|
400
|
+
* Get the current amplitude/volume of the microphone or output audio.
|
|
401
|
+
* Useful for voice activity visualization.
|
|
402
|
+
* @returns value between 0 and 1
|
|
381
403
|
*/
|
|
382
|
-
|
|
404
|
+
getAmplitude(): number;
|
|
383
405
|
/**
|
|
384
|
-
*
|
|
406
|
+
* Mute or unmute the microphone.
|
|
385
407
|
*/
|
|
386
|
-
|
|
408
|
+
toggleMute(): boolean;
|
|
387
409
|
/**
|
|
388
|
-
*
|
|
410
|
+
* High-precision method to get visemes that should be active
|
|
411
|
+
* at the current playback frame. Use this in a requestAnimationFrame loop.
|
|
389
412
|
*/
|
|
390
|
-
|
|
413
|
+
getFrameVisemes(): Viseme[];
|
|
391
414
|
/**
|
|
392
|
-
*
|
|
415
|
+
* Change the system prompt mid-conversation.
|
|
393
416
|
*/
|
|
394
|
-
|
|
417
|
+
updatePrompt(newPrompt: string): void;
|
|
395
418
|
/**
|
|
396
|
-
*
|
|
419
|
+
* Disconnect and release audio resources.
|
|
397
420
|
*/
|
|
398
|
-
|
|
421
|
+
disconnect(): void;
|
|
399
422
|
}
|
|
400
423
|
|
|
401
|
-
export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
|
|
424
|
+
export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme, VoiceAgent, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
|
package/dist/index.d.ts
CHANGED
|
@@ -90,11 +90,12 @@ interface VoiceAgentOptions {
|
|
|
90
90
|
}
|
|
91
91
|
/**
|
|
92
92
|
* Viseme data for lip-sync animation
|
|
93
|
+
* Format: {"v": index, "c": character, "t": timestamp}
|
|
93
94
|
*/
|
|
94
95
|
interface Viseme {
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
96
|
+
v: number;
|
|
97
|
+
c: string;
|
|
98
|
+
t: number;
|
|
98
99
|
}
|
|
99
100
|
|
|
100
101
|
/**
|
|
@@ -117,10 +118,14 @@ declare class VoiceAgentClient {
|
|
|
117
118
|
private isConnected;
|
|
118
119
|
private messages;
|
|
119
120
|
private visemeListeners;
|
|
121
|
+
private wantVisemes;
|
|
122
|
+
private serverUrl;
|
|
120
123
|
constructor(config: LokutorConfig & {
|
|
121
124
|
prompt: string;
|
|
122
125
|
voice?: VoiceStyle;
|
|
123
126
|
language?: Language;
|
|
127
|
+
visemes?: boolean;
|
|
128
|
+
serverUrl?: string;
|
|
124
129
|
onVisemes?: (visemes: Viseme[]) => void;
|
|
125
130
|
});
|
|
126
131
|
/**
|
|
@@ -323,7 +328,7 @@ declare class BrowserAudioManager {
|
|
|
323
328
|
private mediaStream;
|
|
324
329
|
private nextPlaybackTime;
|
|
325
330
|
private activeSources;
|
|
326
|
-
private
|
|
331
|
+
private audioClockOffset;
|
|
327
332
|
private inputSampleRate;
|
|
328
333
|
private outputSampleRate;
|
|
329
334
|
private autoGainControl;
|
|
@@ -333,6 +338,7 @@ declare class BrowserAudioManager {
|
|
|
333
338
|
private onInputError?;
|
|
334
339
|
private isMuted;
|
|
335
340
|
private isListening;
|
|
341
|
+
private resampler;
|
|
336
342
|
constructor(config?: BrowserAudioConfig);
|
|
337
343
|
/**
|
|
338
344
|
* Initialize the AudioContext and analyser
|
|
@@ -342,60 +348,77 @@ declare class BrowserAudioManager {
|
|
|
342
348
|
* Start capturing audio from the microphone
|
|
343
349
|
*/
|
|
344
350
|
startMicrophone(onAudioInput: (pcm16Data: Uint8Array) => void): Promise<void>;
|
|
345
|
-
/**
|
|
346
|
-
* Internal method to process microphone audio data
|
|
347
|
-
*/
|
|
348
351
|
private _processAudioInput;
|
|
349
|
-
/**
|
|
350
|
-
* Stop capturing microphone input
|
|
351
|
-
*/
|
|
352
352
|
stopMicrophone(): void;
|
|
353
353
|
/**
|
|
354
354
|
* Play back audio received from the server
|
|
355
|
-
* @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
|
|
356
355
|
*/
|
|
357
356
|
playAudio(pcm16Data: Uint8Array): void;
|
|
358
|
-
/**
|
|
359
|
-
* Internal method to schedule and play audio with sample-accurate timing
|
|
360
|
-
*/
|
|
361
357
|
private _schedulePlayback;
|
|
362
358
|
/**
|
|
363
|
-
*
|
|
359
|
+
* Get the current high-precision audio clock offset for viseme synchronization.
|
|
360
|
+
* Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
|
|
364
361
|
*/
|
|
365
|
-
|
|
362
|
+
getAudioClockOffset(): number | null;
|
|
366
363
|
/**
|
|
367
|
-
*
|
|
364
|
+
* Reset the audio clock offset (call when a response is interrupted or finished)
|
|
368
365
|
*/
|
|
366
|
+
resetAudioClock(): void;
|
|
367
|
+
stopPlayback(): void;
|
|
369
368
|
setMuted(muted: boolean): void;
|
|
370
|
-
/**
|
|
371
|
-
* Get current mute state
|
|
372
|
-
*/
|
|
373
369
|
isMicMuted(): boolean;
|
|
370
|
+
getAmplitude(): number;
|
|
371
|
+
getFrequencyData(): Uint8Array;
|
|
372
|
+
getWaveformData(): Uint8Array;
|
|
373
|
+
cleanup(): void;
|
|
374
|
+
getAudioContext(): AudioContext | null;
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
/**
|
|
378
|
+
* High-level AI Voice Agent for browser-based conversations.
|
|
379
|
+
*
|
|
380
|
+
* This class orchestrates microphone input, AI processing, and
|
|
381
|
+
* speaker output, providing a simple interface for building
|
|
382
|
+
* voice assistants with lip-sync support.
|
|
383
|
+
*/
|
|
384
|
+
declare class VoiceAgent {
|
|
385
|
+
private client;
|
|
386
|
+
private audioManager;
|
|
387
|
+
private options;
|
|
388
|
+
private isConnected;
|
|
389
|
+
private visemeQueue;
|
|
390
|
+
constructor(options: VoiceAgentOptions & {
|
|
391
|
+
apiKey: string;
|
|
392
|
+
});
|
|
374
393
|
/**
|
|
375
|
-
*
|
|
376
|
-
*
|
|
394
|
+
* Initialize hardware and connect to the AI server.
|
|
395
|
+
* This must be called in response to a user guesture (like a click)
|
|
396
|
+
* to satisfy browser AudioContext requirements.
|
|
377
397
|
*/
|
|
378
|
-
|
|
398
|
+
connect(): Promise<boolean>;
|
|
379
399
|
/**
|
|
380
|
-
* Get
|
|
400
|
+
* Get the current amplitude/volume of the microphone or output audio.
|
|
401
|
+
* Useful for voice activity visualization.
|
|
402
|
+
* @returns value between 0 and 1
|
|
381
403
|
*/
|
|
382
|
-
|
|
404
|
+
getAmplitude(): number;
|
|
383
405
|
/**
|
|
384
|
-
*
|
|
406
|
+
* Mute or unmute the microphone.
|
|
385
407
|
*/
|
|
386
|
-
|
|
408
|
+
toggleMute(): boolean;
|
|
387
409
|
/**
|
|
388
|
-
*
|
|
410
|
+
* High-precision method to get visemes that should be active
|
|
411
|
+
* at the current playback frame. Use this in a requestAnimationFrame loop.
|
|
389
412
|
*/
|
|
390
|
-
|
|
413
|
+
getFrameVisemes(): Viseme[];
|
|
391
414
|
/**
|
|
392
|
-
*
|
|
415
|
+
* Change the system prompt mid-conversation.
|
|
393
416
|
*/
|
|
394
|
-
|
|
417
|
+
updatePrompt(newPrompt: string): void;
|
|
395
418
|
/**
|
|
396
|
-
*
|
|
419
|
+
* Disconnect and release audio resources.
|
|
397
420
|
*/
|
|
398
|
-
|
|
421
|
+
disconnect(): void;
|
|
399
422
|
}
|
|
400
423
|
|
|
401
|
-
export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
|
|
424
|
+
export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme, VoiceAgent, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
|
package/dist/index.js
CHANGED
|
@@ -26,6 +26,7 @@ __export(index_exports, {
|
|
|
26
26
|
Language: () => Language,
|
|
27
27
|
StreamResampler: () => StreamResampler,
|
|
28
28
|
TTSClient: () => TTSClient,
|
|
29
|
+
VoiceAgent: () => VoiceAgent,
|
|
29
30
|
VoiceAgentClient: () => VoiceAgentClient,
|
|
30
31
|
VoiceStyle: () => VoiceStyle,
|
|
31
32
|
applyLowPassFilter: () => applyLowPassFilter,
|
|
@@ -65,8 +66,8 @@ var Language = /* @__PURE__ */ ((Language2) => {
|
|
|
65
66
|
return Language2;
|
|
66
67
|
})(Language || {});
|
|
67
68
|
var AUDIO_CONFIG = {
|
|
68
|
-
SAMPLE_RATE:
|
|
69
|
-
SPEAKER_SAMPLE_RATE:
|
|
69
|
+
SAMPLE_RATE: 16e3,
|
|
70
|
+
SPEAKER_SAMPLE_RATE: 44100,
|
|
70
71
|
CHANNELS: 1,
|
|
71
72
|
CHUNK_DURATION_MS: 20,
|
|
72
73
|
get CHUNK_SIZE() {
|
|
@@ -103,17 +104,21 @@ var VoiceAgentClient = class {
|
|
|
103
104
|
isConnected = false;
|
|
104
105
|
messages = [];
|
|
105
106
|
visemeListeners = [];
|
|
107
|
+
wantVisemes = false;
|
|
108
|
+
serverUrl = null;
|
|
106
109
|
constructor(config) {
|
|
107
110
|
this.apiKey = config.apiKey;
|
|
108
111
|
this.prompt = config.prompt;
|
|
109
112
|
this.voice = config.voice || "F1" /* F1 */;
|
|
110
113
|
this.language = config.language || "en" /* ENGLISH */;
|
|
114
|
+
this.serverUrl = config.serverUrl || null;
|
|
111
115
|
this.onTranscription = config.onTranscription;
|
|
112
116
|
this.onResponse = config.onResponse;
|
|
113
117
|
this.onAudioCallback = config.onAudio;
|
|
114
118
|
this.onVisemesCallback = config.onVisemes;
|
|
115
119
|
this.onStatus = config.onStatus;
|
|
116
120
|
this.onError = config.onError;
|
|
121
|
+
this.wantVisemes = config.visemes || false;
|
|
117
122
|
}
|
|
118
123
|
/**
|
|
119
124
|
* Connect to the Lokutor Voice Agent server
|
|
@@ -121,12 +126,12 @@ var VoiceAgentClient = class {
|
|
|
121
126
|
async connect() {
|
|
122
127
|
return new Promise((resolve, reject) => {
|
|
123
128
|
try {
|
|
124
|
-
let url = DEFAULT_URLS.VOICE_AGENT;
|
|
129
|
+
let url = this.serverUrl || DEFAULT_URLS.VOICE_AGENT;
|
|
125
130
|
if (this.apiKey) {
|
|
126
131
|
const separator = url.includes("?") ? "&" : "?";
|
|
127
132
|
url += `${separator}api_key=${this.apiKey}`;
|
|
128
133
|
}
|
|
129
|
-
console.log(`\u{1F517} Connecting to ${
|
|
134
|
+
console.log(`\u{1F517} Connecting to ${url}...`);
|
|
130
135
|
this.ws = new WebSocket(url);
|
|
131
136
|
this.ws.binaryType = "arraybuffer";
|
|
132
137
|
this.ws.onopen = () => {
|
|
@@ -165,7 +170,8 @@ var VoiceAgentClient = class {
|
|
|
165
170
|
this.ws.send(JSON.stringify({ type: "prompt", data: this.prompt }));
|
|
166
171
|
this.ws.send(JSON.stringify({ type: "voice", data: this.voice }));
|
|
167
172
|
this.ws.send(JSON.stringify({ type: "language", data: this.language }));
|
|
168
|
-
|
|
173
|
+
this.ws.send(JSON.stringify({ type: "visemes", data: this.wantVisemes }));
|
|
174
|
+
console.log(`\u2699\uFE0F Configured: voice=${this.voice}, language=${this.language}, visemes=${this.wantVisemes}`);
|
|
169
175
|
}
|
|
170
176
|
/**
|
|
171
177
|
* Send raw PCM audio data to the server
|
|
@@ -498,7 +504,8 @@ var BrowserAudioManager = class {
|
|
|
498
504
|
// Playback scheduling
|
|
499
505
|
nextPlaybackTime = 0;
|
|
500
506
|
activeSources = [];
|
|
501
|
-
|
|
507
|
+
// High-precision clock anchor for viseme sync
|
|
508
|
+
audioClockOffset = null;
|
|
502
509
|
// Configuration
|
|
503
510
|
inputSampleRate;
|
|
504
511
|
outputSampleRate;
|
|
@@ -511,6 +518,7 @@ var BrowserAudioManager = class {
|
|
|
511
518
|
// Audio processing state
|
|
512
519
|
isMuted = false;
|
|
513
520
|
isListening = false;
|
|
521
|
+
resampler = null;
|
|
514
522
|
constructor(config = {}) {
|
|
515
523
|
this.inputSampleRate = config.inputSampleRate ?? AUDIO_CONFIG.SAMPLE_RATE;
|
|
516
524
|
this.outputSampleRate = config.outputSampleRate ?? AUDIO_CONFIG.SPEAKER_SAMPLE_RATE;
|
|
@@ -534,7 +542,6 @@ var BrowserAudioManager = class {
|
|
|
534
542
|
}
|
|
535
543
|
if (this.audioContext.state === "suspended") {
|
|
536
544
|
await this.audioContext.resume();
|
|
537
|
-
console.log("\u{1F442} AudioContext resumed");
|
|
538
545
|
}
|
|
539
546
|
if (analyserConfig?.enabled !== false) {
|
|
540
547
|
this.analyserNode = this.audioContext.createAnalyser();
|
|
@@ -548,6 +555,7 @@ var BrowserAudioManager = class {
|
|
|
548
555
|
if (!this.audioContext) {
|
|
549
556
|
await this.init();
|
|
550
557
|
}
|
|
558
|
+
this.resampler = new StreamResampler(this.audioContext.sampleRate, this.inputSampleRate);
|
|
551
559
|
try {
|
|
552
560
|
this.onAudioInput = onAudioInput;
|
|
553
561
|
this.isListening = true;
|
|
@@ -563,9 +571,7 @@ var BrowserAudioManager = class {
|
|
|
563
571
|
this.scriptProcessor = this.audioContext.createScriptProcessor(
|
|
564
572
|
bufferSize,
|
|
565
573
|
1,
|
|
566
|
-
// input channels
|
|
567
574
|
1
|
|
568
|
-
// output channels
|
|
569
575
|
);
|
|
570
576
|
this.mediaStreamAudioSourceNode.connect(this.scriptProcessor);
|
|
571
577
|
this.scriptProcessor.connect(this.audioContext.destination);
|
|
@@ -582,40 +588,19 @@ var BrowserAudioManager = class {
|
|
|
582
588
|
throw err;
|
|
583
589
|
}
|
|
584
590
|
}
|
|
585
|
-
/**
|
|
586
|
-
* Internal method to process microphone audio data
|
|
587
|
-
*/
|
|
588
591
|
_processAudioInput(event) {
|
|
589
|
-
if (!this.onAudioInput || !this.audioContext || !this.isListening) return;
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
const
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
}
|
|
597
|
-
const hardwareRate = this.audioContext.sampleRate;
|
|
598
|
-
let processedData = new Float32Array(inputData);
|
|
599
|
-
if (hardwareRate !== this.inputSampleRate) {
|
|
600
|
-
processedData = resampleWithAntiAliasing(
|
|
601
|
-
processedData,
|
|
602
|
-
hardwareRate,
|
|
603
|
-
this.inputSampleRate
|
|
604
|
-
);
|
|
592
|
+
if (!this.onAudioInput || !this.audioContext || !this.isListening || this.isMuted) return;
|
|
593
|
+
const inputData = event.inputBuffer.getChannelData(0);
|
|
594
|
+
event.outputBuffer.getChannelData(0).fill(0);
|
|
595
|
+
const resampled = this.resampler ? this.resampler.process(inputData) : inputData;
|
|
596
|
+
if (resampled && resampled.length > 0) {
|
|
597
|
+
const int16Data = float32ToPcm16(resampled);
|
|
598
|
+
this.onAudioInput(new Uint8Array(int16Data.buffer, int16Data.byteOffset, int16Data.byteLength));
|
|
605
599
|
}
|
|
606
|
-
const int16Data = float32ToPcm16(processedData);
|
|
607
|
-
const uint8Data = new Uint8Array(
|
|
608
|
-
int16Data.buffer,
|
|
609
|
-
int16Data.byteOffset,
|
|
610
|
-
int16Data.byteLength
|
|
611
|
-
);
|
|
612
|
-
this.onAudioInput(uint8Data);
|
|
613
600
|
}
|
|
614
|
-
/**
|
|
615
|
-
* Stop capturing microphone input
|
|
616
|
-
*/
|
|
617
601
|
stopMicrophone() {
|
|
618
602
|
this.isListening = false;
|
|
603
|
+
this.resampler = null;
|
|
619
604
|
if (this.mediaStream) {
|
|
620
605
|
this.mediaStream.getTracks().forEach((track) => track.stop());
|
|
621
606
|
this.mediaStream = null;
|
|
@@ -628,17 +613,12 @@ var BrowserAudioManager = class {
|
|
|
628
613
|
this.mediaStreamAudioSourceNode.disconnect();
|
|
629
614
|
this.mediaStreamAudioSourceNode = null;
|
|
630
615
|
}
|
|
631
|
-
console.log("\u{1F3A4} Microphone stopped");
|
|
632
616
|
}
|
|
633
617
|
/**
|
|
634
618
|
* Play back audio received from the server
|
|
635
|
-
* @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
|
|
636
619
|
*/
|
|
637
620
|
playAudio(pcm16Data) {
|
|
638
|
-
if (!this.audioContext)
|
|
639
|
-
console.warn("AudioContext not initialized");
|
|
640
|
-
return;
|
|
641
|
-
}
|
|
621
|
+
if (!this.audioContext) return;
|
|
642
622
|
const int16Array = new Int16Array(
|
|
643
623
|
pcm16Data.buffer,
|
|
644
624
|
pcm16Data.byteOffset,
|
|
@@ -653,18 +633,17 @@ var BrowserAudioManager = class {
|
|
|
653
633
|
audioBuffer.getChannelData(0).set(float32Data);
|
|
654
634
|
this._schedulePlayback(audioBuffer);
|
|
655
635
|
}
|
|
656
|
-
/**
|
|
657
|
-
* Internal method to schedule and play audio with sample-accurate timing
|
|
658
|
-
*/
|
|
659
636
|
_schedulePlayback(audioBuffer) {
|
|
660
637
|
if (!this.audioContext) return;
|
|
661
638
|
const currentTime = this.audioContext.currentTime;
|
|
662
639
|
const duration = audioBuffer.length / this.outputSampleRate;
|
|
663
640
|
const startTime = Math.max(
|
|
664
641
|
currentTime + 0.01,
|
|
665
|
-
// Minimum 10ms delay
|
|
666
642
|
this.nextPlaybackTime
|
|
667
643
|
);
|
|
644
|
+
if (this.audioClockOffset === null) {
|
|
645
|
+
this.audioClockOffset = startTime;
|
|
646
|
+
}
|
|
668
647
|
this.nextPlaybackTime = startTime + duration;
|
|
669
648
|
const source = this.audioContext.createBufferSource();
|
|
670
649
|
source.buffer = audioBuffer;
|
|
@@ -682,8 +661,18 @@ var BrowserAudioManager = class {
|
|
|
682
661
|
};
|
|
683
662
|
}
|
|
684
663
|
/**
|
|
685
|
-
*
|
|
664
|
+
* Get the current high-precision audio clock offset for viseme synchronization.
|
|
665
|
+
* Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
|
|
666
|
+
*/
|
|
667
|
+
getAudioClockOffset() {
|
|
668
|
+
return this.audioClockOffset;
|
|
669
|
+
}
|
|
670
|
+
/**
|
|
671
|
+
* Reset the audio clock offset (call when a response is interrupted or finished)
|
|
686
672
|
*/
|
|
673
|
+
resetAudioClock() {
|
|
674
|
+
this.audioClockOffset = null;
|
|
675
|
+
}
|
|
687
676
|
stopPlayback() {
|
|
688
677
|
this.activeSources.forEach((source) => {
|
|
689
678
|
try {
|
|
@@ -692,26 +681,15 @@ var BrowserAudioManager = class {
|
|
|
692
681
|
}
|
|
693
682
|
});
|
|
694
683
|
this.activeSources = [];
|
|
695
|
-
this.
|
|
696
|
-
this.
|
|
697
|
-
console.log("\u{1F507} Playback stopped");
|
|
684
|
+
this.nextPlaybackTime = 0;
|
|
685
|
+
this.resetAudioClock();
|
|
698
686
|
}
|
|
699
|
-
/**
|
|
700
|
-
* Toggle mute state
|
|
701
|
-
*/
|
|
702
687
|
setMuted(muted) {
|
|
703
688
|
this.isMuted = muted;
|
|
704
689
|
}
|
|
705
|
-
/**
|
|
706
|
-
* Get current mute state
|
|
707
|
-
*/
|
|
708
690
|
isMicMuted() {
|
|
709
691
|
return this.isMuted;
|
|
710
692
|
}
|
|
711
|
-
/**
|
|
712
|
-
* Get current amplitude from analyser (for visualization)
|
|
713
|
-
* Returns value between 0 and 1
|
|
714
|
-
*/
|
|
715
693
|
getAmplitude() {
|
|
716
694
|
if (!this.analyserNode) return 0;
|
|
717
695
|
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
@@ -719,31 +697,18 @@ var BrowserAudioManager = class {
|
|
|
719
697
|
const rms = calculateRMS(dataArray);
|
|
720
698
|
return Math.min(rms * 10, 1);
|
|
721
699
|
}
|
|
722
|
-
/**
|
|
723
|
-
* Get frequency data from analyser for visualization
|
|
724
|
-
*/
|
|
725
700
|
getFrequencyData() {
|
|
726
|
-
if (!this.analyserNode)
|
|
727
|
-
return new Uint8Array(0);
|
|
728
|
-
}
|
|
701
|
+
if (!this.analyserNode) return new Uint8Array(0);
|
|
729
702
|
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
730
703
|
this.analyserNode.getByteFrequencyData(dataArray);
|
|
731
704
|
return dataArray;
|
|
732
705
|
}
|
|
733
|
-
/**
|
|
734
|
-
* Get time-domain data from analyser for waveform visualization
|
|
735
|
-
*/
|
|
736
706
|
getWaveformData() {
|
|
737
|
-
if (!this.analyserNode)
|
|
738
|
-
return new Uint8Array(0);
|
|
739
|
-
}
|
|
707
|
+
if (!this.analyserNode) return new Uint8Array(0);
|
|
740
708
|
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
741
709
|
this.analyserNode.getByteTimeDomainData(dataArray);
|
|
742
710
|
return dataArray;
|
|
743
711
|
}
|
|
744
|
-
/**
|
|
745
|
-
* Cleanup and close AudioContext
|
|
746
|
-
*/
|
|
747
712
|
cleanup() {
|
|
748
713
|
this.stopMicrophone();
|
|
749
714
|
this.stopPlayback();
|
|
@@ -752,17 +717,124 @@ var BrowserAudioManager = class {
|
|
|
752
717
|
this.analyserNode = null;
|
|
753
718
|
}
|
|
754
719
|
}
|
|
720
|
+
getAudioContext() {
|
|
721
|
+
return this.audioContext;
|
|
722
|
+
}
|
|
723
|
+
};
|
|
724
|
+
|
|
725
|
+
// src/voice-agent.ts
|
|
726
|
+
var VoiceAgent = class {
|
|
727
|
+
client;
|
|
728
|
+
audioManager;
|
|
729
|
+
options;
|
|
730
|
+
isConnected = false;
|
|
731
|
+
visemeQueue = [];
|
|
732
|
+
constructor(options) {
|
|
733
|
+
this.options = options;
|
|
734
|
+
this.client = new VoiceAgentClient({
|
|
735
|
+
apiKey: options.apiKey,
|
|
736
|
+
prompt: options.prompt || "You are a helpful and friendly AI assistant.",
|
|
737
|
+
voice: options.voice || "F1" /* F1 */,
|
|
738
|
+
language: options.language || "en" /* ENGLISH */,
|
|
739
|
+
visemes: options.visemes ?? true,
|
|
740
|
+
serverUrl: options.serverUrl,
|
|
741
|
+
onTranscription: (text) => {
|
|
742
|
+
if (options.onTranscription) options.onTranscription(text, true);
|
|
743
|
+
},
|
|
744
|
+
onResponse: (text) => {
|
|
745
|
+
if (options.onTranscription) options.onTranscription(text, false);
|
|
746
|
+
},
|
|
747
|
+
onAudio: (data) => {
|
|
748
|
+
this.audioManager.playAudio(data);
|
|
749
|
+
},
|
|
750
|
+
onVisemes: (visemes) => {
|
|
751
|
+
this.visemeQueue.push(...visemes);
|
|
752
|
+
if (options.onVisemes) options.onVisemes(visemes);
|
|
753
|
+
},
|
|
754
|
+
onStatus: (status) => {
|
|
755
|
+
if (options.onStatusChange) options.onStatusChange(status);
|
|
756
|
+
if (status === "interrupted" || status === "thinking") {
|
|
757
|
+
this.audioManager.stopPlayback();
|
|
758
|
+
this.visemeQueue = [];
|
|
759
|
+
}
|
|
760
|
+
},
|
|
761
|
+
onError: (err) => {
|
|
762
|
+
if (options.onError) options.onError(err);
|
|
763
|
+
}
|
|
764
|
+
});
|
|
765
|
+
this.audioManager = new BrowserAudioManager({
|
|
766
|
+
autoGainControl: true,
|
|
767
|
+
echoCancellation: true,
|
|
768
|
+
noiseSuppression: true
|
|
769
|
+
});
|
|
770
|
+
}
|
|
755
771
|
/**
|
|
756
|
-
*
|
|
772
|
+
* Initialize hardware and connect to the AI server.
|
|
773
|
+
* This must be called in response to a user guesture (like a click)
|
|
774
|
+
* to satisfy browser AudioContext requirements.
|
|
757
775
|
*/
|
|
758
|
-
|
|
759
|
-
|
|
776
|
+
async connect() {
|
|
777
|
+
try {
|
|
778
|
+
await this.audioManager.init();
|
|
779
|
+
const connected = await this.client.connect();
|
|
780
|
+
if (!connected) return false;
|
|
781
|
+
this.isConnected = true;
|
|
782
|
+
await this.audioManager.startMicrophone((pcm16Data) => {
|
|
783
|
+
if (this.isConnected) {
|
|
784
|
+
this.client.sendAudio(pcm16Data);
|
|
785
|
+
}
|
|
786
|
+
});
|
|
787
|
+
return true;
|
|
788
|
+
} catch (err) {
|
|
789
|
+
if (this.options.onError) this.options.onError(err);
|
|
790
|
+
return false;
|
|
791
|
+
}
|
|
760
792
|
}
|
|
761
793
|
/**
|
|
762
|
-
*
|
|
794
|
+
* Get the current amplitude/volume of the microphone or output audio.
|
|
795
|
+
* Useful for voice activity visualization.
|
|
796
|
+
* @returns value between 0 and 1
|
|
763
797
|
*/
|
|
764
|
-
|
|
765
|
-
return this.
|
|
798
|
+
getAmplitude() {
|
|
799
|
+
return this.audioManager.getAmplitude();
|
|
800
|
+
}
|
|
801
|
+
/**
|
|
802
|
+
* Mute or unmute the microphone.
|
|
803
|
+
*/
|
|
804
|
+
toggleMute() {
|
|
805
|
+
const currentState = this.audioManager.isMicMuted();
|
|
806
|
+
this.audioManager.setMuted(!currentState);
|
|
807
|
+
return !currentState;
|
|
808
|
+
}
|
|
809
|
+
/**
|
|
810
|
+
* High-precision method to get visemes that should be active
|
|
811
|
+
* at the current playback frame. Use this in a requestAnimationFrame loop.
|
|
812
|
+
*/
|
|
813
|
+
getFrameVisemes() {
|
|
814
|
+
const offset = this.audioManager.getAudioClockOffset();
|
|
815
|
+
const audioCtx = this.audioManager.getAudioContext();
|
|
816
|
+
if (offset === null || !audioCtx) return [];
|
|
817
|
+
const streamTime = (audioCtx.currentTime - offset) * 1e3;
|
|
818
|
+
const currentBatch = [];
|
|
819
|
+
while (this.visemeQueue.length > 0 && this.visemeQueue[0].t * 1e3 <= streamTime) {
|
|
820
|
+
currentBatch.push(this.visemeQueue.shift());
|
|
821
|
+
}
|
|
822
|
+
return currentBatch;
|
|
823
|
+
}
|
|
824
|
+
/**
|
|
825
|
+
* Change the system prompt mid-conversation.
|
|
826
|
+
*/
|
|
827
|
+
updatePrompt(newPrompt) {
|
|
828
|
+
this.client.updatePrompt(newPrompt);
|
|
829
|
+
}
|
|
830
|
+
/**
|
|
831
|
+
* Disconnect and release audio resources.
|
|
832
|
+
*/
|
|
833
|
+
disconnect() {
|
|
834
|
+
this.isConnected = false;
|
|
835
|
+
this.client.disconnect();
|
|
836
|
+
this.audioManager.cleanup();
|
|
837
|
+
this.visemeQueue = [];
|
|
766
838
|
}
|
|
767
839
|
};
|
|
768
840
|
// Annotate the CommonJS export names for ESM import in node:
|
|
@@ -773,6 +845,7 @@ var BrowserAudioManager = class {
|
|
|
773
845
|
Language,
|
|
774
846
|
StreamResampler,
|
|
775
847
|
TTSClient,
|
|
848
|
+
VoiceAgent,
|
|
776
849
|
VoiceAgentClient,
|
|
777
850
|
VoiceStyle,
|
|
778
851
|
applyLowPassFilter,
|
package/dist/index.mjs
CHANGED
|
@@ -21,8 +21,8 @@ var Language = /* @__PURE__ */ ((Language2) => {
|
|
|
21
21
|
return Language2;
|
|
22
22
|
})(Language || {});
|
|
23
23
|
var AUDIO_CONFIG = {
|
|
24
|
-
SAMPLE_RATE:
|
|
25
|
-
SPEAKER_SAMPLE_RATE:
|
|
24
|
+
SAMPLE_RATE: 16e3,
|
|
25
|
+
SPEAKER_SAMPLE_RATE: 44100,
|
|
26
26
|
CHANNELS: 1,
|
|
27
27
|
CHUNK_DURATION_MS: 20,
|
|
28
28
|
get CHUNK_SIZE() {
|
|
@@ -59,17 +59,21 @@ var VoiceAgentClient = class {
|
|
|
59
59
|
isConnected = false;
|
|
60
60
|
messages = [];
|
|
61
61
|
visemeListeners = [];
|
|
62
|
+
wantVisemes = false;
|
|
63
|
+
serverUrl = null;
|
|
62
64
|
constructor(config) {
|
|
63
65
|
this.apiKey = config.apiKey;
|
|
64
66
|
this.prompt = config.prompt;
|
|
65
67
|
this.voice = config.voice || "F1" /* F1 */;
|
|
66
68
|
this.language = config.language || "en" /* ENGLISH */;
|
|
69
|
+
this.serverUrl = config.serverUrl || null;
|
|
67
70
|
this.onTranscription = config.onTranscription;
|
|
68
71
|
this.onResponse = config.onResponse;
|
|
69
72
|
this.onAudioCallback = config.onAudio;
|
|
70
73
|
this.onVisemesCallback = config.onVisemes;
|
|
71
74
|
this.onStatus = config.onStatus;
|
|
72
75
|
this.onError = config.onError;
|
|
76
|
+
this.wantVisemes = config.visemes || false;
|
|
73
77
|
}
|
|
74
78
|
/**
|
|
75
79
|
* Connect to the Lokutor Voice Agent server
|
|
@@ -77,12 +81,12 @@ var VoiceAgentClient = class {
|
|
|
77
81
|
async connect() {
|
|
78
82
|
return new Promise((resolve, reject) => {
|
|
79
83
|
try {
|
|
80
|
-
let url = DEFAULT_URLS.VOICE_AGENT;
|
|
84
|
+
let url = this.serverUrl || DEFAULT_URLS.VOICE_AGENT;
|
|
81
85
|
if (this.apiKey) {
|
|
82
86
|
const separator = url.includes("?") ? "&" : "?";
|
|
83
87
|
url += `${separator}api_key=${this.apiKey}`;
|
|
84
88
|
}
|
|
85
|
-
console.log(`\u{1F517} Connecting to ${
|
|
89
|
+
console.log(`\u{1F517} Connecting to ${url}...`);
|
|
86
90
|
this.ws = new WebSocket(url);
|
|
87
91
|
this.ws.binaryType = "arraybuffer";
|
|
88
92
|
this.ws.onopen = () => {
|
|
@@ -121,7 +125,8 @@ var VoiceAgentClient = class {
|
|
|
121
125
|
this.ws.send(JSON.stringify({ type: "prompt", data: this.prompt }));
|
|
122
126
|
this.ws.send(JSON.stringify({ type: "voice", data: this.voice }));
|
|
123
127
|
this.ws.send(JSON.stringify({ type: "language", data: this.language }));
|
|
124
|
-
|
|
128
|
+
this.ws.send(JSON.stringify({ type: "visemes", data: this.wantVisemes }));
|
|
129
|
+
console.log(`\u2699\uFE0F Configured: voice=${this.voice}, language=${this.language}, visemes=${this.wantVisemes}`);
|
|
125
130
|
}
|
|
126
131
|
/**
|
|
127
132
|
* Send raw PCM audio data to the server
|
|
@@ -454,7 +459,8 @@ var BrowserAudioManager = class {
|
|
|
454
459
|
// Playback scheduling
|
|
455
460
|
nextPlaybackTime = 0;
|
|
456
461
|
activeSources = [];
|
|
457
|
-
|
|
462
|
+
// High-precision clock anchor for viseme sync
|
|
463
|
+
audioClockOffset = null;
|
|
458
464
|
// Configuration
|
|
459
465
|
inputSampleRate;
|
|
460
466
|
outputSampleRate;
|
|
@@ -467,6 +473,7 @@ var BrowserAudioManager = class {
|
|
|
467
473
|
// Audio processing state
|
|
468
474
|
isMuted = false;
|
|
469
475
|
isListening = false;
|
|
476
|
+
resampler = null;
|
|
470
477
|
constructor(config = {}) {
|
|
471
478
|
this.inputSampleRate = config.inputSampleRate ?? AUDIO_CONFIG.SAMPLE_RATE;
|
|
472
479
|
this.outputSampleRate = config.outputSampleRate ?? AUDIO_CONFIG.SPEAKER_SAMPLE_RATE;
|
|
@@ -490,7 +497,6 @@ var BrowserAudioManager = class {
|
|
|
490
497
|
}
|
|
491
498
|
if (this.audioContext.state === "suspended") {
|
|
492
499
|
await this.audioContext.resume();
|
|
493
|
-
console.log("\u{1F442} AudioContext resumed");
|
|
494
500
|
}
|
|
495
501
|
if (analyserConfig?.enabled !== false) {
|
|
496
502
|
this.analyserNode = this.audioContext.createAnalyser();
|
|
@@ -504,6 +510,7 @@ var BrowserAudioManager = class {
|
|
|
504
510
|
if (!this.audioContext) {
|
|
505
511
|
await this.init();
|
|
506
512
|
}
|
|
513
|
+
this.resampler = new StreamResampler(this.audioContext.sampleRate, this.inputSampleRate);
|
|
507
514
|
try {
|
|
508
515
|
this.onAudioInput = onAudioInput;
|
|
509
516
|
this.isListening = true;
|
|
@@ -519,9 +526,7 @@ var BrowserAudioManager = class {
|
|
|
519
526
|
this.scriptProcessor = this.audioContext.createScriptProcessor(
|
|
520
527
|
bufferSize,
|
|
521
528
|
1,
|
|
522
|
-
// input channels
|
|
523
529
|
1
|
|
524
|
-
// output channels
|
|
525
530
|
);
|
|
526
531
|
this.mediaStreamAudioSourceNode.connect(this.scriptProcessor);
|
|
527
532
|
this.scriptProcessor.connect(this.audioContext.destination);
|
|
@@ -538,40 +543,19 @@ var BrowserAudioManager = class {
|
|
|
538
543
|
throw err;
|
|
539
544
|
}
|
|
540
545
|
}
|
|
541
|
-
/**
|
|
542
|
-
* Internal method to process microphone audio data
|
|
543
|
-
*/
|
|
544
546
|
_processAudioInput(event) {
|
|
545
|
-
if (!this.onAudioInput || !this.audioContext || !this.isListening) return;
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
const
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
}
|
|
553
|
-
const hardwareRate = this.audioContext.sampleRate;
|
|
554
|
-
let processedData = new Float32Array(inputData);
|
|
555
|
-
if (hardwareRate !== this.inputSampleRate) {
|
|
556
|
-
processedData = resampleWithAntiAliasing(
|
|
557
|
-
processedData,
|
|
558
|
-
hardwareRate,
|
|
559
|
-
this.inputSampleRate
|
|
560
|
-
);
|
|
547
|
+
if (!this.onAudioInput || !this.audioContext || !this.isListening || this.isMuted) return;
|
|
548
|
+
const inputData = event.inputBuffer.getChannelData(0);
|
|
549
|
+
event.outputBuffer.getChannelData(0).fill(0);
|
|
550
|
+
const resampled = this.resampler ? this.resampler.process(inputData) : inputData;
|
|
551
|
+
if (resampled && resampled.length > 0) {
|
|
552
|
+
const int16Data = float32ToPcm16(resampled);
|
|
553
|
+
this.onAudioInput(new Uint8Array(int16Data.buffer, int16Data.byteOffset, int16Data.byteLength));
|
|
561
554
|
}
|
|
562
|
-
const int16Data = float32ToPcm16(processedData);
|
|
563
|
-
const uint8Data = new Uint8Array(
|
|
564
|
-
int16Data.buffer,
|
|
565
|
-
int16Data.byteOffset,
|
|
566
|
-
int16Data.byteLength
|
|
567
|
-
);
|
|
568
|
-
this.onAudioInput(uint8Data);
|
|
569
555
|
}
|
|
570
|
-
/**
|
|
571
|
-
* Stop capturing microphone input
|
|
572
|
-
*/
|
|
573
556
|
stopMicrophone() {
|
|
574
557
|
this.isListening = false;
|
|
558
|
+
this.resampler = null;
|
|
575
559
|
if (this.mediaStream) {
|
|
576
560
|
this.mediaStream.getTracks().forEach((track) => track.stop());
|
|
577
561
|
this.mediaStream = null;
|
|
@@ -584,17 +568,12 @@ var BrowserAudioManager = class {
|
|
|
584
568
|
this.mediaStreamAudioSourceNode.disconnect();
|
|
585
569
|
this.mediaStreamAudioSourceNode = null;
|
|
586
570
|
}
|
|
587
|
-
console.log("\u{1F3A4} Microphone stopped");
|
|
588
571
|
}
|
|
589
572
|
/**
|
|
590
573
|
* Play back audio received from the server
|
|
591
|
-
* @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
|
|
592
574
|
*/
|
|
593
575
|
playAudio(pcm16Data) {
|
|
594
|
-
if (!this.audioContext)
|
|
595
|
-
console.warn("AudioContext not initialized");
|
|
596
|
-
return;
|
|
597
|
-
}
|
|
576
|
+
if (!this.audioContext) return;
|
|
598
577
|
const int16Array = new Int16Array(
|
|
599
578
|
pcm16Data.buffer,
|
|
600
579
|
pcm16Data.byteOffset,
|
|
@@ -609,18 +588,17 @@ var BrowserAudioManager = class {
|
|
|
609
588
|
audioBuffer.getChannelData(0).set(float32Data);
|
|
610
589
|
this._schedulePlayback(audioBuffer);
|
|
611
590
|
}
|
|
612
|
-
/**
|
|
613
|
-
* Internal method to schedule and play audio with sample-accurate timing
|
|
614
|
-
*/
|
|
615
591
|
_schedulePlayback(audioBuffer) {
|
|
616
592
|
if (!this.audioContext) return;
|
|
617
593
|
const currentTime = this.audioContext.currentTime;
|
|
618
594
|
const duration = audioBuffer.length / this.outputSampleRate;
|
|
619
595
|
const startTime = Math.max(
|
|
620
596
|
currentTime + 0.01,
|
|
621
|
-
// Minimum 10ms delay
|
|
622
597
|
this.nextPlaybackTime
|
|
623
598
|
);
|
|
599
|
+
if (this.audioClockOffset === null) {
|
|
600
|
+
this.audioClockOffset = startTime;
|
|
601
|
+
}
|
|
624
602
|
this.nextPlaybackTime = startTime + duration;
|
|
625
603
|
const source = this.audioContext.createBufferSource();
|
|
626
604
|
source.buffer = audioBuffer;
|
|
@@ -638,8 +616,18 @@ var BrowserAudioManager = class {
|
|
|
638
616
|
};
|
|
639
617
|
}
|
|
640
618
|
/**
|
|
641
|
-
*
|
|
619
|
+
* Get the current high-precision audio clock offset for viseme synchronization.
|
|
620
|
+
* Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
|
|
621
|
+
*/
|
|
622
|
+
getAudioClockOffset() {
|
|
623
|
+
return this.audioClockOffset;
|
|
624
|
+
}
|
|
625
|
+
/**
|
|
626
|
+
* Reset the audio clock offset (call when a response is interrupted or finished)
|
|
642
627
|
*/
|
|
628
|
+
resetAudioClock() {
|
|
629
|
+
this.audioClockOffset = null;
|
|
630
|
+
}
|
|
643
631
|
stopPlayback() {
|
|
644
632
|
this.activeSources.forEach((source) => {
|
|
645
633
|
try {
|
|
@@ -648,26 +636,15 @@ var BrowserAudioManager = class {
|
|
|
648
636
|
}
|
|
649
637
|
});
|
|
650
638
|
this.activeSources = [];
|
|
651
|
-
this.
|
|
652
|
-
this.
|
|
653
|
-
console.log("\u{1F507} Playback stopped");
|
|
639
|
+
this.nextPlaybackTime = 0;
|
|
640
|
+
this.resetAudioClock();
|
|
654
641
|
}
|
|
655
|
-
/**
|
|
656
|
-
* Toggle mute state
|
|
657
|
-
*/
|
|
658
642
|
setMuted(muted) {
|
|
659
643
|
this.isMuted = muted;
|
|
660
644
|
}
|
|
661
|
-
/**
|
|
662
|
-
* Get current mute state
|
|
663
|
-
*/
|
|
664
645
|
isMicMuted() {
|
|
665
646
|
return this.isMuted;
|
|
666
647
|
}
|
|
667
|
-
/**
|
|
668
|
-
* Get current amplitude from analyser (for visualization)
|
|
669
|
-
* Returns value between 0 and 1
|
|
670
|
-
*/
|
|
671
648
|
getAmplitude() {
|
|
672
649
|
if (!this.analyserNode) return 0;
|
|
673
650
|
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
@@ -675,31 +652,18 @@ var BrowserAudioManager = class {
|
|
|
675
652
|
const rms = calculateRMS(dataArray);
|
|
676
653
|
return Math.min(rms * 10, 1);
|
|
677
654
|
}
|
|
678
|
-
/**
|
|
679
|
-
* Get frequency data from analyser for visualization
|
|
680
|
-
*/
|
|
681
655
|
getFrequencyData() {
|
|
682
|
-
if (!this.analyserNode)
|
|
683
|
-
return new Uint8Array(0);
|
|
684
|
-
}
|
|
656
|
+
if (!this.analyserNode) return new Uint8Array(0);
|
|
685
657
|
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
686
658
|
this.analyserNode.getByteFrequencyData(dataArray);
|
|
687
659
|
return dataArray;
|
|
688
660
|
}
|
|
689
|
-
/**
|
|
690
|
-
* Get time-domain data from analyser for waveform visualization
|
|
691
|
-
*/
|
|
692
661
|
getWaveformData() {
|
|
693
|
-
if (!this.analyserNode)
|
|
694
|
-
return new Uint8Array(0);
|
|
695
|
-
}
|
|
662
|
+
if (!this.analyserNode) return new Uint8Array(0);
|
|
696
663
|
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
697
664
|
this.analyserNode.getByteTimeDomainData(dataArray);
|
|
698
665
|
return dataArray;
|
|
699
666
|
}
|
|
700
|
-
/**
|
|
701
|
-
* Cleanup and close AudioContext
|
|
702
|
-
*/
|
|
703
667
|
cleanup() {
|
|
704
668
|
this.stopMicrophone();
|
|
705
669
|
this.stopPlayback();
|
|
@@ -708,17 +672,124 @@ var BrowserAudioManager = class {
|
|
|
708
672
|
this.analyserNode = null;
|
|
709
673
|
}
|
|
710
674
|
}
|
|
675
|
+
getAudioContext() {
|
|
676
|
+
return this.audioContext;
|
|
677
|
+
}
|
|
678
|
+
};
|
|
679
|
+
|
|
680
|
+
// src/voice-agent.ts
|
|
681
|
+
var VoiceAgent = class {
|
|
682
|
+
client;
|
|
683
|
+
audioManager;
|
|
684
|
+
options;
|
|
685
|
+
isConnected = false;
|
|
686
|
+
visemeQueue = [];
|
|
687
|
+
constructor(options) {
|
|
688
|
+
this.options = options;
|
|
689
|
+
this.client = new VoiceAgentClient({
|
|
690
|
+
apiKey: options.apiKey,
|
|
691
|
+
prompt: options.prompt || "You are a helpful and friendly AI assistant.",
|
|
692
|
+
voice: options.voice || "F1" /* F1 */,
|
|
693
|
+
language: options.language || "en" /* ENGLISH */,
|
|
694
|
+
visemes: options.visemes ?? true,
|
|
695
|
+
serverUrl: options.serverUrl,
|
|
696
|
+
onTranscription: (text) => {
|
|
697
|
+
if (options.onTranscription) options.onTranscription(text, true);
|
|
698
|
+
},
|
|
699
|
+
onResponse: (text) => {
|
|
700
|
+
if (options.onTranscription) options.onTranscription(text, false);
|
|
701
|
+
},
|
|
702
|
+
onAudio: (data) => {
|
|
703
|
+
this.audioManager.playAudio(data);
|
|
704
|
+
},
|
|
705
|
+
onVisemes: (visemes) => {
|
|
706
|
+
this.visemeQueue.push(...visemes);
|
|
707
|
+
if (options.onVisemes) options.onVisemes(visemes);
|
|
708
|
+
},
|
|
709
|
+
onStatus: (status) => {
|
|
710
|
+
if (options.onStatusChange) options.onStatusChange(status);
|
|
711
|
+
if (status === "interrupted" || status === "thinking") {
|
|
712
|
+
this.audioManager.stopPlayback();
|
|
713
|
+
this.visemeQueue = [];
|
|
714
|
+
}
|
|
715
|
+
},
|
|
716
|
+
onError: (err) => {
|
|
717
|
+
if (options.onError) options.onError(err);
|
|
718
|
+
}
|
|
719
|
+
});
|
|
720
|
+
this.audioManager = new BrowserAudioManager({
|
|
721
|
+
autoGainControl: true,
|
|
722
|
+
echoCancellation: true,
|
|
723
|
+
noiseSuppression: true
|
|
724
|
+
});
|
|
725
|
+
}
|
|
711
726
|
/**
|
|
712
|
-
*
|
|
727
|
+
* Initialize hardware and connect to the AI server.
|
|
728
|
+
* This must be called in response to a user guesture (like a click)
|
|
729
|
+
* to satisfy browser AudioContext requirements.
|
|
713
730
|
*/
|
|
714
|
-
|
|
715
|
-
|
|
731
|
+
async connect() {
|
|
732
|
+
try {
|
|
733
|
+
await this.audioManager.init();
|
|
734
|
+
const connected = await this.client.connect();
|
|
735
|
+
if (!connected) return false;
|
|
736
|
+
this.isConnected = true;
|
|
737
|
+
await this.audioManager.startMicrophone((pcm16Data) => {
|
|
738
|
+
if (this.isConnected) {
|
|
739
|
+
this.client.sendAudio(pcm16Data);
|
|
740
|
+
}
|
|
741
|
+
});
|
|
742
|
+
return true;
|
|
743
|
+
} catch (err) {
|
|
744
|
+
if (this.options.onError) this.options.onError(err);
|
|
745
|
+
return false;
|
|
746
|
+
}
|
|
716
747
|
}
|
|
717
748
|
/**
|
|
718
|
-
*
|
|
749
|
+
* Get the current amplitude/volume of the microphone or output audio.
|
|
750
|
+
* Useful for voice activity visualization.
|
|
751
|
+
* @returns value between 0 and 1
|
|
719
752
|
*/
|
|
720
|
-
|
|
721
|
-
return this.
|
|
753
|
+
getAmplitude() {
|
|
754
|
+
return this.audioManager.getAmplitude();
|
|
755
|
+
}
|
|
756
|
+
/**
|
|
757
|
+
* Mute or unmute the microphone.
|
|
758
|
+
*/
|
|
759
|
+
toggleMute() {
|
|
760
|
+
const currentState = this.audioManager.isMicMuted();
|
|
761
|
+
this.audioManager.setMuted(!currentState);
|
|
762
|
+
return !currentState;
|
|
763
|
+
}
|
|
764
|
+
/**
|
|
765
|
+
* High-precision method to get visemes that should be active
|
|
766
|
+
* at the current playback frame. Use this in a requestAnimationFrame loop.
|
|
767
|
+
*/
|
|
768
|
+
getFrameVisemes() {
|
|
769
|
+
const offset = this.audioManager.getAudioClockOffset();
|
|
770
|
+
const audioCtx = this.audioManager.getAudioContext();
|
|
771
|
+
if (offset === null || !audioCtx) return [];
|
|
772
|
+
const streamTime = (audioCtx.currentTime - offset) * 1e3;
|
|
773
|
+
const currentBatch = [];
|
|
774
|
+
while (this.visemeQueue.length > 0 && this.visemeQueue[0].t * 1e3 <= streamTime) {
|
|
775
|
+
currentBatch.push(this.visemeQueue.shift());
|
|
776
|
+
}
|
|
777
|
+
return currentBatch;
|
|
778
|
+
}
|
|
779
|
+
/**
|
|
780
|
+
* Change the system prompt mid-conversation.
|
|
781
|
+
*/
|
|
782
|
+
updatePrompt(newPrompt) {
|
|
783
|
+
this.client.updatePrompt(newPrompt);
|
|
784
|
+
}
|
|
785
|
+
/**
|
|
786
|
+
* Disconnect and release audio resources.
|
|
787
|
+
*/
|
|
788
|
+
disconnect() {
|
|
789
|
+
this.isConnected = false;
|
|
790
|
+
this.client.disconnect();
|
|
791
|
+
this.audioManager.cleanup();
|
|
792
|
+
this.visemeQueue = [];
|
|
722
793
|
}
|
|
723
794
|
};
|
|
724
795
|
export {
|
|
@@ -728,6 +799,7 @@ export {
|
|
|
728
799
|
Language,
|
|
729
800
|
StreamResampler,
|
|
730
801
|
TTSClient,
|
|
802
|
+
VoiceAgent,
|
|
731
803
|
VoiceAgentClient,
|
|
732
804
|
VoiceStyle,
|
|
733
805
|
applyLowPassFilter,
|