@lokutor/sdk 1.1.11 → 1.1.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-UI24THO7.mjs +44 -0
- package/dist/index.d.mts +60 -3
- package/dist/index.d.ts +60 -3
- package/dist/index.js +293 -53
- package/dist/index.mjs +137 -53
- package/dist/node-audio-5HOWE6MC.mjs +94 -0
- package/package.json +1 -1
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
// src/types.ts
|
|
2
|
+
var VoiceStyle = /* @__PURE__ */ ((VoiceStyle2) => {
|
|
3
|
+
VoiceStyle2["F1"] = "F1";
|
|
4
|
+
VoiceStyle2["F2"] = "F2";
|
|
5
|
+
VoiceStyle2["F3"] = "F3";
|
|
6
|
+
VoiceStyle2["F4"] = "F4";
|
|
7
|
+
VoiceStyle2["F5"] = "F5";
|
|
8
|
+
VoiceStyle2["M1"] = "M1";
|
|
9
|
+
VoiceStyle2["M2"] = "M2";
|
|
10
|
+
VoiceStyle2["M3"] = "M3";
|
|
11
|
+
VoiceStyle2["M4"] = "M4";
|
|
12
|
+
VoiceStyle2["M5"] = "M5";
|
|
13
|
+
return VoiceStyle2;
|
|
14
|
+
})(VoiceStyle || {});
|
|
15
|
+
var Language = /* @__PURE__ */ ((Language2) => {
|
|
16
|
+
Language2["ENGLISH"] = "en";
|
|
17
|
+
Language2["SPANISH"] = "es";
|
|
18
|
+
Language2["FRENCH"] = "fr";
|
|
19
|
+
Language2["PORTUGUESE"] = "pt";
|
|
20
|
+
Language2["KOREAN"] = "ko";
|
|
21
|
+
return Language2;
|
|
22
|
+
})(Language || {});
|
|
23
|
+
var AUDIO_CONFIG = {
|
|
24
|
+
SAMPLE_RATE: 16e3,
|
|
25
|
+
SAMPLE_RATE_INPUT: 16e3,
|
|
26
|
+
SPEAKER_SAMPLE_RATE: 44100,
|
|
27
|
+
SAMPLE_RATE_OUTPUT: 44100,
|
|
28
|
+
CHANNELS: 1,
|
|
29
|
+
CHUNK_DURATION_MS: 20,
|
|
30
|
+
get CHUNK_SIZE() {
|
|
31
|
+
return Math.floor(this.SAMPLE_RATE * this.CHUNK_DURATION_MS / 1e3);
|
|
32
|
+
}
|
|
33
|
+
};
|
|
34
|
+
var DEFAULT_URLS = {
|
|
35
|
+
VOICE_AGENT: "wss://api.lokutor.com/ws/agent",
|
|
36
|
+
TTS: "wss://api.lokutor.com/ws/tts"
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
export {
|
|
40
|
+
VoiceStyle,
|
|
41
|
+
Language,
|
|
42
|
+
AUDIO_CONFIG,
|
|
43
|
+
DEFAULT_URLS
|
|
44
|
+
};
|
package/dist/index.d.mts
CHANGED
|
@@ -28,7 +28,9 @@ declare enum Language {
|
|
|
28
28
|
*/
|
|
29
29
|
declare const AUDIO_CONFIG: {
|
|
30
30
|
SAMPLE_RATE: number;
|
|
31
|
+
SAMPLE_RATE_INPUT: number;
|
|
31
32
|
SPEAKER_SAMPLE_RATE: number;
|
|
33
|
+
SAMPLE_RATE_OUTPUT: number;
|
|
32
34
|
CHANNELS: number;
|
|
33
35
|
CHUNK_DURATION_MS: number;
|
|
34
36
|
readonly CHUNK_SIZE: number;
|
|
@@ -97,7 +99,43 @@ interface Viseme {
|
|
|
97
99
|
c: string;
|
|
98
100
|
t: number;
|
|
99
101
|
}
|
|
102
|
+
/**
|
|
103
|
+
* Tool definition for LLM function calling (OpenAI format)
|
|
104
|
+
*/
|
|
105
|
+
interface ToolDefinition {
|
|
106
|
+
type: 'function';
|
|
107
|
+
function: {
|
|
108
|
+
name: string;
|
|
109
|
+
description: string;
|
|
110
|
+
parameters: {
|
|
111
|
+
type: 'object';
|
|
112
|
+
properties: Record<string, any>;
|
|
113
|
+
required?: string[];
|
|
114
|
+
};
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Event data for tool execution
|
|
119
|
+
*/
|
|
120
|
+
interface ToolCall {
|
|
121
|
+
name: string;
|
|
122
|
+
arguments: string;
|
|
123
|
+
}
|
|
100
124
|
|
|
125
|
+
/**
|
|
126
|
+
* Interface for audio hardware management (Browser/Node parity)
|
|
127
|
+
*/
|
|
128
|
+
interface AudioManager {
|
|
129
|
+
init(): Promise<void>;
|
|
130
|
+
startMicrophone(onAudioInput: (pcm16Data: Uint8Array) => void): Promise<void>;
|
|
131
|
+
stopMicrophone(): void;
|
|
132
|
+
playAudio(pcm16Data: Uint8Array): void;
|
|
133
|
+
stopPlayback(): void;
|
|
134
|
+
cleanup(): void;
|
|
135
|
+
isMicMuted(): boolean;
|
|
136
|
+
setMuted(muted: boolean): void;
|
|
137
|
+
getAmplitude(): number;
|
|
138
|
+
}
|
|
101
139
|
/**
|
|
102
140
|
* Main client for Lokutor Voice Agent SDK
|
|
103
141
|
*
|
|
@@ -109,6 +147,7 @@ declare class VoiceAgentClient {
|
|
|
109
147
|
prompt: string;
|
|
110
148
|
voice: VoiceStyle;
|
|
111
149
|
language: Language;
|
|
150
|
+
tools: ToolDefinition[];
|
|
112
151
|
private onTranscription?;
|
|
113
152
|
private onResponse?;
|
|
114
153
|
private onAudioCallback?;
|
|
@@ -121,6 +160,8 @@ declare class VoiceAgentClient {
|
|
|
121
160
|
private wantVisemes;
|
|
122
161
|
private audioManager;
|
|
123
162
|
private enableAudio;
|
|
163
|
+
private currentGeneration;
|
|
164
|
+
private listeners;
|
|
124
165
|
private isUserDisconnect;
|
|
125
166
|
private reconnecting;
|
|
126
167
|
private reconnectAttempts;
|
|
@@ -132,11 +173,20 @@ declare class VoiceAgentClient {
|
|
|
132
173
|
visemes?: boolean;
|
|
133
174
|
onVisemes?: (visemes: Viseme[]) => void;
|
|
134
175
|
enableAudio?: boolean;
|
|
176
|
+
tools?: ToolDefinition[];
|
|
135
177
|
});
|
|
136
178
|
/**
|
|
137
179
|
* Connect to the Lokutor Voice Agent server
|
|
180
|
+
* @param customAudioManager Optional replacement for the default audio hardware handler
|
|
138
181
|
*/
|
|
139
|
-
connect(): Promise<boolean>;
|
|
182
|
+
connect(customAudioManager?: AudioManager): Promise<boolean>;
|
|
183
|
+
/**
|
|
184
|
+
* The "Golden Path" - Starts a managed session with hardware handled automatically.
|
|
185
|
+
* This is the recommended way to start a conversation in both Browser and Node.js.
|
|
186
|
+
*/
|
|
187
|
+
startManaged(config?: {
|
|
188
|
+
audioManager?: AudioManager;
|
|
189
|
+
}): Promise<this>;
|
|
140
190
|
/**
|
|
141
191
|
* Send initial configuration to the server
|
|
142
192
|
*/
|
|
@@ -154,7 +204,13 @@ declare class VoiceAgentClient {
|
|
|
154
204
|
* Handle incoming text messages (metadata/transcriptions)
|
|
155
205
|
*/
|
|
156
206
|
private handleTextMessage;
|
|
157
|
-
|
|
207
|
+
/**
|
|
208
|
+
* Register an event listener (for Python parity)
|
|
209
|
+
*/
|
|
210
|
+
on(event: string, callback: Function): this;
|
|
211
|
+
/**
|
|
212
|
+
* Internal emitter for all events
|
|
213
|
+
*/
|
|
158
214
|
private emit;
|
|
159
215
|
onAudio(callback: (data: Uint8Array) => void): void;
|
|
160
216
|
onVisemes(callback: (visemes: Viseme[]) => void): void;
|
|
@@ -211,6 +267,7 @@ declare class TTSClient {
|
|
|
211
267
|
visemes?: boolean;
|
|
212
268
|
onAudio?: (data: Uint8Array) => void;
|
|
213
269
|
onVisemes?: (visemes: any[]) => void;
|
|
270
|
+
onTTFB?: (ms: number) => void;
|
|
214
271
|
onError?: (error: any) => void;
|
|
215
272
|
}): Promise<void>;
|
|
216
273
|
}
|
|
@@ -418,4 +475,4 @@ declare class BrowserAudioManager {
|
|
|
418
475
|
isRecording(): boolean;
|
|
419
476
|
}
|
|
420
477
|
|
|
421
|
-
export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
|
|
478
|
+
export { AUDIO_CONFIG, type AnalyserConfig, type AudioManager, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type ToolCall, type ToolDefinition, type Viseme, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
|
package/dist/index.d.ts
CHANGED
|
@@ -28,7 +28,9 @@ declare enum Language {
|
|
|
28
28
|
*/
|
|
29
29
|
declare const AUDIO_CONFIG: {
|
|
30
30
|
SAMPLE_RATE: number;
|
|
31
|
+
SAMPLE_RATE_INPUT: number;
|
|
31
32
|
SPEAKER_SAMPLE_RATE: number;
|
|
33
|
+
SAMPLE_RATE_OUTPUT: number;
|
|
32
34
|
CHANNELS: number;
|
|
33
35
|
CHUNK_DURATION_MS: number;
|
|
34
36
|
readonly CHUNK_SIZE: number;
|
|
@@ -97,7 +99,43 @@ interface Viseme {
|
|
|
97
99
|
c: string;
|
|
98
100
|
t: number;
|
|
99
101
|
}
|
|
102
|
+
/**
|
|
103
|
+
* Tool definition for LLM function calling (OpenAI format)
|
|
104
|
+
*/
|
|
105
|
+
interface ToolDefinition {
|
|
106
|
+
type: 'function';
|
|
107
|
+
function: {
|
|
108
|
+
name: string;
|
|
109
|
+
description: string;
|
|
110
|
+
parameters: {
|
|
111
|
+
type: 'object';
|
|
112
|
+
properties: Record<string, any>;
|
|
113
|
+
required?: string[];
|
|
114
|
+
};
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Event data for tool execution
|
|
119
|
+
*/
|
|
120
|
+
interface ToolCall {
|
|
121
|
+
name: string;
|
|
122
|
+
arguments: string;
|
|
123
|
+
}
|
|
100
124
|
|
|
125
|
+
/**
|
|
126
|
+
* Interface for audio hardware management (Browser/Node parity)
|
|
127
|
+
*/
|
|
128
|
+
interface AudioManager {
|
|
129
|
+
init(): Promise<void>;
|
|
130
|
+
startMicrophone(onAudioInput: (pcm16Data: Uint8Array) => void): Promise<void>;
|
|
131
|
+
stopMicrophone(): void;
|
|
132
|
+
playAudio(pcm16Data: Uint8Array): void;
|
|
133
|
+
stopPlayback(): void;
|
|
134
|
+
cleanup(): void;
|
|
135
|
+
isMicMuted(): boolean;
|
|
136
|
+
setMuted(muted: boolean): void;
|
|
137
|
+
getAmplitude(): number;
|
|
138
|
+
}
|
|
101
139
|
/**
|
|
102
140
|
* Main client for Lokutor Voice Agent SDK
|
|
103
141
|
*
|
|
@@ -109,6 +147,7 @@ declare class VoiceAgentClient {
|
|
|
109
147
|
prompt: string;
|
|
110
148
|
voice: VoiceStyle;
|
|
111
149
|
language: Language;
|
|
150
|
+
tools: ToolDefinition[];
|
|
112
151
|
private onTranscription?;
|
|
113
152
|
private onResponse?;
|
|
114
153
|
private onAudioCallback?;
|
|
@@ -121,6 +160,8 @@ declare class VoiceAgentClient {
|
|
|
121
160
|
private wantVisemes;
|
|
122
161
|
private audioManager;
|
|
123
162
|
private enableAudio;
|
|
163
|
+
private currentGeneration;
|
|
164
|
+
private listeners;
|
|
124
165
|
private isUserDisconnect;
|
|
125
166
|
private reconnecting;
|
|
126
167
|
private reconnectAttempts;
|
|
@@ -132,11 +173,20 @@ declare class VoiceAgentClient {
|
|
|
132
173
|
visemes?: boolean;
|
|
133
174
|
onVisemes?: (visemes: Viseme[]) => void;
|
|
134
175
|
enableAudio?: boolean;
|
|
176
|
+
tools?: ToolDefinition[];
|
|
135
177
|
});
|
|
136
178
|
/**
|
|
137
179
|
* Connect to the Lokutor Voice Agent server
|
|
180
|
+
* @param customAudioManager Optional replacement for the default audio hardware handler
|
|
138
181
|
*/
|
|
139
|
-
connect(): Promise<boolean>;
|
|
182
|
+
connect(customAudioManager?: AudioManager): Promise<boolean>;
|
|
183
|
+
/**
|
|
184
|
+
* The "Golden Path" - Starts a managed session with hardware handled automatically.
|
|
185
|
+
* This is the recommended way to start a conversation in both Browser and Node.js.
|
|
186
|
+
*/
|
|
187
|
+
startManaged(config?: {
|
|
188
|
+
audioManager?: AudioManager;
|
|
189
|
+
}): Promise<this>;
|
|
140
190
|
/**
|
|
141
191
|
* Send initial configuration to the server
|
|
142
192
|
*/
|
|
@@ -154,7 +204,13 @@ declare class VoiceAgentClient {
|
|
|
154
204
|
* Handle incoming text messages (metadata/transcriptions)
|
|
155
205
|
*/
|
|
156
206
|
private handleTextMessage;
|
|
157
|
-
|
|
207
|
+
/**
|
|
208
|
+
* Register an event listener (for Python parity)
|
|
209
|
+
*/
|
|
210
|
+
on(event: string, callback: Function): this;
|
|
211
|
+
/**
|
|
212
|
+
* Internal emitter for all events
|
|
213
|
+
*/
|
|
158
214
|
private emit;
|
|
159
215
|
onAudio(callback: (data: Uint8Array) => void): void;
|
|
160
216
|
onVisemes(callback: (visemes: Viseme[]) => void): void;
|
|
@@ -211,6 +267,7 @@ declare class TTSClient {
|
|
|
211
267
|
visemes?: boolean;
|
|
212
268
|
onAudio?: (data: Uint8Array) => void;
|
|
213
269
|
onVisemes?: (visemes: any[]) => void;
|
|
270
|
+
onTTFB?: (ms: number) => void;
|
|
214
271
|
onError?: (error: any) => void;
|
|
215
272
|
}): Promise<void>;
|
|
216
273
|
}
|
|
@@ -418,4 +475,4 @@ declare class BrowserAudioManager {
|
|
|
418
475
|
isRecording(): boolean;
|
|
419
476
|
}
|
|
420
477
|
|
|
421
|
-
export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
|
|
478
|
+
export { AUDIO_CONFIG, type AnalyserConfig, type AudioManager, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type ToolCall, type ToolDefinition, type Viseme, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
|
package/dist/index.js
CHANGED
|
@@ -1,8 +1,13 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
2
3
|
var __defProp = Object.defineProperty;
|
|
3
4
|
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
5
|
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
5
7
|
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __esm = (fn, res) => function __init() {
|
|
9
|
+
return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
|
|
10
|
+
};
|
|
6
11
|
var __export = (target, all) => {
|
|
7
12
|
for (var name in all)
|
|
8
13
|
__defProp(target, name, { get: all[name], enumerable: true });
|
|
@@ -15,8 +20,159 @@ var __copyProps = (to, from, except, desc) => {
|
|
|
15
20
|
}
|
|
16
21
|
return to;
|
|
17
22
|
};
|
|
23
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
24
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
25
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
26
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
27
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
28
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
29
|
+
mod
|
|
30
|
+
));
|
|
18
31
|
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
32
|
|
|
33
|
+
// src/types.ts
|
|
34
|
+
var VoiceStyle, Language, AUDIO_CONFIG, DEFAULT_URLS;
|
|
35
|
+
var init_types = __esm({
|
|
36
|
+
"src/types.ts"() {
|
|
37
|
+
"use strict";
|
|
38
|
+
VoiceStyle = /* @__PURE__ */ ((VoiceStyle2) => {
|
|
39
|
+
VoiceStyle2["F1"] = "F1";
|
|
40
|
+
VoiceStyle2["F2"] = "F2";
|
|
41
|
+
VoiceStyle2["F3"] = "F3";
|
|
42
|
+
VoiceStyle2["F4"] = "F4";
|
|
43
|
+
VoiceStyle2["F5"] = "F5";
|
|
44
|
+
VoiceStyle2["M1"] = "M1";
|
|
45
|
+
VoiceStyle2["M2"] = "M2";
|
|
46
|
+
VoiceStyle2["M3"] = "M3";
|
|
47
|
+
VoiceStyle2["M4"] = "M4";
|
|
48
|
+
VoiceStyle2["M5"] = "M5";
|
|
49
|
+
return VoiceStyle2;
|
|
50
|
+
})(VoiceStyle || {});
|
|
51
|
+
Language = /* @__PURE__ */ ((Language2) => {
|
|
52
|
+
Language2["ENGLISH"] = "en";
|
|
53
|
+
Language2["SPANISH"] = "es";
|
|
54
|
+
Language2["FRENCH"] = "fr";
|
|
55
|
+
Language2["PORTUGUESE"] = "pt";
|
|
56
|
+
Language2["KOREAN"] = "ko";
|
|
57
|
+
return Language2;
|
|
58
|
+
})(Language || {});
|
|
59
|
+
AUDIO_CONFIG = {
|
|
60
|
+
SAMPLE_RATE: 16e3,
|
|
61
|
+
SAMPLE_RATE_INPUT: 16e3,
|
|
62
|
+
SPEAKER_SAMPLE_RATE: 44100,
|
|
63
|
+
SAMPLE_RATE_OUTPUT: 44100,
|
|
64
|
+
CHANNELS: 1,
|
|
65
|
+
CHUNK_DURATION_MS: 20,
|
|
66
|
+
get CHUNK_SIZE() {
|
|
67
|
+
return Math.floor(this.SAMPLE_RATE * this.CHUNK_DURATION_MS / 1e3);
|
|
68
|
+
}
|
|
69
|
+
};
|
|
70
|
+
DEFAULT_URLS = {
|
|
71
|
+
VOICE_AGENT: "wss://api.lokutor.com/ws/agent",
|
|
72
|
+
TTS: "wss://api.lokutor.com/ws/tts"
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
// src/node-audio.ts
|
|
78
|
+
var node_audio_exports = {};
|
|
79
|
+
__export(node_audio_exports, {
|
|
80
|
+
NodeAudioManager: () => NodeAudioManager
|
|
81
|
+
});
|
|
82
|
+
var NodeAudioManager;
|
|
83
|
+
var init_node_audio = __esm({
|
|
84
|
+
"src/node-audio.ts"() {
|
|
85
|
+
"use strict";
|
|
86
|
+
init_types();
|
|
87
|
+
NodeAudioManager = class {
|
|
88
|
+
speaker = null;
|
|
89
|
+
recorder = null;
|
|
90
|
+
recordingStream = null;
|
|
91
|
+
isMuted = false;
|
|
92
|
+
isListening = false;
|
|
93
|
+
constructor() {
|
|
94
|
+
}
|
|
95
|
+
async init() {
|
|
96
|
+
try {
|
|
97
|
+
const Speaker = await import("speaker").catch(() => null);
|
|
98
|
+
if (!Speaker) {
|
|
99
|
+
console.warn('\u26A0\uFE0F Package "speaker" is missing. Hardware output will be disabled.');
|
|
100
|
+
console.warn("\u{1F449} Run: npm install speaker");
|
|
101
|
+
}
|
|
102
|
+
} catch (e) {
|
|
103
|
+
console.error("Error initializing Node audio:", e);
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
async startMicrophone(onAudioInput) {
|
|
107
|
+
if (this.isListening) return;
|
|
108
|
+
try {
|
|
109
|
+
const recorder = await import("node-record-lpcm16").catch(() => null);
|
|
110
|
+
if (!recorder) {
|
|
111
|
+
throw new Error('Package "node-record-lpcm16" is missing. Microphone input failed.\n\u{1F449} Run: npm install node-record-lpcm16');
|
|
112
|
+
}
|
|
113
|
+
console.log("\u{1F3A4} Starting microphone (Node.js)...");
|
|
114
|
+
this.recordingStream = recorder.record({
|
|
115
|
+
sampleRate: AUDIO_CONFIG.SAMPLE_RATE,
|
|
116
|
+
threshold: 0,
|
|
117
|
+
verbose: false,
|
|
118
|
+
recordProgram: "sox"
|
|
119
|
+
// default
|
|
120
|
+
});
|
|
121
|
+
this.recordingStream.stream().on("data", (chunk) => {
|
|
122
|
+
if (!this.isMuted && onAudioInput) {
|
|
123
|
+
onAudioInput(new Uint8Array(chunk));
|
|
124
|
+
}
|
|
125
|
+
});
|
|
126
|
+
this.isListening = true;
|
|
127
|
+
} catch (e) {
|
|
128
|
+
console.error("Failed to start microphone:", e.message);
|
|
129
|
+
throw e;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
stopMicrophone() {
|
|
133
|
+
if (this.recordingStream) {
|
|
134
|
+
this.recordingStream.stop();
|
|
135
|
+
this.recordingStream = null;
|
|
136
|
+
}
|
|
137
|
+
this.isListening = false;
|
|
138
|
+
}
|
|
139
|
+
async playAudio(pcm16Data) {
|
|
140
|
+
try {
|
|
141
|
+
if (!this.speaker) {
|
|
142
|
+
const Speaker = (await import("speaker")).default;
|
|
143
|
+
this.speaker = new Speaker({
|
|
144
|
+
channels: AUDIO_CONFIG.CHANNELS,
|
|
145
|
+
bitDepth: 16,
|
|
146
|
+
sampleRate: AUDIO_CONFIG.SPEAKER_SAMPLE_RATE
|
|
147
|
+
});
|
|
148
|
+
}
|
|
149
|
+
this.speaker.write(Buffer.from(pcm16Data));
|
|
150
|
+
} catch (e) {
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
stopPlayback() {
|
|
154
|
+
if (this.speaker) {
|
|
155
|
+
this.speaker.end();
|
|
156
|
+
this.speaker = null;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
cleanup() {
|
|
160
|
+
this.stopMicrophone();
|
|
161
|
+
this.stopPlayback();
|
|
162
|
+
}
|
|
163
|
+
isMicMuted() {
|
|
164
|
+
return this.isMuted;
|
|
165
|
+
}
|
|
166
|
+
setMuted(muted) {
|
|
167
|
+
this.isMuted = muted;
|
|
168
|
+
}
|
|
169
|
+
getAmplitude() {
|
|
170
|
+
return 0;
|
|
171
|
+
}
|
|
172
|
+
};
|
|
173
|
+
}
|
|
174
|
+
});
|
|
175
|
+
|
|
20
176
|
// src/index.ts
|
|
21
177
|
var index_exports = {};
|
|
22
178
|
__export(index_exports, {
|
|
@@ -41,42 +197,13 @@ __export(index_exports, {
|
|
|
41
197
|
simpleTTS: () => simpleTTS
|
|
42
198
|
});
|
|
43
199
|
module.exports = __toCommonJS(index_exports);
|
|
200
|
+
init_types();
|
|
44
201
|
|
|
45
|
-
// src/
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
VoiceStyle2["F4"] = "F4";
|
|
51
|
-
VoiceStyle2["F5"] = "F5";
|
|
52
|
-
VoiceStyle2["M1"] = "M1";
|
|
53
|
-
VoiceStyle2["M2"] = "M2";
|
|
54
|
-
VoiceStyle2["M3"] = "M3";
|
|
55
|
-
VoiceStyle2["M4"] = "M4";
|
|
56
|
-
VoiceStyle2["M5"] = "M5";
|
|
57
|
-
return VoiceStyle2;
|
|
58
|
-
})(VoiceStyle || {});
|
|
59
|
-
var Language = /* @__PURE__ */ ((Language2) => {
|
|
60
|
-
Language2["ENGLISH"] = "en";
|
|
61
|
-
Language2["SPANISH"] = "es";
|
|
62
|
-
Language2["FRENCH"] = "fr";
|
|
63
|
-
Language2["PORTUGUESE"] = "pt";
|
|
64
|
-
Language2["KOREAN"] = "ko";
|
|
65
|
-
return Language2;
|
|
66
|
-
})(Language || {});
|
|
67
|
-
var AUDIO_CONFIG = {
|
|
68
|
-
SAMPLE_RATE: 16e3,
|
|
69
|
-
SPEAKER_SAMPLE_RATE: 44100,
|
|
70
|
-
CHANNELS: 1,
|
|
71
|
-
CHUNK_DURATION_MS: 20,
|
|
72
|
-
get CHUNK_SIZE() {
|
|
73
|
-
return Math.floor(this.SAMPLE_RATE * this.CHUNK_DURATION_MS / 1e3);
|
|
74
|
-
}
|
|
75
|
-
};
|
|
76
|
-
var DEFAULT_URLS = {
|
|
77
|
-
VOICE_AGENT: "wss://api.lokutor.com/ws/agent",
|
|
78
|
-
TTS: "wss://api.lokutor.com/ws/tts"
|
|
79
|
-
};
|
|
202
|
+
// src/client.ts
|
|
203
|
+
init_types();
|
|
204
|
+
|
|
205
|
+
// src/browser-audio.ts
|
|
206
|
+
init_types();
|
|
80
207
|
|
|
81
208
|
// src/audio-utils.ts
|
|
82
209
|
function pcm16ToFloat32(int16Data) {
|
|
@@ -506,6 +633,7 @@ var VoiceAgentClient = class {
|
|
|
506
633
|
prompt;
|
|
507
634
|
voice;
|
|
508
635
|
language;
|
|
636
|
+
tools = [];
|
|
509
637
|
// Callbacks
|
|
510
638
|
onTranscription;
|
|
511
639
|
onResponse;
|
|
@@ -519,6 +647,8 @@ var VoiceAgentClient = class {
|
|
|
519
647
|
wantVisemes = false;
|
|
520
648
|
audioManager = null;
|
|
521
649
|
enableAudio = false;
|
|
650
|
+
currentGeneration = 0;
|
|
651
|
+
listeners = {};
|
|
522
652
|
// Connection resilience
|
|
523
653
|
isUserDisconnect = false;
|
|
524
654
|
reconnecting = false;
|
|
@@ -537,17 +667,23 @@ var VoiceAgentClient = class {
|
|
|
537
667
|
this.onError = config.onError;
|
|
538
668
|
this.wantVisemes = config.visemes || false;
|
|
539
669
|
this.enableAudio = config.enableAudio ?? false;
|
|
670
|
+
this.tools = config.tools || [];
|
|
540
671
|
}
|
|
541
672
|
/**
|
|
542
673
|
* Connect to the Lokutor Voice Agent server
|
|
674
|
+
* @param customAudioManager Optional replacement for the default audio hardware handler
|
|
543
675
|
*/
|
|
544
|
-
async connect() {
|
|
676
|
+
async connect(customAudioManager) {
|
|
545
677
|
this.isUserDisconnect = false;
|
|
546
|
-
if (this.enableAudio) {
|
|
547
|
-
if (
|
|
678
|
+
if (this.enableAudio || customAudioManager) {
|
|
679
|
+
if (customAudioManager) {
|
|
680
|
+
this.audioManager = customAudioManager;
|
|
681
|
+
} else if (!this.audioManager && typeof window !== "undefined") {
|
|
548
682
|
this.audioManager = new BrowserAudioManager();
|
|
549
683
|
}
|
|
550
|
-
|
|
684
|
+
if (this.audioManager) {
|
|
685
|
+
await this.audioManager.init();
|
|
686
|
+
}
|
|
551
687
|
}
|
|
552
688
|
return new Promise((resolve, reject) => {
|
|
553
689
|
try {
|
|
@@ -608,6 +744,34 @@ var VoiceAgentClient = class {
|
|
|
608
744
|
}
|
|
609
745
|
});
|
|
610
746
|
}
|
|
747
|
+
/**
|
|
748
|
+
* The "Golden Path" - Starts a managed session with hardware handled automatically.
|
|
749
|
+
* This is the recommended way to start a conversation in both Browser and Node.js.
|
|
750
|
+
*/
|
|
751
|
+
async startManaged(config) {
|
|
752
|
+
this.enableAudio = true;
|
|
753
|
+
if (config?.audioManager) {
|
|
754
|
+
this.audioManager = config.audioManager;
|
|
755
|
+
} else if (!this.audioManager) {
|
|
756
|
+
if (typeof window !== "undefined") {
|
|
757
|
+
this.audioManager = new BrowserAudioManager();
|
|
758
|
+
} else {
|
|
759
|
+
try {
|
|
760
|
+
const { NodeAudioManager: NodeAudioManager2 } = await Promise.resolve().then(() => (init_node_audio(), node_audio_exports));
|
|
761
|
+
this.audioManager = new NodeAudioManager2();
|
|
762
|
+
} catch (e) {
|
|
763
|
+
console.error('\u274C Failed to load NodeAudioManager. Please ensure "speaker" and "node-record-lpcm16" are installed.');
|
|
764
|
+
}
|
|
765
|
+
}
|
|
766
|
+
}
|
|
767
|
+
await this.connect();
|
|
768
|
+
if (this.audioManager && this.isConnected) {
|
|
769
|
+
await this.audioManager.startMicrophone((data) => {
|
|
770
|
+
this.sendAudio(data);
|
|
771
|
+
});
|
|
772
|
+
}
|
|
773
|
+
return this;
|
|
774
|
+
}
|
|
611
775
|
/**
|
|
612
776
|
* Send initial configuration to the server
|
|
613
777
|
*/
|
|
@@ -617,7 +781,10 @@ var VoiceAgentClient = class {
|
|
|
617
781
|
this.ws.send(JSON.stringify({ type: "voice", data: this.voice }));
|
|
618
782
|
this.ws.send(JSON.stringify({ type: "language", data: this.language }));
|
|
619
783
|
this.ws.send(JSON.stringify({ type: "visemes", data: this.wantVisemes }));
|
|
620
|
-
|
|
784
|
+
if (this.tools && this.tools.length > 0) {
|
|
785
|
+
this.ws.send(JSON.stringify({ type: "tools", data: this.tools }));
|
|
786
|
+
}
|
|
787
|
+
console.log(`\u2699\uFE0F Configured: voice=${this.voice}, language=${this.language}, visemes=${this.wantVisemes}, tools=${this.tools.length}`);
|
|
621
788
|
}
|
|
622
789
|
/**
|
|
623
790
|
* Send raw PCM audio data to the server
|
|
@@ -631,7 +798,11 @@ var VoiceAgentClient = class {
|
|
|
631
798
|
/**
|
|
632
799
|
* Handle incoming binary data (audio response)
|
|
633
800
|
*/
|
|
634
|
-
handleBinaryMessage(data) {
|
|
801
|
+
handleBinaryMessage(data, generation) {
|
|
802
|
+
if (generation !== void 0 && generation < this.currentGeneration) {
|
|
803
|
+
console.log(`\u{1F5D1}\uFE0F Discarding ghost audio (Gen ${generation} < ${this.currentGeneration})`);
|
|
804
|
+
return;
|
|
805
|
+
}
|
|
635
806
|
if (this.audioManager) {
|
|
636
807
|
this.audioManager.playAudio(data);
|
|
637
808
|
}
|
|
@@ -647,7 +818,7 @@ var VoiceAgentClient = class {
|
|
|
647
818
|
case "audio":
|
|
648
819
|
if (msg.data) {
|
|
649
820
|
const buffer = base64ToUint8Array(msg.data);
|
|
650
|
-
this.handleBinaryMessage(buffer);
|
|
821
|
+
this.handleBinaryMessage(buffer, msg.generation);
|
|
651
822
|
}
|
|
652
823
|
break;
|
|
653
824
|
case "transcript":
|
|
@@ -666,6 +837,14 @@ var VoiceAgentClient = class {
|
|
|
666
837
|
}
|
|
667
838
|
break;
|
|
668
839
|
case "status":
|
|
840
|
+
if (msg.data === "thinking") {
|
|
841
|
+
const newGen = msg.generation || 0;
|
|
842
|
+
if (newGen > this.currentGeneration) {
|
|
843
|
+
console.log(`\u{1F9E0} New thought (Gen ${newGen}) - Clearing audio queue`);
|
|
844
|
+
this.currentGeneration = newGen;
|
|
845
|
+
if (this.audioManager) this.audioManager.stopPlayback();
|
|
846
|
+
}
|
|
847
|
+
}
|
|
669
848
|
if (msg.data === "interrupted" && this.audioManager) {
|
|
670
849
|
this.audioManager.stopPlayback();
|
|
671
850
|
}
|
|
@@ -687,25 +866,58 @@ var VoiceAgentClient = class {
|
|
|
687
866
|
if (this.onError) this.onError(msg.data);
|
|
688
867
|
console.error(`\u274C Server error: ${msg.data}`);
|
|
689
868
|
break;
|
|
869
|
+
case "tool_call":
|
|
870
|
+
console.log(`\u{1F6E0}\uFE0F Tool Call: ${msg.name}(${msg.arguments})`);
|
|
871
|
+
break;
|
|
690
872
|
}
|
|
691
873
|
} catch (e) {
|
|
692
874
|
}
|
|
693
875
|
}
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
876
|
+
/**
|
|
877
|
+
* Register an event listener (for Python parity)
|
|
878
|
+
*/
|
|
879
|
+
on(event, callback) {
|
|
880
|
+
if (!this.listeners[event]) {
|
|
881
|
+
this.listeners[event] = [];
|
|
882
|
+
}
|
|
883
|
+
this.listeners[event].push(callback);
|
|
884
|
+
return this;
|
|
885
|
+
}
|
|
886
|
+
/**
|
|
887
|
+
* Internal emitter for all events
|
|
888
|
+
*/
|
|
889
|
+
emit(event, ...args) {
|
|
890
|
+
const legacyMap = {
|
|
891
|
+
"transcription": "onTranscription",
|
|
892
|
+
"response": "onResponse",
|
|
893
|
+
"audio": "onAudioCallback",
|
|
894
|
+
"visemes": "onVisemesCallback",
|
|
895
|
+
"status": "onStatus",
|
|
896
|
+
"error": "onError"
|
|
897
|
+
};
|
|
898
|
+
const legacyKey = legacyMap[event];
|
|
899
|
+
if (legacyKey && this[legacyKey]) {
|
|
900
|
+
try {
|
|
901
|
+
this[legacyKey](...args);
|
|
902
|
+
} catch (e) {
|
|
903
|
+
console.error(`Error in legacy callback ${legacyKey}:`, e);
|
|
904
|
+
}
|
|
905
|
+
}
|
|
906
|
+
if (this.listeners[event]) {
|
|
907
|
+
this.listeners[event].forEach((cb) => {
|
|
908
|
+
try {
|
|
909
|
+
cb(...args);
|
|
910
|
+
} catch (e) {
|
|
911
|
+
console.error(`Error in listener for ${event}:`, e);
|
|
912
|
+
}
|
|
913
|
+
});
|
|
702
914
|
}
|
|
703
915
|
}
|
|
704
916
|
onAudio(callback) {
|
|
705
|
-
this.
|
|
917
|
+
this.on("audio", callback);
|
|
706
918
|
}
|
|
707
919
|
onVisemes(callback) {
|
|
708
|
-
this.
|
|
920
|
+
this.on("visemes", callback);
|
|
709
921
|
}
|
|
710
922
|
/**
|
|
711
923
|
* Disconnect from the server
|
|
@@ -784,15 +996,28 @@ var TTSClient = class {
|
|
|
784
996
|
*/
|
|
785
997
|
synthesize(options) {
|
|
786
998
|
return new Promise((resolve, reject) => {
|
|
999
|
+
let activityTimeout;
|
|
1000
|
+
let ws;
|
|
1001
|
+
let startTime;
|
|
1002
|
+
let firstByteReceived = false;
|
|
1003
|
+
const refreshTimeout = () => {
|
|
1004
|
+
if (activityTimeout) clearTimeout(activityTimeout);
|
|
1005
|
+
activityTimeout = setTimeout(() => {
|
|
1006
|
+
console.log("\u23F1\uFE0F TTS synthesis reached inactivity timeout (2s) - resolving");
|
|
1007
|
+
if (ws) ws.close();
|
|
1008
|
+
resolve();
|
|
1009
|
+
}, 2e3);
|
|
1010
|
+
};
|
|
787
1011
|
try {
|
|
788
1012
|
let url = DEFAULT_URLS.TTS;
|
|
789
1013
|
if (this.apiKey) {
|
|
790
1014
|
const separator = url.includes("?") ? "&" : "?";
|
|
791
1015
|
url += `${separator}api_key=${this.apiKey}`;
|
|
792
1016
|
}
|
|
793
|
-
|
|
1017
|
+
ws = new WebSocket(url);
|
|
794
1018
|
ws.binaryType = "arraybuffer";
|
|
795
1019
|
ws.onopen = () => {
|
|
1020
|
+
refreshTimeout();
|
|
796
1021
|
const req = {
|
|
797
1022
|
text: options.text,
|
|
798
1023
|
voice: options.voice || "F1" /* F1 */,
|
|
@@ -802,9 +1027,16 @@ var TTSClient = class {
|
|
|
802
1027
|
visemes: options.visemes || false
|
|
803
1028
|
};
|
|
804
1029
|
ws.send(JSON.stringify(req));
|
|
1030
|
+
startTime = Date.now();
|
|
805
1031
|
};
|
|
806
1032
|
ws.onmessage = async (event) => {
|
|
1033
|
+
refreshTimeout();
|
|
807
1034
|
if (event.data instanceof ArrayBuffer) {
|
|
1035
|
+
if (!firstByteReceived) {
|
|
1036
|
+
const ttfb = Date.now() - startTime;
|
|
1037
|
+
if (options.onTTFB) options.onTTFB(ttfb);
|
|
1038
|
+
firstByteReceived = true;
|
|
1039
|
+
}
|
|
808
1040
|
if (options.onAudio) options.onAudio(new Uint8Array(event.data));
|
|
809
1041
|
} else {
|
|
810
1042
|
try {
|
|
@@ -812,18 +1044,26 @@ var TTSClient = class {
|
|
|
812
1044
|
if (Array.isArray(msg) && options.onVisemes) {
|
|
813
1045
|
options.onVisemes(msg);
|
|
814
1046
|
}
|
|
1047
|
+
if (msg.type === "eos") {
|
|
1048
|
+
if (activityTimeout) clearTimeout(activityTimeout);
|
|
1049
|
+
ws.close();
|
|
1050
|
+
resolve();
|
|
1051
|
+
}
|
|
815
1052
|
} catch (e) {
|
|
816
1053
|
}
|
|
817
1054
|
}
|
|
818
1055
|
};
|
|
819
1056
|
ws.onerror = (err) => {
|
|
1057
|
+
if (activityTimeout) clearTimeout(activityTimeout);
|
|
820
1058
|
if (options.onError) options.onError(err);
|
|
821
1059
|
reject(err);
|
|
822
1060
|
};
|
|
823
1061
|
ws.onclose = () => {
|
|
1062
|
+
if (activityTimeout) clearTimeout(activityTimeout);
|
|
824
1063
|
resolve();
|
|
825
1064
|
};
|
|
826
1065
|
} catch (err) {
|
|
1066
|
+
if (activityTimeout) clearTimeout(activityTimeout);
|
|
827
1067
|
if (options.onError) options.onError(err);
|
|
828
1068
|
reject(err);
|
|
829
1069
|
}
|
package/dist/index.mjs
CHANGED
|
@@ -1,38 +1,9 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
VoiceStyle2["F5"] = "F5";
|
|
8
|
-
VoiceStyle2["M1"] = "M1";
|
|
9
|
-
VoiceStyle2["M2"] = "M2";
|
|
10
|
-
VoiceStyle2["M3"] = "M3";
|
|
11
|
-
VoiceStyle2["M4"] = "M4";
|
|
12
|
-
VoiceStyle2["M5"] = "M5";
|
|
13
|
-
return VoiceStyle2;
|
|
14
|
-
})(VoiceStyle || {});
|
|
15
|
-
var Language = /* @__PURE__ */ ((Language2) => {
|
|
16
|
-
Language2["ENGLISH"] = "en";
|
|
17
|
-
Language2["SPANISH"] = "es";
|
|
18
|
-
Language2["FRENCH"] = "fr";
|
|
19
|
-
Language2["PORTUGUESE"] = "pt";
|
|
20
|
-
Language2["KOREAN"] = "ko";
|
|
21
|
-
return Language2;
|
|
22
|
-
})(Language || {});
|
|
23
|
-
var AUDIO_CONFIG = {
|
|
24
|
-
SAMPLE_RATE: 16e3,
|
|
25
|
-
SPEAKER_SAMPLE_RATE: 44100,
|
|
26
|
-
CHANNELS: 1,
|
|
27
|
-
CHUNK_DURATION_MS: 20,
|
|
28
|
-
get CHUNK_SIZE() {
|
|
29
|
-
return Math.floor(this.SAMPLE_RATE * this.CHUNK_DURATION_MS / 1e3);
|
|
30
|
-
}
|
|
31
|
-
};
|
|
32
|
-
var DEFAULT_URLS = {
|
|
33
|
-
VOICE_AGENT: "wss://api.lokutor.com/ws/agent",
|
|
34
|
-
TTS: "wss://api.lokutor.com/ws/tts"
|
|
35
|
-
};
|
|
1
|
+
import {
|
|
2
|
+
AUDIO_CONFIG,
|
|
3
|
+
DEFAULT_URLS,
|
|
4
|
+
Language,
|
|
5
|
+
VoiceStyle
|
|
6
|
+
} from "./chunk-UI24THO7.mjs";
|
|
36
7
|
|
|
37
8
|
// src/audio-utils.ts
|
|
38
9
|
function pcm16ToFloat32(int16Data) {
|
|
@@ -462,6 +433,7 @@ var VoiceAgentClient = class {
|
|
|
462
433
|
prompt;
|
|
463
434
|
voice;
|
|
464
435
|
language;
|
|
436
|
+
tools = [];
|
|
465
437
|
// Callbacks
|
|
466
438
|
onTranscription;
|
|
467
439
|
onResponse;
|
|
@@ -475,6 +447,8 @@ var VoiceAgentClient = class {
|
|
|
475
447
|
wantVisemes = false;
|
|
476
448
|
audioManager = null;
|
|
477
449
|
enableAudio = false;
|
|
450
|
+
currentGeneration = 0;
|
|
451
|
+
listeners = {};
|
|
478
452
|
// Connection resilience
|
|
479
453
|
isUserDisconnect = false;
|
|
480
454
|
reconnecting = false;
|
|
@@ -493,17 +467,23 @@ var VoiceAgentClient = class {
|
|
|
493
467
|
this.onError = config.onError;
|
|
494
468
|
this.wantVisemes = config.visemes || false;
|
|
495
469
|
this.enableAudio = config.enableAudio ?? false;
|
|
470
|
+
this.tools = config.tools || [];
|
|
496
471
|
}
|
|
497
472
|
/**
|
|
498
473
|
* Connect to the Lokutor Voice Agent server
|
|
474
|
+
* @param customAudioManager Optional replacement for the default audio hardware handler
|
|
499
475
|
*/
|
|
500
|
-
async connect() {
|
|
476
|
+
async connect(customAudioManager) {
|
|
501
477
|
this.isUserDisconnect = false;
|
|
502
|
-
if (this.enableAudio) {
|
|
503
|
-
if (
|
|
478
|
+
if (this.enableAudio || customAudioManager) {
|
|
479
|
+
if (customAudioManager) {
|
|
480
|
+
this.audioManager = customAudioManager;
|
|
481
|
+
} else if (!this.audioManager && typeof window !== "undefined") {
|
|
504
482
|
this.audioManager = new BrowserAudioManager();
|
|
505
483
|
}
|
|
506
|
-
|
|
484
|
+
if (this.audioManager) {
|
|
485
|
+
await this.audioManager.init();
|
|
486
|
+
}
|
|
507
487
|
}
|
|
508
488
|
return new Promise((resolve, reject) => {
|
|
509
489
|
try {
|
|
@@ -564,6 +544,34 @@ var VoiceAgentClient = class {
|
|
|
564
544
|
}
|
|
565
545
|
});
|
|
566
546
|
}
|
|
547
|
+
/**
|
|
548
|
+
* The "Golden Path" - Starts a managed session with hardware handled automatically.
|
|
549
|
+
* This is the recommended way to start a conversation in both Browser and Node.js.
|
|
550
|
+
*/
|
|
551
|
+
async startManaged(config) {
|
|
552
|
+
this.enableAudio = true;
|
|
553
|
+
if (config?.audioManager) {
|
|
554
|
+
this.audioManager = config.audioManager;
|
|
555
|
+
} else if (!this.audioManager) {
|
|
556
|
+
if (typeof window !== "undefined") {
|
|
557
|
+
this.audioManager = new BrowserAudioManager();
|
|
558
|
+
} else {
|
|
559
|
+
try {
|
|
560
|
+
const { NodeAudioManager } = await import("./node-audio-5HOWE6MC.mjs");
|
|
561
|
+
this.audioManager = new NodeAudioManager();
|
|
562
|
+
} catch (e) {
|
|
563
|
+
console.error('\u274C Failed to load NodeAudioManager. Please ensure "speaker" and "node-record-lpcm16" are installed.');
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
}
|
|
567
|
+
await this.connect();
|
|
568
|
+
if (this.audioManager && this.isConnected) {
|
|
569
|
+
await this.audioManager.startMicrophone((data) => {
|
|
570
|
+
this.sendAudio(data);
|
|
571
|
+
});
|
|
572
|
+
}
|
|
573
|
+
return this;
|
|
574
|
+
}
|
|
567
575
|
/**
|
|
568
576
|
* Send initial configuration to the server
|
|
569
577
|
*/
|
|
@@ -573,7 +581,10 @@ var VoiceAgentClient = class {
|
|
|
573
581
|
this.ws.send(JSON.stringify({ type: "voice", data: this.voice }));
|
|
574
582
|
this.ws.send(JSON.stringify({ type: "language", data: this.language }));
|
|
575
583
|
this.ws.send(JSON.stringify({ type: "visemes", data: this.wantVisemes }));
|
|
576
|
-
|
|
584
|
+
if (this.tools && this.tools.length > 0) {
|
|
585
|
+
this.ws.send(JSON.stringify({ type: "tools", data: this.tools }));
|
|
586
|
+
}
|
|
587
|
+
console.log(`\u2699\uFE0F Configured: voice=${this.voice}, language=${this.language}, visemes=${this.wantVisemes}, tools=${this.tools.length}`);
|
|
577
588
|
}
|
|
578
589
|
/**
|
|
579
590
|
* Send raw PCM audio data to the server
|
|
@@ -587,7 +598,11 @@ var VoiceAgentClient = class {
|
|
|
587
598
|
/**
|
|
588
599
|
* Handle incoming binary data (audio response)
|
|
589
600
|
*/
|
|
590
|
-
handleBinaryMessage(data) {
|
|
601
|
+
handleBinaryMessage(data, generation) {
|
|
602
|
+
if (generation !== void 0 && generation < this.currentGeneration) {
|
|
603
|
+
console.log(`\u{1F5D1}\uFE0F Discarding ghost audio (Gen ${generation} < ${this.currentGeneration})`);
|
|
604
|
+
return;
|
|
605
|
+
}
|
|
591
606
|
if (this.audioManager) {
|
|
592
607
|
this.audioManager.playAudio(data);
|
|
593
608
|
}
|
|
@@ -603,7 +618,7 @@ var VoiceAgentClient = class {
|
|
|
603
618
|
case "audio":
|
|
604
619
|
if (msg.data) {
|
|
605
620
|
const buffer = base64ToUint8Array(msg.data);
|
|
606
|
-
this.handleBinaryMessage(buffer);
|
|
621
|
+
this.handleBinaryMessage(buffer, msg.generation);
|
|
607
622
|
}
|
|
608
623
|
break;
|
|
609
624
|
case "transcript":
|
|
@@ -622,6 +637,14 @@ var VoiceAgentClient = class {
|
|
|
622
637
|
}
|
|
623
638
|
break;
|
|
624
639
|
case "status":
|
|
640
|
+
if (msg.data === "thinking") {
|
|
641
|
+
const newGen = msg.generation || 0;
|
|
642
|
+
if (newGen > this.currentGeneration) {
|
|
643
|
+
console.log(`\u{1F9E0} New thought (Gen ${newGen}) - Clearing audio queue`);
|
|
644
|
+
this.currentGeneration = newGen;
|
|
645
|
+
if (this.audioManager) this.audioManager.stopPlayback();
|
|
646
|
+
}
|
|
647
|
+
}
|
|
625
648
|
if (msg.data === "interrupted" && this.audioManager) {
|
|
626
649
|
this.audioManager.stopPlayback();
|
|
627
650
|
}
|
|
@@ -643,25 +666,58 @@ var VoiceAgentClient = class {
|
|
|
643
666
|
if (this.onError) this.onError(msg.data);
|
|
644
667
|
console.error(`\u274C Server error: ${msg.data}`);
|
|
645
668
|
break;
|
|
669
|
+
case "tool_call":
|
|
670
|
+
console.log(`\u{1F6E0}\uFE0F Tool Call: ${msg.name}(${msg.arguments})`);
|
|
671
|
+
break;
|
|
646
672
|
}
|
|
647
673
|
} catch (e) {
|
|
648
674
|
}
|
|
649
675
|
}
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
676
|
+
/**
|
|
677
|
+
* Register an event listener (for Python parity)
|
|
678
|
+
*/
|
|
679
|
+
on(event, callback) {
|
|
680
|
+
if (!this.listeners[event]) {
|
|
681
|
+
this.listeners[event] = [];
|
|
682
|
+
}
|
|
683
|
+
this.listeners[event].push(callback);
|
|
684
|
+
return this;
|
|
685
|
+
}
|
|
686
|
+
/**
|
|
687
|
+
* Internal emitter for all events
|
|
688
|
+
*/
|
|
689
|
+
emit(event, ...args) {
|
|
690
|
+
const legacyMap = {
|
|
691
|
+
"transcription": "onTranscription",
|
|
692
|
+
"response": "onResponse",
|
|
693
|
+
"audio": "onAudioCallback",
|
|
694
|
+
"visemes": "onVisemesCallback",
|
|
695
|
+
"status": "onStatus",
|
|
696
|
+
"error": "onError"
|
|
697
|
+
};
|
|
698
|
+
const legacyKey = legacyMap[event];
|
|
699
|
+
if (legacyKey && this[legacyKey]) {
|
|
700
|
+
try {
|
|
701
|
+
this[legacyKey](...args);
|
|
702
|
+
} catch (e) {
|
|
703
|
+
console.error(`Error in legacy callback ${legacyKey}:`, e);
|
|
704
|
+
}
|
|
705
|
+
}
|
|
706
|
+
if (this.listeners[event]) {
|
|
707
|
+
this.listeners[event].forEach((cb) => {
|
|
708
|
+
try {
|
|
709
|
+
cb(...args);
|
|
710
|
+
} catch (e) {
|
|
711
|
+
console.error(`Error in listener for ${event}:`, e);
|
|
712
|
+
}
|
|
713
|
+
});
|
|
658
714
|
}
|
|
659
715
|
}
|
|
660
716
|
onAudio(callback) {
|
|
661
|
-
this.
|
|
717
|
+
this.on("audio", callback);
|
|
662
718
|
}
|
|
663
719
|
onVisemes(callback) {
|
|
664
|
-
this.
|
|
720
|
+
this.on("visemes", callback);
|
|
665
721
|
}
|
|
666
722
|
/**
|
|
667
723
|
* Disconnect from the server
|
|
@@ -740,15 +796,28 @@ var TTSClient = class {
|
|
|
740
796
|
*/
|
|
741
797
|
synthesize(options) {
|
|
742
798
|
return new Promise((resolve, reject) => {
|
|
799
|
+
let activityTimeout;
|
|
800
|
+
let ws;
|
|
801
|
+
let startTime;
|
|
802
|
+
let firstByteReceived = false;
|
|
803
|
+
const refreshTimeout = () => {
|
|
804
|
+
if (activityTimeout) clearTimeout(activityTimeout);
|
|
805
|
+
activityTimeout = setTimeout(() => {
|
|
806
|
+
console.log("\u23F1\uFE0F TTS synthesis reached inactivity timeout (2s) - resolving");
|
|
807
|
+
if (ws) ws.close();
|
|
808
|
+
resolve();
|
|
809
|
+
}, 2e3);
|
|
810
|
+
};
|
|
743
811
|
try {
|
|
744
812
|
let url = DEFAULT_URLS.TTS;
|
|
745
813
|
if (this.apiKey) {
|
|
746
814
|
const separator = url.includes("?") ? "&" : "?";
|
|
747
815
|
url += `${separator}api_key=${this.apiKey}`;
|
|
748
816
|
}
|
|
749
|
-
|
|
817
|
+
ws = new WebSocket(url);
|
|
750
818
|
ws.binaryType = "arraybuffer";
|
|
751
819
|
ws.onopen = () => {
|
|
820
|
+
refreshTimeout();
|
|
752
821
|
const req = {
|
|
753
822
|
text: options.text,
|
|
754
823
|
voice: options.voice || "F1" /* F1 */,
|
|
@@ -758,9 +827,16 @@ var TTSClient = class {
|
|
|
758
827
|
visemes: options.visemes || false
|
|
759
828
|
};
|
|
760
829
|
ws.send(JSON.stringify(req));
|
|
830
|
+
startTime = Date.now();
|
|
761
831
|
};
|
|
762
832
|
ws.onmessage = async (event) => {
|
|
833
|
+
refreshTimeout();
|
|
763
834
|
if (event.data instanceof ArrayBuffer) {
|
|
835
|
+
if (!firstByteReceived) {
|
|
836
|
+
const ttfb = Date.now() - startTime;
|
|
837
|
+
if (options.onTTFB) options.onTTFB(ttfb);
|
|
838
|
+
firstByteReceived = true;
|
|
839
|
+
}
|
|
764
840
|
if (options.onAudio) options.onAudio(new Uint8Array(event.data));
|
|
765
841
|
} else {
|
|
766
842
|
try {
|
|
@@ -768,18 +844,26 @@ var TTSClient = class {
|
|
|
768
844
|
if (Array.isArray(msg) && options.onVisemes) {
|
|
769
845
|
options.onVisemes(msg);
|
|
770
846
|
}
|
|
847
|
+
if (msg.type === "eos") {
|
|
848
|
+
if (activityTimeout) clearTimeout(activityTimeout);
|
|
849
|
+
ws.close();
|
|
850
|
+
resolve();
|
|
851
|
+
}
|
|
771
852
|
} catch (e) {
|
|
772
853
|
}
|
|
773
854
|
}
|
|
774
855
|
};
|
|
775
856
|
ws.onerror = (err) => {
|
|
857
|
+
if (activityTimeout) clearTimeout(activityTimeout);
|
|
776
858
|
if (options.onError) options.onError(err);
|
|
777
859
|
reject(err);
|
|
778
860
|
};
|
|
779
861
|
ws.onclose = () => {
|
|
862
|
+
if (activityTimeout) clearTimeout(activityTimeout);
|
|
780
863
|
resolve();
|
|
781
864
|
};
|
|
782
865
|
} catch (err) {
|
|
866
|
+
if (activityTimeout) clearTimeout(activityTimeout);
|
|
783
867
|
if (options.onError) options.onError(err);
|
|
784
868
|
reject(err);
|
|
785
869
|
}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import {
|
|
2
|
+
AUDIO_CONFIG
|
|
3
|
+
} from "./chunk-UI24THO7.mjs";
|
|
4
|
+
|
|
5
|
+
// src/node-audio.ts
|
|
6
|
+
var NodeAudioManager = class {
|
|
7
|
+
speaker = null;
|
|
8
|
+
recorder = null;
|
|
9
|
+
recordingStream = null;
|
|
10
|
+
isMuted = false;
|
|
11
|
+
isListening = false;
|
|
12
|
+
constructor() {
|
|
13
|
+
}
|
|
14
|
+
async init() {
|
|
15
|
+
try {
|
|
16
|
+
const Speaker = await import("speaker").catch(() => null);
|
|
17
|
+
if (!Speaker) {
|
|
18
|
+
console.warn('\u26A0\uFE0F Package "speaker" is missing. Hardware output will be disabled.');
|
|
19
|
+
console.warn("\u{1F449} Run: npm install speaker");
|
|
20
|
+
}
|
|
21
|
+
} catch (e) {
|
|
22
|
+
console.error("Error initializing Node audio:", e);
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
async startMicrophone(onAudioInput) {
|
|
26
|
+
if (this.isListening) return;
|
|
27
|
+
try {
|
|
28
|
+
const recorder = await import("node-record-lpcm16").catch(() => null);
|
|
29
|
+
if (!recorder) {
|
|
30
|
+
throw new Error('Package "node-record-lpcm16" is missing. Microphone input failed.\n\u{1F449} Run: npm install node-record-lpcm16');
|
|
31
|
+
}
|
|
32
|
+
console.log("\u{1F3A4} Starting microphone (Node.js)...");
|
|
33
|
+
this.recordingStream = recorder.record({
|
|
34
|
+
sampleRate: AUDIO_CONFIG.SAMPLE_RATE,
|
|
35
|
+
threshold: 0,
|
|
36
|
+
verbose: false,
|
|
37
|
+
recordProgram: "sox"
|
|
38
|
+
// default
|
|
39
|
+
});
|
|
40
|
+
this.recordingStream.stream().on("data", (chunk) => {
|
|
41
|
+
if (!this.isMuted && onAudioInput) {
|
|
42
|
+
onAudioInput(new Uint8Array(chunk));
|
|
43
|
+
}
|
|
44
|
+
});
|
|
45
|
+
this.isListening = true;
|
|
46
|
+
} catch (e) {
|
|
47
|
+
console.error("Failed to start microphone:", e.message);
|
|
48
|
+
throw e;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
stopMicrophone() {
|
|
52
|
+
if (this.recordingStream) {
|
|
53
|
+
this.recordingStream.stop();
|
|
54
|
+
this.recordingStream = null;
|
|
55
|
+
}
|
|
56
|
+
this.isListening = false;
|
|
57
|
+
}
|
|
58
|
+
async playAudio(pcm16Data) {
|
|
59
|
+
try {
|
|
60
|
+
if (!this.speaker) {
|
|
61
|
+
const Speaker = (await import("speaker")).default;
|
|
62
|
+
this.speaker = new Speaker({
|
|
63
|
+
channels: AUDIO_CONFIG.CHANNELS,
|
|
64
|
+
bitDepth: 16,
|
|
65
|
+
sampleRate: AUDIO_CONFIG.SPEAKER_SAMPLE_RATE
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
this.speaker.write(Buffer.from(pcm16Data));
|
|
69
|
+
} catch (e) {
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
stopPlayback() {
|
|
73
|
+
if (this.speaker) {
|
|
74
|
+
this.speaker.end();
|
|
75
|
+
this.speaker = null;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
cleanup() {
|
|
79
|
+
this.stopMicrophone();
|
|
80
|
+
this.stopPlayback();
|
|
81
|
+
}
|
|
82
|
+
isMicMuted() {
|
|
83
|
+
return this.isMuted;
|
|
84
|
+
}
|
|
85
|
+
setMuted(muted) {
|
|
86
|
+
this.isMuted = muted;
|
|
87
|
+
}
|
|
88
|
+
getAmplitude() {
|
|
89
|
+
return 0;
|
|
90
|
+
}
|
|
91
|
+
};
|
|
92
|
+
export {
|
|
93
|
+
NodeAudioManager
|
|
94
|
+
};
|