@ch4p/cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +190 -0
- package/dist/agent-LRE3CXL3.js +761 -0
- package/dist/audit-BIGXGJSQ.js +12 -0
- package/dist/canvas-OIXBPYUL.js +313 -0
- package/dist/chunk-6BURGD2Y.js +290 -0
- package/dist/chunk-7EUURDQ5.js +6987 -0
- package/dist/chunk-CNLYUY2K.js +220 -0
- package/dist/chunk-GEFQONOB.js +1720 -0
- package/dist/chunk-IRNN57EQ.js +1810 -0
- package/dist/chunk-NMGPBPNU.js +154 -0
- package/dist/chunk-NRFRTZVP.js +289 -0
- package/dist/chunk-P6OXQDX2.js +4359 -0
- package/dist/chunk-PGZ24EFT.js +3388 -0
- package/dist/chunk-TEVLTQYT.js +185 -0
- package/dist/chunk-WL32AHUY.js +655 -0
- package/dist/chunk-YSCX2QQQ.js +88 -0
- package/dist/dist-VZE4JK7Q.js +25 -0
- package/dist/doctor-D6CEAUAC.js +274 -0
- package/dist/gateway-KZYTXCIQ.js +2062 -0
- package/dist/identity-S45KJKEO.js +215 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +200 -0
- package/dist/install-WROGJA5J.js +378 -0
- package/dist/message-JHZX6JBD.js +189 -0
- package/dist/onboard-ZVZLMMZP.js +849 -0
- package/dist/pairing-UOUSOB7K.js +147 -0
- package/dist/skills-UPJVALNY.js +138 -0
- package/dist/status-6HMA7CDS.js +94 -0
- package/dist/tools-2X5MBDEX.js +50 -0
- package/package.json +79 -0
|
@@ -0,0 +1,655 @@
|
|
|
1
|
+
// ../../packages/voice/dist/index.js
|
|
2
|
+
import { EventEmitter } from "events";
|
|
3
|
+
import { EventEmitter as EventEmitter2 } from "events";
|
|
4
|
+
import { spawn, execSync } from "child_process";
|
|
5
|
+
import { writeFileSync, unlinkSync, mkdtempSync } from "fs";
|
|
6
|
+
import { join } from "path";
|
|
7
|
+
import { tmpdir } from "os";
|
|
8
|
+
import { execFile as execFileCb } from "child_process";
|
|
9
|
+
import { promisify } from "util";
|
|
10
|
+
import { EventEmitter as EventEmitter3 } from "events";
|
|
11
|
+
var VoiceProcessor = class {
|
|
12
|
+
stt;
|
|
13
|
+
tts;
|
|
14
|
+
config;
|
|
15
|
+
constructor(opts) {
|
|
16
|
+
this.stt = opts.stt;
|
|
17
|
+
this.tts = opts.tts;
|
|
18
|
+
this.config = opts.config;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Process an inbound message, transcribing any audio attachments.
|
|
22
|
+
*
|
|
23
|
+
* If no STT provider is configured or the message has no audio attachments,
|
|
24
|
+
* the message is returned unchanged.
|
|
25
|
+
*
|
|
26
|
+
* For each audio attachment that carries a `data` Buffer, the provider
|
|
27
|
+
* transcribes it directly. Attachments that only have a `url` are skipped
|
|
28
|
+
* (the caller is responsible for downloading the audio first).
|
|
29
|
+
*
|
|
30
|
+
* The transcript is prepended to the message text as "[Voice message]: ...".
|
|
31
|
+
*
|
|
32
|
+
* @param msg - The inbound message to process.
|
|
33
|
+
* @returns The message with transcripts prepended to the text.
|
|
34
|
+
*/
|
|
35
|
+
async processInbound(msg) {
|
|
36
|
+
if (!this.stt || !this.config.enabled) {
|
|
37
|
+
return msg;
|
|
38
|
+
}
|
|
39
|
+
const audioAttachments = (msg.attachments ?? []).filter(
|
|
40
|
+
(a) => a.type === "audio"
|
|
41
|
+
);
|
|
42
|
+
if (audioAttachments.length === 0) {
|
|
43
|
+
return msg;
|
|
44
|
+
}
|
|
45
|
+
const transcripts = [];
|
|
46
|
+
for (const attachment of audioAttachments) {
|
|
47
|
+
if (attachment.data) {
|
|
48
|
+
const mimeType = attachment.mimeType ?? "audio/webm";
|
|
49
|
+
const transcript = await this.stt.transcribe(attachment.data, mimeType);
|
|
50
|
+
if (transcript) {
|
|
51
|
+
transcripts.push(transcript);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
if (transcripts.length === 0) {
|
|
56
|
+
return msg;
|
|
57
|
+
}
|
|
58
|
+
const voiceText = transcripts.map((t) => `[Voice message]: ${t}`).join("\n");
|
|
59
|
+
const newText = msg.text ? `${voiceText}
|
|
60
|
+
${msg.text}` : voiceText;
|
|
61
|
+
return { ...msg, text: newText };
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Process an outbound message, synthesizing audio from the text.
|
|
65
|
+
*
|
|
66
|
+
* If no TTS provider is configured or voice is disabled, the message is
|
|
67
|
+
* returned unchanged.
|
|
68
|
+
*
|
|
69
|
+
* The synthesized audio is added as an attachment to the message.
|
|
70
|
+
*
|
|
71
|
+
* @param msg - The outbound message to process.
|
|
72
|
+
* @returns The message with an audio attachment added.
|
|
73
|
+
*/
|
|
74
|
+
async processOutbound(msg) {
|
|
75
|
+
if (!this.tts || !this.config.enabled) {
|
|
76
|
+
return msg;
|
|
77
|
+
}
|
|
78
|
+
if (!msg.text) {
|
|
79
|
+
return msg;
|
|
80
|
+
}
|
|
81
|
+
const { audio, mimeType } = await this.tts.synthesize(msg.text);
|
|
82
|
+
const audioAttachment = {
|
|
83
|
+
type: "audio",
|
|
84
|
+
data: audio,
|
|
85
|
+
mimeType,
|
|
86
|
+
filename: "response.mp3"
|
|
87
|
+
};
|
|
88
|
+
const existingAttachments = msg.attachments ?? [];
|
|
89
|
+
return {
|
|
90
|
+
...msg,
|
|
91
|
+
attachments: [...existingAttachments, audioAttachment]
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
};
|
|
95
|
+
var WhisperSTT = class {
|
|
96
|
+
apiKey;
|
|
97
|
+
constructor(config) {
|
|
98
|
+
this.apiKey = config.apiKey;
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* Transcribe an audio buffer to text using OpenAI Whisper.
|
|
102
|
+
*
|
|
103
|
+
* @param audio - Raw audio data as a Buffer.
|
|
104
|
+
* @param mimeType - MIME type of the audio (e.g. 'audio/webm', 'audio/mp3').
|
|
105
|
+
* @returns The transcribed text.
|
|
106
|
+
*/
|
|
107
|
+
async transcribe(audio, mimeType) {
|
|
108
|
+
const ext = mimeTypeToExtension(mimeType);
|
|
109
|
+
const blob = new Blob([audio], { type: mimeType });
|
|
110
|
+
const form = new FormData();
|
|
111
|
+
form.append("file", blob, `audio.${ext}`);
|
|
112
|
+
form.append("model", "whisper-1");
|
|
113
|
+
const res = await fetch("https://api.openai.com/v1/audio/transcriptions", {
|
|
114
|
+
method: "POST",
|
|
115
|
+
headers: {
|
|
116
|
+
Authorization: `Bearer ${this.apiKey}`
|
|
117
|
+
},
|
|
118
|
+
body: form
|
|
119
|
+
});
|
|
120
|
+
if (!res.ok) {
|
|
121
|
+
const body = await res.text();
|
|
122
|
+
throw new Error(`Whisper STT failed (${res.status}): ${body}`);
|
|
123
|
+
}
|
|
124
|
+
const json = await res.json();
|
|
125
|
+
return json.text;
|
|
126
|
+
}
|
|
127
|
+
};
|
|
128
|
+
function mimeTypeToExtension(mimeType) {
|
|
129
|
+
const map = {
|
|
130
|
+
"audio/webm": "webm",
|
|
131
|
+
"audio/mp3": "mp3",
|
|
132
|
+
"audio/mpeg": "mp3",
|
|
133
|
+
"audio/mp4": "mp4",
|
|
134
|
+
"audio/wav": "wav",
|
|
135
|
+
"audio/ogg": "ogg",
|
|
136
|
+
"audio/flac": "flac",
|
|
137
|
+
"audio/x-m4a": "m4a"
|
|
138
|
+
};
|
|
139
|
+
return map[mimeType] ?? "webm";
|
|
140
|
+
}
|
|
141
|
+
var DeepgramSTT = class {
|
|
142
|
+
apiKey;
|
|
143
|
+
constructor(config) {
|
|
144
|
+
this.apiKey = config.apiKey;
|
|
145
|
+
}
|
|
146
|
+
/**
|
|
147
|
+
* Transcribe an audio buffer to text using Deepgram Nova-3.
|
|
148
|
+
*
|
|
149
|
+
* @param audio - Raw audio data as a Buffer.
|
|
150
|
+
* @param mimeType - MIME type of the audio (e.g. 'audio/webm', 'audio/wav').
|
|
151
|
+
* @returns The transcribed text.
|
|
152
|
+
*/
|
|
153
|
+
async transcribe(audio, mimeType) {
|
|
154
|
+
const res = await fetch(
|
|
155
|
+
"https://api.deepgram.com/v1/listen?model=nova-3",
|
|
156
|
+
{
|
|
157
|
+
method: "POST",
|
|
158
|
+
headers: {
|
|
159
|
+
Authorization: `Token ${this.apiKey}`,
|
|
160
|
+
"Content-Type": mimeType
|
|
161
|
+
},
|
|
162
|
+
body: audio
|
|
163
|
+
}
|
|
164
|
+
);
|
|
165
|
+
if (!res.ok) {
|
|
166
|
+
const body = await res.text();
|
|
167
|
+
throw new Error(`Deepgram STT failed (${res.status}): ${body}`);
|
|
168
|
+
}
|
|
169
|
+
const json = await res.json();
|
|
170
|
+
return json.results.channels[0]?.alternatives[0]?.transcript ?? "";
|
|
171
|
+
}
|
|
172
|
+
};
|
|
173
|
+
var DEFAULT_VOICE_ID = "21m00Tcm4TlvDq8ikWAM";
|
|
174
|
+
var ElevenLabsTTS = class {
|
|
175
|
+
apiKey;
|
|
176
|
+
voiceId;
|
|
177
|
+
constructor(config) {
|
|
178
|
+
this.apiKey = config.apiKey;
|
|
179
|
+
this.voiceId = config.voiceId ?? DEFAULT_VOICE_ID;
|
|
180
|
+
}
|
|
181
|
+
/**
|
|
182
|
+
* Synthesize text into audio using ElevenLabs.
|
|
183
|
+
*
|
|
184
|
+
* @param text - The text to convert to speech.
|
|
185
|
+
* @returns An object containing the audio Buffer and its MIME type.
|
|
186
|
+
*/
|
|
187
|
+
async synthesize(text) {
|
|
188
|
+
const url = `https://api.elevenlabs.io/v1/text-to-speech/${this.voiceId}`;
|
|
189
|
+
const res = await fetch(url, {
|
|
190
|
+
method: "POST",
|
|
191
|
+
headers: {
|
|
192
|
+
"xi-api-key": this.apiKey,
|
|
193
|
+
"Content-Type": "application/json",
|
|
194
|
+
Accept: "audio/mpeg"
|
|
195
|
+
},
|
|
196
|
+
body: JSON.stringify({
|
|
197
|
+
text,
|
|
198
|
+
model_id: "eleven_flash_v2_5"
|
|
199
|
+
})
|
|
200
|
+
});
|
|
201
|
+
if (!res.ok) {
|
|
202
|
+
const body = await res.text();
|
|
203
|
+
throw new Error(`ElevenLabs TTS failed (${res.status}): ${body}`);
|
|
204
|
+
}
|
|
205
|
+
const arrayBuf = await res.arrayBuffer();
|
|
206
|
+
return {
|
|
207
|
+
audio: Buffer.from(arrayBuf),
|
|
208
|
+
mimeType: "audio/mpeg"
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
};
|
|
212
|
+
var VoiceActivityDetector = class extends EventEmitter {
|
|
213
|
+
energyThreshold;
|
|
214
|
+
silenceDurationMs;
|
|
215
|
+
minSpeechDurationMs;
|
|
216
|
+
isSpeaking = false;
|
|
217
|
+
speechStartTime = 0;
|
|
218
|
+
lastSpeechTime = 0;
|
|
219
|
+
speechBuffers = [];
|
|
220
|
+
silenceCheckTimer = null;
|
|
221
|
+
constructor(config = {}) {
|
|
222
|
+
super();
|
|
223
|
+
this.energyThreshold = config.energyThreshold ?? 500;
|
|
224
|
+
this.silenceDurationMs = config.silenceDurationMs ?? 800;
|
|
225
|
+
this.minSpeechDurationMs = config.minSpeechDurationMs ?? 300;
|
|
226
|
+
}
|
|
227
|
+
/**
|
|
228
|
+
* Process a chunk of raw PCM Int16LE audio samples.
|
|
229
|
+
*
|
|
230
|
+
* Call this repeatedly with audio data from the microphone.
|
|
231
|
+
* The VAD will emit `speech_start` and `speech_end` events as appropriate.
|
|
232
|
+
*/
|
|
233
|
+
processSamples(pcmBuffer) {
|
|
234
|
+
const energy = this.computeRMS(pcmBuffer);
|
|
235
|
+
const now = Date.now();
|
|
236
|
+
if (energy >= this.energyThreshold) {
|
|
237
|
+
this.lastSpeechTime = now;
|
|
238
|
+
if (!this.isSpeaking) {
|
|
239
|
+
this.isSpeaking = true;
|
|
240
|
+
this.speechStartTime = now;
|
|
241
|
+
this.speechBuffers = [];
|
|
242
|
+
this.speechBuffers.push(Buffer.from(pcmBuffer));
|
|
243
|
+
this.emit("speech_start");
|
|
244
|
+
} else {
|
|
245
|
+
this.speechBuffers.push(Buffer.from(pcmBuffer));
|
|
246
|
+
}
|
|
247
|
+
this.scheduleSilenceCheck();
|
|
248
|
+
} else if (this.isSpeaking) {
|
|
249
|
+
this.speechBuffers.push(Buffer.from(pcmBuffer));
|
|
250
|
+
this.scheduleSilenceCheck();
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
/** Reset internal state. Call when stopping the listener. */
|
|
254
|
+
reset() {
|
|
255
|
+
this.isSpeaking = false;
|
|
256
|
+
this.speechStartTime = 0;
|
|
257
|
+
this.lastSpeechTime = 0;
|
|
258
|
+
this.speechBuffers = [];
|
|
259
|
+
if (this.silenceCheckTimer) {
|
|
260
|
+
clearTimeout(this.silenceCheckTimer);
|
|
261
|
+
this.silenceCheckTimer = null;
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
// -------------------------------------------------------------------------
|
|
265
|
+
// Private
|
|
266
|
+
// -------------------------------------------------------------------------
|
|
267
|
+
/** Compute root mean square energy of Int16LE PCM samples. */
|
|
268
|
+
computeRMS(buffer) {
|
|
269
|
+
const samples = buffer.length / 2;
|
|
270
|
+
if (samples === 0) return 0;
|
|
271
|
+
let sumSquares = 0;
|
|
272
|
+
for (let i = 0; i < buffer.length - 1; i += 2) {
|
|
273
|
+
const sample = buffer.readInt16LE(i);
|
|
274
|
+
sumSquares += sample * sample;
|
|
275
|
+
}
|
|
276
|
+
return Math.sqrt(sumSquares / samples);
|
|
277
|
+
}
|
|
278
|
+
/** Schedule a check for sustained silence after speech. */
|
|
279
|
+
scheduleSilenceCheck() {
|
|
280
|
+
if (this.silenceCheckTimer) {
|
|
281
|
+
clearTimeout(this.silenceCheckTimer);
|
|
282
|
+
}
|
|
283
|
+
this.silenceCheckTimer = setTimeout(() => {
|
|
284
|
+
this.silenceCheckTimer = null;
|
|
285
|
+
if (!this.isSpeaking) return;
|
|
286
|
+
const elapsed = Date.now() - this.lastSpeechTime;
|
|
287
|
+
if (elapsed >= this.silenceDurationMs) {
|
|
288
|
+
this.endSpeech();
|
|
289
|
+
}
|
|
290
|
+
}, this.silenceDurationMs + 50);
|
|
291
|
+
}
|
|
292
|
+
/** Finalize a speech segment. */
|
|
293
|
+
endSpeech() {
|
|
294
|
+
if (!this.isSpeaking) return;
|
|
295
|
+
const durationMs = Date.now() - this.speechStartTime;
|
|
296
|
+
this.isSpeaking = false;
|
|
297
|
+
if (durationMs < this.minSpeechDurationMs) {
|
|
298
|
+
this.speechBuffers = [];
|
|
299
|
+
return;
|
|
300
|
+
}
|
|
301
|
+
const audio = Buffer.concat(this.speechBuffers);
|
|
302
|
+
this.speechBuffers = [];
|
|
303
|
+
this.emit("speech_end", { audio, durationMs });
|
|
304
|
+
}
|
|
305
|
+
};
|
|
306
|
+
var MicCapture = class _MicCapture extends EventEmitter2 {
|
|
307
|
+
process = null;
|
|
308
|
+
sampleRate;
|
|
309
|
+
channels;
|
|
310
|
+
bitDepth;
|
|
311
|
+
device;
|
|
312
|
+
constructor(config = {}) {
|
|
313
|
+
super();
|
|
314
|
+
this.sampleRate = config.sampleRate ?? 16e3;
|
|
315
|
+
this.channels = config.channels ?? 1;
|
|
316
|
+
this.bitDepth = config.bitDepth ?? 16;
|
|
317
|
+
this.device = config.device;
|
|
318
|
+
}
|
|
319
|
+
/**
|
|
320
|
+
* Check whether `rec` (SoX) is available on PATH.
|
|
321
|
+
*/
|
|
322
|
+
static isAvailable() {
|
|
323
|
+
try {
|
|
324
|
+
const cmd = process.platform === "win32" ? "where rec" : "which rec";
|
|
325
|
+
execSync(cmd, { stdio: "ignore" });
|
|
326
|
+
return true;
|
|
327
|
+
} catch {
|
|
328
|
+
return false;
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
/**
|
|
332
|
+
* Start capturing audio from the system microphone.
|
|
333
|
+
*
|
|
334
|
+
* Emits `data` events with raw PCM Int16LE buffers.
|
|
335
|
+
* Call `stop()` to terminate the capture.
|
|
336
|
+
*
|
|
337
|
+
* @throws If `rec` is not available or the process fails to start.
|
|
338
|
+
*/
|
|
339
|
+
start() {
|
|
340
|
+
if (this.process) {
|
|
341
|
+
throw new Error("MicCapture is already running. Call stop() first.");
|
|
342
|
+
}
|
|
343
|
+
if (!_MicCapture.isAvailable()) {
|
|
344
|
+
throw new Error(
|
|
345
|
+
"SoX `rec` command not found on PATH. Install SoX: macOS `brew install sox`, Linux `apt install sox`."
|
|
346
|
+
);
|
|
347
|
+
}
|
|
348
|
+
const args = [
|
|
349
|
+
"-q",
|
|
350
|
+
// Quiet — suppress progress output
|
|
351
|
+
"-t",
|
|
352
|
+
"raw",
|
|
353
|
+
// Output raw PCM
|
|
354
|
+
"-b",
|
|
355
|
+
String(this.bitDepth),
|
|
356
|
+
// Bit depth
|
|
357
|
+
"-r",
|
|
358
|
+
String(this.sampleRate),
|
|
359
|
+
// Sample rate
|
|
360
|
+
"-c",
|
|
361
|
+
String(this.channels),
|
|
362
|
+
// Channels
|
|
363
|
+
"-e",
|
|
364
|
+
"signed-integer",
|
|
365
|
+
// Encoding
|
|
366
|
+
"-L",
|
|
367
|
+
// Little-endian byte order
|
|
368
|
+
"-"
|
|
369
|
+
// Output to stdout
|
|
370
|
+
];
|
|
371
|
+
const env = { ...process.env };
|
|
372
|
+
if (this.device) {
|
|
373
|
+
env["AUDIODEV"] = this.device;
|
|
374
|
+
}
|
|
375
|
+
this.process = spawn("rec", args, {
|
|
376
|
+
stdio: ["ignore", "pipe", "ignore"],
|
|
377
|
+
env
|
|
378
|
+
});
|
|
379
|
+
this.process.stdout.on("data", (chunk) => {
|
|
380
|
+
this.emit("data", chunk);
|
|
381
|
+
});
|
|
382
|
+
this.process.on("error", (err) => {
|
|
383
|
+
this.emit("error", err);
|
|
384
|
+
this.process = null;
|
|
385
|
+
});
|
|
386
|
+
this.process.on("close", () => {
|
|
387
|
+
this.process = null;
|
|
388
|
+
this.emit("close");
|
|
389
|
+
});
|
|
390
|
+
}
|
|
391
|
+
/**
|
|
392
|
+
* Stop the microphone capture.
|
|
393
|
+
*/
|
|
394
|
+
stop() {
|
|
395
|
+
if (!this.process) return;
|
|
396
|
+
try {
|
|
397
|
+
this.process.kill("SIGTERM");
|
|
398
|
+
} catch {
|
|
399
|
+
}
|
|
400
|
+
this.process = null;
|
|
401
|
+
}
|
|
402
|
+
/**
|
|
403
|
+
* Whether the microphone is currently capturing.
|
|
404
|
+
*/
|
|
405
|
+
get isRunning() {
|
|
406
|
+
return this.process !== null;
|
|
407
|
+
}
|
|
408
|
+
};
|
|
409
|
+
var execFile = promisify(execFileCb);
|
|
410
|
+
var EXTENSION_MAP = {
|
|
411
|
+
"audio/mpeg": ".mp3",
|
|
412
|
+
"audio/mp3": ".mp3",
|
|
413
|
+
"audio/wav": ".wav",
|
|
414
|
+
"audio/x-wav": ".wav",
|
|
415
|
+
"audio/ogg": ".ogg",
|
|
416
|
+
"audio/flac": ".flac",
|
|
417
|
+
"audio/aac": ".aac",
|
|
418
|
+
"audio/mp4": ".m4a",
|
|
419
|
+
"audio/webm": ".webm"
|
|
420
|
+
};
|
|
421
|
+
var AudioPlayback = class {
|
|
422
|
+
tempDir = null;
|
|
423
|
+
/**
|
|
424
|
+
* Play an audio buffer through the system speakers.
|
|
425
|
+
*
|
|
426
|
+
* Writes the buffer to a temp file, invokes the platform player,
|
|
427
|
+
* and cleans up after playback completes.
|
|
428
|
+
*
|
|
429
|
+
* @param audio - Raw audio data.
|
|
430
|
+
* @param mimeType - MIME type of the audio (e.g. 'audio/mpeg').
|
|
431
|
+
* @throws If no suitable playback command is available.
|
|
432
|
+
*/
|
|
433
|
+
async play(audio, mimeType) {
|
|
434
|
+
if (!this.tempDir) {
|
|
435
|
+
this.tempDir = mkdtempSync(join(tmpdir(), "ch4p-audio-"));
|
|
436
|
+
}
|
|
437
|
+
const ext = EXTENSION_MAP[mimeType] ?? ".mp3";
|
|
438
|
+
const tempFile = join(this.tempDir, `playback-${Date.now()}${ext}`);
|
|
439
|
+
try {
|
|
440
|
+
writeFileSync(tempFile, audio);
|
|
441
|
+
if (process.platform === "darwin") {
|
|
442
|
+
await execFile("afplay", [tempFile]);
|
|
443
|
+
} else {
|
|
444
|
+
try {
|
|
445
|
+
await execFile("play", ["-q", tempFile]);
|
|
446
|
+
} catch {
|
|
447
|
+
if (ext === ".wav") {
|
|
448
|
+
await execFile("aplay", ["-q", tempFile]);
|
|
449
|
+
} else {
|
|
450
|
+
throw new Error(
|
|
451
|
+
"No audio playback command found. Install SoX (`apt install sox`) or use WAV format with ALSA (`aplay`)."
|
|
452
|
+
);
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
} finally {
|
|
457
|
+
try {
|
|
458
|
+
unlinkSync(tempFile);
|
|
459
|
+
} catch {
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
};
|
|
464
|
+
var WakeListener = class extends EventEmitter3 {
|
|
465
|
+
stt;
|
|
466
|
+
tts;
|
|
467
|
+
playback;
|
|
468
|
+
config;
|
|
469
|
+
mic = null;
|
|
470
|
+
vad = null;
|
|
471
|
+
running = false;
|
|
472
|
+
processing = false;
|
|
473
|
+
constructor(opts) {
|
|
474
|
+
super();
|
|
475
|
+
this.stt = opts.stt;
|
|
476
|
+
this.tts = opts.tts;
|
|
477
|
+
this.config = opts.config;
|
|
478
|
+
this.playback = new AudioPlayback();
|
|
479
|
+
}
|
|
480
|
+
/**
|
|
481
|
+
* Start listening for voice input.
|
|
482
|
+
*
|
|
483
|
+
* Opens the system microphone, runs VAD, and transcribes detected speech.
|
|
484
|
+
* Emits `wake` events when speech is successfully transcribed.
|
|
485
|
+
*
|
|
486
|
+
* @throws If microphone capture is not available (SoX not installed).
|
|
487
|
+
*/
|
|
488
|
+
start() {
|
|
489
|
+
if (this.running) return;
|
|
490
|
+
if (!MicCapture.isAvailable()) {
|
|
491
|
+
throw new Error(
|
|
492
|
+
"Voice wake requires SoX for microphone capture. Install: macOS `brew install sox`, Linux `apt install sox`."
|
|
493
|
+
);
|
|
494
|
+
}
|
|
495
|
+
this.running = true;
|
|
496
|
+
this.vad = new VoiceActivityDetector({
|
|
497
|
+
energyThreshold: this.config.energyThreshold,
|
|
498
|
+
silenceDurationMs: this.config.silenceDurationMs,
|
|
499
|
+
minSpeechDurationMs: this.config.minSpeechDurationMs
|
|
500
|
+
});
|
|
501
|
+
this.mic = new MicCapture({
|
|
502
|
+
device: this.config.device
|
|
503
|
+
});
|
|
504
|
+
this.mic.on("data", (chunk) => {
|
|
505
|
+
if (this.vad && !this.processing) {
|
|
506
|
+
this.vad.processSamples(chunk);
|
|
507
|
+
}
|
|
508
|
+
});
|
|
509
|
+
this.mic.on("error", (err) => {
|
|
510
|
+
this.emit("error", err);
|
|
511
|
+
});
|
|
512
|
+
this.vad.on("speech_end", (segment) => {
|
|
513
|
+
void this.handleSpeechSegment(segment.audio, segment.durationMs);
|
|
514
|
+
});
|
|
515
|
+
this.mic.start();
|
|
516
|
+
this.emit("listening");
|
|
517
|
+
}
|
|
518
|
+
/**
|
|
519
|
+
* Stop listening.
|
|
520
|
+
*/
|
|
521
|
+
stop() {
|
|
522
|
+
if (!this.running) return;
|
|
523
|
+
this.running = false;
|
|
524
|
+
if (this.mic) {
|
|
525
|
+
this.mic.stop();
|
|
526
|
+
this.mic.removeAllListeners();
|
|
527
|
+
this.mic = null;
|
|
528
|
+
}
|
|
529
|
+
if (this.vad) {
|
|
530
|
+
this.vad.reset();
|
|
531
|
+
this.vad.removeAllListeners();
|
|
532
|
+
this.vad = null;
|
|
533
|
+
}
|
|
534
|
+
this.emit("stopped");
|
|
535
|
+
}
|
|
536
|
+
/**
|
|
537
|
+
* Speak a text response through the system speakers.
|
|
538
|
+
*
|
|
539
|
+
* Uses the configured TTS provider to synthesize audio, then plays it
|
|
540
|
+
* via the platform-native playback command. If no TTS is configured,
|
|
541
|
+
* this is a no-op.
|
|
542
|
+
*/
|
|
543
|
+
async speak(text) {
|
|
544
|
+
if (!this.tts || !text) return;
|
|
545
|
+
try {
|
|
546
|
+
const { audio, mimeType } = await this.tts.synthesize(text);
|
|
547
|
+
await this.playback.play(audio, mimeType);
|
|
548
|
+
} catch (err) {
|
|
549
|
+
this.emit("error", err instanceof Error ? err : new Error(String(err)));
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
/** Whether the listener is currently active. */
|
|
553
|
+
get isListening() {
|
|
554
|
+
return this.running;
|
|
555
|
+
}
|
|
556
|
+
// -------------------------------------------------------------------------
|
|
557
|
+
// Private
|
|
558
|
+
// -------------------------------------------------------------------------
|
|
559
|
+
/**
|
|
560
|
+
* Process a completed speech segment: transcribe and optionally filter by
|
|
561
|
+
* wake word.
|
|
562
|
+
*/
|
|
563
|
+
async handleSpeechSegment(audio, durationMs) {
|
|
564
|
+
if (!this.running) return;
|
|
565
|
+
this.processing = true;
|
|
566
|
+
try {
|
|
567
|
+
const wavBuffer = this.wrapPCMAsWAV(audio);
|
|
568
|
+
const transcript = await this.stt.transcribe(wavBuffer, "audio/wav");
|
|
569
|
+
if (!transcript || transcript.trim().length === 0) {
|
|
570
|
+
return;
|
|
571
|
+
}
|
|
572
|
+
const text = transcript.trim();
|
|
573
|
+
if (this.config.wakeWord) {
|
|
574
|
+
const wakeWord = this.config.wakeWord.toLowerCase();
|
|
575
|
+
const lowerText = text.toLowerCase();
|
|
576
|
+
if (!lowerText.startsWith(wakeWord)) {
|
|
577
|
+
return;
|
|
578
|
+
}
|
|
579
|
+
const strippedText = text.slice(wakeWord.length).trim();
|
|
580
|
+
if (strippedText.length === 0) {
|
|
581
|
+
return;
|
|
582
|
+
}
|
|
583
|
+
this.emit("wake", {
|
|
584
|
+
text: strippedText,
|
|
585
|
+
durationMs,
|
|
586
|
+
wakeWordStripped: true
|
|
587
|
+
});
|
|
588
|
+
} else {
|
|
589
|
+
this.emit("wake", {
|
|
590
|
+
text,
|
|
591
|
+
durationMs,
|
|
592
|
+
wakeWordStripped: false
|
|
593
|
+
});
|
|
594
|
+
}
|
|
595
|
+
} catch (err) {
|
|
596
|
+
this.emit("error", err instanceof Error ? err : new Error(String(err)));
|
|
597
|
+
} finally {
|
|
598
|
+
this.processing = false;
|
|
599
|
+
}
|
|
600
|
+
}
|
|
601
|
+
/**
|
|
602
|
+
* Wrap raw PCM Int16LE data in a WAV container.
|
|
603
|
+
*
|
|
604
|
+
* STT providers (Whisper, Deepgram) expect audio in a recognized format.
|
|
605
|
+
* This creates a minimal WAV header for the raw PCM data.
|
|
606
|
+
*/
|
|
607
|
+
wrapPCMAsWAV(pcm) {
|
|
608
|
+
const sampleRate = 16e3;
|
|
609
|
+
const channels = 1;
|
|
610
|
+
const bitsPerSample = 16;
|
|
611
|
+
const byteRate = sampleRate * channels * (bitsPerSample / 8);
|
|
612
|
+
const blockAlign = channels * (bitsPerSample / 8);
|
|
613
|
+
const dataSize = pcm.length;
|
|
614
|
+
const header = Buffer.alloc(44);
|
|
615
|
+
header.write("RIFF", 0);
|
|
616
|
+
header.writeUInt32LE(36 + dataSize, 4);
|
|
617
|
+
header.write("WAVE", 8);
|
|
618
|
+
header.write("fmt ", 12);
|
|
619
|
+
header.writeUInt32LE(16, 16);
|
|
620
|
+
header.writeUInt16LE(1, 20);
|
|
621
|
+
header.writeUInt16LE(channels, 22);
|
|
622
|
+
header.writeUInt32LE(sampleRate, 24);
|
|
623
|
+
header.writeUInt32LE(byteRate, 28);
|
|
624
|
+
header.writeUInt16LE(blockAlign, 32);
|
|
625
|
+
header.writeUInt16LE(bitsPerSample, 34);
|
|
626
|
+
header.write("data", 36);
|
|
627
|
+
header.writeUInt32LE(dataSize, 40);
|
|
628
|
+
return Buffer.concat([header, pcm]);
|
|
629
|
+
}
|
|
630
|
+
};
|
|
631
|
+
|
|
632
|
+
// src/system-prompt.ts
|
|
633
|
+
function buildSystemPrompt(opts = {}) {
|
|
634
|
+
let prompt = "You are ch4p, a personal AI assistant. You are helpful, concise, and security-conscious. When asked to perform actions, respect the configured autonomy level.";
|
|
635
|
+
if (opts.hasMemory) {
|
|
636
|
+
prompt += " You have persistent memory \u2014 you can recall information from previous conversations and learn from interactions over time. Use the memory_store and memory_recall tools to explicitly save or retrieve specific information when helpful.";
|
|
637
|
+
}
|
|
638
|
+
if (opts.hasSearch) {
|
|
639
|
+
prompt += " You have web search capability \u2014 use the web_search tool to find current information, look up facts, or research topics when needed.";
|
|
640
|
+
}
|
|
641
|
+
if (opts.skillRegistry && opts.skillRegistry.size > 0) {
|
|
642
|
+
const descriptions = opts.skillRegistry.getDescriptions().map((s) => ` - ${s.name}: ${s.description}`).join("\n");
|
|
643
|
+
prompt += "\n\nAvailable skills (use the `load_skill` tool with the skill name to get full instructions):\n" + descriptions;
|
|
644
|
+
}
|
|
645
|
+
return prompt;
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
export {
|
|
649
|
+
VoiceProcessor,
|
|
650
|
+
WhisperSTT,
|
|
651
|
+
DeepgramSTT,
|
|
652
|
+
ElevenLabsTTS,
|
|
653
|
+
WakeListener,
|
|
654
|
+
buildSystemPrompt
|
|
655
|
+
};
|