agent-voice 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/daemon.js ADDED
@@ -0,0 +1,473 @@
1
+ import {
2
+ BIT_DEPTH,
3
+ CHANNELS,
4
+ SAMPLE_RATE
5
+ } from "./chunk-YU5FF2L7.js";
6
+
7
+ // src/daemon.ts
8
+ import { rmSync as rmSync3 } from "fs";
9
+ import { createRequire } from "module";
10
+ import { createServer } from "net";
11
+
12
+ // src/config.ts
13
+ import { chmodSync, mkdirSync, readFileSync, writeFileSync } from "fs";
14
+ import { homedir } from "os";
15
+ import { join } from "path";
16
+ var CONFIG_DIR = join(homedir(), ".agent-voice");
17
+ var CONFIG_PATH = join(CONFIG_DIR, "config.json");
18
+ var DAEMON_SOCKET_PATH = join(CONFIG_DIR, "daemon.sock");
19
+ var DAEMON_PID_PATH = join(CONFIG_DIR, "daemon.pid");
20
+ var LOG_DIR = join(CONFIG_DIR, "logs");
21
+ var AUDIO_LOG_DIR = join(LOG_DIR, "audio");
22
+ var EVENTS_LOG_PATH = join(LOG_DIR, "events.ndjson");
23
+ var DAEMON_DEFAULTS = {
24
+ idleTimeoutMinutes: 30,
25
+ audioRingBufferSize: 50
26
+ };
27
+ function readConfig() {
28
+ try {
29
+ return JSON.parse(readFileSync(CONFIG_PATH, "utf-8"));
30
+ } catch {
31
+ return {};
32
+ }
33
+ }
34
+ function resolveAuth() {
35
+ const config = readConfig();
36
+ if (config.auth?.apiKey) {
37
+ return config.auth;
38
+ }
39
+ if (process.env.OPENAI_API_KEY) {
40
+ return { apiKey: process.env.OPENAI_API_KEY };
41
+ }
42
+ throw new Error(
43
+ "No API key found. Run `agent-voice auth` or set OPENAI_API_KEY."
44
+ );
45
+ }
46
+ function isDebugEnabled() {
47
+ if (process.env.AGENT_VOICE_DEBUG === "1") return true;
48
+ return readConfig().debug === true;
49
+ }
50
+ function isDebugAudioEnabled() {
51
+ if (process.env.AGENT_VOICE_DEBUG_AUDIO === "1") return true;
52
+ return readConfig()["debug.audio"] === true;
53
+ }
54
+ function resolveDaemonConfig() {
55
+ const config = readConfig();
56
+ return {
57
+ idleTimeoutMinutes: config.daemon?.idleTimeoutMinutes ?? DAEMON_DEFAULTS.idleTimeoutMinutes,
58
+ audioRingBufferSize: config.daemon?.audioRingBufferSize ?? DAEMON_DEFAULTS.audioRingBufferSize
59
+ };
60
+ }
61
+
62
+ // src/daemon-lifecycle.ts
63
+ import { spawn } from "child_process";
64
+ import {
65
+ existsSync,
66
+ mkdirSync as mkdirSync2,
67
+ readFileSync as readFileSync2,
68
+ rmSync,
69
+ writeFileSync as writeFileSync2
70
+ } from "fs";
71
+ import { connect } from "net";
72
+ import { dirname, join as join2 } from "path";
73
+
74
+ // src/daemon-protocol.ts
75
+ import { z } from "zod";
76
+ var DaemonRequest = z.discriminatedUnion("type", [
77
+ z.object({
78
+ type: z.literal("say"),
79
+ id: z.string(),
80
+ message: z.string(),
81
+ voice: z.string()
82
+ }),
83
+ z.object({
84
+ type: z.literal("ask"),
85
+ id: z.string(),
86
+ message: z.string(),
87
+ voice: z.string(),
88
+ timeout: z.number(),
89
+ ack: z.boolean()
90
+ }),
91
+ z.object({ type: z.literal("ping") }),
92
+ z.object({ type: z.literal("shutdown") })
93
+ ]);
94
+ var TraceEntry = z.object({
95
+ atMs: z.number(),
96
+ event: z.string(),
97
+ detail: z.record(z.unknown()).optional()
98
+ });
99
+ var DaemonResponse = z.discriminatedUnion("type", [
100
+ z.object({ type: z.literal("say:done"), id: z.string() }),
101
+ z.object({
102
+ type: z.literal("ask:done"),
103
+ id: z.string(),
104
+ transcript: z.string()
105
+ }),
106
+ z.object({ type: z.literal("error"), id: z.string(), message: z.string() }),
107
+ z.object({
108
+ type: z.literal("pong"),
109
+ uptime: z.number(),
110
+ commandCount: z.number()
111
+ }),
112
+ z.object({ type: z.literal("log"), id: z.string(), entry: TraceEntry })
113
+ ]);
114
+ function encodeMessage(msg) {
115
+ const json = JSON.stringify(msg);
116
+ const payload = Buffer.from(`${json}
117
+ `, "utf-8");
118
+ const header = Buffer.alloc(4);
119
+ header.writeUInt32BE(payload.length, 0);
120
+ return Buffer.concat([header, payload]);
121
+ }
122
+ function createMessageParser(onMessage) {
123
+ let buffer = Buffer.alloc(0);
124
+ return (chunk) => {
125
+ buffer = Buffer.concat([buffer, chunk]);
126
+ while (buffer.length >= 4) {
127
+ const length = buffer.readUInt32BE(0);
128
+ if (buffer.length < 4 + length) break;
129
+ const payload = buffer.subarray(4, 4 + length).toString("utf-8");
130
+ buffer = buffer.subarray(4 + length);
131
+ onMessage(JSON.parse(payload));
132
+ }
133
+ };
134
+ }
135
+
136
+ // src/daemon-lifecycle.ts
137
+ function writeDaemonPid(pid) {
138
+ mkdirSync2(dirname(DAEMON_PID_PATH), { recursive: true });
139
+ writeFileSync2(DAEMON_PID_PATH, `${pid}
140
+ `);
141
+ }
142
+ function removeDaemonPid() {
143
+ try {
144
+ rmSync(DAEMON_PID_PATH);
145
+ } catch {
146
+ }
147
+ }
148
+
149
+ // src/daemon-log.ts
150
+ import {
151
+ appendFileSync,
152
+ mkdirSync as mkdirSync3,
153
+ readdirSync,
154
+ rmSync as rmSync2,
155
+ writeFileSync as writeFileSync3
156
+ } from "fs";
157
+ import { join as join3 } from "path";
158
+ function ensureLogDir() {
159
+ mkdirSync3(LOG_DIR, { recursive: true });
160
+ }
161
+ function ensureAudioDir() {
162
+ mkdirSync3(AUDIO_LOG_DIR, { recursive: true });
163
+ }
164
+ function appendLogEntry(entry) {
165
+ if (!isDebugEnabled()) return;
166
+ ensureLogDir();
167
+ appendFileSync(EVENTS_LOG_PATH, `${JSON.stringify(entry)}
168
+ `);
169
+ }
170
+ function createCommandLogger(cmd, id) {
171
+ const startMs = Date.now();
172
+ return {
173
+ log(event, detail) {
174
+ appendLogEntry({
175
+ ts: (/* @__PURE__ */ new Date()).toISOString(),
176
+ cmd,
177
+ id,
178
+ event,
179
+ detail
180
+ });
181
+ },
182
+ trace(event) {
183
+ appendLogEntry({
184
+ ts: (/* @__PURE__ */ new Date()).toISOString(),
185
+ cmd,
186
+ id,
187
+ event: event.event,
188
+ detail: { ...event.detail, atMs: event.atMs }
189
+ });
190
+ },
191
+ get startMs() {
192
+ return startMs;
193
+ }
194
+ };
195
+ }
196
+ function createWavBuffer(pcm16) {
197
+ const header = Buffer.alloc(44);
198
+ const dataSize = pcm16.length;
199
+ const fileSize = 36 + dataSize;
200
+ const byteRate = SAMPLE_RATE * CHANNELS * (BIT_DEPTH / 8);
201
+ const blockAlign = CHANNELS * (BIT_DEPTH / 8);
202
+ header.write("RIFF", 0);
203
+ header.writeUInt32LE(fileSize, 4);
204
+ header.write("WAVE", 8);
205
+ header.write("fmt ", 12);
206
+ header.writeUInt32LE(16, 16);
207
+ header.writeUInt16LE(1, 20);
208
+ header.writeUInt16LE(CHANNELS, 22);
209
+ header.writeUInt32LE(SAMPLE_RATE, 24);
210
+ header.writeUInt32LE(byteRate, 28);
211
+ header.writeUInt16LE(blockAlign, 32);
212
+ header.writeUInt16LE(BIT_DEPTH, 34);
213
+ header.write("data", 36);
214
+ header.writeUInt32LE(dataSize, 40);
215
+ return Buffer.concat([header, pcm16]);
216
+ }
217
+ function writeAudioCapture(id, streams) {
218
+ if (!isDebugAudioEnabled()) return [];
219
+ ensureAudioDir();
220
+ const written = [];
221
+ for (const [name, chunks] of Object.entries(streams)) {
222
+ if (chunks.length === 0) continue;
223
+ const path = join3(AUDIO_LOG_DIR, `${id}-${name}.wav`);
224
+ writeFileSync3(path, createWavBuffer(Buffer.concat(chunks)));
225
+ written.push(path);
226
+ }
227
+ enforceRingBuffer();
228
+ return written;
229
+ }
230
+ function enforceRingBuffer() {
231
+ const { audioRingBufferSize } = resolveDaemonConfig();
232
+ let files;
233
+ try {
234
+ files = readdirSync(AUDIO_LOG_DIR).filter((f) => f.endsWith(".wav")).sort();
235
+ } catch {
236
+ return;
237
+ }
238
+ const commandIds = /* @__PURE__ */ new Set();
239
+ for (const file of files) {
240
+ const match = file.match(/^(.+)-(?:assistant|mic|model-input)\.wav$/);
241
+ if (match) commandIds.add(match[1]);
242
+ }
243
+ const ids = [...commandIds].sort();
244
+ const excess = ids.length - audioRingBufferSize;
245
+ if (excess <= 0) return;
246
+ const idsToRemove = new Set(ids.slice(0, excess));
247
+ for (const file of files) {
248
+ const match = file.match(/^(.+)-(?:assistant|mic|model-input)\.wav$/);
249
+ if (match && idsToRemove.has(match[1])) {
250
+ try {
251
+ rmSync2(join3(AUDIO_LOG_DIR, file));
252
+ } catch {
253
+ }
254
+ }
255
+ }
256
+ }
257
+
258
+ // src/daemon.ts
259
+ var require2 = createRequire(import.meta.url);
260
+ var engineState = null;
261
+ var commandCount = 0;
262
+ var startedAt = Date.now();
263
+ var idleTimer = null;
264
+ function resetIdleTimer() {
265
+ if (idleTimer) clearTimeout(idleTimer);
266
+ const { idleTimeoutMinutes } = resolveDaemonConfig();
267
+ idleTimer = setTimeout(
268
+ () => {
269
+ shutdown();
270
+ },
271
+ idleTimeoutMinutes * 60 * 1e3
272
+ );
273
+ }
274
+ function getOrCreateEngine(mode) {
275
+ if (engineState && engineState.mode === mode) {
276
+ return engineState.engine;
277
+ }
278
+ if (engineState) {
279
+ try {
280
+ engineState.engine.stop();
281
+ engineState.engine.close();
282
+ } catch {
283
+ }
284
+ engineState = null;
285
+ }
286
+ const { AudioEngine } = require2("agent-voice-audio");
287
+ const engine = new AudioEngine({
288
+ sampleRate: SAMPLE_RATE,
289
+ channels: 1,
290
+ enableAec: mode === "ask",
291
+ streamDelayMs: mode === "ask" ? 30 : void 0
292
+ });
293
+ engine.start();
294
+ engineState = { engine, mode };
295
+ return engine;
296
+ }
297
+ function createEngineProxy(engine) {
298
+ return {
299
+ start() {
300
+ },
301
+ stop() {
302
+ },
303
+ close() {
304
+ },
305
+ play: engine.play.bind(engine),
306
+ readProcessedCapture: engine.readProcessedCapture.bind(engine),
307
+ readRawCapture: engine.readRawCapture.bind(engine),
308
+ setStreamDelayMs: engine.setStreamDelayMs.bind(engine),
309
+ getStats: engine.getStats.bind(engine)
310
+ };
311
+ }
312
+ var commandQueue = [];
313
+ var processing = false;
314
+ async function processQueue() {
315
+ if (processing) return;
316
+ processing = true;
317
+ while (commandQueue.length > 0) {
318
+ const item = commandQueue.shift();
319
+ if (!item) break;
320
+ await executeCommand(item.request, item.socket);
321
+ }
322
+ processing = false;
323
+ }
324
+ function send(socket, msg) {
325
+ if (!socket.destroyed) {
326
+ socket.write(encodeMessage(msg));
327
+ }
328
+ }
329
+ async function executeCommand(request, socket) {
330
+ if (request.type === "ping") {
331
+ send(socket, {
332
+ type: "pong",
333
+ uptime: Date.now() - startedAt,
334
+ commandCount
335
+ });
336
+ return;
337
+ }
338
+ if (request.type === "shutdown") {
339
+ shutdown();
340
+ return;
341
+ }
342
+ commandCount++;
343
+ resetIdleTimer();
344
+ if (request.type === "say") {
345
+ await executeSay(request, socket);
346
+ } else if (request.type === "ask") {
347
+ await executeAsk(request, socket);
348
+ }
349
+ }
350
+ async function executeSay(request, socket) {
351
+ const logger = createCommandLogger("say", request.id);
352
+ const assistantChunks = [];
353
+ try {
354
+ const engine = getOrCreateEngine("say");
355
+ const proxy = createEngineProxy(engine);
356
+ const auth = resolveAuth();
357
+ const { say } = await import("./say-6EJTKNJJ.js");
358
+ await say(request.message, {
359
+ voice: request.voice,
360
+ auth,
361
+ createAudioEngine: () => proxy,
362
+ onAssistantAudio(pcm16) {
363
+ assistantChunks.push(Buffer.from(pcm16));
364
+ },
365
+ onTrace(event) {
366
+ logger.trace(event);
367
+ send(socket, { type: "log", id: request.id, entry: event });
368
+ }
369
+ });
370
+ writeAudioCapture(request.id, { assistant: assistantChunks });
371
+ logger.log("done");
372
+ send(socket, { type: "say:done", id: request.id });
373
+ } catch (err) {
374
+ const message = err instanceof Error ? err.message : String(err);
375
+ logger.log("error", { message });
376
+ writeAudioCapture(request.id, { assistant: assistantChunks });
377
+ send(socket, { type: "error", id: request.id, message });
378
+ }
379
+ }
380
+ async function executeAsk(request, socket) {
381
+ const logger = createCommandLogger("ask", request.id);
382
+ const assistantChunks = [];
383
+ const micChunks = [];
384
+ const modelInputChunks = [];
385
+ try {
386
+ const engine = getOrCreateEngine("ask");
387
+ const proxy = createEngineProxy(engine);
388
+ const auth = resolveAuth();
389
+ const { ask } = await import("./ask-5J4JCHM4.js");
390
+ const transcript = await ask(request.message, {
391
+ voice: request.voice,
392
+ timeout: request.timeout,
393
+ ack: request.ack,
394
+ auth,
395
+ createAudioEngine: () => proxy,
396
+ onAssistantAudio(pcm16) {
397
+ assistantChunks.push(Buffer.from(pcm16));
398
+ },
399
+ onMicAudio(pcm16) {
400
+ micChunks.push(Buffer.from(pcm16));
401
+ },
402
+ onAudioFrameSent(pcm16) {
403
+ modelInputChunks.push(Buffer.from(pcm16));
404
+ },
405
+ onTrace(event) {
406
+ logger.trace(event);
407
+ send(socket, { type: "log", id: request.id, entry: event });
408
+ }
409
+ });
410
+ writeAudioCapture(request.id, {
411
+ assistant: assistantChunks,
412
+ mic: micChunks,
413
+ "model-input": modelInputChunks
414
+ });
415
+ logger.log("done", { transcript });
416
+ send(socket, { type: "ask:done", id: request.id, transcript });
417
+ } catch (err) {
418
+ const message = err instanceof Error ? err.message : String(err);
419
+ logger.log("error", { message });
420
+ writeAudioCapture(request.id, {
421
+ assistant: assistantChunks,
422
+ mic: micChunks,
423
+ "model-input": modelInputChunks
424
+ });
425
+ send(socket, { type: "error", id: request.id, message });
426
+ }
427
+ }
428
+ var server = createServer((socket) => {
429
+ const parse = createMessageParser((msg) => {
430
+ const result = DaemonRequest.safeParse(msg);
431
+ if (!result.success) {
432
+ send(socket, {
433
+ type: "error",
434
+ id: "unknown",
435
+ message: `Invalid request: ${result.error.message}`
436
+ });
437
+ return;
438
+ }
439
+ commandQueue.push({ request: result.data, socket });
440
+ processQueue();
441
+ });
442
+ socket.on("data", parse);
443
+ socket.on("error", () => {
444
+ });
445
+ });
446
+ function shutdown() {
447
+ if (idleTimer) clearTimeout(idleTimer);
448
+ server.close();
449
+ if (engineState) {
450
+ try {
451
+ engineState.engine.stop();
452
+ engineState.engine.close();
453
+ } catch {
454
+ }
455
+ engineState = null;
456
+ }
457
+ removeDaemonPid();
458
+ try {
459
+ rmSync3(DAEMON_SOCKET_PATH);
460
+ } catch {
461
+ }
462
+ process.exit(0);
463
+ }
464
+ try {
465
+ rmSync3(DAEMON_SOCKET_PATH);
466
+ } catch {
467
+ }
468
+ server.listen(DAEMON_SOCKET_PATH, () => {
469
+ writeDaemonPid(process.pid);
470
+ resetIdleTimer();
471
+ });
472
+ process.on("SIGTERM", shutdown);
473
+ process.on("SIGINT", shutdown);
package/dist/index.js CHANGED
@@ -197,6 +197,7 @@ async function ask(message, options = {}) {
197
197
  let lastAssistantAudioAt = 0;
198
198
  let nearEndEvidenceSeen = false;
199
199
  let nearEndEvidenceAtMs = 0;
200
+ let nearEndEvidenceConfirmed = false;
200
201
  let cleaned = false;
201
202
  let settled = false;
202
203
  async function cleanup() {
@@ -265,6 +266,19 @@ async function ask(message, options = {}) {
265
266
  if (rms >= minSpeechRms) {
266
267
  nearEndEvidenceSeen = true;
267
268
  nearEndEvidenceAtMs = Date.now();
269
+ if (!nearEndEvidenceConfirmed && speechStartedAtMs > 0) {
270
+ const evidencePreRollMs = readEnvInt(
271
+ "AGENT_VOICE_SPEECH_EVIDENCE_PREROLL_MS",
272
+ 200
273
+ );
274
+ const evidencePostRollMs = readEnvInt(
275
+ "AGENT_VOICE_SPEECH_EVIDENCE_POSTROLL_MS",
276
+ 1500
277
+ );
278
+ if (nearEndEvidenceAtMs >= speechStartedAtMs - evidencePreRollMs && nearEndEvidenceAtMs <= speechStartedAtMs + evidencePostRollMs) {
279
+ nearEndEvidenceConfirmed = true;
280
+ }
281
+ }
268
282
  trace("audio:near_end_evidence", { rms, minSpeechRms });
269
283
  }
270
284
  onAudioFrameSent?.(frame);
@@ -303,29 +317,14 @@ async function ask(message, options = {}) {
303
317
  }
304
318
  logEvent("realtime:transcript", `text="${text}"`);
305
319
  trace("realtime:transcript", { text });
306
- if (speechDetected) {
307
- const evidencePreRollMs = readEnvInt(
308
- "AGENT_VOICE_SPEECH_EVIDENCE_PREROLL_MS",
309
- 200
310
- );
311
- const evidencePostRollMs = readEnvInt(
312
- "AGENT_VOICE_SPEECH_EVIDENCE_POSTROLL_MS",
313
- 1500
314
- );
315
- const evidenceEarliestMs = speechStartedAtMs - evidencePreRollMs;
316
- const evidenceLatestMs = speechStartedAtMs + evidencePostRollMs;
317
- const hasTimelyNearEndEvidence = nearEndEvidenceSeen && nearEndEvidenceAtMs >= evidenceEarliestMs && nearEndEvidenceAtMs <= evidenceLatestMs;
318
- if (!hasTimelyNearEndEvidence) {
319
- trace("realtime:transcript_ignored_no_near_end_evidence", {
320
- text,
321
- speechStartedAtMs,
322
- nearEndEvidenceSeen,
323
- nearEndEvidenceAtMs,
324
- evidenceEarliestMs,
325
- evidenceLatestMs
326
- });
327
- return;
328
- }
320
+ if (speechDetected && !nearEndEvidenceConfirmed) {
321
+ trace("realtime:transcript_ignored_no_near_end_evidence", {
322
+ text,
323
+ speechStartedAtMs,
324
+ nearEndEvidenceSeen,
325
+ nearEndEvidenceAtMs
326
+ });
327
+ return;
329
328
  }
330
329
  if (transcriptTimer) {
331
330
  clearTimeout(transcriptTimer);
@@ -339,6 +338,15 @@ async function ask(message, options = {}) {
339
338
  trace("realtime:speech_started");
340
339
  speechDetected = true;
341
340
  speechStartedAtMs = Date.now();
341
+ if (nearEndEvidenceSeen && !nearEndEvidenceConfirmed) {
342
+ const evidencePreRollMs = readEnvInt(
343
+ "AGENT_VOICE_SPEECH_EVIDENCE_PREROLL_MS",
344
+ 200
345
+ );
346
+ if (nearEndEvidenceAtMs >= speechStartedAtMs - evidencePreRollMs) {
347
+ nearEndEvidenceConfirmed = true;
348
+ }
349
+ }
342
350
  if (timeoutTimer) {
343
351
  clearTimeout(timeoutTimer);
344
352
  timeoutTimer = null;
@@ -424,6 +432,11 @@ import { homedir } from "os";
424
432
  import { join } from "path";
425
433
  var CONFIG_DIR = join(homedir(), ".agent-voice");
426
434
  var CONFIG_PATH = join(CONFIG_DIR, "config.json");
435
+ var DAEMON_SOCKET_PATH = join(CONFIG_DIR, "daemon.sock");
436
+ var DAEMON_PID_PATH = join(CONFIG_DIR, "daemon.pid");
437
+ var LOG_DIR = join(CONFIG_DIR, "logs");
438
+ var AUDIO_LOG_DIR = join(LOG_DIR, "audio");
439
+ var EVENTS_LOG_PATH = join(LOG_DIR, "events.ndjson");
427
440
  function readConfig() {
428
441
  try {
429
442
  return JSON.parse(readFileSync(CONFIG_PATH, "utf-8"));