agent-voice 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -127,7 +127,19 @@ async function ask(message, options = {}) {
127
127
  if (!heardAssistantAudio) return;
128
128
  for (const frame of processedFrames) {
129
129
  const rms = pcm16Rms(frame);
130
- const minSpeechRms = readEnvInt("AGENT_VOICE_MIN_SPEECH_RMS", 550);
130
+ const configuredMinSpeechRms = readEnvInt(
131
+ "AGENT_VOICE_MIN_SPEECH_RMS",
132
+ 220
133
+ );
134
+ const relaxAfterMs = readEnvInt(
135
+ "AGENT_VOICE_MIN_SPEECH_RMS_RELAX_AFTER_MS",
136
+ 500
137
+ );
138
+ const relaxedMinSpeechRms = readEnvInt(
139
+ "AGENT_VOICE_MIN_SPEECH_RMS_RELAXED",
140
+ 120
141
+ );
142
+ const minSpeechRms = speechDetected && speechStartedAtMs > 0 && Date.now() - speechStartedAtMs >= relaxAfterMs ? relaxedMinSpeechRms : configuredMinSpeechRms;
131
143
  if (rms >= minSpeechRms) {
132
144
  nearEndEvidenceSeen = true;
133
145
  nearEndEvidenceAtMs = Date.now();
@@ -170,19 +182,25 @@ async function ask(message, options = {}) {
170
182
  logEvent("realtime:transcript", `text="${text}"`);
171
183
  trace("realtime:transcript", { text });
172
184
  if (speechDetected) {
173
- const evidenceWindowMs = readEnvInt(
174
- "AGENT_VOICE_SPEECH_EVIDENCE_WINDOW_MS",
175
- 1200
185
+ const evidencePreRollMs = readEnvInt(
186
+ "AGENT_VOICE_SPEECH_EVIDENCE_PREROLL_MS",
187
+ 200
188
+ );
189
+ const evidencePostRollMs = readEnvInt(
190
+ "AGENT_VOICE_SPEECH_EVIDENCE_POSTROLL_MS",
191
+ 1500
176
192
  );
177
- const evidenceAgeMs = nearEndEvidenceSeen ? Math.abs(nearEndEvidenceAtMs - speechStartedAtMs) : Number.POSITIVE_INFINITY;
178
- if (!nearEndEvidenceSeen || evidenceAgeMs > evidenceWindowMs) {
193
+ const evidenceEarliestMs = speechStartedAtMs - evidencePreRollMs;
194
+ const evidenceLatestMs = speechStartedAtMs + evidencePostRollMs;
195
+ const hasTimelyNearEndEvidence = nearEndEvidenceSeen && nearEndEvidenceAtMs >= evidenceEarliestMs && nearEndEvidenceAtMs <= evidenceLatestMs;
196
+ if (!hasTimelyNearEndEvidence) {
179
197
  trace("realtime:transcript_ignored_no_near_end_evidence", {
180
198
  text,
181
199
  speechStartedAtMs,
182
200
  nearEndEvidenceSeen,
183
201
  nearEndEvidenceAtMs,
184
- evidenceAgeMs,
185
- evidenceWindowMs
202
+ evidenceEarliestMs,
203
+ evidenceLatestMs
186
204
  });
187
205
  return;
188
206
  }
package/dist/cli.js CHANGED
@@ -28,8 +28,8 @@ async function withSuppressedNativeOutput() {
28
28
  openSync("/dev/null", "w");
29
29
  closeSync(2);
30
30
  openSync("/dev/null", "w");
31
- const { ask } = await import("./ask-OIE6HL2H.js");
32
- const { say } = await import("./say-ZVF6EX52.js");
31
+ const { ask } = await import("./ask-KM3JPI36.js");
32
+ const { say } = await import("./say-OEQQFOCC.js");
33
33
  function writeResult(text) {
34
34
  writeSync(savedStdout, `${text}
35
35
  `);
@@ -187,14 +187,50 @@ ${files.modelInputFile}`
187
187
  process.exit(1);
188
188
  }
189
189
  });
190
- program.command("say").description("Speak a message without listening for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).action(async (opts) => {
190
+ program.command("say").description("Speak a message without listening for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).option(
191
+ "--debug-audio-dir <dir>",
192
+ "Write say audio debug WAV to this directory"
193
+ ).action(async (opts) => {
191
194
  const { say, writeError } = await withSuppressedNativeOutput();
195
+ const assistantChunks = [];
192
196
  try {
193
197
  const auth = resolveAuth();
194
198
  const message = await getMessage(opts.message);
195
- await say(message, { voice: opts.voice, auth });
199
+ await say(message, {
200
+ voice: opts.voice,
201
+ auth,
202
+ onAssistantAudio: opts.debugAudioDir ? (pcm16) => assistantChunks.push(Buffer.from(pcm16)) : void 0
203
+ });
204
+ if (opts.debugAudioDir) {
205
+ mkdirSync(opts.debugAudioDir, { recursive: true });
206
+ const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
207
+ const file = join(
208
+ opts.debugAudioDir,
209
+ `say-${stamp}-assistant-output.wav`
210
+ );
211
+ writeFileSync(file, createWavBuffer(Buffer.concat(assistantChunks)));
212
+ writeError(`debug audio written:
213
+ ${file}`);
214
+ }
196
215
  process.exit(0);
197
216
  } catch (err) {
217
+ if (opts.debugAudioDir && assistantChunks.length > 0) {
218
+ try {
219
+ mkdirSync(opts.debugAudioDir, { recursive: true });
220
+ const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
221
+ const file = join(
222
+ opts.debugAudioDir,
223
+ `say-${stamp}-assistant-output.wav`
224
+ );
225
+ writeFileSync(
226
+ file,
227
+ createWavBuffer(Buffer.concat(assistantChunks))
228
+ );
229
+ writeError(`debug audio written:
230
+ ${file}`);
231
+ } catch {
232
+ }
233
+ }
198
234
  writeError(`${err instanceof Error ? err.message : err}`);
199
235
  process.exit(1);
200
236
  }
package/dist/index.d.ts CHANGED
@@ -91,6 +91,7 @@ type SayOptions = {
91
91
  enableAec?: boolean;
92
92
  streamDelayMs?: number;
93
93
  }) => RustAudioEngine;
94
+ onAssistantAudio?: (pcm16: Buffer) => void;
94
95
  onTrace?: (event: {
95
96
  atMs: number;
96
97
  event: string;
package/dist/index.js CHANGED
@@ -249,7 +249,19 @@ async function ask(message, options = {}) {
249
249
  if (!heardAssistantAudio) return;
250
250
  for (const frame of processedFrames) {
251
251
  const rms = pcm16Rms(frame);
252
- const minSpeechRms = readEnvInt("AGENT_VOICE_MIN_SPEECH_RMS", 550);
252
+ const configuredMinSpeechRms = readEnvInt(
253
+ "AGENT_VOICE_MIN_SPEECH_RMS",
254
+ 220
255
+ );
256
+ const relaxAfterMs = readEnvInt(
257
+ "AGENT_VOICE_MIN_SPEECH_RMS_RELAX_AFTER_MS",
258
+ 500
259
+ );
260
+ const relaxedMinSpeechRms = readEnvInt(
261
+ "AGENT_VOICE_MIN_SPEECH_RMS_RELAXED",
262
+ 120
263
+ );
264
+ const minSpeechRms = speechDetected && speechStartedAtMs > 0 && Date.now() - speechStartedAtMs >= relaxAfterMs ? relaxedMinSpeechRms : configuredMinSpeechRms;
253
265
  if (rms >= minSpeechRms) {
254
266
  nearEndEvidenceSeen = true;
255
267
  nearEndEvidenceAtMs = Date.now();
@@ -292,19 +304,25 @@ async function ask(message, options = {}) {
292
304
  logEvent("realtime:transcript", `text="${text}"`);
293
305
  trace("realtime:transcript", { text });
294
306
  if (speechDetected) {
295
- const evidenceWindowMs = readEnvInt(
296
- "AGENT_VOICE_SPEECH_EVIDENCE_WINDOW_MS",
297
- 1200
307
+ const evidencePreRollMs = readEnvInt(
308
+ "AGENT_VOICE_SPEECH_EVIDENCE_PREROLL_MS",
309
+ 200
310
+ );
311
+ const evidencePostRollMs = readEnvInt(
312
+ "AGENT_VOICE_SPEECH_EVIDENCE_POSTROLL_MS",
313
+ 1500
298
314
  );
299
- const evidenceAgeMs = nearEndEvidenceSeen ? Math.abs(nearEndEvidenceAtMs - speechStartedAtMs) : Number.POSITIVE_INFINITY;
300
- if (!nearEndEvidenceSeen || evidenceAgeMs > evidenceWindowMs) {
315
+ const evidenceEarliestMs = speechStartedAtMs - evidencePreRollMs;
316
+ const evidenceLatestMs = speechStartedAtMs + evidencePostRollMs;
317
+ const hasTimelyNearEndEvidence = nearEndEvidenceSeen && nearEndEvidenceAtMs >= evidenceEarliestMs && nearEndEvidenceAtMs <= evidenceLatestMs;
318
+ if (!hasTimelyNearEndEvidence) {
301
319
  trace("realtime:transcript_ignored_no_near_end_evidence", {
302
320
  text,
303
321
  speechStartedAtMs,
304
322
  nearEndEvidenceSeen,
305
323
  nearEndEvidenceAtMs,
306
- evidenceAgeMs,
307
- evidenceWindowMs
324
+ evidenceEarliestMs,
325
+ evidenceLatestMs
308
326
  });
309
327
  return;
310
328
  }
@@ -439,6 +457,7 @@ async function say(message, options = {}) {
439
457
  auth,
440
458
  createSession,
441
459
  createAudioEngine,
460
+ onAssistantAudio,
442
461
  onTrace
443
462
  } = options;
444
463
  const { AudioEngine } = require3("agent-voice-audio");
@@ -460,6 +479,9 @@ async function say(message, options = {}) {
460
479
  let completionTailTimer = null;
461
480
  let drainPollTimer = null;
462
481
  let drainDeadlineTimer = null;
482
+ let playoutDeadlineTimer = null;
483
+ let firstAudioAtMs = 0;
484
+ let totalReceivedSamples = 0;
463
485
  function cleanup() {
464
486
  if (cleaned) return;
465
487
  cleaned = true;
@@ -467,6 +489,7 @@ async function say(message, options = {}) {
467
489
  if (completionTailTimer) clearTimeout(completionTailTimer);
468
490
  if (drainPollTimer) clearInterval(drainPollTimer);
469
491
  if (drainDeadlineTimer) clearTimeout(drainDeadlineTimer);
492
+ if (playoutDeadlineTimer) clearTimeout(playoutDeadlineTimer);
470
493
  try {
471
494
  engine.stop();
472
495
  engine.close();
@@ -487,6 +510,28 @@ async function say(message, options = {}) {
487
510
  cleanup();
488
511
  reject(error);
489
512
  }
513
+ function waitForWallClockPlayout() {
514
+ if (settled) return;
515
+ if (firstAudioAtMs <= 0 || totalReceivedSamples <= 0) {
516
+ resolveOnce();
517
+ return;
518
+ }
519
+ const expectedPlayoutMs = Math.ceil(
520
+ totalReceivedSamples / SAMPLE_RATE * 1e3
521
+ );
522
+ const playoutTailMs = 140;
523
+ const dueAtMs = firstAudioAtMs + expectedPlayoutMs + playoutTailMs;
524
+ const waitMs = Math.max(0, dueAtMs - Date.now());
525
+ trace("playout:wall_clock_wait", {
526
+ totalReceivedSamples,
527
+ expectedPlayoutMs,
528
+ playoutTailMs,
529
+ waitMs
530
+ });
531
+ playoutDeadlineTimer = setTimeout(() => {
532
+ resolveOnce();
533
+ }, waitMs);
534
+ }
490
535
  function waitForPlaybackDrain() {
491
536
  if (settled) return;
492
537
  if (!engine.getStats) {
@@ -520,7 +565,7 @@ async function say(message, options = {}) {
520
565
  if (pending <= 0) {
521
566
  zeroStreak += 1;
522
567
  if (zeroStreak >= 3) {
523
- resolveOnce();
568
+ waitForWallClockPlayout();
524
569
  }
525
570
  return;
526
571
  }
@@ -529,12 +574,12 @@ async function say(message, options = {}) {
529
574
  trace("drain:no_progress_timeout", {
530
575
  pendingPlaybackSamples: pending
531
576
  });
532
- resolveOnce();
577
+ waitForWallClockPlayout();
533
578
  }
534
579
  }, 20);
535
580
  drainDeadlineTimer = setTimeout(() => {
536
581
  trace("drain:deadline");
537
- resolveOnce();
582
+ waitForWallClockPlayout();
538
583
  }, absoluteDeadlineMs);
539
584
  }
540
585
  function scheduleTailResolve(delayMs) {
@@ -551,7 +596,12 @@ async function say(message, options = {}) {
551
596
  ack: false,
552
597
  auth,
553
598
  onAudioDelta(pcm16) {
599
+ if (firstAudioAtMs <= 0) {
600
+ firstAudioAtMs = Date.now();
601
+ }
602
+ totalReceivedSamples += Math.floor(pcm16.length / 2);
554
603
  engine.play(pcm16);
604
+ onAssistantAudio?.(pcm16);
555
605
  trace("realtime:audio_delta", { bytes: pcm16.length });
556
606
  },
557
607
  onAudioDone() {
@@ -16,6 +16,7 @@ async function say(message, options = {}) {
16
16
  auth,
17
17
  createSession,
18
18
  createAudioEngine,
19
+ onAssistantAudio,
19
20
  onTrace
20
21
  } = options;
21
22
  const { AudioEngine } = require2("agent-voice-audio");
@@ -37,6 +38,9 @@ async function say(message, options = {}) {
37
38
  let completionTailTimer = null;
38
39
  let drainPollTimer = null;
39
40
  let drainDeadlineTimer = null;
41
+ let playoutDeadlineTimer = null;
42
+ let firstAudioAtMs = 0;
43
+ let totalReceivedSamples = 0;
40
44
  function cleanup() {
41
45
  if (cleaned) return;
42
46
  cleaned = true;
@@ -44,6 +48,7 @@ async function say(message, options = {}) {
44
48
  if (completionTailTimer) clearTimeout(completionTailTimer);
45
49
  if (drainPollTimer) clearInterval(drainPollTimer);
46
50
  if (drainDeadlineTimer) clearTimeout(drainDeadlineTimer);
51
+ if (playoutDeadlineTimer) clearTimeout(playoutDeadlineTimer);
47
52
  try {
48
53
  engine.stop();
49
54
  engine.close();
@@ -64,6 +69,28 @@ async function say(message, options = {}) {
64
69
  cleanup();
65
70
  reject(error);
66
71
  }
72
+ function waitForWallClockPlayout() {
73
+ if (settled) return;
74
+ if (firstAudioAtMs <= 0 || totalReceivedSamples <= 0) {
75
+ resolveOnce();
76
+ return;
77
+ }
78
+ const expectedPlayoutMs = Math.ceil(
79
+ totalReceivedSamples / SAMPLE_RATE * 1e3
80
+ );
81
+ const playoutTailMs = 140;
82
+ const dueAtMs = firstAudioAtMs + expectedPlayoutMs + playoutTailMs;
83
+ const waitMs = Math.max(0, dueAtMs - Date.now());
84
+ trace("playout:wall_clock_wait", {
85
+ totalReceivedSamples,
86
+ expectedPlayoutMs,
87
+ playoutTailMs,
88
+ waitMs
89
+ });
90
+ playoutDeadlineTimer = setTimeout(() => {
91
+ resolveOnce();
92
+ }, waitMs);
93
+ }
67
94
  function waitForPlaybackDrain() {
68
95
  if (settled) return;
69
96
  if (!engine.getStats) {
@@ -97,7 +124,7 @@ async function say(message, options = {}) {
97
124
  if (pending <= 0) {
98
125
  zeroStreak += 1;
99
126
  if (zeroStreak >= 3) {
100
- resolveOnce();
127
+ waitForWallClockPlayout();
101
128
  }
102
129
  return;
103
130
  }
@@ -106,12 +133,12 @@ async function say(message, options = {}) {
106
133
  trace("drain:no_progress_timeout", {
107
134
  pendingPlaybackSamples: pending
108
135
  });
109
- resolveOnce();
136
+ waitForWallClockPlayout();
110
137
  }
111
138
  }, 20);
112
139
  drainDeadlineTimer = setTimeout(() => {
113
140
  trace("drain:deadline");
114
- resolveOnce();
141
+ waitForWallClockPlayout();
115
142
  }, absoluteDeadlineMs);
116
143
  }
117
144
  function scheduleTailResolve(delayMs) {
@@ -128,7 +155,12 @@ async function say(message, options = {}) {
128
155
  ack: false,
129
156
  auth,
130
157
  onAudioDelta(pcm16) {
158
+ if (firstAudioAtMs <= 0) {
159
+ firstAudioAtMs = Date.now();
160
+ }
161
+ totalReceivedSamples += Math.floor(pcm16.length / 2);
131
162
  engine.play(pcm16);
163
+ onAssistantAudio?.(pcm16);
132
164
  trace("realtime:audio_delta", { bytes: pcm16.length });
133
165
  },
134
166
  onAudioDone() {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-voice",
3
- "version": "0.2.3",
3
+ "version": "0.2.4",
4
4
  "description": "CLI for AI agents to interact with humans via voice",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",