agent-voice 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,22 +10,38 @@ import {
10
10
  // src/ask.ts
11
11
  import { createRequire } from "module";
12
12
  var require2 = createRequire(import.meta.url);
13
+ function pcm16Rms(pcm16) {
14
+ const samples = Math.floor(pcm16.length / 2);
15
+ if (samples === 0) return 0;
16
+ let sumSquares = 0;
17
+ for (let i = 0; i < samples; i++) {
18
+ const value = pcm16.readInt16LE(i * 2);
19
+ sumSquares += value * value;
20
+ }
21
+ return Math.sqrt(sumSquares / samples);
22
+ }
23
+ function readEnvInt(name, fallback) {
24
+ const raw = process.env[name];
25
+ if (raw == null) return fallback;
26
+ const parsed = Number.parseInt(raw, 10);
27
+ return Number.isFinite(parsed) ? parsed : fallback;
28
+ }
13
29
  async function ask(message, options = {}) {
14
30
  const {
15
31
  voice = DEFAULT_VOICE,
16
32
  timeout = 30,
17
33
  ack = false,
18
34
  auth,
35
+ createSession,
36
+ createAudioEngine,
37
+ onTrace,
19
38
  onAudioFrameSent,
20
39
  onAssistantAudio,
21
40
  onMicAudio
22
41
  } = options;
23
42
  const { AudioEngine } = require2("agent-voice-audio");
24
- const streamDelayMs = Number.parseInt(
25
- process.env.AGENT_VOICE_AEC_STREAM_DELAY_MS ?? "30",
26
- 10
27
- );
28
- const engine = new AudioEngine({
43
+ const streamDelayMs = readEnvInt("AGENT_VOICE_AEC_STREAM_DELAY_MS", 30);
44
+ const engine = (createAudioEngine ?? ((engineOptions) => new AudioEngine(engineOptions)))({
29
45
  sampleRate: SAMPLE_RATE,
30
46
  channels: 1,
31
47
  enableAec: true,
@@ -41,7 +57,11 @@ async function ask(message, options = {}) {
41
57
  process.stderr.write(`[ask ${elapsed}ms] ${event}${suffix}
42
58
  `);
43
59
  }
60
+ function trace(event, detail) {
61
+ onTrace?.({ atMs: Date.now() - startMs, event, detail });
62
+ }
44
63
  logEvent("start");
64
+ trace("start");
45
65
  return new Promise((resolve, reject) => {
46
66
  let transcript = "";
47
67
  let timeoutTimer = null;
@@ -49,15 +69,19 @@ async function ask(message, options = {}) {
49
69
  let transcriptTimer = null;
50
70
  let capturePollTimer = null;
51
71
  let speechDetected = false;
72
+ let speechStartedAtMs = 0;
52
73
  let initialResponseDone = false;
53
74
  let heardAssistantAudio = false;
54
75
  let lastAssistantAudioAt = 0;
76
+ let nearEndEvidenceSeen = false;
77
+ let nearEndEvidenceAtMs = 0;
55
78
  let cleaned = false;
56
79
  let settled = false;
57
80
  async function cleanup() {
58
81
  if (cleaned) return;
59
82
  cleaned = true;
60
83
  logEvent("cleanup:start");
84
+ trace("cleanup:start");
61
85
  if (timeoutTimer) clearTimeout(timeoutTimer);
62
86
  if (responseStartTimer) clearTimeout(responseStartTimer);
63
87
  if (transcriptTimer) clearTimeout(transcriptTimer);
@@ -69,6 +93,7 @@ async function ask(message, options = {}) {
69
93
  }
70
94
  session.close();
71
95
  logEvent("cleanup:done");
96
+ trace("cleanup:done");
72
97
  }
73
98
  function resolveOnce(value) {
74
99
  if (settled) return;
@@ -93,41 +118,75 @@ async function ask(message, options = {}) {
93
118
  `audio engine capture read failed: ${err instanceof Error ? err.message : String(err)}`
94
119
  )
95
120
  );
121
+ trace("audio:capture_read_error", {
122
+ error: err instanceof Error ? err.message : String(err)
123
+ });
96
124
  return;
97
125
  }
98
126
  for (const frame of rawFrames) onMicAudio?.(frame);
99
127
  if (!heardAssistantAudio) return;
100
128
  for (const frame of processedFrames) {
129
+ const rms = pcm16Rms(frame);
130
+ const minSpeechRms = readEnvInt("AGENT_VOICE_MIN_SPEECH_RMS", 550);
131
+ if (rms >= minSpeechRms) {
132
+ nearEndEvidenceSeen = true;
133
+ nearEndEvidenceAtMs = Date.now();
134
+ trace("audio:near_end_evidence", { rms, minSpeechRms });
135
+ }
101
136
  onAudioFrameSent?.(frame);
102
137
  session.sendAudio(frame);
103
138
  }
139
+ if (processedFrames.length > 0) {
140
+ trace("audio:sent_capture", { frames: processedFrames.length });
141
+ }
104
142
  }, 10);
105
- const session = createRealtimeSession({
143
+ const session = (createSession ?? createRealtimeSession)({
106
144
  voice,
107
145
  mode: "default",
108
146
  ack,
109
147
  auth,
110
148
  onAudioDelta(pcm16) {
111
149
  logEvent("realtime:audio_delta", `bytes=${pcm16.length}`);
150
+ trace("realtime:audio_delta", { bytes: pcm16.length });
112
151
  heardAssistantAudio = true;
113
152
  lastAssistantAudioAt = Date.now();
114
153
  onAssistantAudio?.(pcm16);
115
154
  engine.play(pcm16);
116
155
  },
117
156
  onTranscript(text) {
118
- const echoGuardMs = Number.parseInt(
119
- process.env.AGENT_VOICE_ECHO_GUARD_MS ?? "1500",
120
- 10
121
- );
157
+ const echoGuardMs = readEnvInt("AGENT_VOICE_ECHO_GUARD_MS", 1500);
122
158
  const sinceAssistantMs = Date.now() - lastAssistantAudioAt;
123
159
  if (heardAssistantAudio && sinceAssistantMs < echoGuardMs) {
124
160
  logEvent(
125
161
  "realtime:transcript_ignored_echo_guard",
126
162
  `since_assistant_ms=${sinceAssistantMs} text="${text}"`
127
163
  );
164
+ trace("realtime:transcript_ignored_echo_guard", {
165
+ sinceAssistantMs,
166
+ text
167
+ });
128
168
  return;
129
169
  }
130
170
  logEvent("realtime:transcript", `text="${text}"`);
171
+ trace("realtime:transcript", { text });
172
+ if (speechDetected) {
173
+ const evidenceWindowMs = readEnvInt(
174
+ "AGENT_VOICE_SPEECH_EVIDENCE_WINDOW_MS",
175
+ 1200
176
+ );
177
+ const evidenceAgeMs = nearEndEvidenceSeen ? Math.abs(nearEndEvidenceAtMs - speechStartedAtMs) : Number.POSITIVE_INFINITY;
178
+ if (!nearEndEvidenceSeen || evidenceAgeMs > evidenceWindowMs) {
179
+ trace("realtime:transcript_ignored_no_near_end_evidence", {
180
+ text,
181
+ speechStartedAtMs,
182
+ nearEndEvidenceSeen,
183
+ nearEndEvidenceAtMs,
184
+ evidenceAgeMs,
185
+ evidenceWindowMs
186
+ });
187
+ return;
188
+ }
189
+ }
131
190
  if (transcriptTimer) {
132
191
  clearTimeout(transcriptTimer);
133
192
  transcriptTimer = null;
@@ -137,7 +196,9 @@ async function ask(message, options = {}) {
137
196
  },
138
197
  onSpeechStarted() {
139
198
  logEvent("realtime:speech_started");
199
+ trace("realtime:speech_started");
140
200
  speechDetected = true;
201
+ speechStartedAtMs = Date.now();
141
202
  if (timeoutTimer) {
142
203
  clearTimeout(timeoutTimer);
143
204
  timeoutTimer = null;
@@ -145,6 +206,9 @@ async function ask(message, options = {}) {
145
206
  if (transcriptTimer) clearTimeout(transcriptTimer);
146
207
  transcriptTimer = setTimeout(() => {
147
208
  logEvent("timeout:no_transcript_after_speech");
209
+ trace("timeout:no_transcript_after_speech", {
210
+ timeoutSeconds: timeout
211
+ });
148
212
  rejectOnce(
149
213
  new Error(
150
214
  `No transcript received within ${timeout}s after speech started`
@@ -160,10 +224,12 @@ async function ask(message, options = {}) {
160
224
  },
161
225
  onInitialResponseDone() {
162
226
  logEvent("realtime:initial_response_done");
227
+ trace("realtime:initial_response_done");
163
228
  initialResponseDone = true;
164
229
  timeoutTimer = setTimeout(() => {
165
230
  if (!speechDetected) {
166
231
  logEvent("timeout:no_speech");
232
+ trace("timeout:no_speech", { timeoutSeconds: timeout });
167
233
  rejectOnce(
168
234
  new Error(`No speech detected within ${timeout}s timeout`)
169
235
  );
@@ -172,21 +238,26 @@ async function ask(message, options = {}) {
172
238
  },
173
239
  onDone() {
174
240
  logEvent("realtime:done");
241
+ trace("realtime:done");
175
242
  if (ack) resolveOnce(transcript);
176
243
  },
177
244
  onError(error) {
178
245
  logEvent("realtime:error", error);
246
+ trace("realtime:error", { error });
179
247
  rejectOnce(new Error(error));
180
248
  }
181
249
  });
182
250
  session.connect().then(
183
251
  () => {
184
252
  logEvent("realtime:connected");
253
+ trace("realtime:connected");
185
254
  logEvent("realtime:send_message");
255
+ trace("realtime:send_message");
186
256
  session.sendMessage(message);
187
257
  responseStartTimer = setTimeout(() => {
188
258
  if (!heardAssistantAudio) {
189
259
  logEvent("timeout:no_assistant_audio");
260
+ trace("timeout:no_assistant_audio");
190
261
  rejectOnce(
191
262
  new Error("No assistant audio received after sending message")
192
263
  );
@@ -198,6 +269,9 @@ async function ask(message, options = {}) {
198
269
  "realtime:connect_error",
199
270
  err instanceof Error ? err.message : String(err)
200
271
  );
272
+ trace("realtime:connect_error", {
273
+ error: err instanceof Error ? err.message : String(err)
274
+ });
201
275
  rejectOnce(err instanceof Error ? err : new Error(String(err)));
202
276
  }
203
277
  );
package/dist/cli.js CHANGED
@@ -12,7 +12,13 @@ import {
12
12
  } from "./chunk-AHLLYIEW.js";
13
13
 
14
14
  // src/cli.ts
15
- import { closeSync, mkdirSync, openSync, writeFileSync, writeSync } from "fs";
15
+ import {
16
+ closeSync,
17
+ mkdirSync,
18
+ openSync,
19
+ writeFileSync,
20
+ writeSync
21
+ } from "fs";
16
22
  import { join } from "path";
17
23
  import { Command } from "commander";
18
24
  async function withSuppressedNativeOutput() {
@@ -22,8 +28,8 @@ async function withSuppressedNativeOutput() {
22
28
  openSync("/dev/null", "w");
23
29
  closeSync(2);
24
30
  openSync("/dev/null", "w");
25
- const { ask } = await import("./ask-A32EH5QX.js");
26
- const { say } = await import("./say-ELJAIWUM.js");
31
+ const { ask } = await import("./ask-OIE6HL2H.js");
32
+ const { say } = await import("./say-ZVF6EX52.js");
27
33
  function writeResult(text) {
28
34
  writeSync(savedStdout, `${text}
29
35
  `);
@@ -78,7 +84,10 @@ function writeDebugAudio(dir, assistantChunks, micChunks, modelInputChunks) {
78
84
  const modelInputFile = join(dir, `ask-${stamp}-model-input.wav`);
79
85
  writeFileSync(assistantFile, createWavBuffer(Buffer.concat(assistantChunks)));
80
86
  writeFileSync(micFile, createWavBuffer(Buffer.concat(micChunks)));
81
- writeFileSync(modelInputFile, createWavBuffer(Buffer.concat(modelInputChunks)));
87
+ writeFileSync(
88
+ modelInputFile,
89
+ createWavBuffer(Buffer.concat(modelInputChunks))
90
+ );
82
91
  return { assistantFile, micFile, modelInputFile };
83
92
  }
84
93
  var program = new Command().name("agent-voice").description("AI agent voice interaction CLI");
@@ -120,7 +129,10 @@ voicesCmd.command("set <voice>").description("Set the default voice").action((vo
120
129
  `);
121
130
  process.exit(0);
122
131
  });
123
- program.command("ask").description("Speak a message and listen for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).option("--timeout <seconds>", "Seconds to wait for user speech", "120").option("--ack", "Speak an acknowledgment after the user responds").option("--debug-audio-dir <dir>", "Write ask audio debug WAVs to this directory").action(async (opts) => {
132
+ program.command("ask").description("Speak a message and listen for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).option("--timeout <seconds>", "Seconds to wait for user speech", "120").option("--ack", "Speak an acknowledgment after the user responds").option(
133
+ "--debug-audio-dir <dir>",
134
+ "Write ask audio debug WAVs to this directory"
135
+ ).action(async (opts) => {
124
136
  const { ask, writeResult, writeError } = await withSuppressedNativeOutput();
125
137
  const assistantChunks = [];
126
138
  const micChunks = [];
package/dist/index.d.ts CHANGED
@@ -5,11 +5,65 @@ type AuthConfig = {
5
5
  declare function resolveAuth(): AuthConfig;
6
6
  declare function resolveVoice(): string;
7
7
 
8
+ declare const VOICES: readonly ["alloy", "ash", "ballad", "coral", "echo", "fable", "nova", "onyx", "sage", "shimmer", "verse"];
9
+ type Voice = (typeof VOICES)[number];
10
+ declare const DEFAULT_VOICE: Voice;
11
+ type Mode = "default" | "say";
12
+
13
+ type RealtimeSessionOptions = {
14
+ voice: string;
15
+ mode: Mode;
16
+ ack: boolean;
17
+ auth?: AuthConfig;
18
+ onAudioDelta: (pcm16: Buffer) => void;
19
+ onAudioDone?: () => void;
20
+ onTranscript: (text: string) => void;
21
+ onSpeechStarted: () => void;
22
+ onInitialResponseDone: () => void;
23
+ onDone: () => void;
24
+ onError: (error: string) => void;
25
+ };
26
+ type RealtimeSession = {
27
+ connect(): Promise<void>;
28
+ sendMessage(text: string): void;
29
+ sendAudio(pcm16: Buffer): void;
30
+ close(): void;
31
+ };
32
+
33
+ type RustAudioEngine$1 = {
34
+ start(): void;
35
+ stop(): void;
36
+ close(): void;
37
+ play(pcm16: Buffer): void;
38
+ readProcessedCapture(maxFrames?: number): Buffer[];
39
+ readRawCapture(maxFrames?: number): Buffer[];
40
+ setStreamDelayMs(delayMs: number): void;
41
+ getStats(): {
42
+ captureFrames: number;
43
+ processedFrames: number;
44
+ playbackUnderruns: number;
45
+ droppedRawFrames: number;
46
+ droppedProcessedFrames: number;
47
+ };
48
+ };
8
49
  type AskOptions = {
9
50
  voice?: string;
10
51
  timeout?: number;
11
52
  ack?: boolean;
12
53
  auth?: AuthConfig;
54
+ createSession?: (options: RealtimeSessionOptions) => RealtimeSession;
55
+ createAudioEngine?: (options: {
56
+ sampleRate?: number;
57
+ channels?: number;
58
+ enableAec?: boolean;
59
+ streamDelayMs?: number;
60
+ maxCaptureFrames?: number;
61
+ }) => RustAudioEngine$1;
62
+ onTrace?: (event: {
63
+ atMs: number;
64
+ event: string;
65
+ detail?: Record<string, unknown>;
66
+ }) => void;
13
67
  createPlayer?: unknown;
14
68
  createRecorder?: unknown;
15
69
  onAudioFrameSent?: (pcm16: Buffer) => void;
@@ -18,15 +72,32 @@ type AskOptions = {
18
72
  };
19
73
  declare function ask(message: string, options?: AskOptions): Promise<string>;
20
74
 
75
+ type RustAudioEngine = {
76
+ start(): void;
77
+ stop(): void;
78
+ close(): void;
79
+ play(pcm16: Buffer): void;
80
+ getStats?(): {
81
+ pendingPlaybackSamples?: number;
82
+ };
83
+ };
21
84
  type SayOptions = {
22
85
  voice?: string;
23
86
  auth?: AuthConfig;
87
+ createSession?: (options: RealtimeSessionOptions) => RealtimeSession;
88
+ createAudioEngine?: (options: {
89
+ sampleRate?: number;
90
+ channels?: number;
91
+ enableAec?: boolean;
92
+ streamDelayMs?: number;
93
+ }) => RustAudioEngine;
94
+ onTrace?: (event: {
95
+ atMs: number;
96
+ event: string;
97
+ detail?: Record<string, unknown>;
98
+ }) => void;
24
99
  createPlayer?: unknown;
25
100
  };
26
101
  declare function say(message: string, options?: SayOptions): Promise<void>;
27
102
 
28
- declare const VOICES: readonly ["alloy", "ash", "ballad", "coral", "echo", "fable", "nova", "onyx", "sage", "shimmer", "verse"];
29
- type Voice = (typeof VOICES)[number];
30
- declare const DEFAULT_VOICE: Voice;
31
-
32
103
  export { type AskOptions, type AuthConfig, DEFAULT_VOICE, type SayOptions, VOICES, type Voice, ask, resolveAuth, resolveVoice, say };
package/dist/index.js CHANGED
@@ -132,22 +132,38 @@ var DEFAULT_VOICE = "ash";
132
132
 
133
133
  // src/ask.ts
134
134
  var require2 = createRequire(import.meta.url);
135
+ function pcm16Rms(pcm16) {
136
+ const samples = Math.floor(pcm16.length / 2);
137
+ if (samples === 0) return 0;
138
+ let sumSquares = 0;
139
+ for (let i = 0; i < samples; i++) {
140
+ const value = pcm16.readInt16LE(i * 2);
141
+ sumSquares += value * value;
142
+ }
143
+ return Math.sqrt(sumSquares / samples);
144
+ }
145
+ function readEnvInt(name, fallback) {
146
+ const raw = process.env[name];
147
+ if (raw == null) return fallback;
148
+ const parsed = Number.parseInt(raw, 10);
149
+ return Number.isFinite(parsed) ? parsed : fallback;
150
+ }
135
151
  async function ask(message, options = {}) {
136
152
  const {
137
153
  voice = DEFAULT_VOICE,
138
154
  timeout = 30,
139
155
  ack = false,
140
156
  auth,
157
+ createSession,
158
+ createAudioEngine,
159
+ onTrace,
141
160
  onAudioFrameSent,
142
161
  onAssistantAudio,
143
162
  onMicAudio
144
163
  } = options;
145
164
  const { AudioEngine } = require2("agent-voice-audio");
146
- const streamDelayMs = Number.parseInt(
147
- process.env.AGENT_VOICE_AEC_STREAM_DELAY_MS ?? "30",
148
- 10
149
- );
150
- const engine = new AudioEngine({
165
+ const streamDelayMs = readEnvInt("AGENT_VOICE_AEC_STREAM_DELAY_MS", 30);
166
+ const engine = (createAudioEngine ?? ((engineOptions) => new AudioEngine(engineOptions)))({
151
167
  sampleRate: SAMPLE_RATE,
152
168
  channels: 1,
153
169
  enableAec: true,
@@ -163,7 +179,11 @@ async function ask(message, options = {}) {
163
179
  process.stderr.write(`[ask ${elapsed}ms] ${event}${suffix}
164
180
  `);
165
181
  }
182
+ function trace(event, detail) {
183
+ onTrace?.({ atMs: Date.now() - startMs, event, detail });
184
+ }
166
185
  logEvent("start");
186
+ trace("start");
167
187
  return new Promise((resolve, reject) => {
168
188
  let transcript = "";
169
189
  let timeoutTimer = null;
@@ -171,15 +191,19 @@ async function ask(message, options = {}) {
171
191
  let transcriptTimer = null;
172
192
  let capturePollTimer = null;
173
193
  let speechDetected = false;
194
+ let speechStartedAtMs = 0;
174
195
  let initialResponseDone = false;
175
196
  let heardAssistantAudio = false;
176
197
  let lastAssistantAudioAt = 0;
198
+ let nearEndEvidenceSeen = false;
199
+ let nearEndEvidenceAtMs = 0;
177
200
  let cleaned = false;
178
201
  let settled = false;
179
202
  async function cleanup() {
180
203
  if (cleaned) return;
181
204
  cleaned = true;
182
205
  logEvent("cleanup:start");
206
+ trace("cleanup:start");
183
207
  if (timeoutTimer) clearTimeout(timeoutTimer);
184
208
  if (responseStartTimer) clearTimeout(responseStartTimer);
185
209
  if (transcriptTimer) clearTimeout(transcriptTimer);
@@ -191,6 +215,7 @@ async function ask(message, options = {}) {
191
215
  }
192
216
  session.close();
193
217
  logEvent("cleanup:done");
218
+ trace("cleanup:done");
194
219
  }
195
220
  function resolveOnce(value) {
196
221
  if (settled) return;
@@ -215,41 +240,75 @@ async function ask(message, options = {}) {
215
240
  `audio engine capture read failed: ${err instanceof Error ? err.message : String(err)}`
216
241
  )
217
242
  );
243
+ trace("audio:capture_read_error", {
244
+ error: err instanceof Error ? err.message : String(err)
245
+ });
218
246
  return;
219
247
  }
220
248
  for (const frame of rawFrames) onMicAudio?.(frame);
221
249
  if (!heardAssistantAudio) return;
222
250
  for (const frame of processedFrames) {
251
+ const rms = pcm16Rms(frame);
252
+ const minSpeechRms = readEnvInt("AGENT_VOICE_MIN_SPEECH_RMS", 550);
253
+ if (rms >= minSpeechRms) {
254
+ nearEndEvidenceSeen = true;
255
+ nearEndEvidenceAtMs = Date.now();
256
+ trace("audio:near_end_evidence", { rms, minSpeechRms });
257
+ }
223
258
  onAudioFrameSent?.(frame);
224
259
  session.sendAudio(frame);
225
260
  }
261
+ if (processedFrames.length > 0) {
262
+ trace("audio:sent_capture", { frames: processedFrames.length });
263
+ }
226
264
  }, 10);
227
- const session = createRealtimeSession({
265
+ const session = (createSession ?? createRealtimeSession)({
228
266
  voice,
229
267
  mode: "default",
230
268
  ack,
231
269
  auth,
232
270
  onAudioDelta(pcm16) {
233
271
  logEvent("realtime:audio_delta", `bytes=${pcm16.length}`);
272
+ trace("realtime:audio_delta", { bytes: pcm16.length });
234
273
  heardAssistantAudio = true;
235
274
  lastAssistantAudioAt = Date.now();
236
275
  onAssistantAudio?.(pcm16);
237
276
  engine.play(pcm16);
238
277
  },
239
278
  onTranscript(text) {
240
- const echoGuardMs = Number.parseInt(
241
- process.env.AGENT_VOICE_ECHO_GUARD_MS ?? "1500",
242
- 10
243
- );
279
+ const echoGuardMs = readEnvInt("AGENT_VOICE_ECHO_GUARD_MS", 1500);
244
280
  const sinceAssistantMs = Date.now() - lastAssistantAudioAt;
245
281
  if (heardAssistantAudio && sinceAssistantMs < echoGuardMs) {
246
282
  logEvent(
247
283
  "realtime:transcript_ignored_echo_guard",
248
284
  `since_assistant_ms=${sinceAssistantMs} text="${text}"`
249
285
  );
286
+ trace("realtime:transcript_ignored_echo_guard", {
287
+ sinceAssistantMs,
288
+ text
289
+ });
250
290
  return;
251
291
  }
252
292
  logEvent("realtime:transcript", `text="${text}"`);
293
+ trace("realtime:transcript", { text });
294
+ if (speechDetected) {
295
+ const evidenceWindowMs = readEnvInt(
296
+ "AGENT_VOICE_SPEECH_EVIDENCE_WINDOW_MS",
297
+ 1200
298
+ );
299
+ const evidenceAgeMs = nearEndEvidenceSeen ? Math.abs(nearEndEvidenceAtMs - speechStartedAtMs) : Number.POSITIVE_INFINITY;
300
+ if (!nearEndEvidenceSeen || evidenceAgeMs > evidenceWindowMs) {
301
+ trace("realtime:transcript_ignored_no_near_end_evidence", {
302
+ text,
303
+ speechStartedAtMs,
304
+ nearEndEvidenceSeen,
305
+ nearEndEvidenceAtMs,
306
+ evidenceAgeMs,
307
+ evidenceWindowMs
308
+ });
309
+ return;
310
+ }
311
+ }
253
312
  if (transcriptTimer) {
254
313
  clearTimeout(transcriptTimer);
255
314
  transcriptTimer = null;
@@ -259,7 +318,9 @@ async function ask(message, options = {}) {
259
318
  },
260
319
  onSpeechStarted() {
261
320
  logEvent("realtime:speech_started");
321
+ trace("realtime:speech_started");
262
322
  speechDetected = true;
323
+ speechStartedAtMs = Date.now();
263
324
  if (timeoutTimer) {
264
325
  clearTimeout(timeoutTimer);
265
326
  timeoutTimer = null;
@@ -267,6 +328,9 @@ async function ask(message, options = {}) {
267
328
  if (transcriptTimer) clearTimeout(transcriptTimer);
268
329
  transcriptTimer = setTimeout(() => {
269
330
  logEvent("timeout:no_transcript_after_speech");
331
+ trace("timeout:no_transcript_after_speech", {
332
+ timeoutSeconds: timeout
333
+ });
270
334
  rejectOnce(
271
335
  new Error(
272
336
  `No transcript received within ${timeout}s after speech started`
@@ -282,10 +346,12 @@ async function ask(message, options = {}) {
282
346
  },
283
347
  onInitialResponseDone() {
284
348
  logEvent("realtime:initial_response_done");
349
+ trace("realtime:initial_response_done");
285
350
  initialResponseDone = true;
286
351
  timeoutTimer = setTimeout(() => {
287
352
  if (!speechDetected) {
288
353
  logEvent("timeout:no_speech");
354
+ trace("timeout:no_speech", { timeoutSeconds: timeout });
289
355
  rejectOnce(
290
356
  new Error(`No speech detected within ${timeout}s timeout`)
291
357
  );
@@ -294,21 +360,26 @@ async function ask(message, options = {}) {
294
360
  },
295
361
  onDone() {
296
362
  logEvent("realtime:done");
363
+ trace("realtime:done");
297
364
  if (ack) resolveOnce(transcript);
298
365
  },
299
366
  onError(error) {
300
367
  logEvent("realtime:error", error);
368
+ trace("realtime:error", { error });
301
369
  rejectOnce(new Error(error));
302
370
  }
303
371
  });
304
372
  session.connect().then(
305
373
  () => {
306
374
  logEvent("realtime:connected");
375
+ trace("realtime:connected");
307
376
  logEvent("realtime:send_message");
377
+ trace("realtime:send_message");
308
378
  session.sendMessage(message);
309
379
  responseStartTimer = setTimeout(() => {
310
380
  if (!heardAssistantAudio) {
311
381
  logEvent("timeout:no_assistant_audio");
382
+ trace("timeout:no_assistant_audio");
312
383
  rejectOnce(
313
384
  new Error("No assistant audio received after sending message")
314
385
  );
@@ -320,6 +391,9 @@ async function ask(message, options = {}) {
320
391
  "realtime:connect_error",
321
392
  err instanceof Error ? err.message : String(err)
322
393
  );
394
+ trace("realtime:connect_error", {
395
+ error: err instanceof Error ? err.message : String(err)
396
+ });
323
397
  rejectOnce(err instanceof Error ? err : new Error(String(err)));
324
398
  }
325
399
  );
@@ -360,30 +434,46 @@ function resolveVoice() {
360
434
  import { createRequire as createRequire2 } from "module";
361
435
  var require3 = createRequire2(import.meta.url);
362
436
  async function say(message, options = {}) {
363
- const { voice = DEFAULT_VOICE, auth } = options;
437
+ const {
438
+ voice = DEFAULT_VOICE,
439
+ auth,
440
+ createSession,
441
+ createAudioEngine,
442
+ onTrace
443
+ } = options;
364
444
  const { AudioEngine } = require3("agent-voice-audio");
365
- const engine = new AudioEngine({
445
+ const startMs = Date.now();
446
+ function trace(event, detail) {
447
+ onTrace?.({ atMs: Date.now() - startMs, event, detail });
448
+ }
449
+ const engine = (createAudioEngine ?? ((engineOptions) => new AudioEngine(engineOptions)))({
366
450
  sampleRate: SAMPLE_RATE,
367
451
  channels: 1,
368
452
  enableAec: false
369
453
  });
370
454
  engine.start();
455
+ trace("start");
371
456
  return new Promise((resolve, reject) => {
372
457
  let cleaned = false;
373
458
  let settled = false;
374
459
  let responseDoneFallbackTimer = null;
375
460
  let completionTailTimer = null;
461
+ let drainPollTimer = null;
462
+ let drainDeadlineTimer = null;
376
463
  function cleanup() {
377
464
  if (cleaned) return;
378
465
  cleaned = true;
379
466
  if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
380
467
  if (completionTailTimer) clearTimeout(completionTailTimer);
468
+ if (drainPollTimer) clearInterval(drainPollTimer);
469
+ if (drainDeadlineTimer) clearTimeout(drainDeadlineTimer);
381
470
  try {
382
471
  engine.stop();
383
472
  engine.close();
384
473
  } catch {
385
474
  }
386
475
  session.close();
476
+ trace("cleanup");
387
477
  }
388
478
  function resolveOnce() {
389
479
  if (settled) return;
@@ -397,23 +487,76 @@ async function say(message, options = {}) {
397
487
  cleanup();
398
488
  reject(error);
399
489
  }
490
+ function waitForPlaybackDrain() {
491
+ if (settled) return;
492
+ if (!engine.getStats) {
493
+ trace("drain:no_stats");
494
+ resolveOnce();
495
+ return;
496
+ }
497
+ const absoluteDeadlineMs = 2e4;
498
+ const maxNoProgressMs = 1200;
499
+ const drainStartMs = Date.now();
500
+ let lastProgressAtMs = drainStartMs;
501
+ let lastPending = Number.POSITIVE_INFINITY;
502
+ trace("drain:deadline_scheduled", {
503
+ absoluteDeadlineMs,
504
+ maxNoProgressMs
505
+ });
506
+ let zeroStreak = 0;
507
+ drainPollTimer = setInterval(() => {
508
+ if (settled) return;
509
+ let pending = 0;
510
+ try {
511
+ pending = Number(engine.getStats?.().pendingPlaybackSamples ?? 0);
512
+ } catch {
513
+ pending = 0;
514
+ }
515
+ trace("drain:poll", { pendingPlaybackSamples: pending });
516
+ if (pending < lastPending) {
517
+ lastPending = pending;
518
+ lastProgressAtMs = Date.now();
519
+ }
520
+ if (pending <= 0) {
521
+ zeroStreak += 1;
522
+ if (zeroStreak >= 3) {
523
+ resolveOnce();
524
+ }
525
+ return;
526
+ }
527
+ zeroStreak = 0;
528
+ if (Date.now() - lastProgressAtMs > maxNoProgressMs) {
529
+ trace("drain:no_progress_timeout", {
530
+ pendingPlaybackSamples: pending
531
+ });
532
+ resolveOnce();
533
+ }
534
+ }, 20);
535
+ drainDeadlineTimer = setTimeout(() => {
536
+ trace("drain:deadline");
537
+ resolveOnce();
538
+ }, absoluteDeadlineMs);
539
+ }
400
540
  function scheduleTailResolve(delayMs) {
401
541
  if (settled) return;
402
542
  if (completionTailTimer) clearTimeout(completionTailTimer);
403
543
  completionTailTimer = setTimeout(() => {
404
- resolveOnce();
544
+ waitForPlaybackDrain();
405
545
  }, delayMs);
546
+ trace("tail_scheduled", { delayMs });
406
547
  }
407
- const session = createRealtimeSession({
548
+ const session = (createSession ?? createRealtimeSession)({
408
549
  voice,
409
550
  mode: "say",
410
551
  ack: false,
411
552
  auth,
412
553
  onAudioDelta(pcm16) {
413
554
  engine.play(pcm16);
555
+ trace("realtime:audio_delta", { bytes: pcm16.length });
414
556
  },
415
557
  onAudioDone() {
416
558
  scheduleTailResolve(140);
559
+ trace("realtime:audio_done");
417
560
  },
418
561
  onTranscript() {
419
562
  },
@@ -424,14 +567,17 @@ async function say(message, options = {}) {
424
567
  responseDoneFallbackTimer = setTimeout(() => {
425
568
  scheduleTailResolve(220);
426
569
  }, 700);
570
+ trace("realtime:initial_response_done");
427
571
  },
428
572
  onDone() {
429
573
  },
430
574
  onError(error) {
575
+ trace("realtime:error", { error });
431
576
  rejectOnce(new Error(error));
432
577
  }
433
578
  });
434
579
  session.connect().then(() => {
580
+ trace("realtime:connected");
435
581
  session.sendMessage(message);
436
582
  }, reject);
437
583
  });
@@ -0,0 +1,164 @@
1
+ #!/usr/bin/env node
2
+ import {
3
+ createRealtimeSession
4
+ } from "./chunk-UYBFONQE.js";
5
+ import {
6
+ DEFAULT_VOICE,
7
+ SAMPLE_RATE
8
+ } from "./chunk-AHLLYIEW.js";
9
+
10
+ // src/say.ts
11
+ import { createRequire } from "module";
12
+ var require2 = createRequire(import.meta.url);
13
+ async function say(message, options = {}) {
14
+ const {
15
+ voice = DEFAULT_VOICE,
16
+ auth,
17
+ createSession,
18
+ createAudioEngine,
19
+ onTrace
20
+ } = options;
21
+ const { AudioEngine } = require2("agent-voice-audio");
22
+ const startMs = Date.now();
23
+ function trace(event, detail) {
24
+ onTrace?.({ atMs: Date.now() - startMs, event, detail });
25
+ }
26
+ const engine = (createAudioEngine ?? ((engineOptions) => new AudioEngine(engineOptions)))({
27
+ sampleRate: SAMPLE_RATE,
28
+ channels: 1,
29
+ enableAec: false
30
+ });
31
+ engine.start();
32
+ trace("start");
33
+ return new Promise((resolve, reject) => {
34
+ let cleaned = false;
35
+ let settled = false;
36
+ let responseDoneFallbackTimer = null;
37
+ let completionTailTimer = null;
38
+ let drainPollTimer = null;
39
+ let drainDeadlineTimer = null;
40
+ function cleanup() {
41
+ if (cleaned) return;
42
+ cleaned = true;
43
+ if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
44
+ if (completionTailTimer) clearTimeout(completionTailTimer);
45
+ if (drainPollTimer) clearInterval(drainPollTimer);
46
+ if (drainDeadlineTimer) clearTimeout(drainDeadlineTimer);
47
+ try {
48
+ engine.stop();
49
+ engine.close();
50
+ } catch {
51
+ }
52
+ session.close();
53
+ trace("cleanup");
54
+ }
55
+ function resolveOnce() {
56
+ if (settled) return;
57
+ settled = true;
58
+ cleanup();
59
+ resolve();
60
+ }
61
+ function rejectOnce(error) {
62
+ if (settled) return;
63
+ settled = true;
64
+ cleanup();
65
+ reject(error);
66
+ }
67
+ function waitForPlaybackDrain() {
68
+ if (settled) return;
69
+ if (!engine.getStats) {
70
+ trace("drain:no_stats");
71
+ resolveOnce();
72
+ return;
73
+ }
74
+ const absoluteDeadlineMs = 2e4;
75
+ const maxNoProgressMs = 1200;
76
+ const drainStartMs = Date.now();
77
+ let lastProgressAtMs = drainStartMs;
78
+ let lastPending = Number.POSITIVE_INFINITY;
79
+ trace("drain:deadline_scheduled", {
80
+ absoluteDeadlineMs,
81
+ maxNoProgressMs
82
+ });
83
+ let zeroStreak = 0;
84
+ drainPollTimer = setInterval(() => {
85
+ if (settled) return;
86
+ let pending = 0;
87
+ try {
88
+ pending = Number(engine.getStats?.().pendingPlaybackSamples ?? 0);
89
+ } catch {
90
+ pending = 0;
91
+ }
92
+ trace("drain:poll", { pendingPlaybackSamples: pending });
93
+ if (pending < lastPending) {
94
+ lastPending = pending;
95
+ lastProgressAtMs = Date.now();
96
+ }
97
+ if (pending <= 0) {
98
+ zeroStreak += 1;
99
+ if (zeroStreak >= 3) {
100
+ resolveOnce();
101
+ }
102
+ return;
103
+ }
104
+ zeroStreak = 0;
105
+ if (Date.now() - lastProgressAtMs > maxNoProgressMs) {
106
+ trace("drain:no_progress_timeout", {
107
+ pendingPlaybackSamples: pending
108
+ });
109
+ resolveOnce();
110
+ }
111
+ }, 20);
112
+ drainDeadlineTimer = setTimeout(() => {
113
+ trace("drain:deadline");
114
+ resolveOnce();
115
+ }, absoluteDeadlineMs);
116
+ }
117
+ function scheduleTailResolve(delayMs) {
118
+ if (settled) return;
119
+ if (completionTailTimer) clearTimeout(completionTailTimer);
120
+ completionTailTimer = setTimeout(() => {
121
+ waitForPlaybackDrain();
122
+ }, delayMs);
123
+ trace("tail_scheduled", { delayMs });
124
+ }
125
+ const session = (createSession ?? createRealtimeSession)({
126
+ voice,
127
+ mode: "say",
128
+ ack: false,
129
+ auth,
130
+ onAudioDelta(pcm16) {
131
+ engine.play(pcm16);
132
+ trace("realtime:audio_delta", { bytes: pcm16.length });
133
+ },
134
+ onAudioDone() {
135
+ scheduleTailResolve(140);
136
+ trace("realtime:audio_done");
137
+ },
138
+ onTranscript() {
139
+ },
140
+ onSpeechStarted() {
141
+ },
142
+ onInitialResponseDone() {
143
+ if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
144
+ responseDoneFallbackTimer = setTimeout(() => {
145
+ scheduleTailResolve(220);
146
+ }, 700);
147
+ trace("realtime:initial_response_done");
148
+ },
149
+ onDone() {
150
+ },
151
+ onError(error) {
152
+ trace("realtime:error", { error });
153
+ rejectOnce(new Error(error));
154
+ }
155
+ });
156
+ session.connect().then(() => {
157
+ trace("realtime:connected");
158
+ session.sendMessage(message);
159
+ }, reject);
160
+ });
161
+ }
162
+ export {
163
+ say
164
+ };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-voice",
3
- "version": "0.2.1",
3
+ "version": "0.2.3",
4
4
  "description": "CLI for AI agents to interact with humans via voice",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -1,92 +0,0 @@
1
- #!/usr/bin/env node
2
- import {
3
- createRealtimeSession
4
- } from "./chunk-UYBFONQE.js";
5
- import {
6
- DEFAULT_VOICE,
7
- SAMPLE_RATE
8
- } from "./chunk-AHLLYIEW.js";
9
-
10
- // src/say.ts
11
- import { createRequire } from "module";
12
- var require2 = createRequire(import.meta.url);
13
- async function say(message, options = {}) {
14
- const { voice = DEFAULT_VOICE, auth } = options;
15
- const { AudioEngine } = require2("agent-voice-audio");
16
- const engine = new AudioEngine({
17
- sampleRate: SAMPLE_RATE,
18
- channels: 1,
19
- enableAec: false
20
- });
21
- engine.start();
22
- return new Promise((resolve, reject) => {
23
- let cleaned = false;
24
- let settled = false;
25
- let responseDoneFallbackTimer = null;
26
- let completionTailTimer = null;
27
- function cleanup() {
28
- if (cleaned) return;
29
- cleaned = true;
30
- if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
31
- if (completionTailTimer) clearTimeout(completionTailTimer);
32
- try {
33
- engine.stop();
34
- engine.close();
35
- } catch {
36
- }
37
- session.close();
38
- }
39
- function resolveOnce() {
40
- if (settled) return;
41
- settled = true;
42
- cleanup();
43
- resolve();
44
- }
45
- function rejectOnce(error) {
46
- if (settled) return;
47
- settled = true;
48
- cleanup();
49
- reject(error);
50
- }
51
- function scheduleTailResolve(delayMs) {
52
- if (settled) return;
53
- if (completionTailTimer) clearTimeout(completionTailTimer);
54
- completionTailTimer = setTimeout(() => {
55
- resolveOnce();
56
- }, delayMs);
57
- }
58
- const session = createRealtimeSession({
59
- voice,
60
- mode: "say",
61
- ack: false,
62
- auth,
63
- onAudioDelta(pcm16) {
64
- engine.play(pcm16);
65
- },
66
- onAudioDone() {
67
- scheduleTailResolve(140);
68
- },
69
- onTranscript() {
70
- },
71
- onSpeechStarted() {
72
- },
73
- onInitialResponseDone() {
74
- if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
75
- responseDoneFallbackTimer = setTimeout(() => {
76
- scheduleTailResolve(220);
77
- }, 700);
78
- },
79
- onDone() {
80
- },
81
- onError(error) {
82
- rejectOnce(new Error(error));
83
- }
84
- });
85
- session.connect().then(() => {
86
- session.sendMessage(message);
87
- }, reject);
88
- });
89
- }
90
- export {
91
- say
92
- };