agent-voice 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
  import {
3
3
  createRealtimeSession
4
- } from "./chunk-VV2VNOC4.js";
4
+ } from "./chunk-UYBFONQE.js";
5
5
  import {
6
6
  DEFAULT_VOICE,
7
7
  SAMPLE_RATE
@@ -10,22 +10,38 @@ import {
10
10
  // src/ask.ts
11
11
  import { createRequire } from "module";
12
12
  var require2 = createRequire(import.meta.url);
13
+ function pcm16Rms(pcm16) {
14
+ const samples = Math.floor(pcm16.length / 2);
15
+ if (samples === 0) return 0;
16
+ let sumSquares = 0;
17
+ for (let i = 0; i < samples; i++) {
18
+ const value = pcm16.readInt16LE(i * 2);
19
+ sumSquares += value * value;
20
+ }
21
+ return Math.sqrt(sumSquares / samples);
22
+ }
23
+ function readEnvInt(name, fallback) {
24
+ const raw = process.env[name];
25
+ if (raw == null) return fallback;
26
+ const parsed = Number.parseInt(raw, 10);
27
+ return Number.isFinite(parsed) ? parsed : fallback;
28
+ }
13
29
  async function ask(message, options = {}) {
14
30
  const {
15
31
  voice = DEFAULT_VOICE,
16
32
  timeout = 30,
17
33
  ack = false,
18
34
  auth,
35
+ createSession,
36
+ createAudioEngine,
37
+ onTrace,
19
38
  onAudioFrameSent,
20
39
  onAssistantAudio,
21
40
  onMicAudio
22
41
  } = options;
23
42
  const { AudioEngine } = require2("agent-voice-audio");
24
- const streamDelayMs = Number.parseInt(
25
- process.env.AGENT_VOICE_AEC_STREAM_DELAY_MS ?? "30",
26
- 10
27
- );
28
- const engine = new AudioEngine({
43
+ const streamDelayMs = readEnvInt("AGENT_VOICE_AEC_STREAM_DELAY_MS", 30);
44
+ const engine = (createAudioEngine ?? ((engineOptions) => new AudioEngine(engineOptions)))({
29
45
  sampleRate: SAMPLE_RATE,
30
46
  channels: 1,
31
47
  enableAec: true,
@@ -41,7 +57,11 @@ async function ask(message, options = {}) {
41
57
  process.stderr.write(`[ask ${elapsed}ms] ${event}${suffix}
42
58
  `);
43
59
  }
60
+ function trace(event, detail) {
61
+ onTrace?.({ atMs: Date.now() - startMs, event, detail });
62
+ }
44
63
  logEvent("start");
64
+ trace("start");
45
65
  return new Promise((resolve, reject) => {
46
66
  let transcript = "";
47
67
  let timeoutTimer = null;
@@ -49,15 +69,19 @@ async function ask(message, options = {}) {
49
69
  let transcriptTimer = null;
50
70
  let capturePollTimer = null;
51
71
  let speechDetected = false;
72
+ let speechStartedAtMs = 0;
52
73
  let initialResponseDone = false;
53
74
  let heardAssistantAudio = false;
54
75
  let lastAssistantAudioAt = 0;
76
+ let nearEndEvidenceSeen = false;
77
+ let nearEndEvidenceAtMs = 0;
55
78
  let cleaned = false;
56
79
  let settled = false;
57
80
  async function cleanup() {
58
81
  if (cleaned) return;
59
82
  cleaned = true;
60
83
  logEvent("cleanup:start");
84
+ trace("cleanup:start");
61
85
  if (timeoutTimer) clearTimeout(timeoutTimer);
62
86
  if (responseStartTimer) clearTimeout(responseStartTimer);
63
87
  if (transcriptTimer) clearTimeout(transcriptTimer);
@@ -69,6 +93,7 @@ async function ask(message, options = {}) {
69
93
  }
70
94
  session.close();
71
95
  logEvent("cleanup:done");
96
+ trace("cleanup:done");
72
97
  }
73
98
  function resolveOnce(value) {
74
99
  if (settled) return;
@@ -93,41 +118,75 @@ async function ask(message, options = {}) {
93
118
  `audio engine capture read failed: ${err instanceof Error ? err.message : String(err)}`
94
119
  )
95
120
  );
121
+ trace("audio:capture_read_error", {
122
+ error: err instanceof Error ? err.message : String(err)
123
+ });
96
124
  return;
97
125
  }
98
126
  for (const frame of rawFrames) onMicAudio?.(frame);
99
127
  if (!heardAssistantAudio) return;
100
128
  for (const frame of processedFrames) {
129
+ const rms = pcm16Rms(frame);
130
+ const minSpeechRms = readEnvInt("AGENT_VOICE_MIN_SPEECH_RMS", 550);
131
+ if (rms >= minSpeechRms) {
132
+ nearEndEvidenceSeen = true;
133
+ nearEndEvidenceAtMs = Date.now();
134
+ trace("audio:near_end_evidence", { rms, minSpeechRms });
135
+ }
101
136
  onAudioFrameSent?.(frame);
102
137
  session.sendAudio(frame);
103
138
  }
139
+ if (processedFrames.length > 0) {
140
+ trace("audio:sent_capture", { frames: processedFrames.length });
141
+ }
104
142
  }, 10);
105
- const session = createRealtimeSession({
143
+ const session = (createSession ?? createRealtimeSession)({
106
144
  voice,
107
145
  mode: "default",
108
146
  ack,
109
147
  auth,
110
148
  onAudioDelta(pcm16) {
111
149
  logEvent("realtime:audio_delta", `bytes=${pcm16.length}`);
150
+ trace("realtime:audio_delta", { bytes: pcm16.length });
112
151
  heardAssistantAudio = true;
113
152
  lastAssistantAudioAt = Date.now();
114
153
  onAssistantAudio?.(pcm16);
115
154
  engine.play(pcm16);
116
155
  },
117
156
  onTranscript(text) {
118
- const echoGuardMs = Number.parseInt(
119
- process.env.AGENT_VOICE_ECHO_GUARD_MS ?? "1500",
120
- 10
121
- );
157
+ const echoGuardMs = readEnvInt("AGENT_VOICE_ECHO_GUARD_MS", 1500);
122
158
  const sinceAssistantMs = Date.now() - lastAssistantAudioAt;
123
159
  if (heardAssistantAudio && sinceAssistantMs < echoGuardMs) {
124
160
  logEvent(
125
161
  "realtime:transcript_ignored_echo_guard",
126
162
  `since_assistant_ms=${sinceAssistantMs} text="${text}"`
127
163
  );
164
+ trace("realtime:transcript_ignored_echo_guard", {
165
+ sinceAssistantMs,
166
+ text
167
+ });
128
168
  return;
129
169
  }
130
170
  logEvent("realtime:transcript", `text="${text}"`);
171
+ trace("realtime:transcript", { text });
172
+ if (speechDetected) {
173
+ const evidenceWindowMs = readEnvInt(
174
+ "AGENT_VOICE_SPEECH_EVIDENCE_WINDOW_MS",
175
+ 1200
176
+ );
177
+ const evidenceAgeMs = nearEndEvidenceSeen ? Math.abs(nearEndEvidenceAtMs - speechStartedAtMs) : Number.POSITIVE_INFINITY;
178
+ if (!nearEndEvidenceSeen || evidenceAgeMs > evidenceWindowMs) {
179
+ trace("realtime:transcript_ignored_no_near_end_evidence", {
180
+ text,
181
+ speechStartedAtMs,
182
+ nearEndEvidenceSeen,
183
+ nearEndEvidenceAtMs,
184
+ evidenceAgeMs,
185
+ evidenceWindowMs
186
+ });
187
+ return;
188
+ }
189
+ }
131
190
  if (transcriptTimer) {
132
191
  clearTimeout(transcriptTimer);
133
192
  transcriptTimer = null;
@@ -137,7 +196,9 @@ async function ask(message, options = {}) {
137
196
  },
138
197
  onSpeechStarted() {
139
198
  logEvent("realtime:speech_started");
199
+ trace("realtime:speech_started");
140
200
  speechDetected = true;
201
+ speechStartedAtMs = Date.now();
141
202
  if (timeoutTimer) {
142
203
  clearTimeout(timeoutTimer);
143
204
  timeoutTimer = null;
@@ -145,6 +206,9 @@ async function ask(message, options = {}) {
145
206
  if (transcriptTimer) clearTimeout(transcriptTimer);
146
207
  transcriptTimer = setTimeout(() => {
147
208
  logEvent("timeout:no_transcript_after_speech");
209
+ trace("timeout:no_transcript_after_speech", {
210
+ timeoutSeconds: timeout
211
+ });
148
212
  rejectOnce(
149
213
  new Error(
150
214
  `No transcript received within ${timeout}s after speech started`
@@ -160,10 +224,12 @@ async function ask(message, options = {}) {
160
224
  },
161
225
  onInitialResponseDone() {
162
226
  logEvent("realtime:initial_response_done");
227
+ trace("realtime:initial_response_done");
163
228
  initialResponseDone = true;
164
229
  timeoutTimer = setTimeout(() => {
165
230
  if (!speechDetected) {
166
231
  logEvent("timeout:no_speech");
232
+ trace("timeout:no_speech", { timeoutSeconds: timeout });
167
233
  rejectOnce(
168
234
  new Error(`No speech detected within ${timeout}s timeout`)
169
235
  );
@@ -172,21 +238,26 @@ async function ask(message, options = {}) {
172
238
  },
173
239
  onDone() {
174
240
  logEvent("realtime:done");
241
+ trace("realtime:done");
175
242
  if (ack) resolveOnce(transcript);
176
243
  },
177
244
  onError(error) {
178
245
  logEvent("realtime:error", error);
246
+ trace("realtime:error", { error });
179
247
  rejectOnce(new Error(error));
180
248
  }
181
249
  });
182
250
  session.connect().then(
183
251
  () => {
184
252
  logEvent("realtime:connected");
253
+ trace("realtime:connected");
185
254
  logEvent("realtime:send_message");
255
+ trace("realtime:send_message");
186
256
  session.sendMessage(message);
187
257
  responseStartTimer = setTimeout(() => {
188
258
  if (!heardAssistantAudio) {
189
259
  logEvent("timeout:no_assistant_audio");
260
+ trace("timeout:no_assistant_audio");
190
261
  rejectOnce(
191
262
  new Error("No assistant audio received after sending message")
192
263
  );
@@ -198,6 +269,9 @@ async function ask(message, options = {}) {
198
269
  "realtime:connect_error",
199
270
  err instanceof Error ? err.message : String(err)
200
271
  );
272
+ trace("realtime:connect_error", {
273
+ error: err instanceof Error ? err.message : String(err)
274
+ });
201
275
  rejectOnce(err instanceof Error ? err : new Error(String(err)));
202
276
  }
203
277
  );
@@ -43,6 +43,9 @@ function createRealtimeSession(options) {
43
43
  const pcm16 = Buffer.from(event.delta, "base64");
44
44
  options.onAudioDelta(pcm16);
45
45
  });
46
+ rt.on("response.audio.done", () => {
47
+ options.onAudioDone?.();
48
+ });
46
49
  rt.on("conversation.item.input_audio_transcription.completed", (event) => {
47
50
  options.onTranscript(event.transcript);
48
51
  });
package/dist/cli.js CHANGED
@@ -12,7 +12,13 @@ import {
12
12
  } from "./chunk-AHLLYIEW.js";
13
13
 
14
14
  // src/cli.ts
15
- import { closeSync, mkdirSync, openSync, writeFileSync, writeSync } from "fs";
15
+ import {
16
+ closeSync,
17
+ mkdirSync,
18
+ openSync,
19
+ writeFileSync,
20
+ writeSync
21
+ } from "fs";
16
22
  import { join } from "path";
17
23
  import { Command } from "commander";
18
24
  async function withSuppressedNativeOutput() {
@@ -22,8 +28,8 @@ async function withSuppressedNativeOutput() {
22
28
  openSync("/dev/null", "w");
23
29
  closeSync(2);
24
30
  openSync("/dev/null", "w");
25
- const { ask } = await import("./ask-GUSXGYSY.js");
26
- const { say } = await import("./say-W56HCNK4.js");
31
+ const { ask } = await import("./ask-OIE6HL2H.js");
32
+ const { say } = await import("./say-ZVF6EX52.js");
27
33
  function writeResult(text) {
28
34
  writeSync(savedStdout, `${text}
29
35
  `);
@@ -78,7 +84,10 @@ function writeDebugAudio(dir, assistantChunks, micChunks, modelInputChunks) {
78
84
  const modelInputFile = join(dir, `ask-${stamp}-model-input.wav`);
79
85
  writeFileSync(assistantFile, createWavBuffer(Buffer.concat(assistantChunks)));
80
86
  writeFileSync(micFile, createWavBuffer(Buffer.concat(micChunks)));
81
- writeFileSync(modelInputFile, createWavBuffer(Buffer.concat(modelInputChunks)));
87
+ writeFileSync(
88
+ modelInputFile,
89
+ createWavBuffer(Buffer.concat(modelInputChunks))
90
+ );
82
91
  return { assistantFile, micFile, modelInputFile };
83
92
  }
84
93
  var program = new Command().name("agent-voice").description("AI agent voice interaction CLI");
@@ -120,7 +129,10 @@ voicesCmd.command("set <voice>").description("Set the default voice").action((vo
120
129
  `);
121
130
  process.exit(0);
122
131
  });
123
- program.command("ask").description("Speak a message and listen for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).option("--timeout <seconds>", "Seconds to wait for user speech", "120").option("--ack", "Speak an acknowledgment after the user responds").option("--debug-audio-dir <dir>", "Write ask audio debug WAVs to this directory").action(async (opts) => {
132
+ program.command("ask").description("Speak a message and listen for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).option("--timeout <seconds>", "Seconds to wait for user speech", "120").option("--ack", "Speak an acknowledgment after the user responds").option(
133
+ "--debug-audio-dir <dir>",
134
+ "Write ask audio debug WAVs to this directory"
135
+ ).action(async (opts) => {
124
136
  const { ask, writeResult, writeError } = await withSuppressedNativeOutput();
125
137
  const assistantChunks = [];
126
138
  const micChunks = [];
package/dist/index.d.ts CHANGED
@@ -5,11 +5,65 @@ type AuthConfig = {
5
5
  declare function resolveAuth(): AuthConfig;
6
6
  declare function resolveVoice(): string;
7
7
 
8
+ declare const VOICES: readonly ["alloy", "ash", "ballad", "coral", "echo", "fable", "nova", "onyx", "sage", "shimmer", "verse"];
9
+ type Voice = (typeof VOICES)[number];
10
+ declare const DEFAULT_VOICE: Voice;
11
+ type Mode = "default" | "say";
12
+
13
+ type RealtimeSessionOptions = {
14
+ voice: string;
15
+ mode: Mode;
16
+ ack: boolean;
17
+ auth?: AuthConfig;
18
+ onAudioDelta: (pcm16: Buffer) => void;
19
+ onAudioDone?: () => void;
20
+ onTranscript: (text: string) => void;
21
+ onSpeechStarted: () => void;
22
+ onInitialResponseDone: () => void;
23
+ onDone: () => void;
24
+ onError: (error: string) => void;
25
+ };
26
+ type RealtimeSession = {
27
+ connect(): Promise<void>;
28
+ sendMessage(text: string): void;
29
+ sendAudio(pcm16: Buffer): void;
30
+ close(): void;
31
+ };
32
+
33
+ type RustAudioEngine$1 = {
34
+ start(): void;
35
+ stop(): void;
36
+ close(): void;
37
+ play(pcm16: Buffer): void;
38
+ readProcessedCapture(maxFrames?: number): Buffer[];
39
+ readRawCapture(maxFrames?: number): Buffer[];
40
+ setStreamDelayMs(delayMs: number): void;
41
+ getStats(): {
42
+ captureFrames: number;
43
+ processedFrames: number;
44
+ playbackUnderruns: number;
45
+ droppedRawFrames: number;
46
+ droppedProcessedFrames: number;
47
+ };
48
+ };
8
49
  type AskOptions = {
9
50
  voice?: string;
10
51
  timeout?: number;
11
52
  ack?: boolean;
12
53
  auth?: AuthConfig;
54
+ createSession?: (options: RealtimeSessionOptions) => RealtimeSession;
55
+ createAudioEngine?: (options: {
56
+ sampleRate?: number;
57
+ channels?: number;
58
+ enableAec?: boolean;
59
+ streamDelayMs?: number;
60
+ maxCaptureFrames?: number;
61
+ }) => RustAudioEngine$1;
62
+ onTrace?: (event: {
63
+ atMs: number;
64
+ event: string;
65
+ detail?: Record<string, unknown>;
66
+ }) => void;
13
67
  createPlayer?: unknown;
14
68
  createRecorder?: unknown;
15
69
  onAudioFrameSent?: (pcm16: Buffer) => void;
@@ -18,15 +72,32 @@ type AskOptions = {
18
72
  };
19
73
  declare function ask(message: string, options?: AskOptions): Promise<string>;
20
74
 
75
+ type RustAudioEngine = {
76
+ start(): void;
77
+ stop(): void;
78
+ close(): void;
79
+ play(pcm16: Buffer): void;
80
+ getStats?(): {
81
+ pendingPlaybackSamples?: number;
82
+ };
83
+ };
21
84
  type SayOptions = {
22
85
  voice?: string;
23
86
  auth?: AuthConfig;
87
+ createSession?: (options: RealtimeSessionOptions) => RealtimeSession;
88
+ createAudioEngine?: (options: {
89
+ sampleRate?: number;
90
+ channels?: number;
91
+ enableAec?: boolean;
92
+ streamDelayMs?: number;
93
+ }) => RustAudioEngine;
94
+ onTrace?: (event: {
95
+ atMs: number;
96
+ event: string;
97
+ detail?: Record<string, unknown>;
98
+ }) => void;
24
99
  createPlayer?: unknown;
25
100
  };
26
101
  declare function say(message: string, options?: SayOptions): Promise<void>;
27
102
 
28
- declare const VOICES: readonly ["alloy", "ash", "ballad", "coral", "echo", "fable", "nova", "onyx", "sage", "shimmer", "verse"];
29
- type Voice = (typeof VOICES)[number];
30
- declare const DEFAULT_VOICE: Voice;
31
-
32
103
  export { type AskOptions, type AuthConfig, DEFAULT_VOICE, type SayOptions, VOICES, type Voice, ask, resolveAuth, resolveVoice, say };
package/dist/index.js CHANGED
@@ -44,6 +44,9 @@ function createRealtimeSession(options) {
44
44
  const pcm16 = Buffer.from(event.delta, "base64");
45
45
  options.onAudioDelta(pcm16);
46
46
  });
47
+ rt.on("response.audio.done", () => {
48
+ options.onAudioDone?.();
49
+ });
47
50
  rt.on("conversation.item.input_audio_transcription.completed", (event) => {
48
51
  options.onTranscript(event.transcript);
49
52
  });
@@ -129,22 +132,38 @@ var DEFAULT_VOICE = "ash";
129
132
 
130
133
  // src/ask.ts
131
134
  var require2 = createRequire(import.meta.url);
135
+ function pcm16Rms(pcm16) {
136
+ const samples = Math.floor(pcm16.length / 2);
137
+ if (samples === 0) return 0;
138
+ let sumSquares = 0;
139
+ for (let i = 0; i < samples; i++) {
140
+ const value = pcm16.readInt16LE(i * 2);
141
+ sumSquares += value * value;
142
+ }
143
+ return Math.sqrt(sumSquares / samples);
144
+ }
145
+ function readEnvInt(name, fallback) {
146
+ const raw = process.env[name];
147
+ if (raw == null) return fallback;
148
+ const parsed = Number.parseInt(raw, 10);
149
+ return Number.isFinite(parsed) ? parsed : fallback;
150
+ }
132
151
  async function ask(message, options = {}) {
133
152
  const {
134
153
  voice = DEFAULT_VOICE,
135
154
  timeout = 30,
136
155
  ack = false,
137
156
  auth,
157
+ createSession,
158
+ createAudioEngine,
159
+ onTrace,
138
160
  onAudioFrameSent,
139
161
  onAssistantAudio,
140
162
  onMicAudio
141
163
  } = options;
142
164
  const { AudioEngine } = require2("agent-voice-audio");
143
- const streamDelayMs = Number.parseInt(
144
- process.env.AGENT_VOICE_AEC_STREAM_DELAY_MS ?? "30",
145
- 10
146
- );
147
- const engine = new AudioEngine({
165
+ const streamDelayMs = readEnvInt("AGENT_VOICE_AEC_STREAM_DELAY_MS", 30);
166
+ const engine = (createAudioEngine ?? ((engineOptions) => new AudioEngine(engineOptions)))({
148
167
  sampleRate: SAMPLE_RATE,
149
168
  channels: 1,
150
169
  enableAec: true,
@@ -160,7 +179,11 @@ async function ask(message, options = {}) {
160
179
  process.stderr.write(`[ask ${elapsed}ms] ${event}${suffix}
161
180
  `);
162
181
  }
182
+ function trace(event, detail) {
183
+ onTrace?.({ atMs: Date.now() - startMs, event, detail });
184
+ }
163
185
  logEvent("start");
186
+ trace("start");
164
187
  return new Promise((resolve, reject) => {
165
188
  let transcript = "";
166
189
  let timeoutTimer = null;
@@ -168,15 +191,19 @@ async function ask(message, options = {}) {
168
191
  let transcriptTimer = null;
169
192
  let capturePollTimer = null;
170
193
  let speechDetected = false;
194
+ let speechStartedAtMs = 0;
171
195
  let initialResponseDone = false;
172
196
  let heardAssistantAudio = false;
173
197
  let lastAssistantAudioAt = 0;
198
+ let nearEndEvidenceSeen = false;
199
+ let nearEndEvidenceAtMs = 0;
174
200
  let cleaned = false;
175
201
  let settled = false;
176
202
  async function cleanup() {
177
203
  if (cleaned) return;
178
204
  cleaned = true;
179
205
  logEvent("cleanup:start");
206
+ trace("cleanup:start");
180
207
  if (timeoutTimer) clearTimeout(timeoutTimer);
181
208
  if (responseStartTimer) clearTimeout(responseStartTimer);
182
209
  if (transcriptTimer) clearTimeout(transcriptTimer);
@@ -188,6 +215,7 @@ async function ask(message, options = {}) {
188
215
  }
189
216
  session.close();
190
217
  logEvent("cleanup:done");
218
+ trace("cleanup:done");
191
219
  }
192
220
  function resolveOnce(value) {
193
221
  if (settled) return;
@@ -212,41 +240,75 @@ async function ask(message, options = {}) {
212
240
  `audio engine capture read failed: ${err instanceof Error ? err.message : String(err)}`
213
241
  )
214
242
  );
243
+ trace("audio:capture_read_error", {
244
+ error: err instanceof Error ? err.message : String(err)
245
+ });
215
246
  return;
216
247
  }
217
248
  for (const frame of rawFrames) onMicAudio?.(frame);
218
249
  if (!heardAssistantAudio) return;
219
250
  for (const frame of processedFrames) {
251
+ const rms = pcm16Rms(frame);
252
+ const minSpeechRms = readEnvInt("AGENT_VOICE_MIN_SPEECH_RMS", 550);
253
+ if (rms >= minSpeechRms) {
254
+ nearEndEvidenceSeen = true;
255
+ nearEndEvidenceAtMs = Date.now();
256
+ trace("audio:near_end_evidence", { rms, minSpeechRms });
257
+ }
220
258
  onAudioFrameSent?.(frame);
221
259
  session.sendAudio(frame);
222
260
  }
261
+ if (processedFrames.length > 0) {
262
+ trace("audio:sent_capture", { frames: processedFrames.length });
263
+ }
223
264
  }, 10);
224
- const session = createRealtimeSession({
265
+ const session = (createSession ?? createRealtimeSession)({
225
266
  voice,
226
267
  mode: "default",
227
268
  ack,
228
269
  auth,
229
270
  onAudioDelta(pcm16) {
230
271
  logEvent("realtime:audio_delta", `bytes=${pcm16.length}`);
272
+ trace("realtime:audio_delta", { bytes: pcm16.length });
231
273
  heardAssistantAudio = true;
232
274
  lastAssistantAudioAt = Date.now();
233
275
  onAssistantAudio?.(pcm16);
234
276
  engine.play(pcm16);
235
277
  },
236
278
  onTranscript(text) {
237
- const echoGuardMs = Number.parseInt(
238
- process.env.AGENT_VOICE_ECHO_GUARD_MS ?? "1500",
239
- 10
240
- );
279
+ const echoGuardMs = readEnvInt("AGENT_VOICE_ECHO_GUARD_MS", 1500);
241
280
  const sinceAssistantMs = Date.now() - lastAssistantAudioAt;
242
281
  if (heardAssistantAudio && sinceAssistantMs < echoGuardMs) {
243
282
  logEvent(
244
283
  "realtime:transcript_ignored_echo_guard",
245
284
  `since_assistant_ms=${sinceAssistantMs} text="${text}"`
246
285
  );
286
+ trace("realtime:transcript_ignored_echo_guard", {
287
+ sinceAssistantMs,
288
+ text
289
+ });
247
290
  return;
248
291
  }
249
292
  logEvent("realtime:transcript", `text="${text}"`);
293
+ trace("realtime:transcript", { text });
294
+ if (speechDetected) {
295
+ const evidenceWindowMs = readEnvInt(
296
+ "AGENT_VOICE_SPEECH_EVIDENCE_WINDOW_MS",
297
+ 1200
298
+ );
299
+ const evidenceAgeMs = nearEndEvidenceSeen ? Math.abs(nearEndEvidenceAtMs - speechStartedAtMs) : Number.POSITIVE_INFINITY;
300
+ if (!nearEndEvidenceSeen || evidenceAgeMs > evidenceWindowMs) {
301
+ trace("realtime:transcript_ignored_no_near_end_evidence", {
302
+ text,
303
+ speechStartedAtMs,
304
+ nearEndEvidenceSeen,
305
+ nearEndEvidenceAtMs,
306
+ evidenceAgeMs,
307
+ evidenceWindowMs
308
+ });
309
+ return;
310
+ }
311
+ }
250
312
  if (transcriptTimer) {
251
313
  clearTimeout(transcriptTimer);
252
314
  transcriptTimer = null;
@@ -256,7 +318,9 @@ async function ask(message, options = {}) {
256
318
  },
257
319
  onSpeechStarted() {
258
320
  logEvent("realtime:speech_started");
321
+ trace("realtime:speech_started");
259
322
  speechDetected = true;
323
+ speechStartedAtMs = Date.now();
260
324
  if (timeoutTimer) {
261
325
  clearTimeout(timeoutTimer);
262
326
  timeoutTimer = null;
@@ -264,6 +328,9 @@ async function ask(message, options = {}) {
264
328
  if (transcriptTimer) clearTimeout(transcriptTimer);
265
329
  transcriptTimer = setTimeout(() => {
266
330
  logEvent("timeout:no_transcript_after_speech");
331
+ trace("timeout:no_transcript_after_speech", {
332
+ timeoutSeconds: timeout
333
+ });
267
334
  rejectOnce(
268
335
  new Error(
269
336
  `No transcript received within ${timeout}s after speech started`
@@ -279,10 +346,12 @@ async function ask(message, options = {}) {
279
346
  },
280
347
  onInitialResponseDone() {
281
348
  logEvent("realtime:initial_response_done");
349
+ trace("realtime:initial_response_done");
282
350
  initialResponseDone = true;
283
351
  timeoutTimer = setTimeout(() => {
284
352
  if (!speechDetected) {
285
353
  logEvent("timeout:no_speech");
354
+ trace("timeout:no_speech", { timeoutSeconds: timeout });
286
355
  rejectOnce(
287
356
  new Error(`No speech detected within ${timeout}s timeout`)
288
357
  );
@@ -291,21 +360,26 @@ async function ask(message, options = {}) {
291
360
  },
292
361
  onDone() {
293
362
  logEvent("realtime:done");
363
+ trace("realtime:done");
294
364
  if (ack) resolveOnce(transcript);
295
365
  },
296
366
  onError(error) {
297
367
  logEvent("realtime:error", error);
368
+ trace("realtime:error", { error });
298
369
  rejectOnce(new Error(error));
299
370
  }
300
371
  });
301
372
  session.connect().then(
302
373
  () => {
303
374
  logEvent("realtime:connected");
375
+ trace("realtime:connected");
304
376
  logEvent("realtime:send_message");
377
+ trace("realtime:send_message");
305
378
  session.sendMessage(message);
306
379
  responseStartTimer = setTimeout(() => {
307
380
  if (!heardAssistantAudio) {
308
381
  logEvent("timeout:no_assistant_audio");
382
+ trace("timeout:no_assistant_audio");
309
383
  rejectOnce(
310
384
  new Error("No assistant audio received after sending message")
311
385
  );
@@ -317,6 +391,9 @@ async function ask(message, options = {}) {
317
391
  "realtime:connect_error",
318
392
  err instanceof Error ? err.message : String(err)
319
393
  );
394
+ trace("realtime:connect_error", {
395
+ error: err instanceof Error ? err.message : String(err)
396
+ });
320
397
  rejectOnce(err instanceof Error ? err : new Error(String(err)));
321
398
  }
322
399
  );
@@ -357,50 +434,150 @@ function resolveVoice() {
357
434
  import { createRequire as createRequire2 } from "module";
358
435
  var require3 = createRequire2(import.meta.url);
359
436
  async function say(message, options = {}) {
360
- const { voice = DEFAULT_VOICE, auth } = options;
437
+ const {
438
+ voice = DEFAULT_VOICE,
439
+ auth,
440
+ createSession,
441
+ createAudioEngine,
442
+ onTrace
443
+ } = options;
361
444
  const { AudioEngine } = require3("agent-voice-audio");
362
- const engine = new AudioEngine({
445
+ const startMs = Date.now();
446
+ function trace(event, detail) {
447
+ onTrace?.({ atMs: Date.now() - startMs, event, detail });
448
+ }
449
+ const engine = (createAudioEngine ?? ((engineOptions) => new AudioEngine(engineOptions)))({
363
450
  sampleRate: SAMPLE_RATE,
364
451
  channels: 1,
365
452
  enableAec: false
366
453
  });
367
454
  engine.start();
455
+ trace("start");
368
456
  return new Promise((resolve, reject) => {
369
457
  let cleaned = false;
458
+ let settled = false;
459
+ let responseDoneFallbackTimer = null;
460
+ let completionTailTimer = null;
461
+ let drainPollTimer = null;
462
+ let drainDeadlineTimer = null;
370
463
  function cleanup() {
371
464
  if (cleaned) return;
372
465
  cleaned = true;
466
+ if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
467
+ if (completionTailTimer) clearTimeout(completionTailTimer);
468
+ if (drainPollTimer) clearInterval(drainPollTimer);
469
+ if (drainDeadlineTimer) clearTimeout(drainDeadlineTimer);
373
470
  try {
374
471
  engine.stop();
375
472
  engine.close();
376
473
  } catch {
377
474
  }
378
475
  session.close();
476
+ trace("cleanup");
379
477
  }
380
- const session = createRealtimeSession({
478
+ function resolveOnce() {
479
+ if (settled) return;
480
+ settled = true;
481
+ cleanup();
482
+ resolve();
483
+ }
484
+ function rejectOnce(error) {
485
+ if (settled) return;
486
+ settled = true;
487
+ cleanup();
488
+ reject(error);
489
+ }
490
+ function waitForPlaybackDrain() {
491
+ if (settled) return;
492
+ if (!engine.getStats) {
493
+ trace("drain:no_stats");
494
+ resolveOnce();
495
+ return;
496
+ }
497
+ const absoluteDeadlineMs = 2e4;
498
+ const maxNoProgressMs = 1200;
499
+ const drainStartMs = Date.now();
500
+ let lastProgressAtMs = drainStartMs;
501
+ let lastPending = Number.POSITIVE_INFINITY;
502
+ trace("drain:deadline_scheduled", {
503
+ absoluteDeadlineMs,
504
+ maxNoProgressMs
505
+ });
506
+ let zeroStreak = 0;
507
+ drainPollTimer = setInterval(() => {
508
+ if (settled) return;
509
+ let pending = 0;
510
+ try {
511
+ pending = Number(engine.getStats?.().pendingPlaybackSamples ?? 0);
512
+ } catch {
513
+ pending = 0;
514
+ }
515
+ trace("drain:poll", { pendingPlaybackSamples: pending });
516
+ if (pending < lastPending) {
517
+ lastPending = pending;
518
+ lastProgressAtMs = Date.now();
519
+ }
520
+ if (pending <= 0) {
521
+ zeroStreak += 1;
522
+ if (zeroStreak >= 3) {
523
+ resolveOnce();
524
+ }
525
+ return;
526
+ }
527
+ zeroStreak = 0;
528
+ if (Date.now() - lastProgressAtMs > maxNoProgressMs) {
529
+ trace("drain:no_progress_timeout", {
530
+ pendingPlaybackSamples: pending
531
+ });
532
+ resolveOnce();
533
+ }
534
+ }, 20);
535
+ drainDeadlineTimer = setTimeout(() => {
536
+ trace("drain:deadline");
537
+ resolveOnce();
538
+ }, absoluteDeadlineMs);
539
+ }
540
+ function scheduleTailResolve(delayMs) {
541
+ if (settled) return;
542
+ if (completionTailTimer) clearTimeout(completionTailTimer);
543
+ completionTailTimer = setTimeout(() => {
544
+ waitForPlaybackDrain();
545
+ }, delayMs);
546
+ trace("tail_scheduled", { delayMs });
547
+ }
548
+ const session = (createSession ?? createRealtimeSession)({
381
549
  voice,
382
550
  mode: "say",
383
551
  ack: false,
384
552
  auth,
385
553
  onAudioDelta(pcm16) {
386
554
  engine.play(pcm16);
555
+ trace("realtime:audio_delta", { bytes: pcm16.length });
556
+ },
557
+ onAudioDone() {
558
+ scheduleTailResolve(140);
559
+ trace("realtime:audio_done");
387
560
  },
388
561
  onTranscript() {
389
562
  },
390
563
  onSpeechStarted() {
391
564
  },
392
565
  onInitialResponseDone() {
393
- cleanup();
394
- resolve();
566
+ if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
567
+ responseDoneFallbackTimer = setTimeout(() => {
568
+ scheduleTailResolve(220);
569
+ }, 700);
570
+ trace("realtime:initial_response_done");
395
571
  },
396
572
  onDone() {
397
573
  },
398
574
  onError(error) {
399
- cleanup();
400
- reject(new Error(error));
575
+ trace("realtime:error", { error });
576
+ rejectOnce(new Error(error));
401
577
  }
402
578
  });
403
579
  session.connect().then(() => {
580
+ trace("realtime:connected");
404
581
  session.sendMessage(message);
405
582
  }, reject);
406
583
  });
@@ -0,0 +1,164 @@
1
+ #!/usr/bin/env node
2
+ import {
3
+ createRealtimeSession
4
+ } from "./chunk-UYBFONQE.js";
5
+ import {
6
+ DEFAULT_VOICE,
7
+ SAMPLE_RATE
8
+ } from "./chunk-AHLLYIEW.js";
9
+
10
+ // src/say.ts
11
+ import { createRequire } from "module";
12
+ var require2 = createRequire(import.meta.url);
13
+ async function say(message, options = {}) {
14
+ const {
15
+ voice = DEFAULT_VOICE,
16
+ auth,
17
+ createSession,
18
+ createAudioEngine,
19
+ onTrace
20
+ } = options;
21
+ const { AudioEngine } = require2("agent-voice-audio");
22
+ const startMs = Date.now();
23
+ function trace(event, detail) {
24
+ onTrace?.({ atMs: Date.now() - startMs, event, detail });
25
+ }
26
+ const engine = (createAudioEngine ?? ((engineOptions) => new AudioEngine(engineOptions)))({
27
+ sampleRate: SAMPLE_RATE,
28
+ channels: 1,
29
+ enableAec: false
30
+ });
31
+ engine.start();
32
+ trace("start");
33
+ return new Promise((resolve, reject) => {
34
+ let cleaned = false;
35
+ let settled = false;
36
+ let responseDoneFallbackTimer = null;
37
+ let completionTailTimer = null;
38
+ let drainPollTimer = null;
39
+ let drainDeadlineTimer = null;
40
+ function cleanup() {
41
+ if (cleaned) return;
42
+ cleaned = true;
43
+ if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
44
+ if (completionTailTimer) clearTimeout(completionTailTimer);
45
+ if (drainPollTimer) clearInterval(drainPollTimer);
46
+ if (drainDeadlineTimer) clearTimeout(drainDeadlineTimer);
47
+ try {
48
+ engine.stop();
49
+ engine.close();
50
+ } catch {
51
+ }
52
+ session.close();
53
+ trace("cleanup");
54
+ }
55
+ function resolveOnce() {
56
+ if (settled) return;
57
+ settled = true;
58
+ cleanup();
59
+ resolve();
60
+ }
61
+ function rejectOnce(error) {
62
+ if (settled) return;
63
+ settled = true;
64
+ cleanup();
65
+ reject(error);
66
+ }
67
+ function waitForPlaybackDrain() {
68
+ if (settled) return;
69
+ if (!engine.getStats) {
70
+ trace("drain:no_stats");
71
+ resolveOnce();
72
+ return;
73
+ }
74
+ const absoluteDeadlineMs = 2e4;
75
+ const maxNoProgressMs = 1200;
76
+ const drainStartMs = Date.now();
77
+ let lastProgressAtMs = drainStartMs;
78
+ let lastPending = Number.POSITIVE_INFINITY;
79
+ trace("drain:deadline_scheduled", {
80
+ absoluteDeadlineMs,
81
+ maxNoProgressMs
82
+ });
83
+ let zeroStreak = 0;
84
+ drainPollTimer = setInterval(() => {
85
+ if (settled) return;
86
+ let pending = 0;
87
+ try {
88
+ pending = Number(engine.getStats?.().pendingPlaybackSamples ?? 0);
89
+ } catch {
90
+ pending = 0;
91
+ }
92
+ trace("drain:poll", { pendingPlaybackSamples: pending });
93
+ if (pending < lastPending) {
94
+ lastPending = pending;
95
+ lastProgressAtMs = Date.now();
96
+ }
97
+ if (pending <= 0) {
98
+ zeroStreak += 1;
99
+ if (zeroStreak >= 3) {
100
+ resolveOnce();
101
+ }
102
+ return;
103
+ }
104
+ zeroStreak = 0;
105
+ if (Date.now() - lastProgressAtMs > maxNoProgressMs) {
106
+ trace("drain:no_progress_timeout", {
107
+ pendingPlaybackSamples: pending
108
+ });
109
+ resolveOnce();
110
+ }
111
+ }, 20);
112
+ drainDeadlineTimer = setTimeout(() => {
113
+ trace("drain:deadline");
114
+ resolveOnce();
115
+ }, absoluteDeadlineMs);
116
+ }
117
+ function scheduleTailResolve(delayMs) {
118
+ if (settled) return;
119
+ if (completionTailTimer) clearTimeout(completionTailTimer);
120
+ completionTailTimer = setTimeout(() => {
121
+ waitForPlaybackDrain();
122
+ }, delayMs);
123
+ trace("tail_scheduled", { delayMs });
124
+ }
125
+ const session = (createSession ?? createRealtimeSession)({
126
+ voice,
127
+ mode: "say",
128
+ ack: false,
129
+ auth,
130
+ onAudioDelta(pcm16) {
131
+ engine.play(pcm16);
132
+ trace("realtime:audio_delta", { bytes: pcm16.length });
133
+ },
134
+ onAudioDone() {
135
+ scheduleTailResolve(140);
136
+ trace("realtime:audio_done");
137
+ },
138
+ onTranscript() {
139
+ },
140
+ onSpeechStarted() {
141
+ },
142
+ onInitialResponseDone() {
143
+ if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
144
+ responseDoneFallbackTimer = setTimeout(() => {
145
+ scheduleTailResolve(220);
146
+ }, 700);
147
+ trace("realtime:initial_response_done");
148
+ },
149
+ onDone() {
150
+ },
151
+ onError(error) {
152
+ trace("realtime:error", { error });
153
+ rejectOnce(new Error(error));
154
+ }
155
+ });
156
+ session.connect().then(() => {
157
+ trace("realtime:connected");
158
+ session.sendMessage(message);
159
+ }, reject);
160
+ });
161
+ }
162
+ export {
163
+ say
164
+ };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-voice",
3
- "version": "0.2.0",
3
+ "version": "0.2.2",
4
4
  "description": "CLI for AI agents to interact with humans via voice",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -18,7 +18,7 @@
18
18
  "dist"
19
19
  ],
20
20
  "dependencies": {
21
- "agent-voice-audio": "^0.2.0",
21
+ "agent-voice-audio": "^0.2.1",
22
22
  "@inquirer/prompts": "^8.2.0",
23
23
  "commander": "^13.1.0",
24
24
  "openai": "^4.96.0",
@@ -1,64 +0,0 @@
1
- #!/usr/bin/env node
2
- import {
3
- createRealtimeSession
4
- } from "./chunk-VV2VNOC4.js";
5
- import {
6
- DEFAULT_VOICE,
7
- SAMPLE_RATE
8
- } from "./chunk-AHLLYIEW.js";
9
-
10
- // src/say.ts
11
- import { createRequire } from "module";
12
- var require2 = createRequire(import.meta.url);
13
- async function say(message, options = {}) {
14
- const { voice = DEFAULT_VOICE, auth } = options;
15
- const { AudioEngine } = require2("agent-voice-audio");
16
- const engine = new AudioEngine({
17
- sampleRate: SAMPLE_RATE,
18
- channels: 1,
19
- enableAec: false
20
- });
21
- engine.start();
22
- return new Promise((resolve, reject) => {
23
- let cleaned = false;
24
- function cleanup() {
25
- if (cleaned) return;
26
- cleaned = true;
27
- try {
28
- engine.stop();
29
- engine.close();
30
- } catch {
31
- }
32
- session.close();
33
- }
34
- const session = createRealtimeSession({
35
- voice,
36
- mode: "say",
37
- ack: false,
38
- auth,
39
- onAudioDelta(pcm16) {
40
- engine.play(pcm16);
41
- },
42
- onTranscript() {
43
- },
44
- onSpeechStarted() {
45
- },
46
- onInitialResponseDone() {
47
- cleanup();
48
- resolve();
49
- },
50
- onDone() {
51
- },
52
- onError(error) {
53
- cleanup();
54
- reject(new Error(error));
55
- }
56
- });
57
- session.connect().then(() => {
58
- session.sendMessage(message);
59
- }, reject);
60
- });
61
- }
62
- export {
63
- say
64
- };