agent-voice 0.1.3 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,208 @@
1
+ #!/usr/bin/env node
2
+ import {
3
+ createRealtimeSession
4
+ } from "./chunk-UYBFONQE.js";
5
+ import {
6
+ DEFAULT_VOICE,
7
+ SAMPLE_RATE
8
+ } from "./chunk-AHLLYIEW.js";
9
+
10
+ // src/ask.ts
11
+ import { createRequire } from "module";
12
+ var require2 = createRequire(import.meta.url);
13
+ async function ask(message, options = {}) {
14
+ const {
15
+ voice = DEFAULT_VOICE,
16
+ timeout = 30,
17
+ ack = false,
18
+ auth,
19
+ onAudioFrameSent,
20
+ onAssistantAudio,
21
+ onMicAudio
22
+ } = options;
23
+ const { AudioEngine } = require2("agent-voice-audio");
24
+ const streamDelayMs = Number.parseInt(
25
+ process.env.AGENT_VOICE_AEC_STREAM_DELAY_MS ?? "30",
26
+ 10
27
+ );
28
+ const engine = new AudioEngine({
29
+ sampleRate: SAMPLE_RATE,
30
+ channels: 1,
31
+ enableAec: true,
32
+ streamDelayMs
33
+ });
34
+ engine.start();
35
+ const debug = process.env.AGENT_VOICE_DEBUG_ASK_EVENTS === "1";
36
+ const startMs = Date.now();
37
+ function logEvent(event, detail) {
38
+ if (!debug) return;
39
+ const elapsed = Date.now() - startMs;
40
+ const suffix = detail ? ` ${detail}` : "";
41
+ process.stderr.write(`[ask ${elapsed}ms] ${event}${suffix}
42
+ `);
43
+ }
44
+ logEvent("start");
45
+ return new Promise((resolve, reject) => {
46
+ let transcript = "";
47
+ let timeoutTimer = null;
48
+ let responseStartTimer = null;
49
+ let transcriptTimer = null;
50
+ let capturePollTimer = null;
51
+ let speechDetected = false;
52
+ let initialResponseDone = false;
53
+ let heardAssistantAudio = false;
54
+ let lastAssistantAudioAt = 0;
55
+ let cleaned = false;
56
+ let settled = false;
57
+ async function cleanup() {
58
+ if (cleaned) return;
59
+ cleaned = true;
60
+ logEvent("cleanup:start");
61
+ if (timeoutTimer) clearTimeout(timeoutTimer);
62
+ if (responseStartTimer) clearTimeout(responseStartTimer);
63
+ if (transcriptTimer) clearTimeout(transcriptTimer);
64
+ if (capturePollTimer) clearInterval(capturePollTimer);
65
+ try {
66
+ engine.stop();
67
+ engine.close();
68
+ } catch {
69
+ }
70
+ session.close();
71
+ logEvent("cleanup:done");
72
+ }
73
+ function resolveOnce(value) {
74
+ if (settled) return;
75
+ settled = true;
76
+ cleanup().then(() => resolve(value));
77
+ }
78
+ function rejectOnce(error) {
79
+ if (settled) return;
80
+ settled = true;
81
+ cleanup().then(() => reject(error));
82
+ }
83
+ capturePollTimer = setInterval(() => {
84
+ if (settled) return;
85
+ let rawFrames = [];
86
+ let processedFrames = [];
87
+ try {
88
+ rawFrames = engine.readRawCapture(64);
89
+ processedFrames = engine.readProcessedCapture(64);
90
+ } catch (err) {
91
+ rejectOnce(
92
+ new Error(
93
+ `audio engine capture read failed: ${err instanceof Error ? err.message : String(err)}`
94
+ )
95
+ );
96
+ return;
97
+ }
98
+ for (const frame of rawFrames) onMicAudio?.(frame);
99
+ if (!heardAssistantAudio) return;
100
+ for (const frame of processedFrames) {
101
+ onAudioFrameSent?.(frame);
102
+ session.sendAudio(frame);
103
+ }
104
+ }, 10);
105
+ const session = createRealtimeSession({
106
+ voice,
107
+ mode: "default",
108
+ ack,
109
+ auth,
110
+ onAudioDelta(pcm16) {
111
+ logEvent("realtime:audio_delta", `bytes=${pcm16.length}`);
112
+ heardAssistantAudio = true;
113
+ lastAssistantAudioAt = Date.now();
114
+ onAssistantAudio?.(pcm16);
115
+ engine.play(pcm16);
116
+ },
117
+ onTranscript(text) {
118
+ const echoGuardMs = Number.parseInt(
119
+ process.env.AGENT_VOICE_ECHO_GUARD_MS ?? "1500",
120
+ 10
121
+ );
122
+ const sinceAssistantMs = Date.now() - lastAssistantAudioAt;
123
+ if (heardAssistantAudio && sinceAssistantMs < echoGuardMs) {
124
+ logEvent(
125
+ "realtime:transcript_ignored_echo_guard",
126
+ `since_assistant_ms=${sinceAssistantMs} text="${text}"`
127
+ );
128
+ return;
129
+ }
130
+ logEvent("realtime:transcript", `text="${text}"`);
131
+ if (transcriptTimer) {
132
+ clearTimeout(transcriptTimer);
133
+ transcriptTimer = null;
134
+ }
135
+ transcript = text;
136
+ if (!ack) resolveOnce(transcript);
137
+ },
138
+ onSpeechStarted() {
139
+ logEvent("realtime:speech_started");
140
+ speechDetected = true;
141
+ if (timeoutTimer) {
142
+ clearTimeout(timeoutTimer);
143
+ timeoutTimer = null;
144
+ }
145
+ if (transcriptTimer) clearTimeout(transcriptTimer);
146
+ transcriptTimer = setTimeout(() => {
147
+ logEvent("timeout:no_transcript_after_speech");
148
+ rejectOnce(
149
+ new Error(
150
+ `No transcript received within ${timeout}s after speech started`
151
+ )
152
+ );
153
+ }, timeout * 1e3);
154
+ if (!initialResponseDone && heardAssistantAudio) {
155
+ try {
156
+ engine.play(Buffer.alloc(0));
157
+ } catch {
158
+ }
159
+ }
160
+ },
161
+ onInitialResponseDone() {
162
+ logEvent("realtime:initial_response_done");
163
+ initialResponseDone = true;
164
+ timeoutTimer = setTimeout(() => {
165
+ if (!speechDetected) {
166
+ logEvent("timeout:no_speech");
167
+ rejectOnce(
168
+ new Error(`No speech detected within ${timeout}s timeout`)
169
+ );
170
+ }
171
+ }, timeout * 1e3);
172
+ },
173
+ onDone() {
174
+ logEvent("realtime:done");
175
+ if (ack) resolveOnce(transcript);
176
+ },
177
+ onError(error) {
178
+ logEvent("realtime:error", error);
179
+ rejectOnce(new Error(error));
180
+ }
181
+ });
182
+ session.connect().then(
183
+ () => {
184
+ logEvent("realtime:connected");
185
+ logEvent("realtime:send_message");
186
+ session.sendMessage(message);
187
+ responseStartTimer = setTimeout(() => {
188
+ if (!heardAssistantAudio) {
189
+ logEvent("timeout:no_assistant_audio");
190
+ rejectOnce(
191
+ new Error("No assistant audio received after sending message")
192
+ );
193
+ }
194
+ }, 1e4);
195
+ },
196
+ (err) => {
197
+ logEvent(
198
+ "realtime:connect_error",
199
+ err instanceof Error ? err.message : String(err)
200
+ );
201
+ rejectOnce(err instanceof Error ? err : new Error(String(err)));
202
+ }
203
+ );
204
+ });
205
+ }
206
+ export {
207
+ ask
208
+ };
@@ -1,8 +1,8 @@
1
1
  #!/usr/bin/env node
2
2
  import {
3
3
  writeAuthConfig
4
- } from "./chunk-7ERYR6ZY.js";
5
- import "./chunk-D3AGL5JD.js";
4
+ } from "./chunk-RGYWLATZ.js";
5
+ import "./chunk-AHLLYIEW.js";
6
6
 
7
7
  // src/auth.ts
8
8
  import { input, password } from "@inquirer/prompts";
@@ -3,6 +3,7 @@
3
3
  // src/types.ts
4
4
  var SAMPLE_RATE = 24e3;
5
5
  var CHANNELS = 1;
6
+ var BIT_DEPTH = 16;
6
7
  var VOICES = [
7
8
  "alloy",
8
9
  "ash",
@@ -21,6 +22,7 @@ var DEFAULT_VOICE = "ash";
21
22
  export {
22
23
  SAMPLE_RATE,
23
24
  CHANNELS,
25
+ BIT_DEPTH,
24
26
  VOICES,
25
27
  DEFAULT_VOICE
26
28
  };
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
  import {
3
3
  DEFAULT_VOICE
4
- } from "./chunk-D3AGL5JD.js";
4
+ } from "./chunk-AHLLYIEW.js";
5
5
 
6
6
  // src/config.ts
7
7
  import { chmodSync, mkdirSync, readFileSync, writeFileSync } from "fs";
@@ -1,71 +1,4 @@
1
1
  #!/usr/bin/env node
2
- import {
3
- CHANNELS,
4
- SAMPLE_RATE
5
- } from "./chunk-D3AGL5JD.js";
6
-
7
- // src/audio.ts
8
- import { AudioIO, SampleFormat16Bit } from "naudiodon2";
9
- function createAudioPlayer() {
10
- const stream = AudioIO({
11
- outOptions: {
12
- channelCount: CHANNELS,
13
- sampleFormat: SampleFormat16Bit,
14
- sampleRate: SAMPLE_RATE,
15
- closeOnError: true
16
- }
17
- });
18
- let closed = false;
19
- return {
20
- write(pcm16) {
21
- return stream.write(pcm16);
22
- },
23
- start() {
24
- stream.start();
25
- },
26
- drain() {
27
- if (closed) return Promise.resolve();
28
- closed = true;
29
- return new Promise((resolve) => {
30
- stream.quit(() => resolve());
31
- });
32
- },
33
- close() {
34
- if (closed) return;
35
- closed = true;
36
- stream.quit();
37
- }
38
- };
39
- }
40
- function createAudioRecorder() {
41
- const stream = AudioIO({
42
- inOptions: {
43
- channelCount: CHANNELS,
44
- sampleFormat: SampleFormat16Bit,
45
- sampleRate: SAMPLE_RATE,
46
- closeOnError: true
47
- }
48
- });
49
- let stopped = false;
50
- return {
51
- onData(cb) {
52
- stream.on("data", cb);
53
- },
54
- start() {
55
- stream.start();
56
- },
57
- stop() {
58
- if (stopped) return;
59
- stopped = true;
60
- stream.quit();
61
- },
62
- close() {
63
- if (stopped) return;
64
- stopped = true;
65
- stream.quit();
66
- }
67
- };
68
- }
69
2
 
70
3
  // src/realtime.ts
71
4
  import { OpenAIRealtimeWS } from "openai/beta/realtime/ws";
@@ -110,6 +43,9 @@ function createRealtimeSession(options) {
110
43
  const pcm16 = Buffer.from(event.delta, "base64");
111
44
  options.onAudioDelta(pcm16);
112
45
  });
46
+ rt.on("response.audio.done", () => {
47
+ options.onAudioDone?.();
48
+ });
113
49
  rt.on("conversation.item.input_audio_transcription.completed", (event) => {
114
50
  options.onTranscript(event.transcript);
115
51
  });
@@ -177,7 +113,5 @@ ${text}`
177
113
  }
178
114
 
179
115
  export {
180
- createAudioPlayer,
181
- createAudioRecorder,
182
116
  createRealtimeSession
183
117
  };
package/dist/cli.js CHANGED
@@ -3,13 +3,17 @@ import {
3
3
  resolveAuth,
4
4
  resolveVoice,
5
5
  writeVoiceConfig
6
- } from "./chunk-7ERYR6ZY.js";
6
+ } from "./chunk-RGYWLATZ.js";
7
7
  import {
8
+ BIT_DEPTH,
9
+ CHANNELS,
10
+ SAMPLE_RATE,
8
11
  VOICES
9
- } from "./chunk-D3AGL5JD.js";
12
+ } from "./chunk-AHLLYIEW.js";
10
13
 
11
14
  // src/cli.ts
12
- import { closeSync, openSync, writeSync } from "fs";
15
+ import { closeSync, mkdirSync, openSync, writeFileSync, writeSync } from "fs";
16
+ import { join } from "path";
13
17
  import { Command } from "commander";
14
18
  async function withSuppressedNativeOutput() {
15
19
  const savedStdout = openSync("/dev/fd/1", "w");
@@ -18,8 +22,8 @@ async function withSuppressedNativeOutput() {
18
22
  openSync("/dev/null", "w");
19
23
  closeSync(2);
20
24
  openSync("/dev/null", "w");
21
- const { ask } = await import("./ask-6HS5WYJU.js");
22
- const { say } = await import("./say-PKBQ2ZDL.js");
25
+ const { ask } = await import("./ask-A32EH5QX.js");
26
+ const { say } = await import("./say-ELJAIWUM.js");
23
27
  function writeResult(text) {
24
28
  writeSync(savedStdout, `${text}
25
29
  `);
@@ -45,10 +49,42 @@ async function getMessage(flag) {
45
49
  if (stdin) return stdin;
46
50
  throw new Error("No message provided. Use -m or pipe via stdin.");
47
51
  }
52
+ function createWavBuffer(pcm16) {
53
+ const header = Buffer.alloc(44);
54
+ const dataSize = pcm16.length;
55
+ const fileSize = 36 + dataSize;
56
+ const byteRate = SAMPLE_RATE * CHANNELS * (BIT_DEPTH / 8);
57
+ const blockAlign = CHANNELS * (BIT_DEPTH / 8);
58
+ header.write("RIFF", 0);
59
+ header.writeUInt32LE(fileSize, 4);
60
+ header.write("WAVE", 8);
61
+ header.write("fmt ", 12);
62
+ header.writeUInt32LE(16, 16);
63
+ header.writeUInt16LE(1, 20);
64
+ header.writeUInt16LE(CHANNELS, 22);
65
+ header.writeUInt32LE(SAMPLE_RATE, 24);
66
+ header.writeUInt32LE(byteRate, 28);
67
+ header.writeUInt16LE(blockAlign, 32);
68
+ header.writeUInt16LE(BIT_DEPTH, 34);
69
+ header.write("data", 36);
70
+ header.writeUInt32LE(dataSize, 40);
71
+ return Buffer.concat([header, pcm16]);
72
+ }
73
+ function writeDebugAudio(dir, assistantChunks, micChunks, modelInputChunks) {
74
+ mkdirSync(dir, { recursive: true });
75
+ const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
76
+ const assistantFile = join(dir, `ask-${stamp}-assistant-output.wav`);
77
+ const micFile = join(dir, `ask-${stamp}-mic-input.wav`);
78
+ const modelInputFile = join(dir, `ask-${stamp}-model-input.wav`);
79
+ writeFileSync(assistantFile, createWavBuffer(Buffer.concat(assistantChunks)));
80
+ writeFileSync(micFile, createWavBuffer(Buffer.concat(micChunks)));
81
+ writeFileSync(modelInputFile, createWavBuffer(Buffer.concat(modelInputChunks)));
82
+ return { assistantFile, micFile, modelInputFile };
83
+ }
48
84
  var program = new Command().name("agent-voice").description("AI agent voice interaction CLI");
49
85
  program.command("auth").description("Configure API key and base URL").option("--api-url <url>", "Base URL for the API").option("--api-key <key>", "API key").option("--no-verify", "Skip API key verification").action(async (opts) => {
50
86
  try {
51
- const { auth } = await import("./auth-BRJKBMOE.js");
87
+ const { auth } = await import("./auth-KET5DNSE.js");
52
88
  await auth({
53
89
  apiUrl: opts.apiUrl,
54
90
  apiKey: opts.apiKey,
@@ -84,8 +120,11 @@ voicesCmd.command("set <voice>").description("Set the default voice").action((vo
84
120
  `);
85
121
  process.exit(0);
86
122
  });
87
- program.command("ask").description("Speak a message and listen for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).option("--timeout <seconds>", "Seconds to wait for user speech", "120").option("--ack", "Speak an acknowledgment after the user responds").action(async (opts) => {
123
+ program.command("ask").description("Speak a message and listen for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).option("--timeout <seconds>", "Seconds to wait for user speech", "120").option("--ack", "Speak an acknowledgment after the user responds").option("--debug-audio-dir <dir>", "Write ask audio debug WAVs to this directory").action(async (opts) => {
88
124
  const { ask, writeResult, writeError } = await withSuppressedNativeOutput();
125
+ const assistantChunks = [];
126
+ const micChunks = [];
127
+ const modelInputChunks = [];
89
128
  try {
90
129
  const auth = resolveAuth();
91
130
  const message = await getMessage(opts.message);
@@ -93,11 +132,45 @@ program.command("ask").description("Speak a message and listen for a response").
93
132
  voice: opts.voice,
94
133
  timeout: Number.parseInt(opts.timeout, 10),
95
134
  ack: opts.ack ?? false,
96
- auth
135
+ auth,
136
+ onAssistantAudio: opts.debugAudioDir ? (pcm16) => assistantChunks.push(Buffer.from(pcm16)) : void 0,
137
+ onMicAudio: opts.debugAudioDir ? (pcm16) => micChunks.push(Buffer.from(pcm16)) : void 0,
138
+ onAudioFrameSent: opts.debugAudioDir ? (pcm16) => modelInputChunks.push(Buffer.from(pcm16)) : void 0
97
139
  });
140
+ if (opts.debugAudioDir) {
141
+ const files = writeDebugAudio(
142
+ opts.debugAudioDir,
143
+ assistantChunks,
144
+ micChunks,
145
+ modelInputChunks
146
+ );
147
+ writeError(
148
+ `debug audio written:
149
+ ${files.assistantFile}
150
+ ${files.micFile}
151
+ ${files.modelInputFile}`
152
+ );
153
+ }
98
154
  writeResult(transcript);
99
155
  process.exit(0);
100
156
  } catch (err) {
157
+ if (opts.debugAudioDir) {
158
+ try {
159
+ const files = writeDebugAudio(
160
+ opts.debugAudioDir,
161
+ assistantChunks,
162
+ micChunks,
163
+ modelInputChunks
164
+ );
165
+ writeError(
166
+ `debug audio written:
167
+ ${files.assistantFile}
168
+ ${files.micFile}
169
+ ${files.modelInputFile}`
170
+ );
171
+ } catch {
172
+ }
173
+ }
101
174
  writeError(`${err instanceof Error ? err.message : err}`);
102
175
  process.exit(1);
103
176
  }
package/dist/index.d.ts CHANGED
@@ -1,16 +1,3 @@
1
- type AudioPlayer = {
2
- write(pcm16: Buffer): boolean;
3
- start(): void;
4
- drain(): Promise<void>;
5
- close(): void;
6
- };
7
- type AudioRecorder = {
8
- onData(cb: (pcm16: Buffer) => void): void;
9
- start(): void;
10
- stop(): void;
11
- close(): void;
12
- };
13
-
14
1
  type AuthConfig = {
15
2
  apiKey: string;
16
3
  baseUrl?: string;
@@ -23,15 +10,18 @@ type AskOptions = {
23
10
  timeout?: number;
24
11
  ack?: boolean;
25
12
  auth?: AuthConfig;
26
- createPlayer?: () => AudioPlayer;
27
- createRecorder?: () => AudioRecorder;
13
+ createPlayer?: unknown;
14
+ createRecorder?: unknown;
15
+ onAudioFrameSent?: (pcm16: Buffer) => void;
16
+ onAssistantAudio?: (pcm16: Buffer) => void;
17
+ onMicAudio?: (pcm16: Buffer) => void;
28
18
  };
29
19
  declare function ask(message: string, options?: AskOptions): Promise<string>;
30
20
 
31
21
  type SayOptions = {
32
22
  voice?: string;
33
23
  auth?: AuthConfig;
34
- createPlayer?: () => AudioPlayer;
24
+ createPlayer?: unknown;
35
25
  };
36
26
  declare function say(message: string, options?: SayOptions): Promise<void>;
37
27
 
package/dist/index.js CHANGED
@@ -1,120 +1,5 @@
1
- // src/audio.ts
2
- import { AudioIO, SampleFormat16Bit } from "naudiodon2";
3
-
4
- // src/types.ts
5
- var SAMPLE_RATE = 24e3;
6
- var CHANNELS = 1;
7
- var VOICES = [
8
- "alloy",
9
- "ash",
10
- "ballad",
11
- "coral",
12
- "echo",
13
- "fable",
14
- "nova",
15
- "onyx",
16
- "sage",
17
- "shimmer",
18
- "verse"
19
- ];
20
- var DEFAULT_VOICE = "ash";
21
-
22
- // src/audio.ts
23
- function createAudioPlayer() {
24
- const stream = AudioIO({
25
- outOptions: {
26
- channelCount: CHANNELS,
27
- sampleFormat: SampleFormat16Bit,
28
- sampleRate: SAMPLE_RATE,
29
- closeOnError: true
30
- }
31
- });
32
- let closed = false;
33
- return {
34
- write(pcm16) {
35
- return stream.write(pcm16);
36
- },
37
- start() {
38
- stream.start();
39
- },
40
- drain() {
41
- if (closed) return Promise.resolve();
42
- closed = true;
43
- return new Promise((resolve) => {
44
- stream.quit(() => resolve());
45
- });
46
- },
47
- close() {
48
- if (closed) return;
49
- closed = true;
50
- stream.quit();
51
- }
52
- };
53
- }
54
- function createAudioRecorder() {
55
- const stream = AudioIO({
56
- inOptions: {
57
- channelCount: CHANNELS,
58
- sampleFormat: SampleFormat16Bit,
59
- sampleRate: SAMPLE_RATE,
60
- closeOnError: true
61
- }
62
- });
63
- let stopped = false;
64
- return {
65
- onData(cb) {
66
- stream.on("data", cb);
67
- },
68
- start() {
69
- stream.start();
70
- },
71
- stop() {
72
- if (stopped) return;
73
- stopped = true;
74
- stream.quit();
75
- },
76
- close() {
77
- if (stopped) return;
78
- stopped = true;
79
- stream.quit();
80
- }
81
- };
82
- }
83
-
84
- // src/echo-canceller.ts
85
- import { EchoCanceller } from "agent-voice-aec";
86
- var FRAME_SIZE = 480;
87
- var FILTER_LENGTH = 4800;
88
- var FRAME_BYTES = FRAME_SIZE * 2;
89
- function createEchoCanceller() {
90
- const aec = new EchoCanceller(FRAME_SIZE, FILTER_LENGTH, SAMPLE_RATE);
91
- let playbackBuffer = Buffer.alloc(0);
92
- let captureBuffer = Buffer.alloc(0);
93
- return {
94
- playback(pcm16) {
95
- playbackBuffer = Buffer.concat([playbackBuffer, pcm16]);
96
- while (playbackBuffer.length >= FRAME_BYTES) {
97
- aec.playback(playbackBuffer.subarray(0, FRAME_BYTES));
98
- playbackBuffer = playbackBuffer.subarray(FRAME_BYTES);
99
- }
100
- },
101
- capture(pcm16) {
102
- captureBuffer = Buffer.concat([captureBuffer, pcm16]);
103
- const frames = [];
104
- while (captureBuffer.length >= FRAME_BYTES) {
105
- const out = aec.capture(captureBuffer.subarray(0, FRAME_BYTES));
106
- frames.push(out);
107
- captureBuffer = captureBuffer.subarray(FRAME_BYTES);
108
- }
109
- return frames;
110
- },
111
- reset() {
112
- aec.reset();
113
- playbackBuffer = Buffer.alloc(0);
114
- captureBuffer = Buffer.alloc(0);
115
- }
116
- };
117
- }
1
+ // src/ask.ts
2
+ import { createRequire } from "module";
118
3
 
119
4
  // src/realtime.ts
120
5
  import { OpenAIRealtimeWS } from "openai/beta/realtime/ws";
@@ -159,6 +44,9 @@ function createRealtimeSession(options) {
159
44
  const pcm16 = Buffer.from(event.delta, "base64");
160
45
  options.onAudioDelta(pcm16);
161
46
  });
47
+ rt.on("response.audio.done", () => {
48
+ options.onAudioDone?.();
49
+ });
162
50
  rt.on("conversation.item.input_audio_transcription.completed", (event) => {
163
51
  options.onTranscript(event.transcript);
164
52
  });
@@ -225,100 +113,216 @@ ${text}`
225
113
  };
226
114
  }
227
115
 
116
+ // src/types.ts
117
+ var SAMPLE_RATE = 24e3;
118
+ var VOICES = [
119
+ "alloy",
120
+ "ash",
121
+ "ballad",
122
+ "coral",
123
+ "echo",
124
+ "fable",
125
+ "nova",
126
+ "onyx",
127
+ "sage",
128
+ "shimmer",
129
+ "verse"
130
+ ];
131
+ var DEFAULT_VOICE = "ash";
132
+
228
133
  // src/ask.ts
134
+ var require2 = createRequire(import.meta.url);
229
135
  async function ask(message, options = {}) {
230
136
  const {
231
137
  voice = DEFAULT_VOICE,
232
138
  timeout = 30,
233
139
  ack = false,
234
140
  auth,
235
- createPlayer = createAudioPlayer,
236
- createRecorder = createAudioRecorder
141
+ onAudioFrameSent,
142
+ onAssistantAudio,
143
+ onMicAudio
237
144
  } = options;
238
- const player = createPlayer();
239
- player.start();
240
- const echoCanceller = createEchoCanceller();
145
+ const { AudioEngine } = require2("agent-voice-audio");
146
+ const streamDelayMs = Number.parseInt(
147
+ process.env.AGENT_VOICE_AEC_STREAM_DELAY_MS ?? "30",
148
+ 10
149
+ );
150
+ const engine = new AudioEngine({
151
+ sampleRate: SAMPLE_RATE,
152
+ channels: 1,
153
+ enableAec: true,
154
+ streamDelayMs
155
+ });
156
+ engine.start();
157
+ const debug = process.env.AGENT_VOICE_DEBUG_ASK_EVENTS === "1";
158
+ const startMs = Date.now();
159
+ function logEvent(event, detail) {
160
+ if (!debug) return;
161
+ const elapsed = Date.now() - startMs;
162
+ const suffix = detail ? ` ${detail}` : "";
163
+ process.stderr.write(`[ask ${elapsed}ms] ${event}${suffix}
164
+ `);
165
+ }
166
+ logEvent("start");
241
167
  return new Promise((resolve, reject) => {
242
- let recorder = null;
243
- let recorderStarted = false;
244
168
  let transcript = "";
245
169
  let timeoutTimer = null;
170
+ let responseStartTimer = null;
171
+ let transcriptTimer = null;
172
+ let capturePollTimer = null;
246
173
  let speechDetected = false;
247
174
  let initialResponseDone = false;
248
- let interrupted = false;
175
+ let heardAssistantAudio = false;
176
+ let lastAssistantAudioAt = 0;
249
177
  let cleaned = false;
250
- let resolved = false;
178
+ let settled = false;
251
179
  async function cleanup() {
252
180
  if (cleaned) return;
253
181
  cleaned = true;
182
+ logEvent("cleanup:start");
254
183
  if (timeoutTimer) clearTimeout(timeoutTimer);
255
- recorder?.stop();
256
- recorder?.close();
257
- await player.drain();
184
+ if (responseStartTimer) clearTimeout(responseStartTimer);
185
+ if (transcriptTimer) clearTimeout(transcriptTimer);
186
+ if (capturePollTimer) clearInterval(capturePollTimer);
187
+ try {
188
+ engine.stop();
189
+ engine.close();
190
+ } catch {
191
+ }
258
192
  session.close();
193
+ logEvent("cleanup:done");
259
194
  }
260
- function finish() {
261
- if (resolved) return;
262
- resolved = true;
263
- cleanup().then(() => resolve(transcript));
195
+ function resolveOnce(value) {
196
+ if (settled) return;
197
+ settled = true;
198
+ cleanup().then(() => resolve(value));
264
199
  }
265
- function startRecorder() {
266
- if (recorderStarted) return;
267
- recorderStarted = true;
268
- recorder = createRecorder();
269
- recorder.onData((pcm16) => {
270
- const cleaned2 = echoCanceller.capture(pcm16);
271
- for (const frame of cleaned2) {
272
- session.sendAudio(frame);
273
- }
274
- });
275
- recorder.start();
200
+ function rejectOnce(error) {
201
+ if (settled) return;
202
+ settled = true;
203
+ cleanup().then(() => reject(error));
276
204
  }
205
+ capturePollTimer = setInterval(() => {
206
+ if (settled) return;
207
+ let rawFrames = [];
208
+ let processedFrames = [];
209
+ try {
210
+ rawFrames = engine.readRawCapture(64);
211
+ processedFrames = engine.readProcessedCapture(64);
212
+ } catch (err) {
213
+ rejectOnce(
214
+ new Error(
215
+ `audio engine capture read failed: ${err instanceof Error ? err.message : String(err)}`
216
+ )
217
+ );
218
+ return;
219
+ }
220
+ for (const frame of rawFrames) onMicAudio?.(frame);
221
+ if (!heardAssistantAudio) return;
222
+ for (const frame of processedFrames) {
223
+ onAudioFrameSent?.(frame);
224
+ session.sendAudio(frame);
225
+ }
226
+ }, 10);
277
227
  const session = createRealtimeSession({
278
228
  voice,
279
229
  mode: "default",
280
230
  ack,
281
231
  auth,
282
232
  onAudioDelta(pcm16) {
283
- echoCanceller.playback(pcm16);
284
- player.write(pcm16);
285
- startRecorder();
233
+ logEvent("realtime:audio_delta", `bytes=${pcm16.length}`);
234
+ heardAssistantAudio = true;
235
+ lastAssistantAudioAt = Date.now();
236
+ onAssistantAudio?.(pcm16);
237
+ engine.play(pcm16);
286
238
  },
287
239
  onTranscript(text) {
240
+ const echoGuardMs = Number.parseInt(
241
+ process.env.AGENT_VOICE_ECHO_GUARD_MS ?? "1500",
242
+ 10
243
+ );
244
+ const sinceAssistantMs = Date.now() - lastAssistantAudioAt;
245
+ if (heardAssistantAudio && sinceAssistantMs < echoGuardMs) {
246
+ logEvent(
247
+ "realtime:transcript_ignored_echo_guard",
248
+ `since_assistant_ms=${sinceAssistantMs} text="${text}"`
249
+ );
250
+ return;
251
+ }
252
+ logEvent("realtime:transcript", `text="${text}"`);
253
+ if (transcriptTimer) {
254
+ clearTimeout(transcriptTimer);
255
+ transcriptTimer = null;
256
+ }
288
257
  transcript = text;
289
- if (!ack) finish();
258
+ if (!ack) resolveOnce(transcript);
290
259
  },
291
260
  onSpeechStarted() {
261
+ logEvent("realtime:speech_started");
292
262
  speechDetected = true;
293
263
  if (timeoutTimer) {
294
264
  clearTimeout(timeoutTimer);
295
265
  timeoutTimer = null;
296
266
  }
297
- if (!initialResponseDone) {
298
- interrupted = true;
299
- player.close();
267
+ if (transcriptTimer) clearTimeout(transcriptTimer);
268
+ transcriptTimer = setTimeout(() => {
269
+ logEvent("timeout:no_transcript_after_speech");
270
+ rejectOnce(
271
+ new Error(
272
+ `No transcript received within ${timeout}s after speech started`
273
+ )
274
+ );
275
+ }, timeout * 1e3);
276
+ if (!initialResponseDone && heardAssistantAudio) {
277
+ try {
278
+ engine.play(Buffer.alloc(0));
279
+ } catch {
280
+ }
300
281
  }
301
282
  },
302
283
  onInitialResponseDone() {
284
+ logEvent("realtime:initial_response_done");
303
285
  initialResponseDone = true;
304
286
  timeoutTimer = setTimeout(() => {
305
287
  if (!speechDetected) {
306
- cleanup();
307
- reject(new Error(`No speech detected within ${timeout}s timeout`));
288
+ logEvent("timeout:no_speech");
289
+ rejectOnce(
290
+ new Error(`No speech detected within ${timeout}s timeout`)
291
+ );
308
292
  }
309
293
  }, timeout * 1e3);
310
294
  },
311
295
  onDone() {
312
- if (ack) finish();
296
+ logEvent("realtime:done");
297
+ if (ack) resolveOnce(transcript);
313
298
  },
314
- async onError(error) {
315
- await cleanup();
316
- reject(new Error(error));
299
+ onError(error) {
300
+ logEvent("realtime:error", error);
301
+ rejectOnce(new Error(error));
317
302
  }
318
303
  });
319
- session.connect().then(() => {
320
- session.sendMessage(message);
321
- }, reject);
304
+ session.connect().then(
305
+ () => {
306
+ logEvent("realtime:connected");
307
+ logEvent("realtime:send_message");
308
+ session.sendMessage(message);
309
+ responseStartTimer = setTimeout(() => {
310
+ if (!heardAssistantAudio) {
311
+ logEvent("timeout:no_assistant_audio");
312
+ rejectOnce(
313
+ new Error("No assistant audio received after sending message")
314
+ );
315
+ }
316
+ }, 1e4);
317
+ },
318
+ (err) => {
319
+ logEvent(
320
+ "realtime:connect_error",
321
+ err instanceof Error ? err.message : String(err)
322
+ );
323
+ rejectOnce(err instanceof Error ? err : new Error(String(err)));
324
+ }
325
+ );
322
326
  });
323
327
  }
324
328
 
@@ -353,48 +357,78 @@ function resolveVoice() {
353
357
  }
354
358
 
355
359
  // src/say.ts
360
+ import { createRequire as createRequire2 } from "module";
361
+ var require3 = createRequire2(import.meta.url);
356
362
  async function say(message, options = {}) {
357
- const {
358
- voice = DEFAULT_VOICE,
359
- auth,
360
- createPlayer = createAudioPlayer
361
- } = options;
362
- const player = createPlayer();
363
- player.start();
363
+ const { voice = DEFAULT_VOICE, auth } = options;
364
+ const { AudioEngine } = require3("agent-voice-audio");
365
+ const engine = new AudioEngine({
366
+ sampleRate: SAMPLE_RATE,
367
+ channels: 1,
368
+ enableAec: false
369
+ });
370
+ engine.start();
364
371
  return new Promise((resolve, reject) => {
365
372
  let cleaned = false;
373
+ let settled = false;
374
+ let responseDoneFallbackTimer = null;
375
+ let completionTailTimer = null;
366
376
  function cleanup() {
367
377
  if (cleaned) return;
368
378
  cleaned = true;
379
+ if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
380
+ if (completionTailTimer) clearTimeout(completionTailTimer);
381
+ try {
382
+ engine.stop();
383
+ engine.close();
384
+ } catch {
385
+ }
369
386
  session.close();
370
387
  }
388
+ function resolveOnce() {
389
+ if (settled) return;
390
+ settled = true;
391
+ cleanup();
392
+ resolve();
393
+ }
394
+ function rejectOnce(error) {
395
+ if (settled) return;
396
+ settled = true;
397
+ cleanup();
398
+ reject(error);
399
+ }
400
+ function scheduleTailResolve(delayMs) {
401
+ if (settled) return;
402
+ if (completionTailTimer) clearTimeout(completionTailTimer);
403
+ completionTailTimer = setTimeout(() => {
404
+ resolveOnce();
405
+ }, delayMs);
406
+ }
371
407
  const session = createRealtimeSession({
372
408
  voice,
373
409
  mode: "say",
374
410
  ack: false,
375
411
  auth,
376
412
  onAudioDelta(pcm16) {
377
- player.write(pcm16);
413
+ engine.play(pcm16);
414
+ },
415
+ onAudioDone() {
416
+ scheduleTailResolve(140);
378
417
  },
379
418
  onTranscript() {
380
419
  },
381
420
  onSpeechStarted() {
382
421
  },
383
- async onInitialResponseDone() {
384
- try {
385
- await player.drain();
386
- } catch {
387
- player.close();
388
- }
389
- cleanup();
390
- resolve();
422
+ onInitialResponseDone() {
423
+ if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
424
+ responseDoneFallbackTimer = setTimeout(() => {
425
+ scheduleTailResolve(220);
426
+ }, 700);
391
427
  },
392
428
  onDone() {
393
429
  },
394
430
  onError(error) {
395
- player.close();
396
- cleanup();
397
- reject(new Error(error));
431
+ rejectOnce(new Error(error));
398
432
  }
399
433
  });
400
434
  session.connect().then(() => {
@@ -0,0 +1,92 @@
1
+ #!/usr/bin/env node
2
+ import {
3
+ createRealtimeSession
4
+ } from "./chunk-UYBFONQE.js";
5
+ import {
6
+ DEFAULT_VOICE,
7
+ SAMPLE_RATE
8
+ } from "./chunk-AHLLYIEW.js";
9
+
10
+ // src/say.ts
11
+ import { createRequire } from "module";
12
+ var require2 = createRequire(import.meta.url);
13
+ async function say(message, options = {}) {
14
+ const { voice = DEFAULT_VOICE, auth } = options;
15
+ const { AudioEngine } = require2("agent-voice-audio");
16
+ const engine = new AudioEngine({
17
+ sampleRate: SAMPLE_RATE,
18
+ channels: 1,
19
+ enableAec: false
20
+ });
21
+ engine.start();
22
+ return new Promise((resolve, reject) => {
23
+ let cleaned = false;
24
+ let settled = false;
25
+ let responseDoneFallbackTimer = null;
26
+ let completionTailTimer = null;
27
+ function cleanup() {
28
+ if (cleaned) return;
29
+ cleaned = true;
30
+ if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
31
+ if (completionTailTimer) clearTimeout(completionTailTimer);
32
+ try {
33
+ engine.stop();
34
+ engine.close();
35
+ } catch {
36
+ }
37
+ session.close();
38
+ }
39
+ function resolveOnce() {
40
+ if (settled) return;
41
+ settled = true;
42
+ cleanup();
43
+ resolve();
44
+ }
45
+ function rejectOnce(error) {
46
+ if (settled) return;
47
+ settled = true;
48
+ cleanup();
49
+ reject(error);
50
+ }
51
+ function scheduleTailResolve(delayMs) {
52
+ if (settled) return;
53
+ if (completionTailTimer) clearTimeout(completionTailTimer);
54
+ completionTailTimer = setTimeout(() => {
55
+ resolveOnce();
56
+ }, delayMs);
57
+ }
58
+ const session = createRealtimeSession({
59
+ voice,
60
+ mode: "say",
61
+ ack: false,
62
+ auth,
63
+ onAudioDelta(pcm16) {
64
+ engine.play(pcm16);
65
+ },
66
+ onAudioDone() {
67
+ scheduleTailResolve(140);
68
+ },
69
+ onTranscript() {
70
+ },
71
+ onSpeechStarted() {
72
+ },
73
+ onInitialResponseDone() {
74
+ if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
75
+ responseDoneFallbackTimer = setTimeout(() => {
76
+ scheduleTailResolve(220);
77
+ }, 700);
78
+ },
79
+ onDone() {
80
+ },
81
+ onError(error) {
82
+ rejectOnce(new Error(error));
83
+ }
84
+ });
85
+ session.connect().then(() => {
86
+ session.sendMessage(message);
87
+ }, reject);
88
+ });
89
+ }
90
+ export {
91
+ say
92
+ };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-voice",
3
- "version": "0.1.3",
3
+ "version": "0.2.1",
4
4
  "description": "CLI for AI agents to interact with humans via voice",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -18,12 +18,11 @@
18
18
  "dist"
19
19
  ],
20
20
  "dependencies": {
21
+ "agent-voice-audio": "^0.2.1",
21
22
  "@inquirer/prompts": "^8.2.0",
22
23
  "commander": "^13.1.0",
23
- "naudiodon2": "^2.1.0",
24
24
  "openai": "^4.96.0",
25
- "ws": "^8.18.0",
26
- "agent-voice-aec": "0.1.1"
25
+ "ws": "^8.18.0"
27
26
  },
28
27
  "devDependencies": {
29
28
  "@types/node": "^22.12.0",
@@ -1,145 +0,0 @@
1
- #!/usr/bin/env node
2
- import {
3
- createAudioPlayer,
4
- createAudioRecorder,
5
- createRealtimeSession
6
- } from "./chunk-AQ5LP2XD.js";
7
- import {
8
- DEFAULT_VOICE,
9
- SAMPLE_RATE
10
- } from "./chunk-D3AGL5JD.js";
11
-
12
- // src/echo-canceller.ts
13
- import { EchoCanceller } from "agent-voice-aec";
14
- var FRAME_SIZE = 480;
15
- var FILTER_LENGTH = 4800;
16
- var FRAME_BYTES = FRAME_SIZE * 2;
17
- function createEchoCanceller() {
18
- const aec = new EchoCanceller(FRAME_SIZE, FILTER_LENGTH, SAMPLE_RATE);
19
- let playbackBuffer = Buffer.alloc(0);
20
- let captureBuffer = Buffer.alloc(0);
21
- return {
22
- playback(pcm16) {
23
- playbackBuffer = Buffer.concat([playbackBuffer, pcm16]);
24
- while (playbackBuffer.length >= FRAME_BYTES) {
25
- aec.playback(playbackBuffer.subarray(0, FRAME_BYTES));
26
- playbackBuffer = playbackBuffer.subarray(FRAME_BYTES);
27
- }
28
- },
29
- capture(pcm16) {
30
- captureBuffer = Buffer.concat([captureBuffer, pcm16]);
31
- const frames = [];
32
- while (captureBuffer.length >= FRAME_BYTES) {
33
- const out = aec.capture(captureBuffer.subarray(0, FRAME_BYTES));
34
- frames.push(out);
35
- captureBuffer = captureBuffer.subarray(FRAME_BYTES);
36
- }
37
- return frames;
38
- },
39
- reset() {
40
- aec.reset();
41
- playbackBuffer = Buffer.alloc(0);
42
- captureBuffer = Buffer.alloc(0);
43
- }
44
- };
45
- }
46
-
47
- // src/ask.ts
48
- async function ask(message, options = {}) {
49
- const {
50
- voice = DEFAULT_VOICE,
51
- timeout = 30,
52
- ack = false,
53
- auth,
54
- createPlayer = createAudioPlayer,
55
- createRecorder = createAudioRecorder
56
- } = options;
57
- const player = createPlayer();
58
- player.start();
59
- const echoCanceller = createEchoCanceller();
60
- return new Promise((resolve, reject) => {
61
- let recorder = null;
62
- let recorderStarted = false;
63
- let transcript = "";
64
- let timeoutTimer = null;
65
- let speechDetected = false;
66
- let initialResponseDone = false;
67
- let interrupted = false;
68
- let cleaned = false;
69
- let resolved = false;
70
- async function cleanup() {
71
- if (cleaned) return;
72
- cleaned = true;
73
- if (timeoutTimer) clearTimeout(timeoutTimer);
74
- recorder?.stop();
75
- recorder?.close();
76
- await player.drain();
77
- session.close();
78
- }
79
- function finish() {
80
- if (resolved) return;
81
- resolved = true;
82
- cleanup().then(() => resolve(transcript));
83
- }
84
- function startRecorder() {
85
- if (recorderStarted) return;
86
- recorderStarted = true;
87
- recorder = createRecorder();
88
- recorder.onData((pcm16) => {
89
- const cleaned2 = echoCanceller.capture(pcm16);
90
- for (const frame of cleaned2) {
91
- session.sendAudio(frame);
92
- }
93
- });
94
- recorder.start();
95
- }
96
- const session = createRealtimeSession({
97
- voice,
98
- mode: "default",
99
- ack,
100
- auth,
101
- onAudioDelta(pcm16) {
102
- echoCanceller.playback(pcm16);
103
- player.write(pcm16);
104
- startRecorder();
105
- },
106
- onTranscript(text) {
107
- transcript = text;
108
- if (!ack) finish();
109
- },
110
- onSpeechStarted() {
111
- speechDetected = true;
112
- if (timeoutTimer) {
113
- clearTimeout(timeoutTimer);
114
- timeoutTimer = null;
115
- }
116
- if (!initialResponseDone) {
117
- interrupted = true;
118
- player.close();
119
- }
120
- },
121
- onInitialResponseDone() {
122
- initialResponseDone = true;
123
- timeoutTimer = setTimeout(() => {
124
- if (!speechDetected) {
125
- cleanup();
126
- reject(new Error(`No speech detected within ${timeout}s timeout`));
127
- }
128
- }, timeout * 1e3);
129
- },
130
- onDone() {
131
- if (ack) finish();
132
- },
133
- async onError(error) {
134
- await cleanup();
135
- reject(new Error(error));
136
- }
137
- });
138
- session.connect().then(() => {
139
- session.sendMessage(message);
140
- }, reject);
141
- });
142
- }
143
- export {
144
- ask
145
- };
@@ -1,62 +0,0 @@
1
- #!/usr/bin/env node
2
- import {
3
- createAudioPlayer,
4
- createRealtimeSession
5
- } from "./chunk-AQ5LP2XD.js";
6
- import {
7
- DEFAULT_VOICE
8
- } from "./chunk-D3AGL5JD.js";
9
-
10
- // src/say.ts
11
- async function say(message, options = {}) {
12
- const {
13
- voice = DEFAULT_VOICE,
14
- auth,
15
- createPlayer = createAudioPlayer
16
- } = options;
17
- const player = createPlayer();
18
- player.start();
19
- return new Promise((resolve, reject) => {
20
- let cleaned = false;
21
- function cleanup() {
22
- if (cleaned) return;
23
- cleaned = true;
24
- session.close();
25
- }
26
- const session = createRealtimeSession({
27
- voice,
28
- mode: "say",
29
- ack: false,
30
- auth,
31
- onAudioDelta(pcm16) {
32
- player.write(pcm16);
33
- },
34
- onTranscript() {
35
- },
36
- onSpeechStarted() {
37
- },
38
- async onInitialResponseDone() {
39
- try {
40
- await player.drain();
41
- } catch {
42
- player.close();
43
- }
44
- cleanup();
45
- resolve();
46
- },
47
- onDone() {
48
- },
49
- onError(error) {
50
- player.close();
51
- cleanup();
52
- reject(new Error(error));
53
- }
54
- });
55
- session.connect().then(() => {
56
- session.sendMessage(message);
57
- }, reject);
58
- });
59
- }
60
- export {
61
- say
62
- };