agent-voice 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,208 @@
1
+ #!/usr/bin/env node
2
+ import {
3
+ createRealtimeSession
4
+ } from "./chunk-VV2VNOC4.js";
5
+ import {
6
+ DEFAULT_VOICE,
7
+ SAMPLE_RATE
8
+ } from "./chunk-AHLLYIEW.js";
9
+
10
+ // src/ask.ts
11
+ import { createRequire } from "module";
12
+ var require2 = createRequire(import.meta.url);
13
+ async function ask(message, options = {}) {
14
+ const {
15
+ voice = DEFAULT_VOICE,
16
+ timeout = 30,
17
+ ack = false,
18
+ auth,
19
+ onAudioFrameSent,
20
+ onAssistantAudio,
21
+ onMicAudio
22
+ } = options;
23
+ const { AudioEngine } = require2("agent-voice-audio");
24
+ const streamDelayMs = Number.parseInt(
25
+ process.env.AGENT_VOICE_AEC_STREAM_DELAY_MS ?? "30",
26
+ 10
27
+ );
28
+ const engine = new AudioEngine({
29
+ sampleRate: SAMPLE_RATE,
30
+ channels: 1,
31
+ enableAec: true,
32
+ streamDelayMs
33
+ });
34
+ engine.start();
35
+ const debug = process.env.AGENT_VOICE_DEBUG_ASK_EVENTS === "1";
36
+ const startMs = Date.now();
37
+ function logEvent(event, detail) {
38
+ if (!debug) return;
39
+ const elapsed = Date.now() - startMs;
40
+ const suffix = detail ? ` ${detail}` : "";
41
+ process.stderr.write(`[ask ${elapsed}ms] ${event}${suffix}
42
+ `);
43
+ }
44
+ logEvent("start");
45
+ return new Promise((resolve, reject) => {
46
+ let transcript = "";
47
+ let timeoutTimer = null;
48
+ let responseStartTimer = null;
49
+ let transcriptTimer = null;
50
+ let capturePollTimer = null;
51
+ let speechDetected = false;
52
+ let initialResponseDone = false;
53
+ let heardAssistantAudio = false;
54
+ let lastAssistantAudioAt = 0;
55
+ let cleaned = false;
56
+ let settled = false;
57
+ async function cleanup() {
58
+ if (cleaned) return;
59
+ cleaned = true;
60
+ logEvent("cleanup:start");
61
+ if (timeoutTimer) clearTimeout(timeoutTimer);
62
+ if (responseStartTimer) clearTimeout(responseStartTimer);
63
+ if (transcriptTimer) clearTimeout(transcriptTimer);
64
+ if (capturePollTimer) clearInterval(capturePollTimer);
65
+ try {
66
+ engine.stop();
67
+ engine.close();
68
+ } catch {
69
+ }
70
+ session.close();
71
+ logEvent("cleanup:done");
72
+ }
73
+ function resolveOnce(value) {
74
+ if (settled) return;
75
+ settled = true;
76
+ cleanup().then(() => resolve(value));
77
+ }
78
+ function rejectOnce(error) {
79
+ if (settled) return;
80
+ settled = true;
81
+ cleanup().then(() => reject(error));
82
+ }
83
+ capturePollTimer = setInterval(() => {
84
+ if (settled) return;
85
+ let rawFrames = [];
86
+ let processedFrames = [];
87
+ try {
88
+ rawFrames = engine.readRawCapture(64);
89
+ processedFrames = engine.readProcessedCapture(64);
90
+ } catch (err) {
91
+ rejectOnce(
92
+ new Error(
93
+ `audio engine capture read failed: ${err instanceof Error ? err.message : String(err)}`
94
+ )
95
+ );
96
+ return;
97
+ }
98
+ for (const frame of rawFrames) onMicAudio?.(frame);
99
+ if (!heardAssistantAudio) return;
100
+ for (const frame of processedFrames) {
101
+ onAudioFrameSent?.(frame);
102
+ session.sendAudio(frame);
103
+ }
104
+ }, 10);
105
+ const session = createRealtimeSession({
106
+ voice,
107
+ mode: "default",
108
+ ack,
109
+ auth,
110
+ onAudioDelta(pcm16) {
111
+ logEvent("realtime:audio_delta", `bytes=${pcm16.length}`);
112
+ heardAssistantAudio = true;
113
+ lastAssistantAudioAt = Date.now();
114
+ onAssistantAudio?.(pcm16);
115
+ engine.play(pcm16);
116
+ },
117
+ onTranscript(text) {
118
+ const echoGuardMs = Number.parseInt(
119
+ process.env.AGENT_VOICE_ECHO_GUARD_MS ?? "1500",
120
+ 10
121
+ );
122
+ const sinceAssistantMs = Date.now() - lastAssistantAudioAt;
123
+ if (heardAssistantAudio && sinceAssistantMs < echoGuardMs) {
124
+ logEvent(
125
+ "realtime:transcript_ignored_echo_guard",
126
+ `since_assistant_ms=${sinceAssistantMs} text="${text}"`
127
+ );
128
+ return;
129
+ }
130
+ logEvent("realtime:transcript", `text="${text}"`);
131
+ if (transcriptTimer) {
132
+ clearTimeout(transcriptTimer);
133
+ transcriptTimer = null;
134
+ }
135
+ transcript = text;
136
+ if (!ack) resolveOnce(transcript);
137
+ },
138
+ onSpeechStarted() {
139
+ logEvent("realtime:speech_started");
140
+ speechDetected = true;
141
+ if (timeoutTimer) {
142
+ clearTimeout(timeoutTimer);
143
+ timeoutTimer = null;
144
+ }
145
+ if (transcriptTimer) clearTimeout(transcriptTimer);
146
+ transcriptTimer = setTimeout(() => {
147
+ logEvent("timeout:no_transcript_after_speech");
148
+ rejectOnce(
149
+ new Error(
150
+ `No transcript received within ${timeout}s after speech started`
151
+ )
152
+ );
153
+ }, timeout * 1e3);
154
+ if (!initialResponseDone && heardAssistantAudio) {
155
+ try {
156
+ engine.play(Buffer.alloc(0));
157
+ } catch {
158
+ }
159
+ }
160
+ },
161
+ onInitialResponseDone() {
162
+ logEvent("realtime:initial_response_done");
163
+ initialResponseDone = true;
164
+ timeoutTimer = setTimeout(() => {
165
+ if (!speechDetected) {
166
+ logEvent("timeout:no_speech");
167
+ rejectOnce(
168
+ new Error(`No speech detected within ${timeout}s timeout`)
169
+ );
170
+ }
171
+ }, timeout * 1e3);
172
+ },
173
+ onDone() {
174
+ logEvent("realtime:done");
175
+ if (ack) resolveOnce(transcript);
176
+ },
177
+ onError(error) {
178
+ logEvent("realtime:error", error);
179
+ rejectOnce(new Error(error));
180
+ }
181
+ });
182
+ session.connect().then(
183
+ () => {
184
+ logEvent("realtime:connected");
185
+ logEvent("realtime:send_message");
186
+ session.sendMessage(message);
187
+ responseStartTimer = setTimeout(() => {
188
+ if (!heardAssistantAudio) {
189
+ logEvent("timeout:no_assistant_audio");
190
+ rejectOnce(
191
+ new Error("No assistant audio received after sending message")
192
+ );
193
+ }
194
+ }, 1e4);
195
+ },
196
+ (err) => {
197
+ logEvent(
198
+ "realtime:connect_error",
199
+ err instanceof Error ? err.message : String(err)
200
+ );
201
+ rejectOnce(err instanceof Error ? err : new Error(String(err)));
202
+ }
203
+ );
204
+ });
205
+ }
206
+ export {
207
+ ask
208
+ };
@@ -0,0 +1,63 @@
1
+ #!/usr/bin/env node
2
+ import {
3
+ writeAuthConfig
4
+ } from "./chunk-RGYWLATZ.js";
5
+ import "./chunk-AHLLYIEW.js";
6
+
7
+ // src/auth.ts
8
+ import { input, password } from "@inquirer/prompts";
9
+ import OpenAI from "openai";
10
+ var DEFAULT_BASE_URL = "https://api.openai.com/v1";
11
+ async function verifyAuth(apiKey, baseURL) {
12
+ const client = new OpenAI({ apiKey, baseURL });
13
+ await client.models.list();
14
+ }
15
+ async function readKeyFromStdin() {
16
+ const chunks = [];
17
+ for await (const chunk of process.stdin) chunks.push(chunk);
18
+ return Buffer.concat(chunks).toString("utf-8").trim();
19
+ }
20
+ async function auth(flags = {}) {
21
+ const nonInteractive = flags.apiUrl != null || flags.apiKey != null || flags.noVerify === true;
22
+ let baseUrl;
23
+ let apiKey;
24
+ if (nonInteractive) {
25
+ baseUrl = flags.apiUrl ?? DEFAULT_BASE_URL;
26
+ if (flags.apiKey) {
27
+ apiKey = flags.apiKey;
28
+ } else {
29
+ apiKey = await readKeyFromStdin();
30
+ if (!apiKey) {
31
+ throw new Error(
32
+ "No API key provided. Pass --api-key or pipe via stdin."
33
+ );
34
+ }
35
+ }
36
+ if (!flags.noVerify) {
37
+ process.stderr.write("Verifying...\n");
38
+ await verifyAuth(apiKey, baseUrl);
39
+ }
40
+ } else {
41
+ baseUrl = await input({
42
+ message: "Base URL",
43
+ default: DEFAULT_BASE_URL
44
+ });
45
+ apiKey = await password({
46
+ message: "API key"
47
+ });
48
+ if (!apiKey) {
49
+ throw new Error("API key is required.");
50
+ }
51
+ process.stderr.write("Verifying...\n");
52
+ await verifyAuth(apiKey, baseUrl);
53
+ }
54
+ const config = { apiKey };
55
+ if (baseUrl !== DEFAULT_BASE_URL) {
56
+ config.baseUrl = baseUrl;
57
+ }
58
+ writeAuthConfig(config);
59
+ process.stderr.write("Auth config saved to ~/.agent-voice/config.json\n");
60
+ }
61
+ export {
62
+ auth
63
+ };
@@ -3,6 +3,7 @@
3
3
  // src/types.ts
4
4
  var SAMPLE_RATE = 24e3;
5
5
  var CHANNELS = 1;
6
+ var BIT_DEPTH = 16;
6
7
  var VOICES = [
7
8
  "alloy",
8
9
  "ash",
@@ -21,6 +22,7 @@ var DEFAULT_VOICE = "ash";
21
22
  export {
22
23
  SAMPLE_RATE,
23
24
  CHANNELS,
25
+ BIT_DEPTH,
24
26
  VOICES,
25
27
  DEFAULT_VOICE
26
28
  };
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
  import {
3
3
  DEFAULT_VOICE
4
- } from "./chunk-D3AGL5JD.js";
4
+ } from "./chunk-AHLLYIEW.js";
5
5
 
6
6
  // src/config.ts
7
7
  import { chmodSync, mkdirSync, readFileSync, writeFileSync } from "fs";
@@ -1,71 +1,4 @@
1
1
  #!/usr/bin/env node
2
- import {
3
- CHANNELS,
4
- SAMPLE_RATE
5
- } from "./chunk-D3AGL5JD.js";
6
-
7
- // src/audio.ts
8
- import { AudioIO, SampleFormat16Bit } from "naudiodon2";
9
- function createAudioPlayer() {
10
- const stream = AudioIO({
11
- outOptions: {
12
- channelCount: CHANNELS,
13
- sampleFormat: SampleFormat16Bit,
14
- sampleRate: SAMPLE_RATE,
15
- closeOnError: true
16
- }
17
- });
18
- let closed = false;
19
- return {
20
- write(pcm16) {
21
- return stream.write(pcm16);
22
- },
23
- start() {
24
- stream.start();
25
- },
26
- drain() {
27
- if (closed) return Promise.resolve();
28
- closed = true;
29
- return new Promise((resolve) => {
30
- stream.quit(() => resolve());
31
- });
32
- },
33
- close() {
34
- if (closed) return;
35
- closed = true;
36
- stream.quit();
37
- }
38
- };
39
- }
40
- function createAudioRecorder() {
41
- const stream = AudioIO({
42
- inOptions: {
43
- channelCount: CHANNELS,
44
- sampleFormat: SampleFormat16Bit,
45
- sampleRate: SAMPLE_RATE,
46
- closeOnError: true
47
- }
48
- });
49
- let stopped = false;
50
- return {
51
- onData(cb) {
52
- stream.on("data", cb);
53
- },
54
- start() {
55
- stream.start();
56
- },
57
- stop() {
58
- if (stopped) return;
59
- stopped = true;
60
- stream.quit();
61
- },
62
- close() {
63
- if (stopped) return;
64
- stopped = true;
65
- stream.quit();
66
- }
67
- };
68
- }
69
2
 
70
3
  // src/realtime.ts
71
4
  import { OpenAIRealtimeWS } from "openai/beta/realtime/ws";
@@ -87,7 +20,7 @@ function createRealtimeSession(options) {
87
20
  let rt;
88
21
  let responseCount = 0;
89
22
  function configureSession() {
90
- const turnDetection = options.mode === "say" ? null : {
23
+ const turnDetection = options.mode === "say" ? void 0 : {
91
24
  type: "semantic_vad",
92
25
  eagerness: "medium",
93
26
  create_response: options.ack,
@@ -177,7 +110,5 @@ ${text}`
177
110
  }
178
111
 
179
112
  export {
180
- createAudioPlayer,
181
- createAudioRecorder,
182
113
  createRealtimeSession
183
114
  };
package/dist/cli.js CHANGED
@@ -3,26 +3,37 @@ import {
3
3
  resolveAuth,
4
4
  resolveVoice,
5
5
  writeVoiceConfig
6
- } from "./chunk-7ERYR6ZY.js";
6
+ } from "./chunk-RGYWLATZ.js";
7
7
  import {
8
+ BIT_DEPTH,
9
+ CHANNELS,
10
+ SAMPLE_RATE,
8
11
  VOICES
9
- } from "./chunk-D3AGL5JD.js";
12
+ } from "./chunk-AHLLYIEW.js";
10
13
 
11
14
  // src/cli.ts
12
- import { closeSync, openSync, writeSync } from "fs";
15
+ import { closeSync, mkdirSync, openSync, writeFileSync, writeSync } from "fs";
16
+ import { join } from "path";
13
17
  import { Command } from "commander";
14
- async function withSuppressedStdout() {
15
- const savedFd = openSync("/dev/fd/1", "w");
18
+ async function withSuppressedNativeOutput() {
19
+ const savedStdout = openSync("/dev/fd/1", "w");
20
+ const savedStderr = openSync("/dev/fd/2", "w");
16
21
  closeSync(1);
17
22
  openSync("/dev/null", "w");
18
- const { ask } = await import("./ask-NW4PBKFP.js");
19
- const { say } = await import("./say-HPM3WIE2.js");
23
+ closeSync(2);
24
+ openSync("/dev/null", "w");
25
+ const { ask } = await import("./ask-GUSXGYSY.js");
26
+ const { say } = await import("./say-W56HCNK4.js");
20
27
  function writeResult(text) {
21
- writeSync(savedFd, `${text}
28
+ writeSync(savedStdout, `${text}
29
+ `);
30
+ closeSync(savedStdout);
31
+ }
32
+ function writeError(text) {
33
+ writeSync(savedStderr, `${text}
22
34
  `);
23
- closeSync(savedFd);
24
35
  }
25
- return { ask, say, writeResult };
36
+ return { ask, say, writeResult, writeError };
26
37
  }
27
38
  async function readStdin() {
28
39
  if (process.stdin.isTTY) return "";
@@ -38,11 +49,47 @@ async function getMessage(flag) {
38
49
  if (stdin) return stdin;
39
50
  throw new Error("No message provided. Use -m or pipe via stdin.");
40
51
  }
52
+ function createWavBuffer(pcm16) {
53
+ const header = Buffer.alloc(44);
54
+ const dataSize = pcm16.length;
55
+ const fileSize = 36 + dataSize;
56
+ const byteRate = SAMPLE_RATE * CHANNELS * (BIT_DEPTH / 8);
57
+ const blockAlign = CHANNELS * (BIT_DEPTH / 8);
58
+ header.write("RIFF", 0);
59
+ header.writeUInt32LE(fileSize, 4);
60
+ header.write("WAVE", 8);
61
+ header.write("fmt ", 12);
62
+ header.writeUInt32LE(16, 16);
63
+ header.writeUInt16LE(1, 20);
64
+ header.writeUInt16LE(CHANNELS, 22);
65
+ header.writeUInt32LE(SAMPLE_RATE, 24);
66
+ header.writeUInt32LE(byteRate, 28);
67
+ header.writeUInt16LE(blockAlign, 32);
68
+ header.writeUInt16LE(BIT_DEPTH, 34);
69
+ header.write("data", 36);
70
+ header.writeUInt32LE(dataSize, 40);
71
+ return Buffer.concat([header, pcm16]);
72
+ }
73
+ function writeDebugAudio(dir, assistantChunks, micChunks, modelInputChunks) {
74
+ mkdirSync(dir, { recursive: true });
75
+ const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
76
+ const assistantFile = join(dir, `ask-${stamp}-assistant-output.wav`);
77
+ const micFile = join(dir, `ask-${stamp}-mic-input.wav`);
78
+ const modelInputFile = join(dir, `ask-${stamp}-model-input.wav`);
79
+ writeFileSync(assistantFile, createWavBuffer(Buffer.concat(assistantChunks)));
80
+ writeFileSync(micFile, createWavBuffer(Buffer.concat(micChunks)));
81
+ writeFileSync(modelInputFile, createWavBuffer(Buffer.concat(modelInputChunks)));
82
+ return { assistantFile, micFile, modelInputFile };
83
+ }
41
84
  var program = new Command().name("agent-voice").description("AI agent voice interaction CLI");
42
- program.command("auth").description("Configure API key and base URL").action(async () => {
85
+ program.command("auth").description("Configure API key and base URL").option("--api-url <url>", "Base URL for the API").option("--api-key <key>", "API key").option("--no-verify", "Skip API key verification").action(async (opts) => {
43
86
  try {
44
- const { auth } = await import("./auth-42XIU3B7.js");
45
- await auth();
87
+ const { auth } = await import("./auth-KET5DNSE.js");
88
+ await auth({
89
+ apiUrl: opts.apiUrl,
90
+ apiKey: opts.apiKey,
91
+ noVerify: !opts.verify
92
+ });
46
93
  process.exit(0);
47
94
  } catch (err) {
48
95
  process.stderr.write(`${err instanceof Error ? err.message : err}
@@ -73,35 +120,70 @@ voicesCmd.command("set <voice>").description("Set the default voice").action((vo
73
120
  `);
74
121
  process.exit(0);
75
122
  });
76
- program.command("ask").description("Speak a message and listen for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).option("--timeout <seconds>", "Seconds to wait for user speech", "30").option("--ack", "Speak an acknowledgment after the user responds").action(async (opts) => {
123
+ program.command("ask").description("Speak a message and listen for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).option("--timeout <seconds>", "Seconds to wait for user speech", "120").option("--ack", "Speak an acknowledgment after the user responds").option("--debug-audio-dir <dir>", "Write ask audio debug WAVs to this directory").action(async (opts) => {
124
+ const { ask, writeResult, writeError } = await withSuppressedNativeOutput();
125
+ const assistantChunks = [];
126
+ const micChunks = [];
127
+ const modelInputChunks = [];
77
128
  try {
78
- const { ask, writeResult } = await withSuppressedStdout();
79
129
  const auth = resolveAuth();
80
130
  const message = await getMessage(opts.message);
81
131
  const transcript = await ask(message, {
82
132
  voice: opts.voice,
83
133
  timeout: Number.parseInt(opts.timeout, 10),
84
134
  ack: opts.ack ?? false,
85
- auth
135
+ auth,
136
+ onAssistantAudio: opts.debugAudioDir ? (pcm16) => assistantChunks.push(Buffer.from(pcm16)) : void 0,
137
+ onMicAudio: opts.debugAudioDir ? (pcm16) => micChunks.push(Buffer.from(pcm16)) : void 0,
138
+ onAudioFrameSent: opts.debugAudioDir ? (pcm16) => modelInputChunks.push(Buffer.from(pcm16)) : void 0
86
139
  });
140
+ if (opts.debugAudioDir) {
141
+ const files = writeDebugAudio(
142
+ opts.debugAudioDir,
143
+ assistantChunks,
144
+ micChunks,
145
+ modelInputChunks
146
+ );
147
+ writeError(
148
+ `debug audio written:
149
+ ${files.assistantFile}
150
+ ${files.micFile}
151
+ ${files.modelInputFile}`
152
+ );
153
+ }
87
154
  writeResult(transcript);
88
155
  process.exit(0);
89
156
  } catch (err) {
90
- process.stderr.write(`${err instanceof Error ? err.message : err}
91
- `);
157
+ if (opts.debugAudioDir) {
158
+ try {
159
+ const files = writeDebugAudio(
160
+ opts.debugAudioDir,
161
+ assistantChunks,
162
+ micChunks,
163
+ modelInputChunks
164
+ );
165
+ writeError(
166
+ `debug audio written:
167
+ ${files.assistantFile}
168
+ ${files.micFile}
169
+ ${files.modelInputFile}`
170
+ );
171
+ } catch {
172
+ }
173
+ }
174
+ writeError(`${err instanceof Error ? err.message : err}`);
92
175
  process.exit(1);
93
176
  }
94
177
  });
95
178
  program.command("say").description("Speak a message without listening for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).action(async (opts) => {
179
+ const { say, writeError } = await withSuppressedNativeOutput();
96
180
  try {
97
- const { say } = await withSuppressedStdout();
98
181
  const auth = resolveAuth();
99
182
  const message = await getMessage(opts.message);
100
183
  await say(message, { voice: opts.voice, auth });
101
184
  process.exit(0);
102
185
  } catch (err) {
103
- process.stderr.write(`${err instanceof Error ? err.message : err}
104
- `);
186
+ writeError(`${err instanceof Error ? err.message : err}`);
105
187
  process.exit(1);
106
188
  }
107
189
  });
package/dist/index.d.ts CHANGED
@@ -1,16 +1,3 @@
1
- type AudioPlayer = {
2
- write(pcm16: Buffer): boolean;
3
- start(): void;
4
- drain(): Promise<void>;
5
- close(): void;
6
- };
7
- type AudioRecorder = {
8
- onData(cb: (pcm16: Buffer) => void): void;
9
- start(): void;
10
- stop(): void;
11
- close(): void;
12
- };
13
-
14
1
  type AuthConfig = {
15
2
  apiKey: string;
16
3
  baseUrl?: string;
@@ -23,15 +10,18 @@ type AskOptions = {
23
10
  timeout?: number;
24
11
  ack?: boolean;
25
12
  auth?: AuthConfig;
26
- createPlayer?: () => AudioPlayer;
27
- createRecorder?: () => AudioRecorder;
13
+ createPlayer?: unknown;
14
+ createRecorder?: unknown;
15
+ onAudioFrameSent?: (pcm16: Buffer) => void;
16
+ onAssistantAudio?: (pcm16: Buffer) => void;
17
+ onMicAudio?: (pcm16: Buffer) => void;
28
18
  };
29
19
  declare function ask(message: string, options?: AskOptions): Promise<string>;
30
20
 
31
21
  type SayOptions = {
32
22
  voice?: string;
33
23
  auth?: AuthConfig;
34
- createPlayer?: () => AudioPlayer;
24
+ createPlayer?: unknown;
35
25
  };
36
26
  declare function say(message: string, options?: SayOptions): Promise<void>;
37
27
 
package/dist/index.js CHANGED
@@ -1,85 +1,5 @@
1
- // src/audio.ts
2
- import { AudioIO, SampleFormat16Bit } from "naudiodon2";
3
-
4
- // src/types.ts
5
- var SAMPLE_RATE = 24e3;
6
- var CHANNELS = 1;
7
- var VOICES = [
8
- "alloy",
9
- "ash",
10
- "ballad",
11
- "coral",
12
- "echo",
13
- "fable",
14
- "nova",
15
- "onyx",
16
- "sage",
17
- "shimmer",
18
- "verse"
19
- ];
20
- var DEFAULT_VOICE = "ash";
21
-
22
- // src/audio.ts
23
- function createAudioPlayer() {
24
- const stream = AudioIO({
25
- outOptions: {
26
- channelCount: CHANNELS,
27
- sampleFormat: SampleFormat16Bit,
28
- sampleRate: SAMPLE_RATE,
29
- closeOnError: true
30
- }
31
- });
32
- let closed = false;
33
- return {
34
- write(pcm16) {
35
- return stream.write(pcm16);
36
- },
37
- start() {
38
- stream.start();
39
- },
40
- drain() {
41
- if (closed) return Promise.resolve();
42
- closed = true;
43
- return new Promise((resolve) => {
44
- stream.quit(() => resolve());
45
- });
46
- },
47
- close() {
48
- if (closed) return;
49
- closed = true;
50
- stream.quit();
51
- }
52
- };
53
- }
54
- function createAudioRecorder() {
55
- const stream = AudioIO({
56
- inOptions: {
57
- channelCount: CHANNELS,
58
- sampleFormat: SampleFormat16Bit,
59
- sampleRate: SAMPLE_RATE,
60
- closeOnError: true
61
- }
62
- });
63
- let stopped = false;
64
- return {
65
- onData(cb) {
66
- stream.on("data", cb);
67
- },
68
- start() {
69
- stream.start();
70
- },
71
- stop() {
72
- if (stopped) return;
73
- stopped = true;
74
- stream.quit();
75
- },
76
- close() {
77
- if (stopped) return;
78
- stopped = true;
79
- stream.quit();
80
- }
81
- };
82
- }
1
+ // src/ask.ts
2
+ import { createRequire } from "module";
83
3
 
84
4
  // src/realtime.ts
85
5
  import { OpenAIRealtimeWS } from "openai/beta/realtime/ws";
@@ -101,7 +21,7 @@ function createRealtimeSession(options) {
101
21
  let rt;
102
22
  let responseCount = 0;
103
23
  function configureSession() {
104
- const turnDetection = options.mode === "say" ? null : {
24
+ const turnDetection = options.mode === "say" ? void 0 : {
105
25
  type: "semantic_vad",
106
26
  eagerness: "medium",
107
27
  create_response: options.ack,
@@ -190,84 +110,216 @@ ${text}`
190
110
  };
191
111
  }
192
112
 
113
+ // src/types.ts
114
+ var SAMPLE_RATE = 24e3;
115
+ var VOICES = [
116
+ "alloy",
117
+ "ash",
118
+ "ballad",
119
+ "coral",
120
+ "echo",
121
+ "fable",
122
+ "nova",
123
+ "onyx",
124
+ "sage",
125
+ "shimmer",
126
+ "verse"
127
+ ];
128
+ var DEFAULT_VOICE = "ash";
129
+
193
130
  // src/ask.ts
131
+ var require2 = createRequire(import.meta.url);
194
132
  async function ask(message, options = {}) {
195
133
  const {
196
134
  voice = DEFAULT_VOICE,
197
135
  timeout = 30,
198
136
  ack = false,
199
137
  auth,
200
- createPlayer = createAudioPlayer,
201
- createRecorder = createAudioRecorder
138
+ onAudioFrameSent,
139
+ onAssistantAudio,
140
+ onMicAudio
202
141
  } = options;
203
- const player = createPlayer();
204
- player.start();
142
+ const { AudioEngine } = require2("agent-voice-audio");
143
+ const streamDelayMs = Number.parseInt(
144
+ process.env.AGENT_VOICE_AEC_STREAM_DELAY_MS ?? "30",
145
+ 10
146
+ );
147
+ const engine = new AudioEngine({
148
+ sampleRate: SAMPLE_RATE,
149
+ channels: 1,
150
+ enableAec: true,
151
+ streamDelayMs
152
+ });
153
+ engine.start();
154
+ const debug = process.env.AGENT_VOICE_DEBUG_ASK_EVENTS === "1";
155
+ const startMs = Date.now();
156
+ function logEvent(event, detail) {
157
+ if (!debug) return;
158
+ const elapsed = Date.now() - startMs;
159
+ const suffix = detail ? ` ${detail}` : "";
160
+ process.stderr.write(`[ask ${elapsed}ms] ${event}${suffix}
161
+ `);
162
+ }
163
+ logEvent("start");
205
164
  return new Promise((resolve, reject) => {
206
- let recorder = null;
207
165
  let transcript = "";
208
166
  let timeoutTimer = null;
167
+ let responseStartTimer = null;
168
+ let transcriptTimer = null;
169
+ let capturePollTimer = null;
209
170
  let speechDetected = false;
171
+ let initialResponseDone = false;
172
+ let heardAssistantAudio = false;
173
+ let lastAssistantAudioAt = 0;
210
174
  let cleaned = false;
211
- let resolved = false;
175
+ let settled = false;
212
176
  async function cleanup() {
213
177
  if (cleaned) return;
214
178
  cleaned = true;
179
+ logEvent("cleanup:start");
215
180
  if (timeoutTimer) clearTimeout(timeoutTimer);
216
- recorder?.stop();
217
- recorder?.close();
218
- await player.drain();
181
+ if (responseStartTimer) clearTimeout(responseStartTimer);
182
+ if (transcriptTimer) clearTimeout(transcriptTimer);
183
+ if (capturePollTimer) clearInterval(capturePollTimer);
184
+ try {
185
+ engine.stop();
186
+ engine.close();
187
+ } catch {
188
+ }
219
189
  session.close();
190
+ logEvent("cleanup:done");
191
+ }
192
+ function resolveOnce(value) {
193
+ if (settled) return;
194
+ settled = true;
195
+ cleanup().then(() => resolve(value));
220
196
  }
221
- function finish() {
222
- if (resolved) return;
223
- resolved = true;
224
- cleanup().then(() => resolve(transcript));
197
+ function rejectOnce(error) {
198
+ if (settled) return;
199
+ settled = true;
200
+ cleanup().then(() => reject(error));
225
201
  }
202
+ capturePollTimer = setInterval(() => {
203
+ if (settled) return;
204
+ let rawFrames = [];
205
+ let processedFrames = [];
206
+ try {
207
+ rawFrames = engine.readRawCapture(64);
208
+ processedFrames = engine.readProcessedCapture(64);
209
+ } catch (err) {
210
+ rejectOnce(
211
+ new Error(
212
+ `audio engine capture read failed: ${err instanceof Error ? err.message : String(err)}`
213
+ )
214
+ );
215
+ return;
216
+ }
217
+ for (const frame of rawFrames) onMicAudio?.(frame);
218
+ if (!heardAssistantAudio) return;
219
+ for (const frame of processedFrames) {
220
+ onAudioFrameSent?.(frame);
221
+ session.sendAudio(frame);
222
+ }
223
+ }, 10);
226
224
  const session = createRealtimeSession({
227
225
  voice,
228
226
  mode: "default",
229
227
  ack,
230
228
  auth,
231
229
  onAudioDelta(pcm16) {
232
- player.write(pcm16);
230
+ logEvent("realtime:audio_delta", `bytes=${pcm16.length}`);
231
+ heardAssistantAudio = true;
232
+ lastAssistantAudioAt = Date.now();
233
+ onAssistantAudio?.(pcm16);
234
+ engine.play(pcm16);
233
235
  },
234
236
  onTranscript(text) {
237
+ const echoGuardMs = Number.parseInt(
238
+ process.env.AGENT_VOICE_ECHO_GUARD_MS ?? "1500",
239
+ 10
240
+ );
241
+ const sinceAssistantMs = Date.now() - lastAssistantAudioAt;
242
+ if (heardAssistantAudio && sinceAssistantMs < echoGuardMs) {
243
+ logEvent(
244
+ "realtime:transcript_ignored_echo_guard",
245
+ `since_assistant_ms=${sinceAssistantMs} text="${text}"`
246
+ );
247
+ return;
248
+ }
249
+ logEvent("realtime:transcript", `text="${text}"`);
250
+ if (transcriptTimer) {
251
+ clearTimeout(transcriptTimer);
252
+ transcriptTimer = null;
253
+ }
235
254
  transcript = text;
236
- if (!ack) finish();
255
+ if (!ack) resolveOnce(transcript);
237
256
  },
238
257
  onSpeechStarted() {
258
+ logEvent("realtime:speech_started");
239
259
  speechDetected = true;
240
260
  if (timeoutTimer) {
241
261
  clearTimeout(timeoutTimer);
242
262
  timeoutTimer = null;
243
263
  }
264
+ if (transcriptTimer) clearTimeout(transcriptTimer);
265
+ transcriptTimer = setTimeout(() => {
266
+ logEvent("timeout:no_transcript_after_speech");
267
+ rejectOnce(
268
+ new Error(
269
+ `No transcript received within ${timeout}s after speech started`
270
+ )
271
+ );
272
+ }, timeout * 1e3);
273
+ if (!initialResponseDone && heardAssistantAudio) {
274
+ try {
275
+ engine.play(Buffer.alloc(0));
276
+ } catch {
277
+ }
278
+ }
244
279
  },
245
280
  onInitialResponseDone() {
246
- setTimeout(() => {
247
- recorder = createRecorder();
248
- recorder.onData((pcm16) => {
249
- session.sendAudio(pcm16);
250
- });
251
- recorder.start();
252
- }, 500);
281
+ logEvent("realtime:initial_response_done");
282
+ initialResponseDone = true;
253
283
  timeoutTimer = setTimeout(() => {
254
284
  if (!speechDetected) {
255
- cleanup();
256
- reject(new Error(`No speech detected within ${timeout}s timeout`));
285
+ logEvent("timeout:no_speech");
286
+ rejectOnce(
287
+ new Error(`No speech detected within ${timeout}s timeout`)
288
+ );
257
289
  }
258
290
  }, timeout * 1e3);
259
291
  },
260
292
  onDone() {
261
- if (ack) finish();
293
+ logEvent("realtime:done");
294
+ if (ack) resolveOnce(transcript);
262
295
  },
263
- async onError(error) {
264
- await cleanup();
265
- reject(new Error(error));
296
+ onError(error) {
297
+ logEvent("realtime:error", error);
298
+ rejectOnce(new Error(error));
266
299
  }
267
300
  });
268
- session.connect().then(() => {
269
- session.sendMessage(message);
270
- }, reject);
301
+ session.connect().then(
302
+ () => {
303
+ logEvent("realtime:connected");
304
+ logEvent("realtime:send_message");
305
+ session.sendMessage(message);
306
+ responseStartTimer = setTimeout(() => {
307
+ if (!heardAssistantAudio) {
308
+ logEvent("timeout:no_assistant_audio");
309
+ rejectOnce(
310
+ new Error("No assistant audio received after sending message")
311
+ );
312
+ }
313
+ }, 1e4);
314
+ },
315
+ (err) => {
316
+ logEvent(
317
+ "realtime:connect_error",
318
+ err instanceof Error ? err.message : String(err)
319
+ );
320
+ rejectOnce(err instanceof Error ? err : new Error(String(err)));
321
+ }
322
+ );
271
323
  });
272
324
  }
273
325
 
@@ -302,19 +354,27 @@ function resolveVoice() {
302
354
  }
303
355
 
304
356
  // src/say.ts
357
+ import { createRequire as createRequire2 } from "module";
358
+ var require3 = createRequire2(import.meta.url);
305
359
  async function say(message, options = {}) {
306
- const {
307
- voice = DEFAULT_VOICE,
308
- auth,
309
- createPlayer = createAudioPlayer
310
- } = options;
311
- const player = createPlayer();
312
- player.start();
360
+ const { voice = DEFAULT_VOICE, auth } = options;
361
+ const { AudioEngine } = require3("agent-voice-audio");
362
+ const engine = new AudioEngine({
363
+ sampleRate: SAMPLE_RATE,
364
+ channels: 1,
365
+ enableAec: false
366
+ });
367
+ engine.start();
313
368
  return new Promise((resolve, reject) => {
314
369
  let cleaned = false;
315
370
  function cleanup() {
316
371
  if (cleaned) return;
317
372
  cleaned = true;
373
+ try {
374
+ engine.stop();
375
+ engine.close();
376
+ } catch {
377
+ }
318
378
  session.close();
319
379
  }
320
380
  const session = createRealtimeSession({
@@ -323,25 +383,19 @@ async function say(message, options = {}) {
323
383
  ack: false,
324
384
  auth,
325
385
  onAudioDelta(pcm16) {
326
- player.write(pcm16);
386
+ engine.play(pcm16);
327
387
  },
328
388
  onTranscript() {
329
389
  },
330
390
  onSpeechStarted() {
331
391
  },
332
- async onInitialResponseDone() {
333
- try {
334
- await player.drain();
335
- } catch {
336
- player.close();
337
- }
392
+ onInitialResponseDone() {
338
393
  cleanup();
339
394
  resolve();
340
395
  },
341
396
  onDone() {
342
397
  },
343
398
  onError(error) {
344
- player.close();
345
399
  cleanup();
346
400
  reject(new Error(error));
347
401
  }
@@ -1,26 +1,34 @@
1
1
  #!/usr/bin/env node
2
2
  import {
3
- createAudioPlayer,
4
3
  createRealtimeSession
5
- } from "./chunk-EBYXFYS5.js";
4
+ } from "./chunk-VV2VNOC4.js";
6
5
  import {
7
- DEFAULT_VOICE
8
- } from "./chunk-D3AGL5JD.js";
6
+ DEFAULT_VOICE,
7
+ SAMPLE_RATE
8
+ } from "./chunk-AHLLYIEW.js";
9
9
 
10
10
  // src/say.ts
11
+ import { createRequire } from "module";
12
+ var require2 = createRequire(import.meta.url);
11
13
  async function say(message, options = {}) {
12
- const {
13
- voice = DEFAULT_VOICE,
14
- auth,
15
- createPlayer = createAudioPlayer
16
- } = options;
17
- const player = createPlayer();
18
- player.start();
14
+ const { voice = DEFAULT_VOICE, auth } = options;
15
+ const { AudioEngine } = require2("agent-voice-audio");
16
+ const engine = new AudioEngine({
17
+ sampleRate: SAMPLE_RATE,
18
+ channels: 1,
19
+ enableAec: false
20
+ });
21
+ engine.start();
19
22
  return new Promise((resolve, reject) => {
20
23
  let cleaned = false;
21
24
  function cleanup() {
22
25
  if (cleaned) return;
23
26
  cleaned = true;
27
+ try {
28
+ engine.stop();
29
+ engine.close();
30
+ } catch {
31
+ }
24
32
  session.close();
25
33
  }
26
34
  const session = createRealtimeSession({
@@ -29,25 +37,19 @@ async function say(message, options = {}) {
29
37
  ack: false,
30
38
  auth,
31
39
  onAudioDelta(pcm16) {
32
- player.write(pcm16);
40
+ engine.play(pcm16);
33
41
  },
34
42
  onTranscript() {
35
43
  },
36
44
  onSpeechStarted() {
37
45
  },
38
- async onInitialResponseDone() {
39
- try {
40
- await player.drain();
41
- } catch {
42
- player.close();
43
- }
46
+ onInitialResponseDone() {
44
47
  cleanup();
45
48
  resolve();
46
49
  },
47
50
  onDone() {
48
51
  },
49
52
  onError(error) {
50
- player.close();
51
53
  cleanup();
52
54
  reject(new Error(error));
53
55
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-voice",
3
- "version": "0.1.2",
3
+ "version": "0.2.0",
4
4
  "description": "CLI for AI agents to interact with humans via voice",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -18,32 +18,26 @@
18
18
  "dist"
19
19
  ],
20
20
  "dependencies": {
21
+ "agent-voice-audio": "^0.2.0",
21
22
  "@inquirer/prompts": "^8.2.0",
22
23
  "commander": "^13.1.0",
23
- "naudiodon2": "^2.1.0",
24
24
  "openai": "^4.96.0",
25
25
  "ws": "^8.18.0"
26
26
  },
27
27
  "devDependencies": {
28
- "@biomejs/biome": "^1.9.4",
29
- "@changesets/cli": "^2.29.8",
30
28
  "@types/node": "^22.12.0",
31
29
  "@types/ws": "^8.5.14",
32
- "dotenv-cli": "^11.0.0",
33
- "lefthook": "^2.1.0",
34
30
  "tsup": "^8.3.6",
35
31
  "tsx": "^4.19.2",
36
32
  "typescript": "^5.7.3",
37
33
  "vitest": "^4.0.18"
38
34
  },
39
35
  "scripts": {
40
- "dev": "dotenv -e .env.local -- tsx src/cli.ts",
41
- "agent-voice": "dotenv -e .env.local -- tsx src/cli.ts",
36
+ "dev": "tsx src/cli.ts",
42
37
  "build": "tsup",
43
38
  "check": "biome check --write .",
44
39
  "typecheck": "tsc --noEmit",
45
- "test": "dotenv -e .env.local -- vitest run",
46
- "test:watch": "dotenv -e .env.local -- vitest",
47
- "release": "pnpm build && changeset publish"
40
+ "test": "vitest run",
41
+ "test:watch": "vitest"
48
42
  }
49
43
  }
@@ -1,93 +0,0 @@
1
- #!/usr/bin/env node
2
- import {
3
- createAudioPlayer,
4
- createAudioRecorder,
5
- createRealtimeSession
6
- } from "./chunk-EBYXFYS5.js";
7
- import {
8
- DEFAULT_VOICE
9
- } from "./chunk-D3AGL5JD.js";
10
-
11
- // src/ask.ts
12
- async function ask(message, options = {}) {
13
- const {
14
- voice = DEFAULT_VOICE,
15
- timeout = 30,
16
- ack = false,
17
- auth,
18
- createPlayer = createAudioPlayer,
19
- createRecorder = createAudioRecorder
20
- } = options;
21
- const player = createPlayer();
22
- player.start();
23
- return new Promise((resolve, reject) => {
24
- let recorder = null;
25
- let transcript = "";
26
- let timeoutTimer = null;
27
- let speechDetected = false;
28
- let cleaned = false;
29
- let resolved = false;
30
- async function cleanup() {
31
- if (cleaned) return;
32
- cleaned = true;
33
- if (timeoutTimer) clearTimeout(timeoutTimer);
34
- recorder?.stop();
35
- recorder?.close();
36
- await player.drain();
37
- session.close();
38
- }
39
- function finish() {
40
- if (resolved) return;
41
- resolved = true;
42
- cleanup().then(() => resolve(transcript));
43
- }
44
- const session = createRealtimeSession({
45
- voice,
46
- mode: "default",
47
- ack,
48
- auth,
49
- onAudioDelta(pcm16) {
50
- player.write(pcm16);
51
- },
52
- onTranscript(text) {
53
- transcript = text;
54
- if (!ack) finish();
55
- },
56
- onSpeechStarted() {
57
- speechDetected = true;
58
- if (timeoutTimer) {
59
- clearTimeout(timeoutTimer);
60
- timeoutTimer = null;
61
- }
62
- },
63
- onInitialResponseDone() {
64
- setTimeout(() => {
65
- recorder = createRecorder();
66
- recorder.onData((pcm16) => {
67
- session.sendAudio(pcm16);
68
- });
69
- recorder.start();
70
- }, 500);
71
- timeoutTimer = setTimeout(() => {
72
- if (!speechDetected) {
73
- cleanup();
74
- reject(new Error(`No speech detected within ${timeout}s timeout`));
75
- }
76
- }, timeout * 1e3);
77
- },
78
- onDone() {
79
- if (ack) finish();
80
- },
81
- async onError(error) {
82
- await cleanup();
83
- reject(new Error(error));
84
- }
85
- });
86
- session.connect().then(() => {
87
- session.sendMessage(message);
88
- }, reject);
89
- });
90
- }
91
- export {
92
- ask
93
- };
@@ -1,37 +0,0 @@
1
- #!/usr/bin/env node
2
- import {
3
- writeAuthConfig
4
- } from "./chunk-7ERYR6ZY.js";
5
- import "./chunk-D3AGL5JD.js";
6
-
7
- // src/auth.ts
8
- import { input, password } from "@inquirer/prompts";
9
- import OpenAI from "openai";
10
- var DEFAULT_BASE_URL = "https://api.openai.com/v1";
11
- async function verifyAuth(apiKey, baseURL) {
12
- const client = new OpenAI({ apiKey, baseURL });
13
- await client.models.list();
14
- }
15
- async function auth() {
16
- const baseUrl = await input({
17
- message: "Base URL",
18
- default: DEFAULT_BASE_URL
19
- });
20
- const apiKey = await password({
21
- message: "API key"
22
- });
23
- if (!apiKey) {
24
- throw new Error("API key is required.");
25
- }
26
- process.stderr.write("Verifying...\n");
27
- await verifyAuth(apiKey, baseUrl);
28
- const config = { apiKey };
29
- if (baseUrl !== DEFAULT_BASE_URL) {
30
- config.baseUrl = baseUrl;
31
- }
32
- writeAuthConfig(config);
33
- process.stderr.write("Auth config saved to ~/.agent-voice/config.json\n");
34
- }
35
- export {
36
- auth
37
- };