agent-voice 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ask-GUSXGYSY.js +208 -0
- package/dist/auth-KET5DNSE.js +63 -0
- package/dist/{chunk-D3AGL5JD.js → chunk-AHLLYIEW.js} +2 -0
- package/dist/{chunk-7ERYR6ZY.js → chunk-RGYWLATZ.js} +1 -1
- package/dist/{chunk-EBYXFYS5.js → chunk-VV2VNOC4.js} +1 -70
- package/dist/cli.js +103 -21
- package/dist/index.d.ts +6 -16
- package/dist/index.js +183 -129
- package/dist/{say-HPM3WIE2.js → say-W56HCNK4.js} +21 -19
- package/package.json +5 -11
- package/dist/ask-NW4PBKFP.js +0 -93
- package/dist/auth-42XIU3B7.js +0 -37
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import {
|
|
3
|
+
createRealtimeSession
|
|
4
|
+
} from "./chunk-VV2VNOC4.js";
|
|
5
|
+
import {
|
|
6
|
+
DEFAULT_VOICE,
|
|
7
|
+
SAMPLE_RATE
|
|
8
|
+
} from "./chunk-AHLLYIEW.js";
|
|
9
|
+
|
|
10
|
+
// src/ask.ts
|
|
11
|
+
import { createRequire } from "module";
|
|
12
|
+
var require2 = createRequire(import.meta.url);
|
|
13
|
+
async function ask(message, options = {}) {
|
|
14
|
+
const {
|
|
15
|
+
voice = DEFAULT_VOICE,
|
|
16
|
+
timeout = 30,
|
|
17
|
+
ack = false,
|
|
18
|
+
auth,
|
|
19
|
+
onAudioFrameSent,
|
|
20
|
+
onAssistantAudio,
|
|
21
|
+
onMicAudio
|
|
22
|
+
} = options;
|
|
23
|
+
const { AudioEngine } = require2("agent-voice-audio");
|
|
24
|
+
const streamDelayMs = Number.parseInt(
|
|
25
|
+
process.env.AGENT_VOICE_AEC_STREAM_DELAY_MS ?? "30",
|
|
26
|
+
10
|
|
27
|
+
);
|
|
28
|
+
const engine = new AudioEngine({
|
|
29
|
+
sampleRate: SAMPLE_RATE,
|
|
30
|
+
channels: 1,
|
|
31
|
+
enableAec: true,
|
|
32
|
+
streamDelayMs
|
|
33
|
+
});
|
|
34
|
+
engine.start();
|
|
35
|
+
const debug = process.env.AGENT_VOICE_DEBUG_ASK_EVENTS === "1";
|
|
36
|
+
const startMs = Date.now();
|
|
37
|
+
function logEvent(event, detail) {
|
|
38
|
+
if (!debug) return;
|
|
39
|
+
const elapsed = Date.now() - startMs;
|
|
40
|
+
const suffix = detail ? ` ${detail}` : "";
|
|
41
|
+
process.stderr.write(`[ask ${elapsed}ms] ${event}${suffix}
|
|
42
|
+
`);
|
|
43
|
+
}
|
|
44
|
+
logEvent("start");
|
|
45
|
+
return new Promise((resolve, reject) => {
|
|
46
|
+
let transcript = "";
|
|
47
|
+
let timeoutTimer = null;
|
|
48
|
+
let responseStartTimer = null;
|
|
49
|
+
let transcriptTimer = null;
|
|
50
|
+
let capturePollTimer = null;
|
|
51
|
+
let speechDetected = false;
|
|
52
|
+
let initialResponseDone = false;
|
|
53
|
+
let heardAssistantAudio = false;
|
|
54
|
+
let lastAssistantAudioAt = 0;
|
|
55
|
+
let cleaned = false;
|
|
56
|
+
let settled = false;
|
|
57
|
+
async function cleanup() {
|
|
58
|
+
if (cleaned) return;
|
|
59
|
+
cleaned = true;
|
|
60
|
+
logEvent("cleanup:start");
|
|
61
|
+
if (timeoutTimer) clearTimeout(timeoutTimer);
|
|
62
|
+
if (responseStartTimer) clearTimeout(responseStartTimer);
|
|
63
|
+
if (transcriptTimer) clearTimeout(transcriptTimer);
|
|
64
|
+
if (capturePollTimer) clearInterval(capturePollTimer);
|
|
65
|
+
try {
|
|
66
|
+
engine.stop();
|
|
67
|
+
engine.close();
|
|
68
|
+
} catch {
|
|
69
|
+
}
|
|
70
|
+
session.close();
|
|
71
|
+
logEvent("cleanup:done");
|
|
72
|
+
}
|
|
73
|
+
function resolveOnce(value) {
|
|
74
|
+
if (settled) return;
|
|
75
|
+
settled = true;
|
|
76
|
+
cleanup().then(() => resolve(value));
|
|
77
|
+
}
|
|
78
|
+
function rejectOnce(error) {
|
|
79
|
+
if (settled) return;
|
|
80
|
+
settled = true;
|
|
81
|
+
cleanup().then(() => reject(error));
|
|
82
|
+
}
|
|
83
|
+
capturePollTimer = setInterval(() => {
|
|
84
|
+
if (settled) return;
|
|
85
|
+
let rawFrames = [];
|
|
86
|
+
let processedFrames = [];
|
|
87
|
+
try {
|
|
88
|
+
rawFrames = engine.readRawCapture(64);
|
|
89
|
+
processedFrames = engine.readProcessedCapture(64);
|
|
90
|
+
} catch (err) {
|
|
91
|
+
rejectOnce(
|
|
92
|
+
new Error(
|
|
93
|
+
`audio engine capture read failed: ${err instanceof Error ? err.message : String(err)}`
|
|
94
|
+
)
|
|
95
|
+
);
|
|
96
|
+
return;
|
|
97
|
+
}
|
|
98
|
+
for (const frame of rawFrames) onMicAudio?.(frame);
|
|
99
|
+
if (!heardAssistantAudio) return;
|
|
100
|
+
for (const frame of processedFrames) {
|
|
101
|
+
onAudioFrameSent?.(frame);
|
|
102
|
+
session.sendAudio(frame);
|
|
103
|
+
}
|
|
104
|
+
}, 10);
|
|
105
|
+
const session = createRealtimeSession({
|
|
106
|
+
voice,
|
|
107
|
+
mode: "default",
|
|
108
|
+
ack,
|
|
109
|
+
auth,
|
|
110
|
+
onAudioDelta(pcm16) {
|
|
111
|
+
logEvent("realtime:audio_delta", `bytes=${pcm16.length}`);
|
|
112
|
+
heardAssistantAudio = true;
|
|
113
|
+
lastAssistantAudioAt = Date.now();
|
|
114
|
+
onAssistantAudio?.(pcm16);
|
|
115
|
+
engine.play(pcm16);
|
|
116
|
+
},
|
|
117
|
+
onTranscript(text) {
|
|
118
|
+
const echoGuardMs = Number.parseInt(
|
|
119
|
+
process.env.AGENT_VOICE_ECHO_GUARD_MS ?? "1500",
|
|
120
|
+
10
|
|
121
|
+
);
|
|
122
|
+
const sinceAssistantMs = Date.now() - lastAssistantAudioAt;
|
|
123
|
+
if (heardAssistantAudio && sinceAssistantMs < echoGuardMs) {
|
|
124
|
+
logEvent(
|
|
125
|
+
"realtime:transcript_ignored_echo_guard",
|
|
126
|
+
`since_assistant_ms=${sinceAssistantMs} text="${text}"`
|
|
127
|
+
);
|
|
128
|
+
return;
|
|
129
|
+
}
|
|
130
|
+
logEvent("realtime:transcript", `text="${text}"`);
|
|
131
|
+
if (transcriptTimer) {
|
|
132
|
+
clearTimeout(transcriptTimer);
|
|
133
|
+
transcriptTimer = null;
|
|
134
|
+
}
|
|
135
|
+
transcript = text;
|
|
136
|
+
if (!ack) resolveOnce(transcript);
|
|
137
|
+
},
|
|
138
|
+
onSpeechStarted() {
|
|
139
|
+
logEvent("realtime:speech_started");
|
|
140
|
+
speechDetected = true;
|
|
141
|
+
if (timeoutTimer) {
|
|
142
|
+
clearTimeout(timeoutTimer);
|
|
143
|
+
timeoutTimer = null;
|
|
144
|
+
}
|
|
145
|
+
if (transcriptTimer) clearTimeout(transcriptTimer);
|
|
146
|
+
transcriptTimer = setTimeout(() => {
|
|
147
|
+
logEvent("timeout:no_transcript_after_speech");
|
|
148
|
+
rejectOnce(
|
|
149
|
+
new Error(
|
|
150
|
+
`No transcript received within ${timeout}s after speech started`
|
|
151
|
+
)
|
|
152
|
+
);
|
|
153
|
+
}, timeout * 1e3);
|
|
154
|
+
if (!initialResponseDone && heardAssistantAudio) {
|
|
155
|
+
try {
|
|
156
|
+
engine.play(Buffer.alloc(0));
|
|
157
|
+
} catch {
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
},
|
|
161
|
+
onInitialResponseDone() {
|
|
162
|
+
logEvent("realtime:initial_response_done");
|
|
163
|
+
initialResponseDone = true;
|
|
164
|
+
timeoutTimer = setTimeout(() => {
|
|
165
|
+
if (!speechDetected) {
|
|
166
|
+
logEvent("timeout:no_speech");
|
|
167
|
+
rejectOnce(
|
|
168
|
+
new Error(`No speech detected within ${timeout}s timeout`)
|
|
169
|
+
);
|
|
170
|
+
}
|
|
171
|
+
}, timeout * 1e3);
|
|
172
|
+
},
|
|
173
|
+
onDone() {
|
|
174
|
+
logEvent("realtime:done");
|
|
175
|
+
if (ack) resolveOnce(transcript);
|
|
176
|
+
},
|
|
177
|
+
onError(error) {
|
|
178
|
+
logEvent("realtime:error", error);
|
|
179
|
+
rejectOnce(new Error(error));
|
|
180
|
+
}
|
|
181
|
+
});
|
|
182
|
+
session.connect().then(
|
|
183
|
+
() => {
|
|
184
|
+
logEvent("realtime:connected");
|
|
185
|
+
logEvent("realtime:send_message");
|
|
186
|
+
session.sendMessage(message);
|
|
187
|
+
responseStartTimer = setTimeout(() => {
|
|
188
|
+
if (!heardAssistantAudio) {
|
|
189
|
+
logEvent("timeout:no_assistant_audio");
|
|
190
|
+
rejectOnce(
|
|
191
|
+
new Error("No assistant audio received after sending message")
|
|
192
|
+
);
|
|
193
|
+
}
|
|
194
|
+
}, 1e4);
|
|
195
|
+
},
|
|
196
|
+
(err) => {
|
|
197
|
+
logEvent(
|
|
198
|
+
"realtime:connect_error",
|
|
199
|
+
err instanceof Error ? err.message : String(err)
|
|
200
|
+
);
|
|
201
|
+
rejectOnce(err instanceof Error ? err : new Error(String(err)));
|
|
202
|
+
}
|
|
203
|
+
);
|
|
204
|
+
});
|
|
205
|
+
}
|
|
206
|
+
export {
|
|
207
|
+
ask
|
|
208
|
+
};
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import {
|
|
3
|
+
writeAuthConfig
|
|
4
|
+
} from "./chunk-RGYWLATZ.js";
|
|
5
|
+
import "./chunk-AHLLYIEW.js";
|
|
6
|
+
|
|
7
|
+
// src/auth.ts
|
|
8
|
+
import { input, password } from "@inquirer/prompts";
|
|
9
|
+
import OpenAI from "openai";
|
|
10
|
+
var DEFAULT_BASE_URL = "https://api.openai.com/v1";
|
|
11
|
+
async function verifyAuth(apiKey, baseURL) {
|
|
12
|
+
const client = new OpenAI({ apiKey, baseURL });
|
|
13
|
+
await client.models.list();
|
|
14
|
+
}
|
|
15
|
+
async function readKeyFromStdin() {
|
|
16
|
+
const chunks = [];
|
|
17
|
+
for await (const chunk of process.stdin) chunks.push(chunk);
|
|
18
|
+
return Buffer.concat(chunks).toString("utf-8").trim();
|
|
19
|
+
}
|
|
20
|
+
async function auth(flags = {}) {
|
|
21
|
+
const nonInteractive = flags.apiUrl != null || flags.apiKey != null || flags.noVerify === true;
|
|
22
|
+
let baseUrl;
|
|
23
|
+
let apiKey;
|
|
24
|
+
if (nonInteractive) {
|
|
25
|
+
baseUrl = flags.apiUrl ?? DEFAULT_BASE_URL;
|
|
26
|
+
if (flags.apiKey) {
|
|
27
|
+
apiKey = flags.apiKey;
|
|
28
|
+
} else {
|
|
29
|
+
apiKey = await readKeyFromStdin();
|
|
30
|
+
if (!apiKey) {
|
|
31
|
+
throw new Error(
|
|
32
|
+
"No API key provided. Pass --api-key or pipe via stdin."
|
|
33
|
+
);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
if (!flags.noVerify) {
|
|
37
|
+
process.stderr.write("Verifying...\n");
|
|
38
|
+
await verifyAuth(apiKey, baseUrl);
|
|
39
|
+
}
|
|
40
|
+
} else {
|
|
41
|
+
baseUrl = await input({
|
|
42
|
+
message: "Base URL",
|
|
43
|
+
default: DEFAULT_BASE_URL
|
|
44
|
+
});
|
|
45
|
+
apiKey = await password({
|
|
46
|
+
message: "API key"
|
|
47
|
+
});
|
|
48
|
+
if (!apiKey) {
|
|
49
|
+
throw new Error("API key is required.");
|
|
50
|
+
}
|
|
51
|
+
process.stderr.write("Verifying...\n");
|
|
52
|
+
await verifyAuth(apiKey, baseUrl);
|
|
53
|
+
}
|
|
54
|
+
const config = { apiKey };
|
|
55
|
+
if (baseUrl !== DEFAULT_BASE_URL) {
|
|
56
|
+
config.baseUrl = baseUrl;
|
|
57
|
+
}
|
|
58
|
+
writeAuthConfig(config);
|
|
59
|
+
process.stderr.write("Auth config saved to ~/.agent-voice/config.json\n");
|
|
60
|
+
}
|
|
61
|
+
export {
|
|
62
|
+
auth
|
|
63
|
+
};
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
// src/types.ts
|
|
4
4
|
var SAMPLE_RATE = 24e3;
|
|
5
5
|
var CHANNELS = 1;
|
|
6
|
+
var BIT_DEPTH = 16;
|
|
6
7
|
var VOICES = [
|
|
7
8
|
"alloy",
|
|
8
9
|
"ash",
|
|
@@ -21,6 +22,7 @@ var DEFAULT_VOICE = "ash";
|
|
|
21
22
|
export {
|
|
22
23
|
SAMPLE_RATE,
|
|
23
24
|
CHANNELS,
|
|
25
|
+
BIT_DEPTH,
|
|
24
26
|
VOICES,
|
|
25
27
|
DEFAULT_VOICE
|
|
26
28
|
};
|
|
@@ -1,71 +1,4 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import {
|
|
3
|
-
CHANNELS,
|
|
4
|
-
SAMPLE_RATE
|
|
5
|
-
} from "./chunk-D3AGL5JD.js";
|
|
6
|
-
|
|
7
|
-
// src/audio.ts
|
|
8
|
-
import { AudioIO, SampleFormat16Bit } from "naudiodon2";
|
|
9
|
-
function createAudioPlayer() {
|
|
10
|
-
const stream = AudioIO({
|
|
11
|
-
outOptions: {
|
|
12
|
-
channelCount: CHANNELS,
|
|
13
|
-
sampleFormat: SampleFormat16Bit,
|
|
14
|
-
sampleRate: SAMPLE_RATE,
|
|
15
|
-
closeOnError: true
|
|
16
|
-
}
|
|
17
|
-
});
|
|
18
|
-
let closed = false;
|
|
19
|
-
return {
|
|
20
|
-
write(pcm16) {
|
|
21
|
-
return stream.write(pcm16);
|
|
22
|
-
},
|
|
23
|
-
start() {
|
|
24
|
-
stream.start();
|
|
25
|
-
},
|
|
26
|
-
drain() {
|
|
27
|
-
if (closed) return Promise.resolve();
|
|
28
|
-
closed = true;
|
|
29
|
-
return new Promise((resolve) => {
|
|
30
|
-
stream.quit(() => resolve());
|
|
31
|
-
});
|
|
32
|
-
},
|
|
33
|
-
close() {
|
|
34
|
-
if (closed) return;
|
|
35
|
-
closed = true;
|
|
36
|
-
stream.quit();
|
|
37
|
-
}
|
|
38
|
-
};
|
|
39
|
-
}
|
|
40
|
-
function createAudioRecorder() {
|
|
41
|
-
const stream = AudioIO({
|
|
42
|
-
inOptions: {
|
|
43
|
-
channelCount: CHANNELS,
|
|
44
|
-
sampleFormat: SampleFormat16Bit,
|
|
45
|
-
sampleRate: SAMPLE_RATE,
|
|
46
|
-
closeOnError: true
|
|
47
|
-
}
|
|
48
|
-
});
|
|
49
|
-
let stopped = false;
|
|
50
|
-
return {
|
|
51
|
-
onData(cb) {
|
|
52
|
-
stream.on("data", cb);
|
|
53
|
-
},
|
|
54
|
-
start() {
|
|
55
|
-
stream.start();
|
|
56
|
-
},
|
|
57
|
-
stop() {
|
|
58
|
-
if (stopped) return;
|
|
59
|
-
stopped = true;
|
|
60
|
-
stream.quit();
|
|
61
|
-
},
|
|
62
|
-
close() {
|
|
63
|
-
if (stopped) return;
|
|
64
|
-
stopped = true;
|
|
65
|
-
stream.quit();
|
|
66
|
-
}
|
|
67
|
-
};
|
|
68
|
-
}
|
|
69
2
|
|
|
70
3
|
// src/realtime.ts
|
|
71
4
|
import { OpenAIRealtimeWS } from "openai/beta/realtime/ws";
|
|
@@ -87,7 +20,7 @@ function createRealtimeSession(options) {
|
|
|
87
20
|
let rt;
|
|
88
21
|
let responseCount = 0;
|
|
89
22
|
function configureSession() {
|
|
90
|
-
const turnDetection = options.mode === "say" ?
|
|
23
|
+
const turnDetection = options.mode === "say" ? void 0 : {
|
|
91
24
|
type: "semantic_vad",
|
|
92
25
|
eagerness: "medium",
|
|
93
26
|
create_response: options.ack,
|
|
@@ -177,7 +110,5 @@ ${text}`
|
|
|
177
110
|
}
|
|
178
111
|
|
|
179
112
|
export {
|
|
180
|
-
createAudioPlayer,
|
|
181
|
-
createAudioRecorder,
|
|
182
113
|
createRealtimeSession
|
|
183
114
|
};
|
package/dist/cli.js
CHANGED
|
@@ -3,26 +3,37 @@ import {
|
|
|
3
3
|
resolveAuth,
|
|
4
4
|
resolveVoice,
|
|
5
5
|
writeVoiceConfig
|
|
6
|
-
} from "./chunk-
|
|
6
|
+
} from "./chunk-RGYWLATZ.js";
|
|
7
7
|
import {
|
|
8
|
+
BIT_DEPTH,
|
|
9
|
+
CHANNELS,
|
|
10
|
+
SAMPLE_RATE,
|
|
8
11
|
VOICES
|
|
9
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-AHLLYIEW.js";
|
|
10
13
|
|
|
11
14
|
// src/cli.ts
|
|
12
|
-
import { closeSync, openSync, writeSync } from "fs";
|
|
15
|
+
import { closeSync, mkdirSync, openSync, writeFileSync, writeSync } from "fs";
|
|
16
|
+
import { join } from "path";
|
|
13
17
|
import { Command } from "commander";
|
|
14
|
-
async function
|
|
15
|
-
const
|
|
18
|
+
async function withSuppressedNativeOutput() {
|
|
19
|
+
const savedStdout = openSync("/dev/fd/1", "w");
|
|
20
|
+
const savedStderr = openSync("/dev/fd/2", "w");
|
|
16
21
|
closeSync(1);
|
|
17
22
|
openSync("/dev/null", "w");
|
|
18
|
-
|
|
19
|
-
|
|
23
|
+
closeSync(2);
|
|
24
|
+
openSync("/dev/null", "w");
|
|
25
|
+
const { ask } = await import("./ask-GUSXGYSY.js");
|
|
26
|
+
const { say } = await import("./say-W56HCNK4.js");
|
|
20
27
|
function writeResult(text) {
|
|
21
|
-
writeSync(
|
|
28
|
+
writeSync(savedStdout, `${text}
|
|
29
|
+
`);
|
|
30
|
+
closeSync(savedStdout);
|
|
31
|
+
}
|
|
32
|
+
function writeError(text) {
|
|
33
|
+
writeSync(savedStderr, `${text}
|
|
22
34
|
`);
|
|
23
|
-
closeSync(savedFd);
|
|
24
35
|
}
|
|
25
|
-
return { ask, say, writeResult };
|
|
36
|
+
return { ask, say, writeResult, writeError };
|
|
26
37
|
}
|
|
27
38
|
async function readStdin() {
|
|
28
39
|
if (process.stdin.isTTY) return "";
|
|
@@ -38,11 +49,47 @@ async function getMessage(flag) {
|
|
|
38
49
|
if (stdin) return stdin;
|
|
39
50
|
throw new Error("No message provided. Use -m or pipe via stdin.");
|
|
40
51
|
}
|
|
52
|
+
function createWavBuffer(pcm16) {
|
|
53
|
+
const header = Buffer.alloc(44);
|
|
54
|
+
const dataSize = pcm16.length;
|
|
55
|
+
const fileSize = 36 + dataSize;
|
|
56
|
+
const byteRate = SAMPLE_RATE * CHANNELS * (BIT_DEPTH / 8);
|
|
57
|
+
const blockAlign = CHANNELS * (BIT_DEPTH / 8);
|
|
58
|
+
header.write("RIFF", 0);
|
|
59
|
+
header.writeUInt32LE(fileSize, 4);
|
|
60
|
+
header.write("WAVE", 8);
|
|
61
|
+
header.write("fmt ", 12);
|
|
62
|
+
header.writeUInt32LE(16, 16);
|
|
63
|
+
header.writeUInt16LE(1, 20);
|
|
64
|
+
header.writeUInt16LE(CHANNELS, 22);
|
|
65
|
+
header.writeUInt32LE(SAMPLE_RATE, 24);
|
|
66
|
+
header.writeUInt32LE(byteRate, 28);
|
|
67
|
+
header.writeUInt16LE(blockAlign, 32);
|
|
68
|
+
header.writeUInt16LE(BIT_DEPTH, 34);
|
|
69
|
+
header.write("data", 36);
|
|
70
|
+
header.writeUInt32LE(dataSize, 40);
|
|
71
|
+
return Buffer.concat([header, pcm16]);
|
|
72
|
+
}
|
|
73
|
+
function writeDebugAudio(dir, assistantChunks, micChunks, modelInputChunks) {
|
|
74
|
+
mkdirSync(dir, { recursive: true });
|
|
75
|
+
const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
76
|
+
const assistantFile = join(dir, `ask-${stamp}-assistant-output.wav`);
|
|
77
|
+
const micFile = join(dir, `ask-${stamp}-mic-input.wav`);
|
|
78
|
+
const modelInputFile = join(dir, `ask-${stamp}-model-input.wav`);
|
|
79
|
+
writeFileSync(assistantFile, createWavBuffer(Buffer.concat(assistantChunks)));
|
|
80
|
+
writeFileSync(micFile, createWavBuffer(Buffer.concat(micChunks)));
|
|
81
|
+
writeFileSync(modelInputFile, createWavBuffer(Buffer.concat(modelInputChunks)));
|
|
82
|
+
return { assistantFile, micFile, modelInputFile };
|
|
83
|
+
}
|
|
41
84
|
var program = new Command().name("agent-voice").description("AI agent voice interaction CLI");
|
|
42
|
-
program.command("auth").description("Configure API key and base URL").action(async () => {
|
|
85
|
+
program.command("auth").description("Configure API key and base URL").option("--api-url <url>", "Base URL for the API").option("--api-key <key>", "API key").option("--no-verify", "Skip API key verification").action(async (opts) => {
|
|
43
86
|
try {
|
|
44
|
-
const { auth } = await import("./auth-
|
|
45
|
-
await auth(
|
|
87
|
+
const { auth } = await import("./auth-KET5DNSE.js");
|
|
88
|
+
await auth({
|
|
89
|
+
apiUrl: opts.apiUrl,
|
|
90
|
+
apiKey: opts.apiKey,
|
|
91
|
+
noVerify: !opts.verify
|
|
92
|
+
});
|
|
46
93
|
process.exit(0);
|
|
47
94
|
} catch (err) {
|
|
48
95
|
process.stderr.write(`${err instanceof Error ? err.message : err}
|
|
@@ -73,35 +120,70 @@ voicesCmd.command("set <voice>").description("Set the default voice").action((vo
|
|
|
73
120
|
`);
|
|
74
121
|
process.exit(0);
|
|
75
122
|
});
|
|
76
|
-
program.command("ask").description("Speak a message and listen for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).option("--timeout <seconds>", "Seconds to wait for user speech", "
|
|
123
|
+
program.command("ask").description("Speak a message and listen for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).option("--timeout <seconds>", "Seconds to wait for user speech", "120").option("--ack", "Speak an acknowledgment after the user responds").option("--debug-audio-dir <dir>", "Write ask audio debug WAVs to this directory").action(async (opts) => {
|
|
124
|
+
const { ask, writeResult, writeError } = await withSuppressedNativeOutput();
|
|
125
|
+
const assistantChunks = [];
|
|
126
|
+
const micChunks = [];
|
|
127
|
+
const modelInputChunks = [];
|
|
77
128
|
try {
|
|
78
|
-
const { ask, writeResult } = await withSuppressedStdout();
|
|
79
129
|
const auth = resolveAuth();
|
|
80
130
|
const message = await getMessage(opts.message);
|
|
81
131
|
const transcript = await ask(message, {
|
|
82
132
|
voice: opts.voice,
|
|
83
133
|
timeout: Number.parseInt(opts.timeout, 10),
|
|
84
134
|
ack: opts.ack ?? false,
|
|
85
|
-
auth
|
|
135
|
+
auth,
|
|
136
|
+
onAssistantAudio: opts.debugAudioDir ? (pcm16) => assistantChunks.push(Buffer.from(pcm16)) : void 0,
|
|
137
|
+
onMicAudio: opts.debugAudioDir ? (pcm16) => micChunks.push(Buffer.from(pcm16)) : void 0,
|
|
138
|
+
onAudioFrameSent: opts.debugAudioDir ? (pcm16) => modelInputChunks.push(Buffer.from(pcm16)) : void 0
|
|
86
139
|
});
|
|
140
|
+
if (opts.debugAudioDir) {
|
|
141
|
+
const files = writeDebugAudio(
|
|
142
|
+
opts.debugAudioDir,
|
|
143
|
+
assistantChunks,
|
|
144
|
+
micChunks,
|
|
145
|
+
modelInputChunks
|
|
146
|
+
);
|
|
147
|
+
writeError(
|
|
148
|
+
`debug audio written:
|
|
149
|
+
${files.assistantFile}
|
|
150
|
+
${files.micFile}
|
|
151
|
+
${files.modelInputFile}`
|
|
152
|
+
);
|
|
153
|
+
}
|
|
87
154
|
writeResult(transcript);
|
|
88
155
|
process.exit(0);
|
|
89
156
|
} catch (err) {
|
|
90
|
-
|
|
91
|
-
|
|
157
|
+
if (opts.debugAudioDir) {
|
|
158
|
+
try {
|
|
159
|
+
const files = writeDebugAudio(
|
|
160
|
+
opts.debugAudioDir,
|
|
161
|
+
assistantChunks,
|
|
162
|
+
micChunks,
|
|
163
|
+
modelInputChunks
|
|
164
|
+
);
|
|
165
|
+
writeError(
|
|
166
|
+
`debug audio written:
|
|
167
|
+
${files.assistantFile}
|
|
168
|
+
${files.micFile}
|
|
169
|
+
${files.modelInputFile}`
|
|
170
|
+
);
|
|
171
|
+
} catch {
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
writeError(`${err instanceof Error ? err.message : err}`);
|
|
92
175
|
process.exit(1);
|
|
93
176
|
}
|
|
94
177
|
});
|
|
95
178
|
program.command("say").description("Speak a message without listening for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).action(async (opts) => {
|
|
179
|
+
const { say, writeError } = await withSuppressedNativeOutput();
|
|
96
180
|
try {
|
|
97
|
-
const { say } = await withSuppressedStdout();
|
|
98
181
|
const auth = resolveAuth();
|
|
99
182
|
const message = await getMessage(opts.message);
|
|
100
183
|
await say(message, { voice: opts.voice, auth });
|
|
101
184
|
process.exit(0);
|
|
102
185
|
} catch (err) {
|
|
103
|
-
|
|
104
|
-
`);
|
|
186
|
+
writeError(`${err instanceof Error ? err.message : err}`);
|
|
105
187
|
process.exit(1);
|
|
106
188
|
}
|
|
107
189
|
});
|
package/dist/index.d.ts
CHANGED
|
@@ -1,16 +1,3 @@
|
|
|
1
|
-
type AudioPlayer = {
|
|
2
|
-
write(pcm16: Buffer): boolean;
|
|
3
|
-
start(): void;
|
|
4
|
-
drain(): Promise<void>;
|
|
5
|
-
close(): void;
|
|
6
|
-
};
|
|
7
|
-
type AudioRecorder = {
|
|
8
|
-
onData(cb: (pcm16: Buffer) => void): void;
|
|
9
|
-
start(): void;
|
|
10
|
-
stop(): void;
|
|
11
|
-
close(): void;
|
|
12
|
-
};
|
|
13
|
-
|
|
14
1
|
type AuthConfig = {
|
|
15
2
|
apiKey: string;
|
|
16
3
|
baseUrl?: string;
|
|
@@ -23,15 +10,18 @@ type AskOptions = {
|
|
|
23
10
|
timeout?: number;
|
|
24
11
|
ack?: boolean;
|
|
25
12
|
auth?: AuthConfig;
|
|
26
|
-
createPlayer?:
|
|
27
|
-
createRecorder?:
|
|
13
|
+
createPlayer?: unknown;
|
|
14
|
+
createRecorder?: unknown;
|
|
15
|
+
onAudioFrameSent?: (pcm16: Buffer) => void;
|
|
16
|
+
onAssistantAudio?: (pcm16: Buffer) => void;
|
|
17
|
+
onMicAudio?: (pcm16: Buffer) => void;
|
|
28
18
|
};
|
|
29
19
|
declare function ask(message: string, options?: AskOptions): Promise<string>;
|
|
30
20
|
|
|
31
21
|
type SayOptions = {
|
|
32
22
|
voice?: string;
|
|
33
23
|
auth?: AuthConfig;
|
|
34
|
-
createPlayer?:
|
|
24
|
+
createPlayer?: unknown;
|
|
35
25
|
};
|
|
36
26
|
declare function say(message: string, options?: SayOptions): Promise<void>;
|
|
37
27
|
|
package/dist/index.js
CHANGED
|
@@ -1,85 +1,5 @@
|
|
|
1
|
-
// src/
|
|
2
|
-
import {
|
|
3
|
-
|
|
4
|
-
// src/types.ts
|
|
5
|
-
var SAMPLE_RATE = 24e3;
|
|
6
|
-
var CHANNELS = 1;
|
|
7
|
-
var VOICES = [
|
|
8
|
-
"alloy",
|
|
9
|
-
"ash",
|
|
10
|
-
"ballad",
|
|
11
|
-
"coral",
|
|
12
|
-
"echo",
|
|
13
|
-
"fable",
|
|
14
|
-
"nova",
|
|
15
|
-
"onyx",
|
|
16
|
-
"sage",
|
|
17
|
-
"shimmer",
|
|
18
|
-
"verse"
|
|
19
|
-
];
|
|
20
|
-
var DEFAULT_VOICE = "ash";
|
|
21
|
-
|
|
22
|
-
// src/audio.ts
|
|
23
|
-
function createAudioPlayer() {
|
|
24
|
-
const stream = AudioIO({
|
|
25
|
-
outOptions: {
|
|
26
|
-
channelCount: CHANNELS,
|
|
27
|
-
sampleFormat: SampleFormat16Bit,
|
|
28
|
-
sampleRate: SAMPLE_RATE,
|
|
29
|
-
closeOnError: true
|
|
30
|
-
}
|
|
31
|
-
});
|
|
32
|
-
let closed = false;
|
|
33
|
-
return {
|
|
34
|
-
write(pcm16) {
|
|
35
|
-
return stream.write(pcm16);
|
|
36
|
-
},
|
|
37
|
-
start() {
|
|
38
|
-
stream.start();
|
|
39
|
-
},
|
|
40
|
-
drain() {
|
|
41
|
-
if (closed) return Promise.resolve();
|
|
42
|
-
closed = true;
|
|
43
|
-
return new Promise((resolve) => {
|
|
44
|
-
stream.quit(() => resolve());
|
|
45
|
-
});
|
|
46
|
-
},
|
|
47
|
-
close() {
|
|
48
|
-
if (closed) return;
|
|
49
|
-
closed = true;
|
|
50
|
-
stream.quit();
|
|
51
|
-
}
|
|
52
|
-
};
|
|
53
|
-
}
|
|
54
|
-
function createAudioRecorder() {
|
|
55
|
-
const stream = AudioIO({
|
|
56
|
-
inOptions: {
|
|
57
|
-
channelCount: CHANNELS,
|
|
58
|
-
sampleFormat: SampleFormat16Bit,
|
|
59
|
-
sampleRate: SAMPLE_RATE,
|
|
60
|
-
closeOnError: true
|
|
61
|
-
}
|
|
62
|
-
});
|
|
63
|
-
let stopped = false;
|
|
64
|
-
return {
|
|
65
|
-
onData(cb) {
|
|
66
|
-
stream.on("data", cb);
|
|
67
|
-
},
|
|
68
|
-
start() {
|
|
69
|
-
stream.start();
|
|
70
|
-
},
|
|
71
|
-
stop() {
|
|
72
|
-
if (stopped) return;
|
|
73
|
-
stopped = true;
|
|
74
|
-
stream.quit();
|
|
75
|
-
},
|
|
76
|
-
close() {
|
|
77
|
-
if (stopped) return;
|
|
78
|
-
stopped = true;
|
|
79
|
-
stream.quit();
|
|
80
|
-
}
|
|
81
|
-
};
|
|
82
|
-
}
|
|
1
|
+
// src/ask.ts
|
|
2
|
+
import { createRequire } from "module";
|
|
83
3
|
|
|
84
4
|
// src/realtime.ts
|
|
85
5
|
import { OpenAIRealtimeWS } from "openai/beta/realtime/ws";
|
|
@@ -101,7 +21,7 @@ function createRealtimeSession(options) {
|
|
|
101
21
|
let rt;
|
|
102
22
|
let responseCount = 0;
|
|
103
23
|
function configureSession() {
|
|
104
|
-
const turnDetection = options.mode === "say" ?
|
|
24
|
+
const turnDetection = options.mode === "say" ? void 0 : {
|
|
105
25
|
type: "semantic_vad",
|
|
106
26
|
eagerness: "medium",
|
|
107
27
|
create_response: options.ack,
|
|
@@ -190,84 +110,216 @@ ${text}`
|
|
|
190
110
|
};
|
|
191
111
|
}
|
|
192
112
|
|
|
113
|
+
// src/types.ts
|
|
114
|
+
var SAMPLE_RATE = 24e3;
|
|
115
|
+
var VOICES = [
|
|
116
|
+
"alloy",
|
|
117
|
+
"ash",
|
|
118
|
+
"ballad",
|
|
119
|
+
"coral",
|
|
120
|
+
"echo",
|
|
121
|
+
"fable",
|
|
122
|
+
"nova",
|
|
123
|
+
"onyx",
|
|
124
|
+
"sage",
|
|
125
|
+
"shimmer",
|
|
126
|
+
"verse"
|
|
127
|
+
];
|
|
128
|
+
var DEFAULT_VOICE = "ash";
|
|
129
|
+
|
|
193
130
|
// src/ask.ts
|
|
131
|
+
var require2 = createRequire(import.meta.url);
|
|
194
132
|
async function ask(message, options = {}) {
|
|
195
133
|
const {
|
|
196
134
|
voice = DEFAULT_VOICE,
|
|
197
135
|
timeout = 30,
|
|
198
136
|
ack = false,
|
|
199
137
|
auth,
|
|
200
|
-
|
|
201
|
-
|
|
138
|
+
onAudioFrameSent,
|
|
139
|
+
onAssistantAudio,
|
|
140
|
+
onMicAudio
|
|
202
141
|
} = options;
|
|
203
|
-
const
|
|
204
|
-
|
|
142
|
+
const { AudioEngine } = require2("agent-voice-audio");
|
|
143
|
+
const streamDelayMs = Number.parseInt(
|
|
144
|
+
process.env.AGENT_VOICE_AEC_STREAM_DELAY_MS ?? "30",
|
|
145
|
+
10
|
|
146
|
+
);
|
|
147
|
+
const engine = new AudioEngine({
|
|
148
|
+
sampleRate: SAMPLE_RATE,
|
|
149
|
+
channels: 1,
|
|
150
|
+
enableAec: true,
|
|
151
|
+
streamDelayMs
|
|
152
|
+
});
|
|
153
|
+
engine.start();
|
|
154
|
+
const debug = process.env.AGENT_VOICE_DEBUG_ASK_EVENTS === "1";
|
|
155
|
+
const startMs = Date.now();
|
|
156
|
+
function logEvent(event, detail) {
|
|
157
|
+
if (!debug) return;
|
|
158
|
+
const elapsed = Date.now() - startMs;
|
|
159
|
+
const suffix = detail ? ` ${detail}` : "";
|
|
160
|
+
process.stderr.write(`[ask ${elapsed}ms] ${event}${suffix}
|
|
161
|
+
`);
|
|
162
|
+
}
|
|
163
|
+
logEvent("start");
|
|
205
164
|
return new Promise((resolve, reject) => {
|
|
206
|
-
let recorder = null;
|
|
207
165
|
let transcript = "";
|
|
208
166
|
let timeoutTimer = null;
|
|
167
|
+
let responseStartTimer = null;
|
|
168
|
+
let transcriptTimer = null;
|
|
169
|
+
let capturePollTimer = null;
|
|
209
170
|
let speechDetected = false;
|
|
171
|
+
let initialResponseDone = false;
|
|
172
|
+
let heardAssistantAudio = false;
|
|
173
|
+
let lastAssistantAudioAt = 0;
|
|
210
174
|
let cleaned = false;
|
|
211
|
-
let
|
|
175
|
+
let settled = false;
|
|
212
176
|
async function cleanup() {
|
|
213
177
|
if (cleaned) return;
|
|
214
178
|
cleaned = true;
|
|
179
|
+
logEvent("cleanup:start");
|
|
215
180
|
if (timeoutTimer) clearTimeout(timeoutTimer);
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
181
|
+
if (responseStartTimer) clearTimeout(responseStartTimer);
|
|
182
|
+
if (transcriptTimer) clearTimeout(transcriptTimer);
|
|
183
|
+
if (capturePollTimer) clearInterval(capturePollTimer);
|
|
184
|
+
try {
|
|
185
|
+
engine.stop();
|
|
186
|
+
engine.close();
|
|
187
|
+
} catch {
|
|
188
|
+
}
|
|
219
189
|
session.close();
|
|
190
|
+
logEvent("cleanup:done");
|
|
191
|
+
}
|
|
192
|
+
function resolveOnce(value) {
|
|
193
|
+
if (settled) return;
|
|
194
|
+
settled = true;
|
|
195
|
+
cleanup().then(() => resolve(value));
|
|
220
196
|
}
|
|
221
|
-
function
|
|
222
|
-
if (
|
|
223
|
-
|
|
224
|
-
cleanup().then(() =>
|
|
197
|
+
function rejectOnce(error) {
|
|
198
|
+
if (settled) return;
|
|
199
|
+
settled = true;
|
|
200
|
+
cleanup().then(() => reject(error));
|
|
225
201
|
}
|
|
202
|
+
capturePollTimer = setInterval(() => {
|
|
203
|
+
if (settled) return;
|
|
204
|
+
let rawFrames = [];
|
|
205
|
+
let processedFrames = [];
|
|
206
|
+
try {
|
|
207
|
+
rawFrames = engine.readRawCapture(64);
|
|
208
|
+
processedFrames = engine.readProcessedCapture(64);
|
|
209
|
+
} catch (err) {
|
|
210
|
+
rejectOnce(
|
|
211
|
+
new Error(
|
|
212
|
+
`audio engine capture read failed: ${err instanceof Error ? err.message : String(err)}`
|
|
213
|
+
)
|
|
214
|
+
);
|
|
215
|
+
return;
|
|
216
|
+
}
|
|
217
|
+
for (const frame of rawFrames) onMicAudio?.(frame);
|
|
218
|
+
if (!heardAssistantAudio) return;
|
|
219
|
+
for (const frame of processedFrames) {
|
|
220
|
+
onAudioFrameSent?.(frame);
|
|
221
|
+
session.sendAudio(frame);
|
|
222
|
+
}
|
|
223
|
+
}, 10);
|
|
226
224
|
const session = createRealtimeSession({
|
|
227
225
|
voice,
|
|
228
226
|
mode: "default",
|
|
229
227
|
ack,
|
|
230
228
|
auth,
|
|
231
229
|
onAudioDelta(pcm16) {
|
|
232
|
-
|
|
230
|
+
logEvent("realtime:audio_delta", `bytes=${pcm16.length}`);
|
|
231
|
+
heardAssistantAudio = true;
|
|
232
|
+
lastAssistantAudioAt = Date.now();
|
|
233
|
+
onAssistantAudio?.(pcm16);
|
|
234
|
+
engine.play(pcm16);
|
|
233
235
|
},
|
|
234
236
|
onTranscript(text) {
|
|
237
|
+
const echoGuardMs = Number.parseInt(
|
|
238
|
+
process.env.AGENT_VOICE_ECHO_GUARD_MS ?? "1500",
|
|
239
|
+
10
|
|
240
|
+
);
|
|
241
|
+
const sinceAssistantMs = Date.now() - lastAssistantAudioAt;
|
|
242
|
+
if (heardAssistantAudio && sinceAssistantMs < echoGuardMs) {
|
|
243
|
+
logEvent(
|
|
244
|
+
"realtime:transcript_ignored_echo_guard",
|
|
245
|
+
`since_assistant_ms=${sinceAssistantMs} text="${text}"`
|
|
246
|
+
);
|
|
247
|
+
return;
|
|
248
|
+
}
|
|
249
|
+
logEvent("realtime:transcript", `text="${text}"`);
|
|
250
|
+
if (transcriptTimer) {
|
|
251
|
+
clearTimeout(transcriptTimer);
|
|
252
|
+
transcriptTimer = null;
|
|
253
|
+
}
|
|
235
254
|
transcript = text;
|
|
236
|
-
if (!ack)
|
|
255
|
+
if (!ack) resolveOnce(transcript);
|
|
237
256
|
},
|
|
238
257
|
onSpeechStarted() {
|
|
258
|
+
logEvent("realtime:speech_started");
|
|
239
259
|
speechDetected = true;
|
|
240
260
|
if (timeoutTimer) {
|
|
241
261
|
clearTimeout(timeoutTimer);
|
|
242
262
|
timeoutTimer = null;
|
|
243
263
|
}
|
|
264
|
+
if (transcriptTimer) clearTimeout(transcriptTimer);
|
|
265
|
+
transcriptTimer = setTimeout(() => {
|
|
266
|
+
logEvent("timeout:no_transcript_after_speech");
|
|
267
|
+
rejectOnce(
|
|
268
|
+
new Error(
|
|
269
|
+
`No transcript received within ${timeout}s after speech started`
|
|
270
|
+
)
|
|
271
|
+
);
|
|
272
|
+
}, timeout * 1e3);
|
|
273
|
+
if (!initialResponseDone && heardAssistantAudio) {
|
|
274
|
+
try {
|
|
275
|
+
engine.play(Buffer.alloc(0));
|
|
276
|
+
} catch {
|
|
277
|
+
}
|
|
278
|
+
}
|
|
244
279
|
},
|
|
245
280
|
onInitialResponseDone() {
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
recorder.onData((pcm16) => {
|
|
249
|
-
session.sendAudio(pcm16);
|
|
250
|
-
});
|
|
251
|
-
recorder.start();
|
|
252
|
-
}, 500);
|
|
281
|
+
logEvent("realtime:initial_response_done");
|
|
282
|
+
initialResponseDone = true;
|
|
253
283
|
timeoutTimer = setTimeout(() => {
|
|
254
284
|
if (!speechDetected) {
|
|
255
|
-
|
|
256
|
-
|
|
285
|
+
logEvent("timeout:no_speech");
|
|
286
|
+
rejectOnce(
|
|
287
|
+
new Error(`No speech detected within ${timeout}s timeout`)
|
|
288
|
+
);
|
|
257
289
|
}
|
|
258
290
|
}, timeout * 1e3);
|
|
259
291
|
},
|
|
260
292
|
onDone() {
|
|
261
|
-
|
|
293
|
+
logEvent("realtime:done");
|
|
294
|
+
if (ack) resolveOnce(transcript);
|
|
262
295
|
},
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
296
|
+
onError(error) {
|
|
297
|
+
logEvent("realtime:error", error);
|
|
298
|
+
rejectOnce(new Error(error));
|
|
266
299
|
}
|
|
267
300
|
});
|
|
268
|
-
session.connect().then(
|
|
269
|
-
|
|
270
|
-
|
|
301
|
+
session.connect().then(
|
|
302
|
+
() => {
|
|
303
|
+
logEvent("realtime:connected");
|
|
304
|
+
logEvent("realtime:send_message");
|
|
305
|
+
session.sendMessage(message);
|
|
306
|
+
responseStartTimer = setTimeout(() => {
|
|
307
|
+
if (!heardAssistantAudio) {
|
|
308
|
+
logEvent("timeout:no_assistant_audio");
|
|
309
|
+
rejectOnce(
|
|
310
|
+
new Error("No assistant audio received after sending message")
|
|
311
|
+
);
|
|
312
|
+
}
|
|
313
|
+
}, 1e4);
|
|
314
|
+
},
|
|
315
|
+
(err) => {
|
|
316
|
+
logEvent(
|
|
317
|
+
"realtime:connect_error",
|
|
318
|
+
err instanceof Error ? err.message : String(err)
|
|
319
|
+
);
|
|
320
|
+
rejectOnce(err instanceof Error ? err : new Error(String(err)));
|
|
321
|
+
}
|
|
322
|
+
);
|
|
271
323
|
});
|
|
272
324
|
}
|
|
273
325
|
|
|
@@ -302,19 +354,27 @@ function resolveVoice() {
|
|
|
302
354
|
}
|
|
303
355
|
|
|
304
356
|
// src/say.ts
|
|
357
|
+
import { createRequire as createRequire2 } from "module";
|
|
358
|
+
var require3 = createRequire2(import.meta.url);
|
|
305
359
|
async function say(message, options = {}) {
|
|
306
|
-
const {
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
360
|
+
const { voice = DEFAULT_VOICE, auth } = options;
|
|
361
|
+
const { AudioEngine } = require3("agent-voice-audio");
|
|
362
|
+
const engine = new AudioEngine({
|
|
363
|
+
sampleRate: SAMPLE_RATE,
|
|
364
|
+
channels: 1,
|
|
365
|
+
enableAec: false
|
|
366
|
+
});
|
|
367
|
+
engine.start();
|
|
313
368
|
return new Promise((resolve, reject) => {
|
|
314
369
|
let cleaned = false;
|
|
315
370
|
function cleanup() {
|
|
316
371
|
if (cleaned) return;
|
|
317
372
|
cleaned = true;
|
|
373
|
+
try {
|
|
374
|
+
engine.stop();
|
|
375
|
+
engine.close();
|
|
376
|
+
} catch {
|
|
377
|
+
}
|
|
318
378
|
session.close();
|
|
319
379
|
}
|
|
320
380
|
const session = createRealtimeSession({
|
|
@@ -323,25 +383,19 @@ async function say(message, options = {}) {
|
|
|
323
383
|
ack: false,
|
|
324
384
|
auth,
|
|
325
385
|
onAudioDelta(pcm16) {
|
|
326
|
-
|
|
386
|
+
engine.play(pcm16);
|
|
327
387
|
},
|
|
328
388
|
onTranscript() {
|
|
329
389
|
},
|
|
330
390
|
onSpeechStarted() {
|
|
331
391
|
},
|
|
332
|
-
|
|
333
|
-
try {
|
|
334
|
-
await player.drain();
|
|
335
|
-
} catch {
|
|
336
|
-
player.close();
|
|
337
|
-
}
|
|
392
|
+
onInitialResponseDone() {
|
|
338
393
|
cleanup();
|
|
339
394
|
resolve();
|
|
340
395
|
},
|
|
341
396
|
onDone() {
|
|
342
397
|
},
|
|
343
398
|
onError(error) {
|
|
344
|
-
player.close();
|
|
345
399
|
cleanup();
|
|
346
400
|
reject(new Error(error));
|
|
347
401
|
}
|
|
@@ -1,26 +1,34 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import {
|
|
3
|
-
createAudioPlayer,
|
|
4
3
|
createRealtimeSession
|
|
5
|
-
} from "./chunk-
|
|
4
|
+
} from "./chunk-VV2VNOC4.js";
|
|
6
5
|
import {
|
|
7
|
-
DEFAULT_VOICE
|
|
8
|
-
|
|
6
|
+
DEFAULT_VOICE,
|
|
7
|
+
SAMPLE_RATE
|
|
8
|
+
} from "./chunk-AHLLYIEW.js";
|
|
9
9
|
|
|
10
10
|
// src/say.ts
|
|
11
|
+
import { createRequire } from "module";
|
|
12
|
+
var require2 = createRequire(import.meta.url);
|
|
11
13
|
async function say(message, options = {}) {
|
|
12
|
-
const {
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
14
|
+
const { voice = DEFAULT_VOICE, auth } = options;
|
|
15
|
+
const { AudioEngine } = require2("agent-voice-audio");
|
|
16
|
+
const engine = new AudioEngine({
|
|
17
|
+
sampleRate: SAMPLE_RATE,
|
|
18
|
+
channels: 1,
|
|
19
|
+
enableAec: false
|
|
20
|
+
});
|
|
21
|
+
engine.start();
|
|
19
22
|
return new Promise((resolve, reject) => {
|
|
20
23
|
let cleaned = false;
|
|
21
24
|
function cleanup() {
|
|
22
25
|
if (cleaned) return;
|
|
23
26
|
cleaned = true;
|
|
27
|
+
try {
|
|
28
|
+
engine.stop();
|
|
29
|
+
engine.close();
|
|
30
|
+
} catch {
|
|
31
|
+
}
|
|
24
32
|
session.close();
|
|
25
33
|
}
|
|
26
34
|
const session = createRealtimeSession({
|
|
@@ -29,25 +37,19 @@ async function say(message, options = {}) {
|
|
|
29
37
|
ack: false,
|
|
30
38
|
auth,
|
|
31
39
|
onAudioDelta(pcm16) {
|
|
32
|
-
|
|
40
|
+
engine.play(pcm16);
|
|
33
41
|
},
|
|
34
42
|
onTranscript() {
|
|
35
43
|
},
|
|
36
44
|
onSpeechStarted() {
|
|
37
45
|
},
|
|
38
|
-
|
|
39
|
-
try {
|
|
40
|
-
await player.drain();
|
|
41
|
-
} catch {
|
|
42
|
-
player.close();
|
|
43
|
-
}
|
|
46
|
+
onInitialResponseDone() {
|
|
44
47
|
cleanup();
|
|
45
48
|
resolve();
|
|
46
49
|
},
|
|
47
50
|
onDone() {
|
|
48
51
|
},
|
|
49
52
|
onError(error) {
|
|
50
|
-
player.close();
|
|
51
53
|
cleanup();
|
|
52
54
|
reject(new Error(error));
|
|
53
55
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agent-voice",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.2.0",
|
|
4
4
|
"description": "CLI for AI agents to interact with humans via voice",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -18,32 +18,26 @@
|
|
|
18
18
|
"dist"
|
|
19
19
|
],
|
|
20
20
|
"dependencies": {
|
|
21
|
+
"agent-voice-audio": "^0.2.0",
|
|
21
22
|
"@inquirer/prompts": "^8.2.0",
|
|
22
23
|
"commander": "^13.1.0",
|
|
23
|
-
"naudiodon2": "^2.1.0",
|
|
24
24
|
"openai": "^4.96.0",
|
|
25
25
|
"ws": "^8.18.0"
|
|
26
26
|
},
|
|
27
27
|
"devDependencies": {
|
|
28
|
-
"@biomejs/biome": "^1.9.4",
|
|
29
|
-
"@changesets/cli": "^2.29.8",
|
|
30
28
|
"@types/node": "^22.12.0",
|
|
31
29
|
"@types/ws": "^8.5.14",
|
|
32
|
-
"dotenv-cli": "^11.0.0",
|
|
33
|
-
"lefthook": "^2.1.0",
|
|
34
30
|
"tsup": "^8.3.6",
|
|
35
31
|
"tsx": "^4.19.2",
|
|
36
32
|
"typescript": "^5.7.3",
|
|
37
33
|
"vitest": "^4.0.18"
|
|
38
34
|
},
|
|
39
35
|
"scripts": {
|
|
40
|
-
"dev": "
|
|
41
|
-
"agent-voice": "dotenv -e .env.local -- tsx src/cli.ts",
|
|
36
|
+
"dev": "tsx src/cli.ts",
|
|
42
37
|
"build": "tsup",
|
|
43
38
|
"check": "biome check --write .",
|
|
44
39
|
"typecheck": "tsc --noEmit",
|
|
45
|
-
"test": "
|
|
46
|
-
"test:watch": "
|
|
47
|
-
"release": "pnpm build && changeset publish"
|
|
40
|
+
"test": "vitest run",
|
|
41
|
+
"test:watch": "vitest"
|
|
48
42
|
}
|
|
49
43
|
}
|
package/dist/ask-NW4PBKFP.js
DELETED
|
@@ -1,93 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
import {
|
|
3
|
-
createAudioPlayer,
|
|
4
|
-
createAudioRecorder,
|
|
5
|
-
createRealtimeSession
|
|
6
|
-
} from "./chunk-EBYXFYS5.js";
|
|
7
|
-
import {
|
|
8
|
-
DEFAULT_VOICE
|
|
9
|
-
} from "./chunk-D3AGL5JD.js";
|
|
10
|
-
|
|
11
|
-
// src/ask.ts
|
|
12
|
-
async function ask(message, options = {}) {
|
|
13
|
-
const {
|
|
14
|
-
voice = DEFAULT_VOICE,
|
|
15
|
-
timeout = 30,
|
|
16
|
-
ack = false,
|
|
17
|
-
auth,
|
|
18
|
-
createPlayer = createAudioPlayer,
|
|
19
|
-
createRecorder = createAudioRecorder
|
|
20
|
-
} = options;
|
|
21
|
-
const player = createPlayer();
|
|
22
|
-
player.start();
|
|
23
|
-
return new Promise((resolve, reject) => {
|
|
24
|
-
let recorder = null;
|
|
25
|
-
let transcript = "";
|
|
26
|
-
let timeoutTimer = null;
|
|
27
|
-
let speechDetected = false;
|
|
28
|
-
let cleaned = false;
|
|
29
|
-
let resolved = false;
|
|
30
|
-
async function cleanup() {
|
|
31
|
-
if (cleaned) return;
|
|
32
|
-
cleaned = true;
|
|
33
|
-
if (timeoutTimer) clearTimeout(timeoutTimer);
|
|
34
|
-
recorder?.stop();
|
|
35
|
-
recorder?.close();
|
|
36
|
-
await player.drain();
|
|
37
|
-
session.close();
|
|
38
|
-
}
|
|
39
|
-
function finish() {
|
|
40
|
-
if (resolved) return;
|
|
41
|
-
resolved = true;
|
|
42
|
-
cleanup().then(() => resolve(transcript));
|
|
43
|
-
}
|
|
44
|
-
const session = createRealtimeSession({
|
|
45
|
-
voice,
|
|
46
|
-
mode: "default",
|
|
47
|
-
ack,
|
|
48
|
-
auth,
|
|
49
|
-
onAudioDelta(pcm16) {
|
|
50
|
-
player.write(pcm16);
|
|
51
|
-
},
|
|
52
|
-
onTranscript(text) {
|
|
53
|
-
transcript = text;
|
|
54
|
-
if (!ack) finish();
|
|
55
|
-
},
|
|
56
|
-
onSpeechStarted() {
|
|
57
|
-
speechDetected = true;
|
|
58
|
-
if (timeoutTimer) {
|
|
59
|
-
clearTimeout(timeoutTimer);
|
|
60
|
-
timeoutTimer = null;
|
|
61
|
-
}
|
|
62
|
-
},
|
|
63
|
-
onInitialResponseDone() {
|
|
64
|
-
setTimeout(() => {
|
|
65
|
-
recorder = createRecorder();
|
|
66
|
-
recorder.onData((pcm16) => {
|
|
67
|
-
session.sendAudio(pcm16);
|
|
68
|
-
});
|
|
69
|
-
recorder.start();
|
|
70
|
-
}, 500);
|
|
71
|
-
timeoutTimer = setTimeout(() => {
|
|
72
|
-
if (!speechDetected) {
|
|
73
|
-
cleanup();
|
|
74
|
-
reject(new Error(`No speech detected within ${timeout}s timeout`));
|
|
75
|
-
}
|
|
76
|
-
}, timeout * 1e3);
|
|
77
|
-
},
|
|
78
|
-
onDone() {
|
|
79
|
-
if (ack) finish();
|
|
80
|
-
},
|
|
81
|
-
async onError(error) {
|
|
82
|
-
await cleanup();
|
|
83
|
-
reject(new Error(error));
|
|
84
|
-
}
|
|
85
|
-
});
|
|
86
|
-
session.connect().then(() => {
|
|
87
|
-
session.sendMessage(message);
|
|
88
|
-
}, reject);
|
|
89
|
-
});
|
|
90
|
-
}
|
|
91
|
-
export {
|
|
92
|
-
ask
|
|
93
|
-
};
|
package/dist/auth-42XIU3B7.js
DELETED
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
import {
|
|
3
|
-
writeAuthConfig
|
|
4
|
-
} from "./chunk-7ERYR6ZY.js";
|
|
5
|
-
import "./chunk-D3AGL5JD.js";
|
|
6
|
-
|
|
7
|
-
// src/auth.ts
|
|
8
|
-
import { input, password } from "@inquirer/prompts";
|
|
9
|
-
import OpenAI from "openai";
|
|
10
|
-
var DEFAULT_BASE_URL = "https://api.openai.com/v1";
|
|
11
|
-
async function verifyAuth(apiKey, baseURL) {
|
|
12
|
-
const client = new OpenAI({ apiKey, baseURL });
|
|
13
|
-
await client.models.list();
|
|
14
|
-
}
|
|
15
|
-
async function auth() {
|
|
16
|
-
const baseUrl = await input({
|
|
17
|
-
message: "Base URL",
|
|
18
|
-
default: DEFAULT_BASE_URL
|
|
19
|
-
});
|
|
20
|
-
const apiKey = await password({
|
|
21
|
-
message: "API key"
|
|
22
|
-
});
|
|
23
|
-
if (!apiKey) {
|
|
24
|
-
throw new Error("API key is required.");
|
|
25
|
-
}
|
|
26
|
-
process.stderr.write("Verifying...\n");
|
|
27
|
-
await verifyAuth(apiKey, baseUrl);
|
|
28
|
-
const config = { apiKey };
|
|
29
|
-
if (baseUrl !== DEFAULT_BASE_URL) {
|
|
30
|
-
config.baseUrl = baseUrl;
|
|
31
|
-
}
|
|
32
|
-
writeAuthConfig(config);
|
|
33
|
-
process.stderr.write("Auth config saved to ~/.agent-voice/config.json\n");
|
|
34
|
-
}
|
|
35
|
-
export {
|
|
36
|
-
auth
|
|
37
|
-
};
|