agent-voice 0.1.3 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ask-A32EH5QX.js +208 -0
- package/dist/{auth-BRJKBMOE.js → auth-KET5DNSE.js} +2 -2
- package/dist/{chunk-D3AGL5JD.js → chunk-AHLLYIEW.js} +2 -0
- package/dist/{chunk-7ERYR6ZY.js → chunk-RGYWLATZ.js} +1 -1
- package/dist/{chunk-AQ5LP2XD.js → chunk-UYBFONQE.js} +3 -69
- package/dist/cli.js +81 -8
- package/dist/index.d.ts +6 -16
- package/dist/index.js +213 -179
- package/dist/say-ELJAIWUM.js +92 -0
- package/package.json +3 -4
- package/dist/ask-6HS5WYJU.js +0 -145
- package/dist/say-PKBQ2ZDL.js +0 -62
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import {
|
|
3
|
+
createRealtimeSession
|
|
4
|
+
} from "./chunk-UYBFONQE.js";
|
|
5
|
+
import {
|
|
6
|
+
DEFAULT_VOICE,
|
|
7
|
+
SAMPLE_RATE
|
|
8
|
+
} from "./chunk-AHLLYIEW.js";
|
|
9
|
+
|
|
10
|
+
// src/ask.ts
|
|
11
|
+
import { createRequire } from "module";
|
|
12
|
+
var require2 = createRequire(import.meta.url);
|
|
13
|
+
async function ask(message, options = {}) {
|
|
14
|
+
const {
|
|
15
|
+
voice = DEFAULT_VOICE,
|
|
16
|
+
timeout = 30,
|
|
17
|
+
ack = false,
|
|
18
|
+
auth,
|
|
19
|
+
onAudioFrameSent,
|
|
20
|
+
onAssistantAudio,
|
|
21
|
+
onMicAudio
|
|
22
|
+
} = options;
|
|
23
|
+
const { AudioEngine } = require2("agent-voice-audio");
|
|
24
|
+
const streamDelayMs = Number.parseInt(
|
|
25
|
+
process.env.AGENT_VOICE_AEC_STREAM_DELAY_MS ?? "30",
|
|
26
|
+
10
|
|
27
|
+
);
|
|
28
|
+
const engine = new AudioEngine({
|
|
29
|
+
sampleRate: SAMPLE_RATE,
|
|
30
|
+
channels: 1,
|
|
31
|
+
enableAec: true,
|
|
32
|
+
streamDelayMs
|
|
33
|
+
});
|
|
34
|
+
engine.start();
|
|
35
|
+
const debug = process.env.AGENT_VOICE_DEBUG_ASK_EVENTS === "1";
|
|
36
|
+
const startMs = Date.now();
|
|
37
|
+
function logEvent(event, detail) {
|
|
38
|
+
if (!debug) return;
|
|
39
|
+
const elapsed = Date.now() - startMs;
|
|
40
|
+
const suffix = detail ? ` ${detail}` : "";
|
|
41
|
+
process.stderr.write(`[ask ${elapsed}ms] ${event}${suffix}
|
|
42
|
+
`);
|
|
43
|
+
}
|
|
44
|
+
logEvent("start");
|
|
45
|
+
return new Promise((resolve, reject) => {
|
|
46
|
+
let transcript = "";
|
|
47
|
+
let timeoutTimer = null;
|
|
48
|
+
let responseStartTimer = null;
|
|
49
|
+
let transcriptTimer = null;
|
|
50
|
+
let capturePollTimer = null;
|
|
51
|
+
let speechDetected = false;
|
|
52
|
+
let initialResponseDone = false;
|
|
53
|
+
let heardAssistantAudio = false;
|
|
54
|
+
let lastAssistantAudioAt = 0;
|
|
55
|
+
let cleaned = false;
|
|
56
|
+
let settled = false;
|
|
57
|
+
async function cleanup() {
|
|
58
|
+
if (cleaned) return;
|
|
59
|
+
cleaned = true;
|
|
60
|
+
logEvent("cleanup:start");
|
|
61
|
+
if (timeoutTimer) clearTimeout(timeoutTimer);
|
|
62
|
+
if (responseStartTimer) clearTimeout(responseStartTimer);
|
|
63
|
+
if (transcriptTimer) clearTimeout(transcriptTimer);
|
|
64
|
+
if (capturePollTimer) clearInterval(capturePollTimer);
|
|
65
|
+
try {
|
|
66
|
+
engine.stop();
|
|
67
|
+
engine.close();
|
|
68
|
+
} catch {
|
|
69
|
+
}
|
|
70
|
+
session.close();
|
|
71
|
+
logEvent("cleanup:done");
|
|
72
|
+
}
|
|
73
|
+
function resolveOnce(value) {
|
|
74
|
+
if (settled) return;
|
|
75
|
+
settled = true;
|
|
76
|
+
cleanup().then(() => resolve(value));
|
|
77
|
+
}
|
|
78
|
+
function rejectOnce(error) {
|
|
79
|
+
if (settled) return;
|
|
80
|
+
settled = true;
|
|
81
|
+
cleanup().then(() => reject(error));
|
|
82
|
+
}
|
|
83
|
+
capturePollTimer = setInterval(() => {
|
|
84
|
+
if (settled) return;
|
|
85
|
+
let rawFrames = [];
|
|
86
|
+
let processedFrames = [];
|
|
87
|
+
try {
|
|
88
|
+
rawFrames = engine.readRawCapture(64);
|
|
89
|
+
processedFrames = engine.readProcessedCapture(64);
|
|
90
|
+
} catch (err) {
|
|
91
|
+
rejectOnce(
|
|
92
|
+
new Error(
|
|
93
|
+
`audio engine capture read failed: ${err instanceof Error ? err.message : String(err)}`
|
|
94
|
+
)
|
|
95
|
+
);
|
|
96
|
+
return;
|
|
97
|
+
}
|
|
98
|
+
for (const frame of rawFrames) onMicAudio?.(frame);
|
|
99
|
+
if (!heardAssistantAudio) return;
|
|
100
|
+
for (const frame of processedFrames) {
|
|
101
|
+
onAudioFrameSent?.(frame);
|
|
102
|
+
session.sendAudio(frame);
|
|
103
|
+
}
|
|
104
|
+
}, 10);
|
|
105
|
+
const session = createRealtimeSession({
|
|
106
|
+
voice,
|
|
107
|
+
mode: "default",
|
|
108
|
+
ack,
|
|
109
|
+
auth,
|
|
110
|
+
onAudioDelta(pcm16) {
|
|
111
|
+
logEvent("realtime:audio_delta", `bytes=${pcm16.length}`);
|
|
112
|
+
heardAssistantAudio = true;
|
|
113
|
+
lastAssistantAudioAt = Date.now();
|
|
114
|
+
onAssistantAudio?.(pcm16);
|
|
115
|
+
engine.play(pcm16);
|
|
116
|
+
},
|
|
117
|
+
onTranscript(text) {
|
|
118
|
+
const echoGuardMs = Number.parseInt(
|
|
119
|
+
process.env.AGENT_VOICE_ECHO_GUARD_MS ?? "1500",
|
|
120
|
+
10
|
|
121
|
+
);
|
|
122
|
+
const sinceAssistantMs = Date.now() - lastAssistantAudioAt;
|
|
123
|
+
if (heardAssistantAudio && sinceAssistantMs < echoGuardMs) {
|
|
124
|
+
logEvent(
|
|
125
|
+
"realtime:transcript_ignored_echo_guard",
|
|
126
|
+
`since_assistant_ms=${sinceAssistantMs} text="${text}"`
|
|
127
|
+
);
|
|
128
|
+
return;
|
|
129
|
+
}
|
|
130
|
+
logEvent("realtime:transcript", `text="${text}"`);
|
|
131
|
+
if (transcriptTimer) {
|
|
132
|
+
clearTimeout(transcriptTimer);
|
|
133
|
+
transcriptTimer = null;
|
|
134
|
+
}
|
|
135
|
+
transcript = text;
|
|
136
|
+
if (!ack) resolveOnce(transcript);
|
|
137
|
+
},
|
|
138
|
+
onSpeechStarted() {
|
|
139
|
+
logEvent("realtime:speech_started");
|
|
140
|
+
speechDetected = true;
|
|
141
|
+
if (timeoutTimer) {
|
|
142
|
+
clearTimeout(timeoutTimer);
|
|
143
|
+
timeoutTimer = null;
|
|
144
|
+
}
|
|
145
|
+
if (transcriptTimer) clearTimeout(transcriptTimer);
|
|
146
|
+
transcriptTimer = setTimeout(() => {
|
|
147
|
+
logEvent("timeout:no_transcript_after_speech");
|
|
148
|
+
rejectOnce(
|
|
149
|
+
new Error(
|
|
150
|
+
`No transcript received within ${timeout}s after speech started`
|
|
151
|
+
)
|
|
152
|
+
);
|
|
153
|
+
}, timeout * 1e3);
|
|
154
|
+
if (!initialResponseDone && heardAssistantAudio) {
|
|
155
|
+
try {
|
|
156
|
+
engine.play(Buffer.alloc(0));
|
|
157
|
+
} catch {
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
},
|
|
161
|
+
onInitialResponseDone() {
|
|
162
|
+
logEvent("realtime:initial_response_done");
|
|
163
|
+
initialResponseDone = true;
|
|
164
|
+
timeoutTimer = setTimeout(() => {
|
|
165
|
+
if (!speechDetected) {
|
|
166
|
+
logEvent("timeout:no_speech");
|
|
167
|
+
rejectOnce(
|
|
168
|
+
new Error(`No speech detected within ${timeout}s timeout`)
|
|
169
|
+
);
|
|
170
|
+
}
|
|
171
|
+
}, timeout * 1e3);
|
|
172
|
+
},
|
|
173
|
+
onDone() {
|
|
174
|
+
logEvent("realtime:done");
|
|
175
|
+
if (ack) resolveOnce(transcript);
|
|
176
|
+
},
|
|
177
|
+
onError(error) {
|
|
178
|
+
logEvent("realtime:error", error);
|
|
179
|
+
rejectOnce(new Error(error));
|
|
180
|
+
}
|
|
181
|
+
});
|
|
182
|
+
session.connect().then(
|
|
183
|
+
() => {
|
|
184
|
+
logEvent("realtime:connected");
|
|
185
|
+
logEvent("realtime:send_message");
|
|
186
|
+
session.sendMessage(message);
|
|
187
|
+
responseStartTimer = setTimeout(() => {
|
|
188
|
+
if (!heardAssistantAudio) {
|
|
189
|
+
logEvent("timeout:no_assistant_audio");
|
|
190
|
+
rejectOnce(
|
|
191
|
+
new Error("No assistant audio received after sending message")
|
|
192
|
+
);
|
|
193
|
+
}
|
|
194
|
+
}, 1e4);
|
|
195
|
+
},
|
|
196
|
+
(err) => {
|
|
197
|
+
logEvent(
|
|
198
|
+
"realtime:connect_error",
|
|
199
|
+
err instanceof Error ? err.message : String(err)
|
|
200
|
+
);
|
|
201
|
+
rejectOnce(err instanceof Error ? err : new Error(String(err)));
|
|
202
|
+
}
|
|
203
|
+
);
|
|
204
|
+
});
|
|
205
|
+
}
|
|
206
|
+
export {
|
|
207
|
+
ask
|
|
208
|
+
};
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
// src/types.ts
|
|
4
4
|
var SAMPLE_RATE = 24e3;
|
|
5
5
|
var CHANNELS = 1;
|
|
6
|
+
var BIT_DEPTH = 16;
|
|
6
7
|
var VOICES = [
|
|
7
8
|
"alloy",
|
|
8
9
|
"ash",
|
|
@@ -21,6 +22,7 @@ var DEFAULT_VOICE = "ash";
|
|
|
21
22
|
export {
|
|
22
23
|
SAMPLE_RATE,
|
|
23
24
|
CHANNELS,
|
|
25
|
+
BIT_DEPTH,
|
|
24
26
|
VOICES,
|
|
25
27
|
DEFAULT_VOICE
|
|
26
28
|
};
|
|
@@ -1,71 +1,4 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import {
|
|
3
|
-
CHANNELS,
|
|
4
|
-
SAMPLE_RATE
|
|
5
|
-
} from "./chunk-D3AGL5JD.js";
|
|
6
|
-
|
|
7
|
-
// src/audio.ts
|
|
8
|
-
import { AudioIO, SampleFormat16Bit } from "naudiodon2";
|
|
9
|
-
function createAudioPlayer() {
|
|
10
|
-
const stream = AudioIO({
|
|
11
|
-
outOptions: {
|
|
12
|
-
channelCount: CHANNELS,
|
|
13
|
-
sampleFormat: SampleFormat16Bit,
|
|
14
|
-
sampleRate: SAMPLE_RATE,
|
|
15
|
-
closeOnError: true
|
|
16
|
-
}
|
|
17
|
-
});
|
|
18
|
-
let closed = false;
|
|
19
|
-
return {
|
|
20
|
-
write(pcm16) {
|
|
21
|
-
return stream.write(pcm16);
|
|
22
|
-
},
|
|
23
|
-
start() {
|
|
24
|
-
stream.start();
|
|
25
|
-
},
|
|
26
|
-
drain() {
|
|
27
|
-
if (closed) return Promise.resolve();
|
|
28
|
-
closed = true;
|
|
29
|
-
return new Promise((resolve) => {
|
|
30
|
-
stream.quit(() => resolve());
|
|
31
|
-
});
|
|
32
|
-
},
|
|
33
|
-
close() {
|
|
34
|
-
if (closed) return;
|
|
35
|
-
closed = true;
|
|
36
|
-
stream.quit();
|
|
37
|
-
}
|
|
38
|
-
};
|
|
39
|
-
}
|
|
40
|
-
function createAudioRecorder() {
|
|
41
|
-
const stream = AudioIO({
|
|
42
|
-
inOptions: {
|
|
43
|
-
channelCount: CHANNELS,
|
|
44
|
-
sampleFormat: SampleFormat16Bit,
|
|
45
|
-
sampleRate: SAMPLE_RATE,
|
|
46
|
-
closeOnError: true
|
|
47
|
-
}
|
|
48
|
-
});
|
|
49
|
-
let stopped = false;
|
|
50
|
-
return {
|
|
51
|
-
onData(cb) {
|
|
52
|
-
stream.on("data", cb);
|
|
53
|
-
},
|
|
54
|
-
start() {
|
|
55
|
-
stream.start();
|
|
56
|
-
},
|
|
57
|
-
stop() {
|
|
58
|
-
if (stopped) return;
|
|
59
|
-
stopped = true;
|
|
60
|
-
stream.quit();
|
|
61
|
-
},
|
|
62
|
-
close() {
|
|
63
|
-
if (stopped) return;
|
|
64
|
-
stopped = true;
|
|
65
|
-
stream.quit();
|
|
66
|
-
}
|
|
67
|
-
};
|
|
68
|
-
}
|
|
69
2
|
|
|
70
3
|
// src/realtime.ts
|
|
71
4
|
import { OpenAIRealtimeWS } from "openai/beta/realtime/ws";
|
|
@@ -110,6 +43,9 @@ function createRealtimeSession(options) {
|
|
|
110
43
|
const pcm16 = Buffer.from(event.delta, "base64");
|
|
111
44
|
options.onAudioDelta(pcm16);
|
|
112
45
|
});
|
|
46
|
+
rt.on("response.audio.done", () => {
|
|
47
|
+
options.onAudioDone?.();
|
|
48
|
+
});
|
|
113
49
|
rt.on("conversation.item.input_audio_transcription.completed", (event) => {
|
|
114
50
|
options.onTranscript(event.transcript);
|
|
115
51
|
});
|
|
@@ -177,7 +113,5 @@ ${text}`
|
|
|
177
113
|
}
|
|
178
114
|
|
|
179
115
|
export {
|
|
180
|
-
createAudioPlayer,
|
|
181
|
-
createAudioRecorder,
|
|
182
116
|
createRealtimeSession
|
|
183
117
|
};
|
package/dist/cli.js
CHANGED
|
@@ -3,13 +3,17 @@ import {
|
|
|
3
3
|
resolveAuth,
|
|
4
4
|
resolveVoice,
|
|
5
5
|
writeVoiceConfig
|
|
6
|
-
} from "./chunk-
|
|
6
|
+
} from "./chunk-RGYWLATZ.js";
|
|
7
7
|
import {
|
|
8
|
+
BIT_DEPTH,
|
|
9
|
+
CHANNELS,
|
|
10
|
+
SAMPLE_RATE,
|
|
8
11
|
VOICES
|
|
9
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-AHLLYIEW.js";
|
|
10
13
|
|
|
11
14
|
// src/cli.ts
|
|
12
|
-
import { closeSync, openSync, writeSync } from "fs";
|
|
15
|
+
import { closeSync, mkdirSync, openSync, writeFileSync, writeSync } from "fs";
|
|
16
|
+
import { join } from "path";
|
|
13
17
|
import { Command } from "commander";
|
|
14
18
|
async function withSuppressedNativeOutput() {
|
|
15
19
|
const savedStdout = openSync("/dev/fd/1", "w");
|
|
@@ -18,8 +22,8 @@ async function withSuppressedNativeOutput() {
|
|
|
18
22
|
openSync("/dev/null", "w");
|
|
19
23
|
closeSync(2);
|
|
20
24
|
openSync("/dev/null", "w");
|
|
21
|
-
const { ask } = await import("./ask-
|
|
22
|
-
const { say } = await import("./say-
|
|
25
|
+
const { ask } = await import("./ask-A32EH5QX.js");
|
|
26
|
+
const { say } = await import("./say-ELJAIWUM.js");
|
|
23
27
|
function writeResult(text) {
|
|
24
28
|
writeSync(savedStdout, `${text}
|
|
25
29
|
`);
|
|
@@ -45,10 +49,42 @@ async function getMessage(flag) {
|
|
|
45
49
|
if (stdin) return stdin;
|
|
46
50
|
throw new Error("No message provided. Use -m or pipe via stdin.");
|
|
47
51
|
}
|
|
52
|
+
function createWavBuffer(pcm16) {
|
|
53
|
+
const header = Buffer.alloc(44);
|
|
54
|
+
const dataSize = pcm16.length;
|
|
55
|
+
const fileSize = 36 + dataSize;
|
|
56
|
+
const byteRate = SAMPLE_RATE * CHANNELS * (BIT_DEPTH / 8);
|
|
57
|
+
const blockAlign = CHANNELS * (BIT_DEPTH / 8);
|
|
58
|
+
header.write("RIFF", 0);
|
|
59
|
+
header.writeUInt32LE(fileSize, 4);
|
|
60
|
+
header.write("WAVE", 8);
|
|
61
|
+
header.write("fmt ", 12);
|
|
62
|
+
header.writeUInt32LE(16, 16);
|
|
63
|
+
header.writeUInt16LE(1, 20);
|
|
64
|
+
header.writeUInt16LE(CHANNELS, 22);
|
|
65
|
+
header.writeUInt32LE(SAMPLE_RATE, 24);
|
|
66
|
+
header.writeUInt32LE(byteRate, 28);
|
|
67
|
+
header.writeUInt16LE(blockAlign, 32);
|
|
68
|
+
header.writeUInt16LE(BIT_DEPTH, 34);
|
|
69
|
+
header.write("data", 36);
|
|
70
|
+
header.writeUInt32LE(dataSize, 40);
|
|
71
|
+
return Buffer.concat([header, pcm16]);
|
|
72
|
+
}
|
|
73
|
+
function writeDebugAudio(dir, assistantChunks, micChunks, modelInputChunks) {
|
|
74
|
+
mkdirSync(dir, { recursive: true });
|
|
75
|
+
const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
76
|
+
const assistantFile = join(dir, `ask-${stamp}-assistant-output.wav`);
|
|
77
|
+
const micFile = join(dir, `ask-${stamp}-mic-input.wav`);
|
|
78
|
+
const modelInputFile = join(dir, `ask-${stamp}-model-input.wav`);
|
|
79
|
+
writeFileSync(assistantFile, createWavBuffer(Buffer.concat(assistantChunks)));
|
|
80
|
+
writeFileSync(micFile, createWavBuffer(Buffer.concat(micChunks)));
|
|
81
|
+
writeFileSync(modelInputFile, createWavBuffer(Buffer.concat(modelInputChunks)));
|
|
82
|
+
return { assistantFile, micFile, modelInputFile };
|
|
83
|
+
}
|
|
48
84
|
var program = new Command().name("agent-voice").description("AI agent voice interaction CLI");
|
|
49
85
|
program.command("auth").description("Configure API key and base URL").option("--api-url <url>", "Base URL for the API").option("--api-key <key>", "API key").option("--no-verify", "Skip API key verification").action(async (opts) => {
|
|
50
86
|
try {
|
|
51
|
-
const { auth } = await import("./auth-
|
|
87
|
+
const { auth } = await import("./auth-KET5DNSE.js");
|
|
52
88
|
await auth({
|
|
53
89
|
apiUrl: opts.apiUrl,
|
|
54
90
|
apiKey: opts.apiKey,
|
|
@@ -84,8 +120,11 @@ voicesCmd.command("set <voice>").description("Set the default voice").action((vo
|
|
|
84
120
|
`);
|
|
85
121
|
process.exit(0);
|
|
86
122
|
});
|
|
87
|
-
program.command("ask").description("Speak a message and listen for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).option("--timeout <seconds>", "Seconds to wait for user speech", "120").option("--ack", "Speak an acknowledgment after the user responds").action(async (opts) => {
|
|
123
|
+
program.command("ask").description("Speak a message and listen for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).option("--timeout <seconds>", "Seconds to wait for user speech", "120").option("--ack", "Speak an acknowledgment after the user responds").option("--debug-audio-dir <dir>", "Write ask audio debug WAVs to this directory").action(async (opts) => {
|
|
88
124
|
const { ask, writeResult, writeError } = await withSuppressedNativeOutput();
|
|
125
|
+
const assistantChunks = [];
|
|
126
|
+
const micChunks = [];
|
|
127
|
+
const modelInputChunks = [];
|
|
89
128
|
try {
|
|
90
129
|
const auth = resolveAuth();
|
|
91
130
|
const message = await getMessage(opts.message);
|
|
@@ -93,11 +132,45 @@ program.command("ask").description("Speak a message and listen for a response").
|
|
|
93
132
|
voice: opts.voice,
|
|
94
133
|
timeout: Number.parseInt(opts.timeout, 10),
|
|
95
134
|
ack: opts.ack ?? false,
|
|
96
|
-
auth
|
|
135
|
+
auth,
|
|
136
|
+
onAssistantAudio: opts.debugAudioDir ? (pcm16) => assistantChunks.push(Buffer.from(pcm16)) : void 0,
|
|
137
|
+
onMicAudio: opts.debugAudioDir ? (pcm16) => micChunks.push(Buffer.from(pcm16)) : void 0,
|
|
138
|
+
onAudioFrameSent: opts.debugAudioDir ? (pcm16) => modelInputChunks.push(Buffer.from(pcm16)) : void 0
|
|
97
139
|
});
|
|
140
|
+
if (opts.debugAudioDir) {
|
|
141
|
+
const files = writeDebugAudio(
|
|
142
|
+
opts.debugAudioDir,
|
|
143
|
+
assistantChunks,
|
|
144
|
+
micChunks,
|
|
145
|
+
modelInputChunks
|
|
146
|
+
);
|
|
147
|
+
writeError(
|
|
148
|
+
`debug audio written:
|
|
149
|
+
${files.assistantFile}
|
|
150
|
+
${files.micFile}
|
|
151
|
+
${files.modelInputFile}`
|
|
152
|
+
);
|
|
153
|
+
}
|
|
98
154
|
writeResult(transcript);
|
|
99
155
|
process.exit(0);
|
|
100
156
|
} catch (err) {
|
|
157
|
+
if (opts.debugAudioDir) {
|
|
158
|
+
try {
|
|
159
|
+
const files = writeDebugAudio(
|
|
160
|
+
opts.debugAudioDir,
|
|
161
|
+
assistantChunks,
|
|
162
|
+
micChunks,
|
|
163
|
+
modelInputChunks
|
|
164
|
+
);
|
|
165
|
+
writeError(
|
|
166
|
+
`debug audio written:
|
|
167
|
+
${files.assistantFile}
|
|
168
|
+
${files.micFile}
|
|
169
|
+
${files.modelInputFile}`
|
|
170
|
+
);
|
|
171
|
+
} catch {
|
|
172
|
+
}
|
|
173
|
+
}
|
|
101
174
|
writeError(`${err instanceof Error ? err.message : err}`);
|
|
102
175
|
process.exit(1);
|
|
103
176
|
}
|
package/dist/index.d.ts
CHANGED
|
@@ -1,16 +1,3 @@
|
|
|
1
|
-
type AudioPlayer = {
|
|
2
|
-
write(pcm16: Buffer): boolean;
|
|
3
|
-
start(): void;
|
|
4
|
-
drain(): Promise<void>;
|
|
5
|
-
close(): void;
|
|
6
|
-
};
|
|
7
|
-
type AudioRecorder = {
|
|
8
|
-
onData(cb: (pcm16: Buffer) => void): void;
|
|
9
|
-
start(): void;
|
|
10
|
-
stop(): void;
|
|
11
|
-
close(): void;
|
|
12
|
-
};
|
|
13
|
-
|
|
14
1
|
type AuthConfig = {
|
|
15
2
|
apiKey: string;
|
|
16
3
|
baseUrl?: string;
|
|
@@ -23,15 +10,18 @@ type AskOptions = {
|
|
|
23
10
|
timeout?: number;
|
|
24
11
|
ack?: boolean;
|
|
25
12
|
auth?: AuthConfig;
|
|
26
|
-
createPlayer?:
|
|
27
|
-
createRecorder?:
|
|
13
|
+
createPlayer?: unknown;
|
|
14
|
+
createRecorder?: unknown;
|
|
15
|
+
onAudioFrameSent?: (pcm16: Buffer) => void;
|
|
16
|
+
onAssistantAudio?: (pcm16: Buffer) => void;
|
|
17
|
+
onMicAudio?: (pcm16: Buffer) => void;
|
|
28
18
|
};
|
|
29
19
|
declare function ask(message: string, options?: AskOptions): Promise<string>;
|
|
30
20
|
|
|
31
21
|
type SayOptions = {
|
|
32
22
|
voice?: string;
|
|
33
23
|
auth?: AuthConfig;
|
|
34
|
-
createPlayer?:
|
|
24
|
+
createPlayer?: unknown;
|
|
35
25
|
};
|
|
36
26
|
declare function say(message: string, options?: SayOptions): Promise<void>;
|
|
37
27
|
|
package/dist/index.js
CHANGED
|
@@ -1,120 +1,5 @@
|
|
|
1
|
-
// src/
|
|
2
|
-
import {
|
|
3
|
-
|
|
4
|
-
// src/types.ts
|
|
5
|
-
var SAMPLE_RATE = 24e3;
|
|
6
|
-
var CHANNELS = 1;
|
|
7
|
-
var VOICES = [
|
|
8
|
-
"alloy",
|
|
9
|
-
"ash",
|
|
10
|
-
"ballad",
|
|
11
|
-
"coral",
|
|
12
|
-
"echo",
|
|
13
|
-
"fable",
|
|
14
|
-
"nova",
|
|
15
|
-
"onyx",
|
|
16
|
-
"sage",
|
|
17
|
-
"shimmer",
|
|
18
|
-
"verse"
|
|
19
|
-
];
|
|
20
|
-
var DEFAULT_VOICE = "ash";
|
|
21
|
-
|
|
22
|
-
// src/audio.ts
|
|
23
|
-
function createAudioPlayer() {
|
|
24
|
-
const stream = AudioIO({
|
|
25
|
-
outOptions: {
|
|
26
|
-
channelCount: CHANNELS,
|
|
27
|
-
sampleFormat: SampleFormat16Bit,
|
|
28
|
-
sampleRate: SAMPLE_RATE,
|
|
29
|
-
closeOnError: true
|
|
30
|
-
}
|
|
31
|
-
});
|
|
32
|
-
let closed = false;
|
|
33
|
-
return {
|
|
34
|
-
write(pcm16) {
|
|
35
|
-
return stream.write(pcm16);
|
|
36
|
-
},
|
|
37
|
-
start() {
|
|
38
|
-
stream.start();
|
|
39
|
-
},
|
|
40
|
-
drain() {
|
|
41
|
-
if (closed) return Promise.resolve();
|
|
42
|
-
closed = true;
|
|
43
|
-
return new Promise((resolve) => {
|
|
44
|
-
stream.quit(() => resolve());
|
|
45
|
-
});
|
|
46
|
-
},
|
|
47
|
-
close() {
|
|
48
|
-
if (closed) return;
|
|
49
|
-
closed = true;
|
|
50
|
-
stream.quit();
|
|
51
|
-
}
|
|
52
|
-
};
|
|
53
|
-
}
|
|
54
|
-
function createAudioRecorder() {
|
|
55
|
-
const stream = AudioIO({
|
|
56
|
-
inOptions: {
|
|
57
|
-
channelCount: CHANNELS,
|
|
58
|
-
sampleFormat: SampleFormat16Bit,
|
|
59
|
-
sampleRate: SAMPLE_RATE,
|
|
60
|
-
closeOnError: true
|
|
61
|
-
}
|
|
62
|
-
});
|
|
63
|
-
let stopped = false;
|
|
64
|
-
return {
|
|
65
|
-
onData(cb) {
|
|
66
|
-
stream.on("data", cb);
|
|
67
|
-
},
|
|
68
|
-
start() {
|
|
69
|
-
stream.start();
|
|
70
|
-
},
|
|
71
|
-
stop() {
|
|
72
|
-
if (stopped) return;
|
|
73
|
-
stopped = true;
|
|
74
|
-
stream.quit();
|
|
75
|
-
},
|
|
76
|
-
close() {
|
|
77
|
-
if (stopped) return;
|
|
78
|
-
stopped = true;
|
|
79
|
-
stream.quit();
|
|
80
|
-
}
|
|
81
|
-
};
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
// src/echo-canceller.ts
|
|
85
|
-
import { EchoCanceller } from "agent-voice-aec";
|
|
86
|
-
var FRAME_SIZE = 480;
|
|
87
|
-
var FILTER_LENGTH = 4800;
|
|
88
|
-
var FRAME_BYTES = FRAME_SIZE * 2;
|
|
89
|
-
function createEchoCanceller() {
|
|
90
|
-
const aec = new EchoCanceller(FRAME_SIZE, FILTER_LENGTH, SAMPLE_RATE);
|
|
91
|
-
let playbackBuffer = Buffer.alloc(0);
|
|
92
|
-
let captureBuffer = Buffer.alloc(0);
|
|
93
|
-
return {
|
|
94
|
-
playback(pcm16) {
|
|
95
|
-
playbackBuffer = Buffer.concat([playbackBuffer, pcm16]);
|
|
96
|
-
while (playbackBuffer.length >= FRAME_BYTES) {
|
|
97
|
-
aec.playback(playbackBuffer.subarray(0, FRAME_BYTES));
|
|
98
|
-
playbackBuffer = playbackBuffer.subarray(FRAME_BYTES);
|
|
99
|
-
}
|
|
100
|
-
},
|
|
101
|
-
capture(pcm16) {
|
|
102
|
-
captureBuffer = Buffer.concat([captureBuffer, pcm16]);
|
|
103
|
-
const frames = [];
|
|
104
|
-
while (captureBuffer.length >= FRAME_BYTES) {
|
|
105
|
-
const out = aec.capture(captureBuffer.subarray(0, FRAME_BYTES));
|
|
106
|
-
frames.push(out);
|
|
107
|
-
captureBuffer = captureBuffer.subarray(FRAME_BYTES);
|
|
108
|
-
}
|
|
109
|
-
return frames;
|
|
110
|
-
},
|
|
111
|
-
reset() {
|
|
112
|
-
aec.reset();
|
|
113
|
-
playbackBuffer = Buffer.alloc(0);
|
|
114
|
-
captureBuffer = Buffer.alloc(0);
|
|
115
|
-
}
|
|
116
|
-
};
|
|
117
|
-
}
|
|
1
|
+
// src/ask.ts
|
|
2
|
+
import { createRequire } from "module";
|
|
118
3
|
|
|
119
4
|
// src/realtime.ts
|
|
120
5
|
import { OpenAIRealtimeWS } from "openai/beta/realtime/ws";
|
|
@@ -159,6 +44,9 @@ function createRealtimeSession(options) {
|
|
|
159
44
|
const pcm16 = Buffer.from(event.delta, "base64");
|
|
160
45
|
options.onAudioDelta(pcm16);
|
|
161
46
|
});
|
|
47
|
+
rt.on("response.audio.done", () => {
|
|
48
|
+
options.onAudioDone?.();
|
|
49
|
+
});
|
|
162
50
|
rt.on("conversation.item.input_audio_transcription.completed", (event) => {
|
|
163
51
|
options.onTranscript(event.transcript);
|
|
164
52
|
});
|
|
@@ -225,100 +113,216 @@ ${text}`
|
|
|
225
113
|
};
|
|
226
114
|
}
|
|
227
115
|
|
|
116
|
+
// src/types.ts
|
|
117
|
+
var SAMPLE_RATE = 24e3;
|
|
118
|
+
var VOICES = [
|
|
119
|
+
"alloy",
|
|
120
|
+
"ash",
|
|
121
|
+
"ballad",
|
|
122
|
+
"coral",
|
|
123
|
+
"echo",
|
|
124
|
+
"fable",
|
|
125
|
+
"nova",
|
|
126
|
+
"onyx",
|
|
127
|
+
"sage",
|
|
128
|
+
"shimmer",
|
|
129
|
+
"verse"
|
|
130
|
+
];
|
|
131
|
+
var DEFAULT_VOICE = "ash";
|
|
132
|
+
|
|
228
133
|
// src/ask.ts
|
|
134
|
+
var require2 = createRequire(import.meta.url);
|
|
229
135
|
async function ask(message, options = {}) {
|
|
230
136
|
const {
|
|
231
137
|
voice = DEFAULT_VOICE,
|
|
232
138
|
timeout = 30,
|
|
233
139
|
ack = false,
|
|
234
140
|
auth,
|
|
235
|
-
|
|
236
|
-
|
|
141
|
+
onAudioFrameSent,
|
|
142
|
+
onAssistantAudio,
|
|
143
|
+
onMicAudio
|
|
237
144
|
} = options;
|
|
238
|
-
const
|
|
239
|
-
|
|
240
|
-
|
|
145
|
+
const { AudioEngine } = require2("agent-voice-audio");
|
|
146
|
+
const streamDelayMs = Number.parseInt(
|
|
147
|
+
process.env.AGENT_VOICE_AEC_STREAM_DELAY_MS ?? "30",
|
|
148
|
+
10
|
|
149
|
+
);
|
|
150
|
+
const engine = new AudioEngine({
|
|
151
|
+
sampleRate: SAMPLE_RATE,
|
|
152
|
+
channels: 1,
|
|
153
|
+
enableAec: true,
|
|
154
|
+
streamDelayMs
|
|
155
|
+
});
|
|
156
|
+
engine.start();
|
|
157
|
+
const debug = process.env.AGENT_VOICE_DEBUG_ASK_EVENTS === "1";
|
|
158
|
+
const startMs = Date.now();
|
|
159
|
+
function logEvent(event, detail) {
|
|
160
|
+
if (!debug) return;
|
|
161
|
+
const elapsed = Date.now() - startMs;
|
|
162
|
+
const suffix = detail ? ` ${detail}` : "";
|
|
163
|
+
process.stderr.write(`[ask ${elapsed}ms] ${event}${suffix}
|
|
164
|
+
`);
|
|
165
|
+
}
|
|
166
|
+
logEvent("start");
|
|
241
167
|
return new Promise((resolve, reject) => {
|
|
242
|
-
let recorder = null;
|
|
243
|
-
let recorderStarted = false;
|
|
244
168
|
let transcript = "";
|
|
245
169
|
let timeoutTimer = null;
|
|
170
|
+
let responseStartTimer = null;
|
|
171
|
+
let transcriptTimer = null;
|
|
172
|
+
let capturePollTimer = null;
|
|
246
173
|
let speechDetected = false;
|
|
247
174
|
let initialResponseDone = false;
|
|
248
|
-
let
|
|
175
|
+
let heardAssistantAudio = false;
|
|
176
|
+
let lastAssistantAudioAt = 0;
|
|
249
177
|
let cleaned = false;
|
|
250
|
-
let
|
|
178
|
+
let settled = false;
|
|
251
179
|
async function cleanup() {
|
|
252
180
|
if (cleaned) return;
|
|
253
181
|
cleaned = true;
|
|
182
|
+
logEvent("cleanup:start");
|
|
254
183
|
if (timeoutTimer) clearTimeout(timeoutTimer);
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
184
|
+
if (responseStartTimer) clearTimeout(responseStartTimer);
|
|
185
|
+
if (transcriptTimer) clearTimeout(transcriptTimer);
|
|
186
|
+
if (capturePollTimer) clearInterval(capturePollTimer);
|
|
187
|
+
try {
|
|
188
|
+
engine.stop();
|
|
189
|
+
engine.close();
|
|
190
|
+
} catch {
|
|
191
|
+
}
|
|
258
192
|
session.close();
|
|
193
|
+
logEvent("cleanup:done");
|
|
259
194
|
}
|
|
260
|
-
function
|
|
261
|
-
if (
|
|
262
|
-
|
|
263
|
-
cleanup().then(() => resolve(
|
|
195
|
+
function resolveOnce(value) {
|
|
196
|
+
if (settled) return;
|
|
197
|
+
settled = true;
|
|
198
|
+
cleanup().then(() => resolve(value));
|
|
264
199
|
}
|
|
265
|
-
function
|
|
266
|
-
if (
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
recorder.onData((pcm16) => {
|
|
270
|
-
const cleaned2 = echoCanceller.capture(pcm16);
|
|
271
|
-
for (const frame of cleaned2) {
|
|
272
|
-
session.sendAudio(frame);
|
|
273
|
-
}
|
|
274
|
-
});
|
|
275
|
-
recorder.start();
|
|
200
|
+
function rejectOnce(error) {
|
|
201
|
+
if (settled) return;
|
|
202
|
+
settled = true;
|
|
203
|
+
cleanup().then(() => reject(error));
|
|
276
204
|
}
|
|
205
|
+
capturePollTimer = setInterval(() => {
|
|
206
|
+
if (settled) return;
|
|
207
|
+
let rawFrames = [];
|
|
208
|
+
let processedFrames = [];
|
|
209
|
+
try {
|
|
210
|
+
rawFrames = engine.readRawCapture(64);
|
|
211
|
+
processedFrames = engine.readProcessedCapture(64);
|
|
212
|
+
} catch (err) {
|
|
213
|
+
rejectOnce(
|
|
214
|
+
new Error(
|
|
215
|
+
`audio engine capture read failed: ${err instanceof Error ? err.message : String(err)}`
|
|
216
|
+
)
|
|
217
|
+
);
|
|
218
|
+
return;
|
|
219
|
+
}
|
|
220
|
+
for (const frame of rawFrames) onMicAudio?.(frame);
|
|
221
|
+
if (!heardAssistantAudio) return;
|
|
222
|
+
for (const frame of processedFrames) {
|
|
223
|
+
onAudioFrameSent?.(frame);
|
|
224
|
+
session.sendAudio(frame);
|
|
225
|
+
}
|
|
226
|
+
}, 10);
|
|
277
227
|
const session = createRealtimeSession({
|
|
278
228
|
voice,
|
|
279
229
|
mode: "default",
|
|
280
230
|
ack,
|
|
281
231
|
auth,
|
|
282
232
|
onAudioDelta(pcm16) {
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
233
|
+
logEvent("realtime:audio_delta", `bytes=${pcm16.length}`);
|
|
234
|
+
heardAssistantAudio = true;
|
|
235
|
+
lastAssistantAudioAt = Date.now();
|
|
236
|
+
onAssistantAudio?.(pcm16);
|
|
237
|
+
engine.play(pcm16);
|
|
286
238
|
},
|
|
287
239
|
onTranscript(text) {
|
|
240
|
+
const echoGuardMs = Number.parseInt(
|
|
241
|
+
process.env.AGENT_VOICE_ECHO_GUARD_MS ?? "1500",
|
|
242
|
+
10
|
|
243
|
+
);
|
|
244
|
+
const sinceAssistantMs = Date.now() - lastAssistantAudioAt;
|
|
245
|
+
if (heardAssistantAudio && sinceAssistantMs < echoGuardMs) {
|
|
246
|
+
logEvent(
|
|
247
|
+
"realtime:transcript_ignored_echo_guard",
|
|
248
|
+
`since_assistant_ms=${sinceAssistantMs} text="${text}"`
|
|
249
|
+
);
|
|
250
|
+
return;
|
|
251
|
+
}
|
|
252
|
+
logEvent("realtime:transcript", `text="${text}"`);
|
|
253
|
+
if (transcriptTimer) {
|
|
254
|
+
clearTimeout(transcriptTimer);
|
|
255
|
+
transcriptTimer = null;
|
|
256
|
+
}
|
|
288
257
|
transcript = text;
|
|
289
|
-
if (!ack)
|
|
258
|
+
if (!ack) resolveOnce(transcript);
|
|
290
259
|
},
|
|
291
260
|
onSpeechStarted() {
|
|
261
|
+
logEvent("realtime:speech_started");
|
|
292
262
|
speechDetected = true;
|
|
293
263
|
if (timeoutTimer) {
|
|
294
264
|
clearTimeout(timeoutTimer);
|
|
295
265
|
timeoutTimer = null;
|
|
296
266
|
}
|
|
297
|
-
if (
|
|
298
|
-
|
|
299
|
-
|
|
267
|
+
if (transcriptTimer) clearTimeout(transcriptTimer);
|
|
268
|
+
transcriptTimer = setTimeout(() => {
|
|
269
|
+
logEvent("timeout:no_transcript_after_speech");
|
|
270
|
+
rejectOnce(
|
|
271
|
+
new Error(
|
|
272
|
+
`No transcript received within ${timeout}s after speech started`
|
|
273
|
+
)
|
|
274
|
+
);
|
|
275
|
+
}, timeout * 1e3);
|
|
276
|
+
if (!initialResponseDone && heardAssistantAudio) {
|
|
277
|
+
try {
|
|
278
|
+
engine.play(Buffer.alloc(0));
|
|
279
|
+
} catch {
|
|
280
|
+
}
|
|
300
281
|
}
|
|
301
282
|
},
|
|
302
283
|
onInitialResponseDone() {
|
|
284
|
+
logEvent("realtime:initial_response_done");
|
|
303
285
|
initialResponseDone = true;
|
|
304
286
|
timeoutTimer = setTimeout(() => {
|
|
305
287
|
if (!speechDetected) {
|
|
306
|
-
|
|
307
|
-
|
|
288
|
+
logEvent("timeout:no_speech");
|
|
289
|
+
rejectOnce(
|
|
290
|
+
new Error(`No speech detected within ${timeout}s timeout`)
|
|
291
|
+
);
|
|
308
292
|
}
|
|
309
293
|
}, timeout * 1e3);
|
|
310
294
|
},
|
|
311
295
|
onDone() {
|
|
312
|
-
|
|
296
|
+
logEvent("realtime:done");
|
|
297
|
+
if (ack) resolveOnce(transcript);
|
|
313
298
|
},
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
299
|
+
onError(error) {
|
|
300
|
+
logEvent("realtime:error", error);
|
|
301
|
+
rejectOnce(new Error(error));
|
|
317
302
|
}
|
|
318
303
|
});
|
|
319
|
-
session.connect().then(
|
|
320
|
-
|
|
321
|
-
|
|
304
|
+
session.connect().then(
|
|
305
|
+
() => {
|
|
306
|
+
logEvent("realtime:connected");
|
|
307
|
+
logEvent("realtime:send_message");
|
|
308
|
+
session.sendMessage(message);
|
|
309
|
+
responseStartTimer = setTimeout(() => {
|
|
310
|
+
if (!heardAssistantAudio) {
|
|
311
|
+
logEvent("timeout:no_assistant_audio");
|
|
312
|
+
rejectOnce(
|
|
313
|
+
new Error("No assistant audio received after sending message")
|
|
314
|
+
);
|
|
315
|
+
}
|
|
316
|
+
}, 1e4);
|
|
317
|
+
},
|
|
318
|
+
(err) => {
|
|
319
|
+
logEvent(
|
|
320
|
+
"realtime:connect_error",
|
|
321
|
+
err instanceof Error ? err.message : String(err)
|
|
322
|
+
);
|
|
323
|
+
rejectOnce(err instanceof Error ? err : new Error(String(err)));
|
|
324
|
+
}
|
|
325
|
+
);
|
|
322
326
|
});
|
|
323
327
|
}
|
|
324
328
|
|
|
@@ -353,48 +357,78 @@ function resolveVoice() {
|
|
|
353
357
|
}
|
|
354
358
|
|
|
355
359
|
// src/say.ts
|
|
360
|
+
import { createRequire as createRequire2 } from "module";
|
|
361
|
+
var require3 = createRequire2(import.meta.url);
|
|
356
362
|
async function say(message, options = {}) {
|
|
357
|
-
const {
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
363
|
+
const { voice = DEFAULT_VOICE, auth } = options;
|
|
364
|
+
const { AudioEngine } = require3("agent-voice-audio");
|
|
365
|
+
const engine = new AudioEngine({
|
|
366
|
+
sampleRate: SAMPLE_RATE,
|
|
367
|
+
channels: 1,
|
|
368
|
+
enableAec: false
|
|
369
|
+
});
|
|
370
|
+
engine.start();
|
|
364
371
|
return new Promise((resolve, reject) => {
|
|
365
372
|
let cleaned = false;
|
|
373
|
+
let settled = false;
|
|
374
|
+
let responseDoneFallbackTimer = null;
|
|
375
|
+
let completionTailTimer = null;
|
|
366
376
|
function cleanup() {
|
|
367
377
|
if (cleaned) return;
|
|
368
378
|
cleaned = true;
|
|
379
|
+
if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
|
|
380
|
+
if (completionTailTimer) clearTimeout(completionTailTimer);
|
|
381
|
+
try {
|
|
382
|
+
engine.stop();
|
|
383
|
+
engine.close();
|
|
384
|
+
} catch {
|
|
385
|
+
}
|
|
369
386
|
session.close();
|
|
370
387
|
}
|
|
388
|
+
function resolveOnce() {
|
|
389
|
+
if (settled) return;
|
|
390
|
+
settled = true;
|
|
391
|
+
cleanup();
|
|
392
|
+
resolve();
|
|
393
|
+
}
|
|
394
|
+
function rejectOnce(error) {
|
|
395
|
+
if (settled) return;
|
|
396
|
+
settled = true;
|
|
397
|
+
cleanup();
|
|
398
|
+
reject(error);
|
|
399
|
+
}
|
|
400
|
+
function scheduleTailResolve(delayMs) {
|
|
401
|
+
if (settled) return;
|
|
402
|
+
if (completionTailTimer) clearTimeout(completionTailTimer);
|
|
403
|
+
completionTailTimer = setTimeout(() => {
|
|
404
|
+
resolveOnce();
|
|
405
|
+
}, delayMs);
|
|
406
|
+
}
|
|
371
407
|
const session = createRealtimeSession({
|
|
372
408
|
voice,
|
|
373
409
|
mode: "say",
|
|
374
410
|
ack: false,
|
|
375
411
|
auth,
|
|
376
412
|
onAudioDelta(pcm16) {
|
|
377
|
-
|
|
413
|
+
engine.play(pcm16);
|
|
414
|
+
},
|
|
415
|
+
onAudioDone() {
|
|
416
|
+
scheduleTailResolve(140);
|
|
378
417
|
},
|
|
379
418
|
onTranscript() {
|
|
380
419
|
},
|
|
381
420
|
onSpeechStarted() {
|
|
382
421
|
},
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
}
|
|
389
|
-
cleanup();
|
|
390
|
-
resolve();
|
|
422
|
+
onInitialResponseDone() {
|
|
423
|
+
if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
|
|
424
|
+
responseDoneFallbackTimer = setTimeout(() => {
|
|
425
|
+
scheduleTailResolve(220);
|
|
426
|
+
}, 700);
|
|
391
427
|
},
|
|
392
428
|
onDone() {
|
|
393
429
|
},
|
|
394
430
|
onError(error) {
|
|
395
|
-
|
|
396
|
-
cleanup();
|
|
397
|
-
reject(new Error(error));
|
|
431
|
+
rejectOnce(new Error(error));
|
|
398
432
|
}
|
|
399
433
|
});
|
|
400
434
|
session.connect().then(() => {
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import {
|
|
3
|
+
createRealtimeSession
|
|
4
|
+
} from "./chunk-UYBFONQE.js";
|
|
5
|
+
import {
|
|
6
|
+
DEFAULT_VOICE,
|
|
7
|
+
SAMPLE_RATE
|
|
8
|
+
} from "./chunk-AHLLYIEW.js";
|
|
9
|
+
|
|
10
|
+
// src/say.ts
|
|
11
|
+
import { createRequire } from "module";
|
|
12
|
+
var require2 = createRequire(import.meta.url);
|
|
13
|
+
async function say(message, options = {}) {
|
|
14
|
+
const { voice = DEFAULT_VOICE, auth } = options;
|
|
15
|
+
const { AudioEngine } = require2("agent-voice-audio");
|
|
16
|
+
const engine = new AudioEngine({
|
|
17
|
+
sampleRate: SAMPLE_RATE,
|
|
18
|
+
channels: 1,
|
|
19
|
+
enableAec: false
|
|
20
|
+
});
|
|
21
|
+
engine.start();
|
|
22
|
+
return new Promise((resolve, reject) => {
|
|
23
|
+
let cleaned = false;
|
|
24
|
+
let settled = false;
|
|
25
|
+
let responseDoneFallbackTimer = null;
|
|
26
|
+
let completionTailTimer = null;
|
|
27
|
+
function cleanup() {
|
|
28
|
+
if (cleaned) return;
|
|
29
|
+
cleaned = true;
|
|
30
|
+
if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
|
|
31
|
+
if (completionTailTimer) clearTimeout(completionTailTimer);
|
|
32
|
+
try {
|
|
33
|
+
engine.stop();
|
|
34
|
+
engine.close();
|
|
35
|
+
} catch {
|
|
36
|
+
}
|
|
37
|
+
session.close();
|
|
38
|
+
}
|
|
39
|
+
function resolveOnce() {
|
|
40
|
+
if (settled) return;
|
|
41
|
+
settled = true;
|
|
42
|
+
cleanup();
|
|
43
|
+
resolve();
|
|
44
|
+
}
|
|
45
|
+
function rejectOnce(error) {
|
|
46
|
+
if (settled) return;
|
|
47
|
+
settled = true;
|
|
48
|
+
cleanup();
|
|
49
|
+
reject(error);
|
|
50
|
+
}
|
|
51
|
+
function scheduleTailResolve(delayMs) {
|
|
52
|
+
if (settled) return;
|
|
53
|
+
if (completionTailTimer) clearTimeout(completionTailTimer);
|
|
54
|
+
completionTailTimer = setTimeout(() => {
|
|
55
|
+
resolveOnce();
|
|
56
|
+
}, delayMs);
|
|
57
|
+
}
|
|
58
|
+
const session = createRealtimeSession({
|
|
59
|
+
voice,
|
|
60
|
+
mode: "say",
|
|
61
|
+
ack: false,
|
|
62
|
+
auth,
|
|
63
|
+
onAudioDelta(pcm16) {
|
|
64
|
+
engine.play(pcm16);
|
|
65
|
+
},
|
|
66
|
+
onAudioDone() {
|
|
67
|
+
scheduleTailResolve(140);
|
|
68
|
+
},
|
|
69
|
+
onTranscript() {
|
|
70
|
+
},
|
|
71
|
+
onSpeechStarted() {
|
|
72
|
+
},
|
|
73
|
+
onInitialResponseDone() {
|
|
74
|
+
if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
|
|
75
|
+
responseDoneFallbackTimer = setTimeout(() => {
|
|
76
|
+
scheduleTailResolve(220);
|
|
77
|
+
}, 700);
|
|
78
|
+
},
|
|
79
|
+
onDone() {
|
|
80
|
+
},
|
|
81
|
+
onError(error) {
|
|
82
|
+
rejectOnce(new Error(error));
|
|
83
|
+
}
|
|
84
|
+
});
|
|
85
|
+
session.connect().then(() => {
|
|
86
|
+
session.sendMessage(message);
|
|
87
|
+
}, reject);
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
export {
|
|
91
|
+
say
|
|
92
|
+
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agent-voice",
|
|
3
|
-
"version": "0.1
|
|
3
|
+
"version": "0.2.1",
|
|
4
4
|
"description": "CLI for AI agents to interact with humans via voice",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -18,12 +18,11 @@
|
|
|
18
18
|
"dist"
|
|
19
19
|
],
|
|
20
20
|
"dependencies": {
|
|
21
|
+
"agent-voice-audio": "^0.2.1",
|
|
21
22
|
"@inquirer/prompts": "^8.2.0",
|
|
22
23
|
"commander": "^13.1.0",
|
|
23
|
-
"naudiodon2": "^2.1.0",
|
|
24
24
|
"openai": "^4.96.0",
|
|
25
|
-
"ws": "^8.18.0"
|
|
26
|
-
"agent-voice-aec": "0.1.1"
|
|
25
|
+
"ws": "^8.18.0"
|
|
27
26
|
},
|
|
28
27
|
"devDependencies": {
|
|
29
28
|
"@types/node": "^22.12.0",
|
package/dist/ask-6HS5WYJU.js
DELETED
|
@@ -1,145 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
import {
|
|
3
|
-
createAudioPlayer,
|
|
4
|
-
createAudioRecorder,
|
|
5
|
-
createRealtimeSession
|
|
6
|
-
} from "./chunk-AQ5LP2XD.js";
|
|
7
|
-
import {
|
|
8
|
-
DEFAULT_VOICE,
|
|
9
|
-
SAMPLE_RATE
|
|
10
|
-
} from "./chunk-D3AGL5JD.js";
|
|
11
|
-
|
|
12
|
-
// src/echo-canceller.ts
|
|
13
|
-
import { EchoCanceller } from "agent-voice-aec";
|
|
14
|
-
var FRAME_SIZE = 480;
|
|
15
|
-
var FILTER_LENGTH = 4800;
|
|
16
|
-
var FRAME_BYTES = FRAME_SIZE * 2;
|
|
17
|
-
function createEchoCanceller() {
|
|
18
|
-
const aec = new EchoCanceller(FRAME_SIZE, FILTER_LENGTH, SAMPLE_RATE);
|
|
19
|
-
let playbackBuffer = Buffer.alloc(0);
|
|
20
|
-
let captureBuffer = Buffer.alloc(0);
|
|
21
|
-
return {
|
|
22
|
-
playback(pcm16) {
|
|
23
|
-
playbackBuffer = Buffer.concat([playbackBuffer, pcm16]);
|
|
24
|
-
while (playbackBuffer.length >= FRAME_BYTES) {
|
|
25
|
-
aec.playback(playbackBuffer.subarray(0, FRAME_BYTES));
|
|
26
|
-
playbackBuffer = playbackBuffer.subarray(FRAME_BYTES);
|
|
27
|
-
}
|
|
28
|
-
},
|
|
29
|
-
capture(pcm16) {
|
|
30
|
-
captureBuffer = Buffer.concat([captureBuffer, pcm16]);
|
|
31
|
-
const frames = [];
|
|
32
|
-
while (captureBuffer.length >= FRAME_BYTES) {
|
|
33
|
-
const out = aec.capture(captureBuffer.subarray(0, FRAME_BYTES));
|
|
34
|
-
frames.push(out);
|
|
35
|
-
captureBuffer = captureBuffer.subarray(FRAME_BYTES);
|
|
36
|
-
}
|
|
37
|
-
return frames;
|
|
38
|
-
},
|
|
39
|
-
reset() {
|
|
40
|
-
aec.reset();
|
|
41
|
-
playbackBuffer = Buffer.alloc(0);
|
|
42
|
-
captureBuffer = Buffer.alloc(0);
|
|
43
|
-
}
|
|
44
|
-
};
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
// src/ask.ts
|
|
48
|
-
async function ask(message, options = {}) {
|
|
49
|
-
const {
|
|
50
|
-
voice = DEFAULT_VOICE,
|
|
51
|
-
timeout = 30,
|
|
52
|
-
ack = false,
|
|
53
|
-
auth,
|
|
54
|
-
createPlayer = createAudioPlayer,
|
|
55
|
-
createRecorder = createAudioRecorder
|
|
56
|
-
} = options;
|
|
57
|
-
const player = createPlayer();
|
|
58
|
-
player.start();
|
|
59
|
-
const echoCanceller = createEchoCanceller();
|
|
60
|
-
return new Promise((resolve, reject) => {
|
|
61
|
-
let recorder = null;
|
|
62
|
-
let recorderStarted = false;
|
|
63
|
-
let transcript = "";
|
|
64
|
-
let timeoutTimer = null;
|
|
65
|
-
let speechDetected = false;
|
|
66
|
-
let initialResponseDone = false;
|
|
67
|
-
let interrupted = false;
|
|
68
|
-
let cleaned = false;
|
|
69
|
-
let resolved = false;
|
|
70
|
-
async function cleanup() {
|
|
71
|
-
if (cleaned) return;
|
|
72
|
-
cleaned = true;
|
|
73
|
-
if (timeoutTimer) clearTimeout(timeoutTimer);
|
|
74
|
-
recorder?.stop();
|
|
75
|
-
recorder?.close();
|
|
76
|
-
await player.drain();
|
|
77
|
-
session.close();
|
|
78
|
-
}
|
|
79
|
-
function finish() {
|
|
80
|
-
if (resolved) return;
|
|
81
|
-
resolved = true;
|
|
82
|
-
cleanup().then(() => resolve(transcript));
|
|
83
|
-
}
|
|
84
|
-
function startRecorder() {
|
|
85
|
-
if (recorderStarted) return;
|
|
86
|
-
recorderStarted = true;
|
|
87
|
-
recorder = createRecorder();
|
|
88
|
-
recorder.onData((pcm16) => {
|
|
89
|
-
const cleaned2 = echoCanceller.capture(pcm16);
|
|
90
|
-
for (const frame of cleaned2) {
|
|
91
|
-
session.sendAudio(frame);
|
|
92
|
-
}
|
|
93
|
-
});
|
|
94
|
-
recorder.start();
|
|
95
|
-
}
|
|
96
|
-
const session = createRealtimeSession({
|
|
97
|
-
voice,
|
|
98
|
-
mode: "default",
|
|
99
|
-
ack,
|
|
100
|
-
auth,
|
|
101
|
-
onAudioDelta(pcm16) {
|
|
102
|
-
echoCanceller.playback(pcm16);
|
|
103
|
-
player.write(pcm16);
|
|
104
|
-
startRecorder();
|
|
105
|
-
},
|
|
106
|
-
onTranscript(text) {
|
|
107
|
-
transcript = text;
|
|
108
|
-
if (!ack) finish();
|
|
109
|
-
},
|
|
110
|
-
onSpeechStarted() {
|
|
111
|
-
speechDetected = true;
|
|
112
|
-
if (timeoutTimer) {
|
|
113
|
-
clearTimeout(timeoutTimer);
|
|
114
|
-
timeoutTimer = null;
|
|
115
|
-
}
|
|
116
|
-
if (!initialResponseDone) {
|
|
117
|
-
interrupted = true;
|
|
118
|
-
player.close();
|
|
119
|
-
}
|
|
120
|
-
},
|
|
121
|
-
onInitialResponseDone() {
|
|
122
|
-
initialResponseDone = true;
|
|
123
|
-
timeoutTimer = setTimeout(() => {
|
|
124
|
-
if (!speechDetected) {
|
|
125
|
-
cleanup();
|
|
126
|
-
reject(new Error(`No speech detected within ${timeout}s timeout`));
|
|
127
|
-
}
|
|
128
|
-
}, timeout * 1e3);
|
|
129
|
-
},
|
|
130
|
-
onDone() {
|
|
131
|
-
if (ack) finish();
|
|
132
|
-
},
|
|
133
|
-
async onError(error) {
|
|
134
|
-
await cleanup();
|
|
135
|
-
reject(new Error(error));
|
|
136
|
-
}
|
|
137
|
-
});
|
|
138
|
-
session.connect().then(() => {
|
|
139
|
-
session.sendMessage(message);
|
|
140
|
-
}, reject);
|
|
141
|
-
});
|
|
142
|
-
}
|
|
143
|
-
export {
|
|
144
|
-
ask
|
|
145
|
-
};
|
package/dist/say-PKBQ2ZDL.js
DELETED
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
import {
|
|
3
|
-
createAudioPlayer,
|
|
4
|
-
createRealtimeSession
|
|
5
|
-
} from "./chunk-AQ5LP2XD.js";
|
|
6
|
-
import {
|
|
7
|
-
DEFAULT_VOICE
|
|
8
|
-
} from "./chunk-D3AGL5JD.js";
|
|
9
|
-
|
|
10
|
-
// src/say.ts
|
|
11
|
-
async function say(message, options = {}) {
|
|
12
|
-
const {
|
|
13
|
-
voice = DEFAULT_VOICE,
|
|
14
|
-
auth,
|
|
15
|
-
createPlayer = createAudioPlayer
|
|
16
|
-
} = options;
|
|
17
|
-
const player = createPlayer();
|
|
18
|
-
player.start();
|
|
19
|
-
return new Promise((resolve, reject) => {
|
|
20
|
-
let cleaned = false;
|
|
21
|
-
function cleanup() {
|
|
22
|
-
if (cleaned) return;
|
|
23
|
-
cleaned = true;
|
|
24
|
-
session.close();
|
|
25
|
-
}
|
|
26
|
-
const session = createRealtimeSession({
|
|
27
|
-
voice,
|
|
28
|
-
mode: "say",
|
|
29
|
-
ack: false,
|
|
30
|
-
auth,
|
|
31
|
-
onAudioDelta(pcm16) {
|
|
32
|
-
player.write(pcm16);
|
|
33
|
-
},
|
|
34
|
-
onTranscript() {
|
|
35
|
-
},
|
|
36
|
-
onSpeechStarted() {
|
|
37
|
-
},
|
|
38
|
-
async onInitialResponseDone() {
|
|
39
|
-
try {
|
|
40
|
-
await player.drain();
|
|
41
|
-
} catch {
|
|
42
|
-
player.close();
|
|
43
|
-
}
|
|
44
|
-
cleanup();
|
|
45
|
-
resolve();
|
|
46
|
-
},
|
|
47
|
-
onDone() {
|
|
48
|
-
},
|
|
49
|
-
onError(error) {
|
|
50
|
-
player.close();
|
|
51
|
-
cleanup();
|
|
52
|
-
reject(new Error(error));
|
|
53
|
-
}
|
|
54
|
-
});
|
|
55
|
-
session.connect().then(() => {
|
|
56
|
-
session.sendMessage(message);
|
|
57
|
-
}, reject);
|
|
58
|
-
});
|
|
59
|
-
}
|
|
60
|
-
export {
|
|
61
|
-
say
|
|
62
|
-
};
|