agent-voice 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{ask-GUSXGYSY.js → ask-OIE6HL2H.js} +85 -11
- package/dist/{chunk-VV2VNOC4.js → chunk-UYBFONQE.js} +3 -0
- package/dist/cli.js +17 -5
- package/dist/index.d.ts +75 -4
- package/dist/index.js +194 -17
- package/dist/say-ZVF6EX52.js +164 -0
- package/package.json +2 -2
- package/dist/say-W56HCNK4.js +0 -64
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import {
|
|
3
3
|
createRealtimeSession
|
|
4
|
-
} from "./chunk-
|
|
4
|
+
} from "./chunk-UYBFONQE.js";
|
|
5
5
|
import {
|
|
6
6
|
DEFAULT_VOICE,
|
|
7
7
|
SAMPLE_RATE
|
|
@@ -10,22 +10,38 @@ import {
|
|
|
10
10
|
// src/ask.ts
|
|
11
11
|
import { createRequire } from "module";
|
|
12
12
|
var require2 = createRequire(import.meta.url);
|
|
13
|
+
function pcm16Rms(pcm16) {
|
|
14
|
+
const samples = Math.floor(pcm16.length / 2);
|
|
15
|
+
if (samples === 0) return 0;
|
|
16
|
+
let sumSquares = 0;
|
|
17
|
+
for (let i = 0; i < samples; i++) {
|
|
18
|
+
const value = pcm16.readInt16LE(i * 2);
|
|
19
|
+
sumSquares += value * value;
|
|
20
|
+
}
|
|
21
|
+
return Math.sqrt(sumSquares / samples);
|
|
22
|
+
}
|
|
23
|
+
function readEnvInt(name, fallback) {
|
|
24
|
+
const raw = process.env[name];
|
|
25
|
+
if (raw == null) return fallback;
|
|
26
|
+
const parsed = Number.parseInt(raw, 10);
|
|
27
|
+
return Number.isFinite(parsed) ? parsed : fallback;
|
|
28
|
+
}
|
|
13
29
|
async function ask(message, options = {}) {
|
|
14
30
|
const {
|
|
15
31
|
voice = DEFAULT_VOICE,
|
|
16
32
|
timeout = 30,
|
|
17
33
|
ack = false,
|
|
18
34
|
auth,
|
|
35
|
+
createSession,
|
|
36
|
+
createAudioEngine,
|
|
37
|
+
onTrace,
|
|
19
38
|
onAudioFrameSent,
|
|
20
39
|
onAssistantAudio,
|
|
21
40
|
onMicAudio
|
|
22
41
|
} = options;
|
|
23
42
|
const { AudioEngine } = require2("agent-voice-audio");
|
|
24
|
-
const streamDelayMs =
|
|
25
|
-
|
|
26
|
-
10
|
|
27
|
-
);
|
|
28
|
-
const engine = new AudioEngine({
|
|
43
|
+
const streamDelayMs = readEnvInt("AGENT_VOICE_AEC_STREAM_DELAY_MS", 30);
|
|
44
|
+
const engine = (createAudioEngine ?? ((engineOptions) => new AudioEngine(engineOptions)))({
|
|
29
45
|
sampleRate: SAMPLE_RATE,
|
|
30
46
|
channels: 1,
|
|
31
47
|
enableAec: true,
|
|
@@ -41,7 +57,11 @@ async function ask(message, options = {}) {
|
|
|
41
57
|
process.stderr.write(`[ask ${elapsed}ms] ${event}${suffix}
|
|
42
58
|
`);
|
|
43
59
|
}
|
|
60
|
+
function trace(event, detail) {
|
|
61
|
+
onTrace?.({ atMs: Date.now() - startMs, event, detail });
|
|
62
|
+
}
|
|
44
63
|
logEvent("start");
|
|
64
|
+
trace("start");
|
|
45
65
|
return new Promise((resolve, reject) => {
|
|
46
66
|
let transcript = "";
|
|
47
67
|
let timeoutTimer = null;
|
|
@@ -49,15 +69,19 @@ async function ask(message, options = {}) {
|
|
|
49
69
|
let transcriptTimer = null;
|
|
50
70
|
let capturePollTimer = null;
|
|
51
71
|
let speechDetected = false;
|
|
72
|
+
let speechStartedAtMs = 0;
|
|
52
73
|
let initialResponseDone = false;
|
|
53
74
|
let heardAssistantAudio = false;
|
|
54
75
|
let lastAssistantAudioAt = 0;
|
|
76
|
+
let nearEndEvidenceSeen = false;
|
|
77
|
+
let nearEndEvidenceAtMs = 0;
|
|
55
78
|
let cleaned = false;
|
|
56
79
|
let settled = false;
|
|
57
80
|
async function cleanup() {
|
|
58
81
|
if (cleaned) return;
|
|
59
82
|
cleaned = true;
|
|
60
83
|
logEvent("cleanup:start");
|
|
84
|
+
trace("cleanup:start");
|
|
61
85
|
if (timeoutTimer) clearTimeout(timeoutTimer);
|
|
62
86
|
if (responseStartTimer) clearTimeout(responseStartTimer);
|
|
63
87
|
if (transcriptTimer) clearTimeout(transcriptTimer);
|
|
@@ -69,6 +93,7 @@ async function ask(message, options = {}) {
|
|
|
69
93
|
}
|
|
70
94
|
session.close();
|
|
71
95
|
logEvent("cleanup:done");
|
|
96
|
+
trace("cleanup:done");
|
|
72
97
|
}
|
|
73
98
|
function resolveOnce(value) {
|
|
74
99
|
if (settled) return;
|
|
@@ -93,41 +118,75 @@ async function ask(message, options = {}) {
|
|
|
93
118
|
`audio engine capture read failed: ${err instanceof Error ? err.message : String(err)}`
|
|
94
119
|
)
|
|
95
120
|
);
|
|
121
|
+
trace("audio:capture_read_error", {
|
|
122
|
+
error: err instanceof Error ? err.message : String(err)
|
|
123
|
+
});
|
|
96
124
|
return;
|
|
97
125
|
}
|
|
98
126
|
for (const frame of rawFrames) onMicAudio?.(frame);
|
|
99
127
|
if (!heardAssistantAudio) return;
|
|
100
128
|
for (const frame of processedFrames) {
|
|
129
|
+
const rms = pcm16Rms(frame);
|
|
130
|
+
const minSpeechRms = readEnvInt("AGENT_VOICE_MIN_SPEECH_RMS", 550);
|
|
131
|
+
if (rms >= minSpeechRms) {
|
|
132
|
+
nearEndEvidenceSeen = true;
|
|
133
|
+
nearEndEvidenceAtMs = Date.now();
|
|
134
|
+
trace("audio:near_end_evidence", { rms, minSpeechRms });
|
|
135
|
+
}
|
|
101
136
|
onAudioFrameSent?.(frame);
|
|
102
137
|
session.sendAudio(frame);
|
|
103
138
|
}
|
|
139
|
+
if (processedFrames.length > 0) {
|
|
140
|
+
trace("audio:sent_capture", { frames: processedFrames.length });
|
|
141
|
+
}
|
|
104
142
|
}, 10);
|
|
105
|
-
const session = createRealtimeSession({
|
|
143
|
+
const session = (createSession ?? createRealtimeSession)({
|
|
106
144
|
voice,
|
|
107
145
|
mode: "default",
|
|
108
146
|
ack,
|
|
109
147
|
auth,
|
|
110
148
|
onAudioDelta(pcm16) {
|
|
111
149
|
logEvent("realtime:audio_delta", `bytes=${pcm16.length}`);
|
|
150
|
+
trace("realtime:audio_delta", { bytes: pcm16.length });
|
|
112
151
|
heardAssistantAudio = true;
|
|
113
152
|
lastAssistantAudioAt = Date.now();
|
|
114
153
|
onAssistantAudio?.(pcm16);
|
|
115
154
|
engine.play(pcm16);
|
|
116
155
|
},
|
|
117
156
|
onTranscript(text) {
|
|
118
|
-
const echoGuardMs =
|
|
119
|
-
process.env.AGENT_VOICE_ECHO_GUARD_MS ?? "1500",
|
|
120
|
-
10
|
|
121
|
-
);
|
|
157
|
+
const echoGuardMs = readEnvInt("AGENT_VOICE_ECHO_GUARD_MS", 1500);
|
|
122
158
|
const sinceAssistantMs = Date.now() - lastAssistantAudioAt;
|
|
123
159
|
if (heardAssistantAudio && sinceAssistantMs < echoGuardMs) {
|
|
124
160
|
logEvent(
|
|
125
161
|
"realtime:transcript_ignored_echo_guard",
|
|
126
162
|
`since_assistant_ms=${sinceAssistantMs} text="${text}"`
|
|
127
163
|
);
|
|
164
|
+
trace("realtime:transcript_ignored_echo_guard", {
|
|
165
|
+
sinceAssistantMs,
|
|
166
|
+
text
|
|
167
|
+
});
|
|
128
168
|
return;
|
|
129
169
|
}
|
|
130
170
|
logEvent("realtime:transcript", `text="${text}"`);
|
|
171
|
+
trace("realtime:transcript", { text });
|
|
172
|
+
if (speechDetected) {
|
|
173
|
+
const evidenceWindowMs = readEnvInt(
|
|
174
|
+
"AGENT_VOICE_SPEECH_EVIDENCE_WINDOW_MS",
|
|
175
|
+
1200
|
|
176
|
+
);
|
|
177
|
+
const evidenceAgeMs = nearEndEvidenceSeen ? Math.abs(nearEndEvidenceAtMs - speechStartedAtMs) : Number.POSITIVE_INFINITY;
|
|
178
|
+
if (!nearEndEvidenceSeen || evidenceAgeMs > evidenceWindowMs) {
|
|
179
|
+
trace("realtime:transcript_ignored_no_near_end_evidence", {
|
|
180
|
+
text,
|
|
181
|
+
speechStartedAtMs,
|
|
182
|
+
nearEndEvidenceSeen,
|
|
183
|
+
nearEndEvidenceAtMs,
|
|
184
|
+
evidenceAgeMs,
|
|
185
|
+
evidenceWindowMs
|
|
186
|
+
});
|
|
187
|
+
return;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
131
190
|
if (transcriptTimer) {
|
|
132
191
|
clearTimeout(transcriptTimer);
|
|
133
192
|
transcriptTimer = null;
|
|
@@ -137,7 +196,9 @@ async function ask(message, options = {}) {
|
|
|
137
196
|
},
|
|
138
197
|
onSpeechStarted() {
|
|
139
198
|
logEvent("realtime:speech_started");
|
|
199
|
+
trace("realtime:speech_started");
|
|
140
200
|
speechDetected = true;
|
|
201
|
+
speechStartedAtMs = Date.now();
|
|
141
202
|
if (timeoutTimer) {
|
|
142
203
|
clearTimeout(timeoutTimer);
|
|
143
204
|
timeoutTimer = null;
|
|
@@ -145,6 +206,9 @@ async function ask(message, options = {}) {
|
|
|
145
206
|
if (transcriptTimer) clearTimeout(transcriptTimer);
|
|
146
207
|
transcriptTimer = setTimeout(() => {
|
|
147
208
|
logEvent("timeout:no_transcript_after_speech");
|
|
209
|
+
trace("timeout:no_transcript_after_speech", {
|
|
210
|
+
timeoutSeconds: timeout
|
|
211
|
+
});
|
|
148
212
|
rejectOnce(
|
|
149
213
|
new Error(
|
|
150
214
|
`No transcript received within ${timeout}s after speech started`
|
|
@@ -160,10 +224,12 @@ async function ask(message, options = {}) {
|
|
|
160
224
|
},
|
|
161
225
|
onInitialResponseDone() {
|
|
162
226
|
logEvent("realtime:initial_response_done");
|
|
227
|
+
trace("realtime:initial_response_done");
|
|
163
228
|
initialResponseDone = true;
|
|
164
229
|
timeoutTimer = setTimeout(() => {
|
|
165
230
|
if (!speechDetected) {
|
|
166
231
|
logEvent("timeout:no_speech");
|
|
232
|
+
trace("timeout:no_speech", { timeoutSeconds: timeout });
|
|
167
233
|
rejectOnce(
|
|
168
234
|
new Error(`No speech detected within ${timeout}s timeout`)
|
|
169
235
|
);
|
|
@@ -172,21 +238,26 @@ async function ask(message, options = {}) {
|
|
|
172
238
|
},
|
|
173
239
|
onDone() {
|
|
174
240
|
logEvent("realtime:done");
|
|
241
|
+
trace("realtime:done");
|
|
175
242
|
if (ack) resolveOnce(transcript);
|
|
176
243
|
},
|
|
177
244
|
onError(error) {
|
|
178
245
|
logEvent("realtime:error", error);
|
|
246
|
+
trace("realtime:error", { error });
|
|
179
247
|
rejectOnce(new Error(error));
|
|
180
248
|
}
|
|
181
249
|
});
|
|
182
250
|
session.connect().then(
|
|
183
251
|
() => {
|
|
184
252
|
logEvent("realtime:connected");
|
|
253
|
+
trace("realtime:connected");
|
|
185
254
|
logEvent("realtime:send_message");
|
|
255
|
+
trace("realtime:send_message");
|
|
186
256
|
session.sendMessage(message);
|
|
187
257
|
responseStartTimer = setTimeout(() => {
|
|
188
258
|
if (!heardAssistantAudio) {
|
|
189
259
|
logEvent("timeout:no_assistant_audio");
|
|
260
|
+
trace("timeout:no_assistant_audio");
|
|
190
261
|
rejectOnce(
|
|
191
262
|
new Error("No assistant audio received after sending message")
|
|
192
263
|
);
|
|
@@ -198,6 +269,9 @@ async function ask(message, options = {}) {
|
|
|
198
269
|
"realtime:connect_error",
|
|
199
270
|
err instanceof Error ? err.message : String(err)
|
|
200
271
|
);
|
|
272
|
+
trace("realtime:connect_error", {
|
|
273
|
+
error: err instanceof Error ? err.message : String(err)
|
|
274
|
+
});
|
|
201
275
|
rejectOnce(err instanceof Error ? err : new Error(String(err)));
|
|
202
276
|
}
|
|
203
277
|
);
|
|
@@ -43,6 +43,9 @@ function createRealtimeSession(options) {
|
|
|
43
43
|
const pcm16 = Buffer.from(event.delta, "base64");
|
|
44
44
|
options.onAudioDelta(pcm16);
|
|
45
45
|
});
|
|
46
|
+
rt.on("response.audio.done", () => {
|
|
47
|
+
options.onAudioDone?.();
|
|
48
|
+
});
|
|
46
49
|
rt.on("conversation.item.input_audio_transcription.completed", (event) => {
|
|
47
50
|
options.onTranscript(event.transcript);
|
|
48
51
|
});
|
package/dist/cli.js
CHANGED
|
@@ -12,7 +12,13 @@ import {
|
|
|
12
12
|
} from "./chunk-AHLLYIEW.js";
|
|
13
13
|
|
|
14
14
|
// src/cli.ts
|
|
15
|
-
import {
|
|
15
|
+
import {
|
|
16
|
+
closeSync,
|
|
17
|
+
mkdirSync,
|
|
18
|
+
openSync,
|
|
19
|
+
writeFileSync,
|
|
20
|
+
writeSync
|
|
21
|
+
} from "fs";
|
|
16
22
|
import { join } from "path";
|
|
17
23
|
import { Command } from "commander";
|
|
18
24
|
async function withSuppressedNativeOutput() {
|
|
@@ -22,8 +28,8 @@ async function withSuppressedNativeOutput() {
|
|
|
22
28
|
openSync("/dev/null", "w");
|
|
23
29
|
closeSync(2);
|
|
24
30
|
openSync("/dev/null", "w");
|
|
25
|
-
const { ask } = await import("./ask-
|
|
26
|
-
const { say } = await import("./say-
|
|
31
|
+
const { ask } = await import("./ask-OIE6HL2H.js");
|
|
32
|
+
const { say } = await import("./say-ZVF6EX52.js");
|
|
27
33
|
function writeResult(text) {
|
|
28
34
|
writeSync(savedStdout, `${text}
|
|
29
35
|
`);
|
|
@@ -78,7 +84,10 @@ function writeDebugAudio(dir, assistantChunks, micChunks, modelInputChunks) {
|
|
|
78
84
|
const modelInputFile = join(dir, `ask-${stamp}-model-input.wav`);
|
|
79
85
|
writeFileSync(assistantFile, createWavBuffer(Buffer.concat(assistantChunks)));
|
|
80
86
|
writeFileSync(micFile, createWavBuffer(Buffer.concat(micChunks)));
|
|
81
|
-
writeFileSync(
|
|
87
|
+
writeFileSync(
|
|
88
|
+
modelInputFile,
|
|
89
|
+
createWavBuffer(Buffer.concat(modelInputChunks))
|
|
90
|
+
);
|
|
82
91
|
return { assistantFile, micFile, modelInputFile };
|
|
83
92
|
}
|
|
84
93
|
var program = new Command().name("agent-voice").description("AI agent voice interaction CLI");
|
|
@@ -120,7 +129,10 @@ voicesCmd.command("set <voice>").description("Set the default voice").action((vo
|
|
|
120
129
|
`);
|
|
121
130
|
process.exit(0);
|
|
122
131
|
});
|
|
123
|
-
program.command("ask").description("Speak a message and listen for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).option("--timeout <seconds>", "Seconds to wait for user speech", "120").option("--ack", "Speak an acknowledgment after the user responds").option(
|
|
132
|
+
program.command("ask").description("Speak a message and listen for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).option("--timeout <seconds>", "Seconds to wait for user speech", "120").option("--ack", "Speak an acknowledgment after the user responds").option(
|
|
133
|
+
"--debug-audio-dir <dir>",
|
|
134
|
+
"Write ask audio debug WAVs to this directory"
|
|
135
|
+
).action(async (opts) => {
|
|
124
136
|
const { ask, writeResult, writeError } = await withSuppressedNativeOutput();
|
|
125
137
|
const assistantChunks = [];
|
|
126
138
|
const micChunks = [];
|
package/dist/index.d.ts
CHANGED
|
@@ -5,11 +5,65 @@ type AuthConfig = {
|
|
|
5
5
|
declare function resolveAuth(): AuthConfig;
|
|
6
6
|
declare function resolveVoice(): string;
|
|
7
7
|
|
|
8
|
+
declare const VOICES: readonly ["alloy", "ash", "ballad", "coral", "echo", "fable", "nova", "onyx", "sage", "shimmer", "verse"];
|
|
9
|
+
type Voice = (typeof VOICES)[number];
|
|
10
|
+
declare const DEFAULT_VOICE: Voice;
|
|
11
|
+
type Mode = "default" | "say";
|
|
12
|
+
|
|
13
|
+
type RealtimeSessionOptions = {
|
|
14
|
+
voice: string;
|
|
15
|
+
mode: Mode;
|
|
16
|
+
ack: boolean;
|
|
17
|
+
auth?: AuthConfig;
|
|
18
|
+
onAudioDelta: (pcm16: Buffer) => void;
|
|
19
|
+
onAudioDone?: () => void;
|
|
20
|
+
onTranscript: (text: string) => void;
|
|
21
|
+
onSpeechStarted: () => void;
|
|
22
|
+
onInitialResponseDone: () => void;
|
|
23
|
+
onDone: () => void;
|
|
24
|
+
onError: (error: string) => void;
|
|
25
|
+
};
|
|
26
|
+
type RealtimeSession = {
|
|
27
|
+
connect(): Promise<void>;
|
|
28
|
+
sendMessage(text: string): void;
|
|
29
|
+
sendAudio(pcm16: Buffer): void;
|
|
30
|
+
close(): void;
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
type RustAudioEngine$1 = {
|
|
34
|
+
start(): void;
|
|
35
|
+
stop(): void;
|
|
36
|
+
close(): void;
|
|
37
|
+
play(pcm16: Buffer): void;
|
|
38
|
+
readProcessedCapture(maxFrames?: number): Buffer[];
|
|
39
|
+
readRawCapture(maxFrames?: number): Buffer[];
|
|
40
|
+
setStreamDelayMs(delayMs: number): void;
|
|
41
|
+
getStats(): {
|
|
42
|
+
captureFrames: number;
|
|
43
|
+
processedFrames: number;
|
|
44
|
+
playbackUnderruns: number;
|
|
45
|
+
droppedRawFrames: number;
|
|
46
|
+
droppedProcessedFrames: number;
|
|
47
|
+
};
|
|
48
|
+
};
|
|
8
49
|
type AskOptions = {
|
|
9
50
|
voice?: string;
|
|
10
51
|
timeout?: number;
|
|
11
52
|
ack?: boolean;
|
|
12
53
|
auth?: AuthConfig;
|
|
54
|
+
createSession?: (options: RealtimeSessionOptions) => RealtimeSession;
|
|
55
|
+
createAudioEngine?: (options: {
|
|
56
|
+
sampleRate?: number;
|
|
57
|
+
channels?: number;
|
|
58
|
+
enableAec?: boolean;
|
|
59
|
+
streamDelayMs?: number;
|
|
60
|
+
maxCaptureFrames?: number;
|
|
61
|
+
}) => RustAudioEngine$1;
|
|
62
|
+
onTrace?: (event: {
|
|
63
|
+
atMs: number;
|
|
64
|
+
event: string;
|
|
65
|
+
detail?: Record<string, unknown>;
|
|
66
|
+
}) => void;
|
|
13
67
|
createPlayer?: unknown;
|
|
14
68
|
createRecorder?: unknown;
|
|
15
69
|
onAudioFrameSent?: (pcm16: Buffer) => void;
|
|
@@ -18,15 +72,32 @@ type AskOptions = {
|
|
|
18
72
|
};
|
|
19
73
|
declare function ask(message: string, options?: AskOptions): Promise<string>;
|
|
20
74
|
|
|
75
|
+
type RustAudioEngine = {
|
|
76
|
+
start(): void;
|
|
77
|
+
stop(): void;
|
|
78
|
+
close(): void;
|
|
79
|
+
play(pcm16: Buffer): void;
|
|
80
|
+
getStats?(): {
|
|
81
|
+
pendingPlaybackSamples?: number;
|
|
82
|
+
};
|
|
83
|
+
};
|
|
21
84
|
type SayOptions = {
|
|
22
85
|
voice?: string;
|
|
23
86
|
auth?: AuthConfig;
|
|
87
|
+
createSession?: (options: RealtimeSessionOptions) => RealtimeSession;
|
|
88
|
+
createAudioEngine?: (options: {
|
|
89
|
+
sampleRate?: number;
|
|
90
|
+
channels?: number;
|
|
91
|
+
enableAec?: boolean;
|
|
92
|
+
streamDelayMs?: number;
|
|
93
|
+
}) => RustAudioEngine;
|
|
94
|
+
onTrace?: (event: {
|
|
95
|
+
atMs: number;
|
|
96
|
+
event: string;
|
|
97
|
+
detail?: Record<string, unknown>;
|
|
98
|
+
}) => void;
|
|
24
99
|
createPlayer?: unknown;
|
|
25
100
|
};
|
|
26
101
|
declare function say(message: string, options?: SayOptions): Promise<void>;
|
|
27
102
|
|
|
28
|
-
declare const VOICES: readonly ["alloy", "ash", "ballad", "coral", "echo", "fable", "nova", "onyx", "sage", "shimmer", "verse"];
|
|
29
|
-
type Voice = (typeof VOICES)[number];
|
|
30
|
-
declare const DEFAULT_VOICE: Voice;
|
|
31
|
-
|
|
32
103
|
export { type AskOptions, type AuthConfig, DEFAULT_VOICE, type SayOptions, VOICES, type Voice, ask, resolveAuth, resolveVoice, say };
|
package/dist/index.js
CHANGED
|
@@ -44,6 +44,9 @@ function createRealtimeSession(options) {
|
|
|
44
44
|
const pcm16 = Buffer.from(event.delta, "base64");
|
|
45
45
|
options.onAudioDelta(pcm16);
|
|
46
46
|
});
|
|
47
|
+
rt.on("response.audio.done", () => {
|
|
48
|
+
options.onAudioDone?.();
|
|
49
|
+
});
|
|
47
50
|
rt.on("conversation.item.input_audio_transcription.completed", (event) => {
|
|
48
51
|
options.onTranscript(event.transcript);
|
|
49
52
|
});
|
|
@@ -129,22 +132,38 @@ var DEFAULT_VOICE = "ash";
|
|
|
129
132
|
|
|
130
133
|
// src/ask.ts
|
|
131
134
|
var require2 = createRequire(import.meta.url);
|
|
135
|
+
function pcm16Rms(pcm16) {
|
|
136
|
+
const samples = Math.floor(pcm16.length / 2);
|
|
137
|
+
if (samples === 0) return 0;
|
|
138
|
+
let sumSquares = 0;
|
|
139
|
+
for (let i = 0; i < samples; i++) {
|
|
140
|
+
const value = pcm16.readInt16LE(i * 2);
|
|
141
|
+
sumSquares += value * value;
|
|
142
|
+
}
|
|
143
|
+
return Math.sqrt(sumSquares / samples);
|
|
144
|
+
}
|
|
145
|
+
function readEnvInt(name, fallback) {
|
|
146
|
+
const raw = process.env[name];
|
|
147
|
+
if (raw == null) return fallback;
|
|
148
|
+
const parsed = Number.parseInt(raw, 10);
|
|
149
|
+
return Number.isFinite(parsed) ? parsed : fallback;
|
|
150
|
+
}
|
|
132
151
|
async function ask(message, options = {}) {
|
|
133
152
|
const {
|
|
134
153
|
voice = DEFAULT_VOICE,
|
|
135
154
|
timeout = 30,
|
|
136
155
|
ack = false,
|
|
137
156
|
auth,
|
|
157
|
+
createSession,
|
|
158
|
+
createAudioEngine,
|
|
159
|
+
onTrace,
|
|
138
160
|
onAudioFrameSent,
|
|
139
161
|
onAssistantAudio,
|
|
140
162
|
onMicAudio
|
|
141
163
|
} = options;
|
|
142
164
|
const { AudioEngine } = require2("agent-voice-audio");
|
|
143
|
-
const streamDelayMs =
|
|
144
|
-
|
|
145
|
-
10
|
|
146
|
-
);
|
|
147
|
-
const engine = new AudioEngine({
|
|
165
|
+
const streamDelayMs = readEnvInt("AGENT_VOICE_AEC_STREAM_DELAY_MS", 30);
|
|
166
|
+
const engine = (createAudioEngine ?? ((engineOptions) => new AudioEngine(engineOptions)))({
|
|
148
167
|
sampleRate: SAMPLE_RATE,
|
|
149
168
|
channels: 1,
|
|
150
169
|
enableAec: true,
|
|
@@ -160,7 +179,11 @@ async function ask(message, options = {}) {
|
|
|
160
179
|
process.stderr.write(`[ask ${elapsed}ms] ${event}${suffix}
|
|
161
180
|
`);
|
|
162
181
|
}
|
|
182
|
+
function trace(event, detail) {
|
|
183
|
+
onTrace?.({ atMs: Date.now() - startMs, event, detail });
|
|
184
|
+
}
|
|
163
185
|
logEvent("start");
|
|
186
|
+
trace("start");
|
|
164
187
|
return new Promise((resolve, reject) => {
|
|
165
188
|
let transcript = "";
|
|
166
189
|
let timeoutTimer = null;
|
|
@@ -168,15 +191,19 @@ async function ask(message, options = {}) {
|
|
|
168
191
|
let transcriptTimer = null;
|
|
169
192
|
let capturePollTimer = null;
|
|
170
193
|
let speechDetected = false;
|
|
194
|
+
let speechStartedAtMs = 0;
|
|
171
195
|
let initialResponseDone = false;
|
|
172
196
|
let heardAssistantAudio = false;
|
|
173
197
|
let lastAssistantAudioAt = 0;
|
|
198
|
+
let nearEndEvidenceSeen = false;
|
|
199
|
+
let nearEndEvidenceAtMs = 0;
|
|
174
200
|
let cleaned = false;
|
|
175
201
|
let settled = false;
|
|
176
202
|
async function cleanup() {
|
|
177
203
|
if (cleaned) return;
|
|
178
204
|
cleaned = true;
|
|
179
205
|
logEvent("cleanup:start");
|
|
206
|
+
trace("cleanup:start");
|
|
180
207
|
if (timeoutTimer) clearTimeout(timeoutTimer);
|
|
181
208
|
if (responseStartTimer) clearTimeout(responseStartTimer);
|
|
182
209
|
if (transcriptTimer) clearTimeout(transcriptTimer);
|
|
@@ -188,6 +215,7 @@ async function ask(message, options = {}) {
|
|
|
188
215
|
}
|
|
189
216
|
session.close();
|
|
190
217
|
logEvent("cleanup:done");
|
|
218
|
+
trace("cleanup:done");
|
|
191
219
|
}
|
|
192
220
|
function resolveOnce(value) {
|
|
193
221
|
if (settled) return;
|
|
@@ -212,41 +240,75 @@ async function ask(message, options = {}) {
|
|
|
212
240
|
`audio engine capture read failed: ${err instanceof Error ? err.message : String(err)}`
|
|
213
241
|
)
|
|
214
242
|
);
|
|
243
|
+
trace("audio:capture_read_error", {
|
|
244
|
+
error: err instanceof Error ? err.message : String(err)
|
|
245
|
+
});
|
|
215
246
|
return;
|
|
216
247
|
}
|
|
217
248
|
for (const frame of rawFrames) onMicAudio?.(frame);
|
|
218
249
|
if (!heardAssistantAudio) return;
|
|
219
250
|
for (const frame of processedFrames) {
|
|
251
|
+
const rms = pcm16Rms(frame);
|
|
252
|
+
const minSpeechRms = readEnvInt("AGENT_VOICE_MIN_SPEECH_RMS", 550);
|
|
253
|
+
if (rms >= minSpeechRms) {
|
|
254
|
+
nearEndEvidenceSeen = true;
|
|
255
|
+
nearEndEvidenceAtMs = Date.now();
|
|
256
|
+
trace("audio:near_end_evidence", { rms, minSpeechRms });
|
|
257
|
+
}
|
|
220
258
|
onAudioFrameSent?.(frame);
|
|
221
259
|
session.sendAudio(frame);
|
|
222
260
|
}
|
|
261
|
+
if (processedFrames.length > 0) {
|
|
262
|
+
trace("audio:sent_capture", { frames: processedFrames.length });
|
|
263
|
+
}
|
|
223
264
|
}, 10);
|
|
224
|
-
const session = createRealtimeSession({
|
|
265
|
+
const session = (createSession ?? createRealtimeSession)({
|
|
225
266
|
voice,
|
|
226
267
|
mode: "default",
|
|
227
268
|
ack,
|
|
228
269
|
auth,
|
|
229
270
|
onAudioDelta(pcm16) {
|
|
230
271
|
logEvent("realtime:audio_delta", `bytes=${pcm16.length}`);
|
|
272
|
+
trace("realtime:audio_delta", { bytes: pcm16.length });
|
|
231
273
|
heardAssistantAudio = true;
|
|
232
274
|
lastAssistantAudioAt = Date.now();
|
|
233
275
|
onAssistantAudio?.(pcm16);
|
|
234
276
|
engine.play(pcm16);
|
|
235
277
|
},
|
|
236
278
|
onTranscript(text) {
|
|
237
|
-
const echoGuardMs =
|
|
238
|
-
process.env.AGENT_VOICE_ECHO_GUARD_MS ?? "1500",
|
|
239
|
-
10
|
|
240
|
-
);
|
|
279
|
+
const echoGuardMs = readEnvInt("AGENT_VOICE_ECHO_GUARD_MS", 1500);
|
|
241
280
|
const sinceAssistantMs = Date.now() - lastAssistantAudioAt;
|
|
242
281
|
if (heardAssistantAudio && sinceAssistantMs < echoGuardMs) {
|
|
243
282
|
logEvent(
|
|
244
283
|
"realtime:transcript_ignored_echo_guard",
|
|
245
284
|
`since_assistant_ms=${sinceAssistantMs} text="${text}"`
|
|
246
285
|
);
|
|
286
|
+
trace("realtime:transcript_ignored_echo_guard", {
|
|
287
|
+
sinceAssistantMs,
|
|
288
|
+
text
|
|
289
|
+
});
|
|
247
290
|
return;
|
|
248
291
|
}
|
|
249
292
|
logEvent("realtime:transcript", `text="${text}"`);
|
|
293
|
+
trace("realtime:transcript", { text });
|
|
294
|
+
if (speechDetected) {
|
|
295
|
+
const evidenceWindowMs = readEnvInt(
|
|
296
|
+
"AGENT_VOICE_SPEECH_EVIDENCE_WINDOW_MS",
|
|
297
|
+
1200
|
|
298
|
+
);
|
|
299
|
+
const evidenceAgeMs = nearEndEvidenceSeen ? Math.abs(nearEndEvidenceAtMs - speechStartedAtMs) : Number.POSITIVE_INFINITY;
|
|
300
|
+
if (!nearEndEvidenceSeen || evidenceAgeMs > evidenceWindowMs) {
|
|
301
|
+
trace("realtime:transcript_ignored_no_near_end_evidence", {
|
|
302
|
+
text,
|
|
303
|
+
speechStartedAtMs,
|
|
304
|
+
nearEndEvidenceSeen,
|
|
305
|
+
nearEndEvidenceAtMs,
|
|
306
|
+
evidenceAgeMs,
|
|
307
|
+
evidenceWindowMs
|
|
308
|
+
});
|
|
309
|
+
return;
|
|
310
|
+
}
|
|
311
|
+
}
|
|
250
312
|
if (transcriptTimer) {
|
|
251
313
|
clearTimeout(transcriptTimer);
|
|
252
314
|
transcriptTimer = null;
|
|
@@ -256,7 +318,9 @@ async function ask(message, options = {}) {
|
|
|
256
318
|
},
|
|
257
319
|
onSpeechStarted() {
|
|
258
320
|
logEvent("realtime:speech_started");
|
|
321
|
+
trace("realtime:speech_started");
|
|
259
322
|
speechDetected = true;
|
|
323
|
+
speechStartedAtMs = Date.now();
|
|
260
324
|
if (timeoutTimer) {
|
|
261
325
|
clearTimeout(timeoutTimer);
|
|
262
326
|
timeoutTimer = null;
|
|
@@ -264,6 +328,9 @@ async function ask(message, options = {}) {
|
|
|
264
328
|
if (transcriptTimer) clearTimeout(transcriptTimer);
|
|
265
329
|
transcriptTimer = setTimeout(() => {
|
|
266
330
|
logEvent("timeout:no_transcript_after_speech");
|
|
331
|
+
trace("timeout:no_transcript_after_speech", {
|
|
332
|
+
timeoutSeconds: timeout
|
|
333
|
+
});
|
|
267
334
|
rejectOnce(
|
|
268
335
|
new Error(
|
|
269
336
|
`No transcript received within ${timeout}s after speech started`
|
|
@@ -279,10 +346,12 @@ async function ask(message, options = {}) {
|
|
|
279
346
|
},
|
|
280
347
|
onInitialResponseDone() {
|
|
281
348
|
logEvent("realtime:initial_response_done");
|
|
349
|
+
trace("realtime:initial_response_done");
|
|
282
350
|
initialResponseDone = true;
|
|
283
351
|
timeoutTimer = setTimeout(() => {
|
|
284
352
|
if (!speechDetected) {
|
|
285
353
|
logEvent("timeout:no_speech");
|
|
354
|
+
trace("timeout:no_speech", { timeoutSeconds: timeout });
|
|
286
355
|
rejectOnce(
|
|
287
356
|
new Error(`No speech detected within ${timeout}s timeout`)
|
|
288
357
|
);
|
|
@@ -291,21 +360,26 @@ async function ask(message, options = {}) {
|
|
|
291
360
|
},
|
|
292
361
|
onDone() {
|
|
293
362
|
logEvent("realtime:done");
|
|
363
|
+
trace("realtime:done");
|
|
294
364
|
if (ack) resolveOnce(transcript);
|
|
295
365
|
},
|
|
296
366
|
onError(error) {
|
|
297
367
|
logEvent("realtime:error", error);
|
|
368
|
+
trace("realtime:error", { error });
|
|
298
369
|
rejectOnce(new Error(error));
|
|
299
370
|
}
|
|
300
371
|
});
|
|
301
372
|
session.connect().then(
|
|
302
373
|
() => {
|
|
303
374
|
logEvent("realtime:connected");
|
|
375
|
+
trace("realtime:connected");
|
|
304
376
|
logEvent("realtime:send_message");
|
|
377
|
+
trace("realtime:send_message");
|
|
305
378
|
session.sendMessage(message);
|
|
306
379
|
responseStartTimer = setTimeout(() => {
|
|
307
380
|
if (!heardAssistantAudio) {
|
|
308
381
|
logEvent("timeout:no_assistant_audio");
|
|
382
|
+
trace("timeout:no_assistant_audio");
|
|
309
383
|
rejectOnce(
|
|
310
384
|
new Error("No assistant audio received after sending message")
|
|
311
385
|
);
|
|
@@ -317,6 +391,9 @@ async function ask(message, options = {}) {
|
|
|
317
391
|
"realtime:connect_error",
|
|
318
392
|
err instanceof Error ? err.message : String(err)
|
|
319
393
|
);
|
|
394
|
+
trace("realtime:connect_error", {
|
|
395
|
+
error: err instanceof Error ? err.message : String(err)
|
|
396
|
+
});
|
|
320
397
|
rejectOnce(err instanceof Error ? err : new Error(String(err)));
|
|
321
398
|
}
|
|
322
399
|
);
|
|
@@ -357,50 +434,150 @@ function resolveVoice() {
|
|
|
357
434
|
import { createRequire as createRequire2 } from "module";
|
|
358
435
|
var require3 = createRequire2(import.meta.url);
|
|
359
436
|
async function say(message, options = {}) {
|
|
360
|
-
const {
|
|
437
|
+
const {
|
|
438
|
+
voice = DEFAULT_VOICE,
|
|
439
|
+
auth,
|
|
440
|
+
createSession,
|
|
441
|
+
createAudioEngine,
|
|
442
|
+
onTrace
|
|
443
|
+
} = options;
|
|
361
444
|
const { AudioEngine } = require3("agent-voice-audio");
|
|
362
|
-
const
|
|
445
|
+
const startMs = Date.now();
|
|
446
|
+
function trace(event, detail) {
|
|
447
|
+
onTrace?.({ atMs: Date.now() - startMs, event, detail });
|
|
448
|
+
}
|
|
449
|
+
const engine = (createAudioEngine ?? ((engineOptions) => new AudioEngine(engineOptions)))({
|
|
363
450
|
sampleRate: SAMPLE_RATE,
|
|
364
451
|
channels: 1,
|
|
365
452
|
enableAec: false
|
|
366
453
|
});
|
|
367
454
|
engine.start();
|
|
455
|
+
trace("start");
|
|
368
456
|
return new Promise((resolve, reject) => {
|
|
369
457
|
let cleaned = false;
|
|
458
|
+
let settled = false;
|
|
459
|
+
let responseDoneFallbackTimer = null;
|
|
460
|
+
let completionTailTimer = null;
|
|
461
|
+
let drainPollTimer = null;
|
|
462
|
+
let drainDeadlineTimer = null;
|
|
370
463
|
function cleanup() {
|
|
371
464
|
if (cleaned) return;
|
|
372
465
|
cleaned = true;
|
|
466
|
+
if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
|
|
467
|
+
if (completionTailTimer) clearTimeout(completionTailTimer);
|
|
468
|
+
if (drainPollTimer) clearInterval(drainPollTimer);
|
|
469
|
+
if (drainDeadlineTimer) clearTimeout(drainDeadlineTimer);
|
|
373
470
|
try {
|
|
374
471
|
engine.stop();
|
|
375
472
|
engine.close();
|
|
376
473
|
} catch {
|
|
377
474
|
}
|
|
378
475
|
session.close();
|
|
476
|
+
trace("cleanup");
|
|
379
477
|
}
|
|
380
|
-
|
|
478
|
+
function resolveOnce() {
|
|
479
|
+
if (settled) return;
|
|
480
|
+
settled = true;
|
|
481
|
+
cleanup();
|
|
482
|
+
resolve();
|
|
483
|
+
}
|
|
484
|
+
function rejectOnce(error) {
|
|
485
|
+
if (settled) return;
|
|
486
|
+
settled = true;
|
|
487
|
+
cleanup();
|
|
488
|
+
reject(error);
|
|
489
|
+
}
|
|
490
|
+
function waitForPlaybackDrain() {
|
|
491
|
+
if (settled) return;
|
|
492
|
+
if (!engine.getStats) {
|
|
493
|
+
trace("drain:no_stats");
|
|
494
|
+
resolveOnce();
|
|
495
|
+
return;
|
|
496
|
+
}
|
|
497
|
+
const absoluteDeadlineMs = 2e4;
|
|
498
|
+
const maxNoProgressMs = 1200;
|
|
499
|
+
const drainStartMs = Date.now();
|
|
500
|
+
let lastProgressAtMs = drainStartMs;
|
|
501
|
+
let lastPending = Number.POSITIVE_INFINITY;
|
|
502
|
+
trace("drain:deadline_scheduled", {
|
|
503
|
+
absoluteDeadlineMs,
|
|
504
|
+
maxNoProgressMs
|
|
505
|
+
});
|
|
506
|
+
let zeroStreak = 0;
|
|
507
|
+
drainPollTimer = setInterval(() => {
|
|
508
|
+
if (settled) return;
|
|
509
|
+
let pending = 0;
|
|
510
|
+
try {
|
|
511
|
+
pending = Number(engine.getStats?.().pendingPlaybackSamples ?? 0);
|
|
512
|
+
} catch {
|
|
513
|
+
pending = 0;
|
|
514
|
+
}
|
|
515
|
+
trace("drain:poll", { pendingPlaybackSamples: pending });
|
|
516
|
+
if (pending < lastPending) {
|
|
517
|
+
lastPending = pending;
|
|
518
|
+
lastProgressAtMs = Date.now();
|
|
519
|
+
}
|
|
520
|
+
if (pending <= 0) {
|
|
521
|
+
zeroStreak += 1;
|
|
522
|
+
if (zeroStreak >= 3) {
|
|
523
|
+
resolveOnce();
|
|
524
|
+
}
|
|
525
|
+
return;
|
|
526
|
+
}
|
|
527
|
+
zeroStreak = 0;
|
|
528
|
+
if (Date.now() - lastProgressAtMs > maxNoProgressMs) {
|
|
529
|
+
trace("drain:no_progress_timeout", {
|
|
530
|
+
pendingPlaybackSamples: pending
|
|
531
|
+
});
|
|
532
|
+
resolveOnce();
|
|
533
|
+
}
|
|
534
|
+
}, 20);
|
|
535
|
+
drainDeadlineTimer = setTimeout(() => {
|
|
536
|
+
trace("drain:deadline");
|
|
537
|
+
resolveOnce();
|
|
538
|
+
}, absoluteDeadlineMs);
|
|
539
|
+
}
|
|
540
|
+
function scheduleTailResolve(delayMs) {
|
|
541
|
+
if (settled) return;
|
|
542
|
+
if (completionTailTimer) clearTimeout(completionTailTimer);
|
|
543
|
+
completionTailTimer = setTimeout(() => {
|
|
544
|
+
waitForPlaybackDrain();
|
|
545
|
+
}, delayMs);
|
|
546
|
+
trace("tail_scheduled", { delayMs });
|
|
547
|
+
}
|
|
548
|
+
const session = (createSession ?? createRealtimeSession)({
|
|
381
549
|
voice,
|
|
382
550
|
mode: "say",
|
|
383
551
|
ack: false,
|
|
384
552
|
auth,
|
|
385
553
|
onAudioDelta(pcm16) {
|
|
386
554
|
engine.play(pcm16);
|
|
555
|
+
trace("realtime:audio_delta", { bytes: pcm16.length });
|
|
556
|
+
},
|
|
557
|
+
onAudioDone() {
|
|
558
|
+
scheduleTailResolve(140);
|
|
559
|
+
trace("realtime:audio_done");
|
|
387
560
|
},
|
|
388
561
|
onTranscript() {
|
|
389
562
|
},
|
|
390
563
|
onSpeechStarted() {
|
|
391
564
|
},
|
|
392
565
|
onInitialResponseDone() {
|
|
393
|
-
|
|
394
|
-
|
|
566
|
+
if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
|
|
567
|
+
responseDoneFallbackTimer = setTimeout(() => {
|
|
568
|
+
scheduleTailResolve(220);
|
|
569
|
+
}, 700);
|
|
570
|
+
trace("realtime:initial_response_done");
|
|
395
571
|
},
|
|
396
572
|
onDone() {
|
|
397
573
|
},
|
|
398
574
|
onError(error) {
|
|
399
|
-
|
|
400
|
-
|
|
575
|
+
trace("realtime:error", { error });
|
|
576
|
+
rejectOnce(new Error(error));
|
|
401
577
|
}
|
|
402
578
|
});
|
|
403
579
|
session.connect().then(() => {
|
|
580
|
+
trace("realtime:connected");
|
|
404
581
|
session.sendMessage(message);
|
|
405
582
|
}, reject);
|
|
406
583
|
});
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import {
|
|
3
|
+
createRealtimeSession
|
|
4
|
+
} from "./chunk-UYBFONQE.js";
|
|
5
|
+
import {
|
|
6
|
+
DEFAULT_VOICE,
|
|
7
|
+
SAMPLE_RATE
|
|
8
|
+
} from "./chunk-AHLLYIEW.js";
|
|
9
|
+
|
|
10
|
+
// src/say.ts
|
|
11
|
+
import { createRequire } from "module";
|
|
12
|
+
var require2 = createRequire(import.meta.url);
|
|
13
|
+
async function say(message, options = {}) {
|
|
14
|
+
const {
|
|
15
|
+
voice = DEFAULT_VOICE,
|
|
16
|
+
auth,
|
|
17
|
+
createSession,
|
|
18
|
+
createAudioEngine,
|
|
19
|
+
onTrace
|
|
20
|
+
} = options;
|
|
21
|
+
const { AudioEngine } = require2("agent-voice-audio");
|
|
22
|
+
const startMs = Date.now();
|
|
23
|
+
function trace(event, detail) {
|
|
24
|
+
onTrace?.({ atMs: Date.now() - startMs, event, detail });
|
|
25
|
+
}
|
|
26
|
+
const engine = (createAudioEngine ?? ((engineOptions) => new AudioEngine(engineOptions)))({
|
|
27
|
+
sampleRate: SAMPLE_RATE,
|
|
28
|
+
channels: 1,
|
|
29
|
+
enableAec: false
|
|
30
|
+
});
|
|
31
|
+
engine.start();
|
|
32
|
+
trace("start");
|
|
33
|
+
return new Promise((resolve, reject) => {
|
|
34
|
+
let cleaned = false;
|
|
35
|
+
let settled = false;
|
|
36
|
+
let responseDoneFallbackTimer = null;
|
|
37
|
+
let completionTailTimer = null;
|
|
38
|
+
let drainPollTimer = null;
|
|
39
|
+
let drainDeadlineTimer = null;
|
|
40
|
+
function cleanup() {
|
|
41
|
+
if (cleaned) return;
|
|
42
|
+
cleaned = true;
|
|
43
|
+
if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
|
|
44
|
+
if (completionTailTimer) clearTimeout(completionTailTimer);
|
|
45
|
+
if (drainPollTimer) clearInterval(drainPollTimer);
|
|
46
|
+
if (drainDeadlineTimer) clearTimeout(drainDeadlineTimer);
|
|
47
|
+
try {
|
|
48
|
+
engine.stop();
|
|
49
|
+
engine.close();
|
|
50
|
+
} catch {
|
|
51
|
+
}
|
|
52
|
+
session.close();
|
|
53
|
+
trace("cleanup");
|
|
54
|
+
}
|
|
55
|
+
function resolveOnce() {
|
|
56
|
+
if (settled) return;
|
|
57
|
+
settled = true;
|
|
58
|
+
cleanup();
|
|
59
|
+
resolve();
|
|
60
|
+
}
|
|
61
|
+
function rejectOnce(error) {
|
|
62
|
+
if (settled) return;
|
|
63
|
+
settled = true;
|
|
64
|
+
cleanup();
|
|
65
|
+
reject(error);
|
|
66
|
+
}
|
|
67
|
+
function waitForPlaybackDrain() {
|
|
68
|
+
if (settled) return;
|
|
69
|
+
if (!engine.getStats) {
|
|
70
|
+
trace("drain:no_stats");
|
|
71
|
+
resolveOnce();
|
|
72
|
+
return;
|
|
73
|
+
}
|
|
74
|
+
const absoluteDeadlineMs = 2e4;
|
|
75
|
+
const maxNoProgressMs = 1200;
|
|
76
|
+
const drainStartMs = Date.now();
|
|
77
|
+
let lastProgressAtMs = drainStartMs;
|
|
78
|
+
let lastPending = Number.POSITIVE_INFINITY;
|
|
79
|
+
trace("drain:deadline_scheduled", {
|
|
80
|
+
absoluteDeadlineMs,
|
|
81
|
+
maxNoProgressMs
|
|
82
|
+
});
|
|
83
|
+
let zeroStreak = 0;
|
|
84
|
+
drainPollTimer = setInterval(() => {
|
|
85
|
+
if (settled) return;
|
|
86
|
+
let pending = 0;
|
|
87
|
+
try {
|
|
88
|
+
pending = Number(engine.getStats?.().pendingPlaybackSamples ?? 0);
|
|
89
|
+
} catch {
|
|
90
|
+
pending = 0;
|
|
91
|
+
}
|
|
92
|
+
trace("drain:poll", { pendingPlaybackSamples: pending });
|
|
93
|
+
if (pending < lastPending) {
|
|
94
|
+
lastPending = pending;
|
|
95
|
+
lastProgressAtMs = Date.now();
|
|
96
|
+
}
|
|
97
|
+
if (pending <= 0) {
|
|
98
|
+
zeroStreak += 1;
|
|
99
|
+
if (zeroStreak >= 3) {
|
|
100
|
+
resolveOnce();
|
|
101
|
+
}
|
|
102
|
+
return;
|
|
103
|
+
}
|
|
104
|
+
zeroStreak = 0;
|
|
105
|
+
if (Date.now() - lastProgressAtMs > maxNoProgressMs) {
|
|
106
|
+
trace("drain:no_progress_timeout", {
|
|
107
|
+
pendingPlaybackSamples: pending
|
|
108
|
+
});
|
|
109
|
+
resolveOnce();
|
|
110
|
+
}
|
|
111
|
+
}, 20);
|
|
112
|
+
drainDeadlineTimer = setTimeout(() => {
|
|
113
|
+
trace("drain:deadline");
|
|
114
|
+
resolveOnce();
|
|
115
|
+
}, absoluteDeadlineMs);
|
|
116
|
+
}
|
|
117
|
+
function scheduleTailResolve(delayMs) {
|
|
118
|
+
if (settled) return;
|
|
119
|
+
if (completionTailTimer) clearTimeout(completionTailTimer);
|
|
120
|
+
completionTailTimer = setTimeout(() => {
|
|
121
|
+
waitForPlaybackDrain();
|
|
122
|
+
}, delayMs);
|
|
123
|
+
trace("tail_scheduled", { delayMs });
|
|
124
|
+
}
|
|
125
|
+
const session = (createSession ?? createRealtimeSession)({
|
|
126
|
+
voice,
|
|
127
|
+
mode: "say",
|
|
128
|
+
ack: false,
|
|
129
|
+
auth,
|
|
130
|
+
onAudioDelta(pcm16) {
|
|
131
|
+
engine.play(pcm16);
|
|
132
|
+
trace("realtime:audio_delta", { bytes: pcm16.length });
|
|
133
|
+
},
|
|
134
|
+
onAudioDone() {
|
|
135
|
+
scheduleTailResolve(140);
|
|
136
|
+
trace("realtime:audio_done");
|
|
137
|
+
},
|
|
138
|
+
onTranscript() {
|
|
139
|
+
},
|
|
140
|
+
onSpeechStarted() {
|
|
141
|
+
},
|
|
142
|
+
onInitialResponseDone() {
|
|
143
|
+
if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
|
|
144
|
+
responseDoneFallbackTimer = setTimeout(() => {
|
|
145
|
+
scheduleTailResolve(220);
|
|
146
|
+
}, 700);
|
|
147
|
+
trace("realtime:initial_response_done");
|
|
148
|
+
},
|
|
149
|
+
onDone() {
|
|
150
|
+
},
|
|
151
|
+
onError(error) {
|
|
152
|
+
trace("realtime:error", { error });
|
|
153
|
+
rejectOnce(new Error(error));
|
|
154
|
+
}
|
|
155
|
+
});
|
|
156
|
+
session.connect().then(() => {
|
|
157
|
+
trace("realtime:connected");
|
|
158
|
+
session.sendMessage(message);
|
|
159
|
+
}, reject);
|
|
160
|
+
});
|
|
161
|
+
}
|
|
162
|
+
export {
|
|
163
|
+
say
|
|
164
|
+
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agent-voice",
|
|
3
|
-
"version": "0.2.
|
|
3
|
+
"version": "0.2.2",
|
|
4
4
|
"description": "CLI for AI agents to interact with humans via voice",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
"dist"
|
|
19
19
|
],
|
|
20
20
|
"dependencies": {
|
|
21
|
-
"agent-voice-audio": "^0.2.
|
|
21
|
+
"agent-voice-audio": "^0.2.1",
|
|
22
22
|
"@inquirer/prompts": "^8.2.0",
|
|
23
23
|
"commander": "^13.1.0",
|
|
24
24
|
"openai": "^4.96.0",
|
package/dist/say-W56HCNK4.js
DELETED
|
@@ -1,64 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
import {
|
|
3
|
-
createRealtimeSession
|
|
4
|
-
} from "./chunk-VV2VNOC4.js";
|
|
5
|
-
import {
|
|
6
|
-
DEFAULT_VOICE,
|
|
7
|
-
SAMPLE_RATE
|
|
8
|
-
} from "./chunk-AHLLYIEW.js";
|
|
9
|
-
|
|
10
|
-
// src/say.ts
|
|
11
|
-
import { createRequire } from "module";
|
|
12
|
-
var require2 = createRequire(import.meta.url);
|
|
13
|
-
async function say(message, options = {}) {
|
|
14
|
-
const { voice = DEFAULT_VOICE, auth } = options;
|
|
15
|
-
const { AudioEngine } = require2("agent-voice-audio");
|
|
16
|
-
const engine = new AudioEngine({
|
|
17
|
-
sampleRate: SAMPLE_RATE,
|
|
18
|
-
channels: 1,
|
|
19
|
-
enableAec: false
|
|
20
|
-
});
|
|
21
|
-
engine.start();
|
|
22
|
-
return new Promise((resolve, reject) => {
|
|
23
|
-
let cleaned = false;
|
|
24
|
-
function cleanup() {
|
|
25
|
-
if (cleaned) return;
|
|
26
|
-
cleaned = true;
|
|
27
|
-
try {
|
|
28
|
-
engine.stop();
|
|
29
|
-
engine.close();
|
|
30
|
-
} catch {
|
|
31
|
-
}
|
|
32
|
-
session.close();
|
|
33
|
-
}
|
|
34
|
-
const session = createRealtimeSession({
|
|
35
|
-
voice,
|
|
36
|
-
mode: "say",
|
|
37
|
-
ack: false,
|
|
38
|
-
auth,
|
|
39
|
-
onAudioDelta(pcm16) {
|
|
40
|
-
engine.play(pcm16);
|
|
41
|
-
},
|
|
42
|
-
onTranscript() {
|
|
43
|
-
},
|
|
44
|
-
onSpeechStarted() {
|
|
45
|
-
},
|
|
46
|
-
onInitialResponseDone() {
|
|
47
|
-
cleanup();
|
|
48
|
-
resolve();
|
|
49
|
-
},
|
|
50
|
-
onDone() {
|
|
51
|
-
},
|
|
52
|
-
onError(error) {
|
|
53
|
-
cleanup();
|
|
54
|
-
reject(new Error(error));
|
|
55
|
-
}
|
|
56
|
-
});
|
|
57
|
-
session.connect().then(() => {
|
|
58
|
-
session.sendMessage(message);
|
|
59
|
-
}, reject);
|
|
60
|
-
});
|
|
61
|
-
}
|
|
62
|
-
export {
|
|
63
|
-
say
|
|
64
|
-
};
|