agent-voice 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{ask-A32EH5QX.js → ask-OIE6HL2H.js} +84 -10
- package/dist/cli.js +17 -5
- package/dist/index.d.ts +75 -4
- package/dist/index.js +160 -14
- package/dist/say-ZVF6EX52.js +164 -0
- package/package.json +1 -1
- package/dist/say-ELJAIWUM.js +0 -92
|
@@ -10,22 +10,38 @@ import {
|
|
|
10
10
|
// src/ask.ts
|
|
11
11
|
import { createRequire } from "module";
|
|
12
12
|
var require2 = createRequire(import.meta.url);
|
|
13
|
+
function pcm16Rms(pcm16) {
|
|
14
|
+
const samples = Math.floor(pcm16.length / 2);
|
|
15
|
+
if (samples === 0) return 0;
|
|
16
|
+
let sumSquares = 0;
|
|
17
|
+
for (let i = 0; i < samples; i++) {
|
|
18
|
+
const value = pcm16.readInt16LE(i * 2);
|
|
19
|
+
sumSquares += value * value;
|
|
20
|
+
}
|
|
21
|
+
return Math.sqrt(sumSquares / samples);
|
|
22
|
+
}
|
|
23
|
+
function readEnvInt(name, fallback) {
|
|
24
|
+
const raw = process.env[name];
|
|
25
|
+
if (raw == null) return fallback;
|
|
26
|
+
const parsed = Number.parseInt(raw, 10);
|
|
27
|
+
return Number.isFinite(parsed) ? parsed : fallback;
|
|
28
|
+
}
|
|
13
29
|
async function ask(message, options = {}) {
|
|
14
30
|
const {
|
|
15
31
|
voice = DEFAULT_VOICE,
|
|
16
32
|
timeout = 30,
|
|
17
33
|
ack = false,
|
|
18
34
|
auth,
|
|
35
|
+
createSession,
|
|
36
|
+
createAudioEngine,
|
|
37
|
+
onTrace,
|
|
19
38
|
onAudioFrameSent,
|
|
20
39
|
onAssistantAudio,
|
|
21
40
|
onMicAudio
|
|
22
41
|
} = options;
|
|
23
42
|
const { AudioEngine } = require2("agent-voice-audio");
|
|
24
|
-
const streamDelayMs =
|
|
25
|
-
|
|
26
|
-
10
|
|
27
|
-
);
|
|
28
|
-
const engine = new AudioEngine({
|
|
43
|
+
const streamDelayMs = readEnvInt("AGENT_VOICE_AEC_STREAM_DELAY_MS", 30);
|
|
44
|
+
const engine = (createAudioEngine ?? ((engineOptions) => new AudioEngine(engineOptions)))({
|
|
29
45
|
sampleRate: SAMPLE_RATE,
|
|
30
46
|
channels: 1,
|
|
31
47
|
enableAec: true,
|
|
@@ -41,7 +57,11 @@ async function ask(message, options = {}) {
|
|
|
41
57
|
process.stderr.write(`[ask ${elapsed}ms] ${event}${suffix}
|
|
42
58
|
`);
|
|
43
59
|
}
|
|
60
|
+
function trace(event, detail) {
|
|
61
|
+
onTrace?.({ atMs: Date.now() - startMs, event, detail });
|
|
62
|
+
}
|
|
44
63
|
logEvent("start");
|
|
64
|
+
trace("start");
|
|
45
65
|
return new Promise((resolve, reject) => {
|
|
46
66
|
let transcript = "";
|
|
47
67
|
let timeoutTimer = null;
|
|
@@ -49,15 +69,19 @@ async function ask(message, options = {}) {
|
|
|
49
69
|
let transcriptTimer = null;
|
|
50
70
|
let capturePollTimer = null;
|
|
51
71
|
let speechDetected = false;
|
|
72
|
+
let speechStartedAtMs = 0;
|
|
52
73
|
let initialResponseDone = false;
|
|
53
74
|
let heardAssistantAudio = false;
|
|
54
75
|
let lastAssistantAudioAt = 0;
|
|
76
|
+
let nearEndEvidenceSeen = false;
|
|
77
|
+
let nearEndEvidenceAtMs = 0;
|
|
55
78
|
let cleaned = false;
|
|
56
79
|
let settled = false;
|
|
57
80
|
async function cleanup() {
|
|
58
81
|
if (cleaned) return;
|
|
59
82
|
cleaned = true;
|
|
60
83
|
logEvent("cleanup:start");
|
|
84
|
+
trace("cleanup:start");
|
|
61
85
|
if (timeoutTimer) clearTimeout(timeoutTimer);
|
|
62
86
|
if (responseStartTimer) clearTimeout(responseStartTimer);
|
|
63
87
|
if (transcriptTimer) clearTimeout(transcriptTimer);
|
|
@@ -69,6 +93,7 @@ async function ask(message, options = {}) {
|
|
|
69
93
|
}
|
|
70
94
|
session.close();
|
|
71
95
|
logEvent("cleanup:done");
|
|
96
|
+
trace("cleanup:done");
|
|
72
97
|
}
|
|
73
98
|
function resolveOnce(value) {
|
|
74
99
|
if (settled) return;
|
|
@@ -93,41 +118,75 @@ async function ask(message, options = {}) {
|
|
|
93
118
|
`audio engine capture read failed: ${err instanceof Error ? err.message : String(err)}`
|
|
94
119
|
)
|
|
95
120
|
);
|
|
121
|
+
trace("audio:capture_read_error", {
|
|
122
|
+
error: err instanceof Error ? err.message : String(err)
|
|
123
|
+
});
|
|
96
124
|
return;
|
|
97
125
|
}
|
|
98
126
|
for (const frame of rawFrames) onMicAudio?.(frame);
|
|
99
127
|
if (!heardAssistantAudio) return;
|
|
100
128
|
for (const frame of processedFrames) {
|
|
129
|
+
const rms = pcm16Rms(frame);
|
|
130
|
+
const minSpeechRms = readEnvInt("AGENT_VOICE_MIN_SPEECH_RMS", 550);
|
|
131
|
+
if (rms >= minSpeechRms) {
|
|
132
|
+
nearEndEvidenceSeen = true;
|
|
133
|
+
nearEndEvidenceAtMs = Date.now();
|
|
134
|
+
trace("audio:near_end_evidence", { rms, minSpeechRms });
|
|
135
|
+
}
|
|
101
136
|
onAudioFrameSent?.(frame);
|
|
102
137
|
session.sendAudio(frame);
|
|
103
138
|
}
|
|
139
|
+
if (processedFrames.length > 0) {
|
|
140
|
+
trace("audio:sent_capture", { frames: processedFrames.length });
|
|
141
|
+
}
|
|
104
142
|
}, 10);
|
|
105
|
-
const session = createRealtimeSession({
|
|
143
|
+
const session = (createSession ?? createRealtimeSession)({
|
|
106
144
|
voice,
|
|
107
145
|
mode: "default",
|
|
108
146
|
ack,
|
|
109
147
|
auth,
|
|
110
148
|
onAudioDelta(pcm16) {
|
|
111
149
|
logEvent("realtime:audio_delta", `bytes=${pcm16.length}`);
|
|
150
|
+
trace("realtime:audio_delta", { bytes: pcm16.length });
|
|
112
151
|
heardAssistantAudio = true;
|
|
113
152
|
lastAssistantAudioAt = Date.now();
|
|
114
153
|
onAssistantAudio?.(pcm16);
|
|
115
154
|
engine.play(pcm16);
|
|
116
155
|
},
|
|
117
156
|
onTranscript(text) {
|
|
118
|
-
const echoGuardMs =
|
|
119
|
-
process.env.AGENT_VOICE_ECHO_GUARD_MS ?? "1500",
|
|
120
|
-
10
|
|
121
|
-
);
|
|
157
|
+
const echoGuardMs = readEnvInt("AGENT_VOICE_ECHO_GUARD_MS", 1500);
|
|
122
158
|
const sinceAssistantMs = Date.now() - lastAssistantAudioAt;
|
|
123
159
|
if (heardAssistantAudio && sinceAssistantMs < echoGuardMs) {
|
|
124
160
|
logEvent(
|
|
125
161
|
"realtime:transcript_ignored_echo_guard",
|
|
126
162
|
`since_assistant_ms=${sinceAssistantMs} text="${text}"`
|
|
127
163
|
);
|
|
164
|
+
trace("realtime:transcript_ignored_echo_guard", {
|
|
165
|
+
sinceAssistantMs,
|
|
166
|
+
text
|
|
167
|
+
});
|
|
128
168
|
return;
|
|
129
169
|
}
|
|
130
170
|
logEvent("realtime:transcript", `text="${text}"`);
|
|
171
|
+
trace("realtime:transcript", { text });
|
|
172
|
+
if (speechDetected) {
|
|
173
|
+
const evidenceWindowMs = readEnvInt(
|
|
174
|
+
"AGENT_VOICE_SPEECH_EVIDENCE_WINDOW_MS",
|
|
175
|
+
1200
|
|
176
|
+
);
|
|
177
|
+
const evidenceAgeMs = nearEndEvidenceSeen ? Math.abs(nearEndEvidenceAtMs - speechStartedAtMs) : Number.POSITIVE_INFINITY;
|
|
178
|
+
if (!nearEndEvidenceSeen || evidenceAgeMs > evidenceWindowMs) {
|
|
179
|
+
trace("realtime:transcript_ignored_no_near_end_evidence", {
|
|
180
|
+
text,
|
|
181
|
+
speechStartedAtMs,
|
|
182
|
+
nearEndEvidenceSeen,
|
|
183
|
+
nearEndEvidenceAtMs,
|
|
184
|
+
evidenceAgeMs,
|
|
185
|
+
evidenceWindowMs
|
|
186
|
+
});
|
|
187
|
+
return;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
131
190
|
if (transcriptTimer) {
|
|
132
191
|
clearTimeout(transcriptTimer);
|
|
133
192
|
transcriptTimer = null;
|
|
@@ -137,7 +196,9 @@ async function ask(message, options = {}) {
|
|
|
137
196
|
},
|
|
138
197
|
onSpeechStarted() {
|
|
139
198
|
logEvent("realtime:speech_started");
|
|
199
|
+
trace("realtime:speech_started");
|
|
140
200
|
speechDetected = true;
|
|
201
|
+
speechStartedAtMs = Date.now();
|
|
141
202
|
if (timeoutTimer) {
|
|
142
203
|
clearTimeout(timeoutTimer);
|
|
143
204
|
timeoutTimer = null;
|
|
@@ -145,6 +206,9 @@ async function ask(message, options = {}) {
|
|
|
145
206
|
if (transcriptTimer) clearTimeout(transcriptTimer);
|
|
146
207
|
transcriptTimer = setTimeout(() => {
|
|
147
208
|
logEvent("timeout:no_transcript_after_speech");
|
|
209
|
+
trace("timeout:no_transcript_after_speech", {
|
|
210
|
+
timeoutSeconds: timeout
|
|
211
|
+
});
|
|
148
212
|
rejectOnce(
|
|
149
213
|
new Error(
|
|
150
214
|
`No transcript received within ${timeout}s after speech started`
|
|
@@ -160,10 +224,12 @@ async function ask(message, options = {}) {
|
|
|
160
224
|
},
|
|
161
225
|
onInitialResponseDone() {
|
|
162
226
|
logEvent("realtime:initial_response_done");
|
|
227
|
+
trace("realtime:initial_response_done");
|
|
163
228
|
initialResponseDone = true;
|
|
164
229
|
timeoutTimer = setTimeout(() => {
|
|
165
230
|
if (!speechDetected) {
|
|
166
231
|
logEvent("timeout:no_speech");
|
|
232
|
+
trace("timeout:no_speech", { timeoutSeconds: timeout });
|
|
167
233
|
rejectOnce(
|
|
168
234
|
new Error(`No speech detected within ${timeout}s timeout`)
|
|
169
235
|
);
|
|
@@ -172,21 +238,26 @@ async function ask(message, options = {}) {
|
|
|
172
238
|
},
|
|
173
239
|
onDone() {
|
|
174
240
|
logEvent("realtime:done");
|
|
241
|
+
trace("realtime:done");
|
|
175
242
|
if (ack) resolveOnce(transcript);
|
|
176
243
|
},
|
|
177
244
|
onError(error) {
|
|
178
245
|
logEvent("realtime:error", error);
|
|
246
|
+
trace("realtime:error", { error });
|
|
179
247
|
rejectOnce(new Error(error));
|
|
180
248
|
}
|
|
181
249
|
});
|
|
182
250
|
session.connect().then(
|
|
183
251
|
() => {
|
|
184
252
|
logEvent("realtime:connected");
|
|
253
|
+
trace("realtime:connected");
|
|
185
254
|
logEvent("realtime:send_message");
|
|
255
|
+
trace("realtime:send_message");
|
|
186
256
|
session.sendMessage(message);
|
|
187
257
|
responseStartTimer = setTimeout(() => {
|
|
188
258
|
if (!heardAssistantAudio) {
|
|
189
259
|
logEvent("timeout:no_assistant_audio");
|
|
260
|
+
trace("timeout:no_assistant_audio");
|
|
190
261
|
rejectOnce(
|
|
191
262
|
new Error("No assistant audio received after sending message")
|
|
192
263
|
);
|
|
@@ -198,6 +269,9 @@ async function ask(message, options = {}) {
|
|
|
198
269
|
"realtime:connect_error",
|
|
199
270
|
err instanceof Error ? err.message : String(err)
|
|
200
271
|
);
|
|
272
|
+
trace("realtime:connect_error", {
|
|
273
|
+
error: err instanceof Error ? err.message : String(err)
|
|
274
|
+
});
|
|
201
275
|
rejectOnce(err instanceof Error ? err : new Error(String(err)));
|
|
202
276
|
}
|
|
203
277
|
);
|
package/dist/cli.js
CHANGED
|
@@ -12,7 +12,13 @@ import {
|
|
|
12
12
|
} from "./chunk-AHLLYIEW.js";
|
|
13
13
|
|
|
14
14
|
// src/cli.ts
|
|
15
|
-
import {
|
|
15
|
+
import {
|
|
16
|
+
closeSync,
|
|
17
|
+
mkdirSync,
|
|
18
|
+
openSync,
|
|
19
|
+
writeFileSync,
|
|
20
|
+
writeSync
|
|
21
|
+
} from "fs";
|
|
16
22
|
import { join } from "path";
|
|
17
23
|
import { Command } from "commander";
|
|
18
24
|
async function withSuppressedNativeOutput() {
|
|
@@ -22,8 +28,8 @@ async function withSuppressedNativeOutput() {
|
|
|
22
28
|
openSync("/dev/null", "w");
|
|
23
29
|
closeSync(2);
|
|
24
30
|
openSync("/dev/null", "w");
|
|
25
|
-
const { ask } = await import("./ask-
|
|
26
|
-
const { say } = await import("./say-
|
|
31
|
+
const { ask } = await import("./ask-OIE6HL2H.js");
|
|
32
|
+
const { say } = await import("./say-ZVF6EX52.js");
|
|
27
33
|
function writeResult(text) {
|
|
28
34
|
writeSync(savedStdout, `${text}
|
|
29
35
|
`);
|
|
@@ -78,7 +84,10 @@ function writeDebugAudio(dir, assistantChunks, micChunks, modelInputChunks) {
|
|
|
78
84
|
const modelInputFile = join(dir, `ask-${stamp}-model-input.wav`);
|
|
79
85
|
writeFileSync(assistantFile, createWavBuffer(Buffer.concat(assistantChunks)));
|
|
80
86
|
writeFileSync(micFile, createWavBuffer(Buffer.concat(micChunks)));
|
|
81
|
-
writeFileSync(
|
|
87
|
+
writeFileSync(
|
|
88
|
+
modelInputFile,
|
|
89
|
+
createWavBuffer(Buffer.concat(modelInputChunks))
|
|
90
|
+
);
|
|
82
91
|
return { assistantFile, micFile, modelInputFile };
|
|
83
92
|
}
|
|
84
93
|
var program = new Command().name("agent-voice").description("AI agent voice interaction CLI");
|
|
@@ -120,7 +129,10 @@ voicesCmd.command("set <voice>").description("Set the default voice").action((vo
|
|
|
120
129
|
`);
|
|
121
130
|
process.exit(0);
|
|
122
131
|
});
|
|
123
|
-
program.command("ask").description("Speak a message and listen for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).option("--timeout <seconds>", "Seconds to wait for user speech", "120").option("--ack", "Speak an acknowledgment after the user responds").option(
|
|
132
|
+
program.command("ask").description("Speak a message and listen for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).option("--timeout <seconds>", "Seconds to wait for user speech", "120").option("--ack", "Speak an acknowledgment after the user responds").option(
|
|
133
|
+
"--debug-audio-dir <dir>",
|
|
134
|
+
"Write ask audio debug WAVs to this directory"
|
|
135
|
+
).action(async (opts) => {
|
|
124
136
|
const { ask, writeResult, writeError } = await withSuppressedNativeOutput();
|
|
125
137
|
const assistantChunks = [];
|
|
126
138
|
const micChunks = [];
|
package/dist/index.d.ts
CHANGED
|
@@ -5,11 +5,65 @@ type AuthConfig = {
|
|
|
5
5
|
declare function resolveAuth(): AuthConfig;
|
|
6
6
|
declare function resolveVoice(): string;
|
|
7
7
|
|
|
8
|
+
declare const VOICES: readonly ["alloy", "ash", "ballad", "coral", "echo", "fable", "nova", "onyx", "sage", "shimmer", "verse"];
|
|
9
|
+
type Voice = (typeof VOICES)[number];
|
|
10
|
+
declare const DEFAULT_VOICE: Voice;
|
|
11
|
+
type Mode = "default" | "say";
|
|
12
|
+
|
|
13
|
+
type RealtimeSessionOptions = {
|
|
14
|
+
voice: string;
|
|
15
|
+
mode: Mode;
|
|
16
|
+
ack: boolean;
|
|
17
|
+
auth?: AuthConfig;
|
|
18
|
+
onAudioDelta: (pcm16: Buffer) => void;
|
|
19
|
+
onAudioDone?: () => void;
|
|
20
|
+
onTranscript: (text: string) => void;
|
|
21
|
+
onSpeechStarted: () => void;
|
|
22
|
+
onInitialResponseDone: () => void;
|
|
23
|
+
onDone: () => void;
|
|
24
|
+
onError: (error: string) => void;
|
|
25
|
+
};
|
|
26
|
+
type RealtimeSession = {
|
|
27
|
+
connect(): Promise<void>;
|
|
28
|
+
sendMessage(text: string): void;
|
|
29
|
+
sendAudio(pcm16: Buffer): void;
|
|
30
|
+
close(): void;
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
type RustAudioEngine$1 = {
|
|
34
|
+
start(): void;
|
|
35
|
+
stop(): void;
|
|
36
|
+
close(): void;
|
|
37
|
+
play(pcm16: Buffer): void;
|
|
38
|
+
readProcessedCapture(maxFrames?: number): Buffer[];
|
|
39
|
+
readRawCapture(maxFrames?: number): Buffer[];
|
|
40
|
+
setStreamDelayMs(delayMs: number): void;
|
|
41
|
+
getStats(): {
|
|
42
|
+
captureFrames: number;
|
|
43
|
+
processedFrames: number;
|
|
44
|
+
playbackUnderruns: number;
|
|
45
|
+
droppedRawFrames: number;
|
|
46
|
+
droppedProcessedFrames: number;
|
|
47
|
+
};
|
|
48
|
+
};
|
|
8
49
|
type AskOptions = {
|
|
9
50
|
voice?: string;
|
|
10
51
|
timeout?: number;
|
|
11
52
|
ack?: boolean;
|
|
12
53
|
auth?: AuthConfig;
|
|
54
|
+
createSession?: (options: RealtimeSessionOptions) => RealtimeSession;
|
|
55
|
+
createAudioEngine?: (options: {
|
|
56
|
+
sampleRate?: number;
|
|
57
|
+
channels?: number;
|
|
58
|
+
enableAec?: boolean;
|
|
59
|
+
streamDelayMs?: number;
|
|
60
|
+
maxCaptureFrames?: number;
|
|
61
|
+
}) => RustAudioEngine$1;
|
|
62
|
+
onTrace?: (event: {
|
|
63
|
+
atMs: number;
|
|
64
|
+
event: string;
|
|
65
|
+
detail?: Record<string, unknown>;
|
|
66
|
+
}) => void;
|
|
13
67
|
createPlayer?: unknown;
|
|
14
68
|
createRecorder?: unknown;
|
|
15
69
|
onAudioFrameSent?: (pcm16: Buffer) => void;
|
|
@@ -18,15 +72,32 @@ type AskOptions = {
|
|
|
18
72
|
};
|
|
19
73
|
declare function ask(message: string, options?: AskOptions): Promise<string>;
|
|
20
74
|
|
|
75
|
+
type RustAudioEngine = {
|
|
76
|
+
start(): void;
|
|
77
|
+
stop(): void;
|
|
78
|
+
close(): void;
|
|
79
|
+
play(pcm16: Buffer): void;
|
|
80
|
+
getStats?(): {
|
|
81
|
+
pendingPlaybackSamples?: number;
|
|
82
|
+
};
|
|
83
|
+
};
|
|
21
84
|
type SayOptions = {
|
|
22
85
|
voice?: string;
|
|
23
86
|
auth?: AuthConfig;
|
|
87
|
+
createSession?: (options: RealtimeSessionOptions) => RealtimeSession;
|
|
88
|
+
createAudioEngine?: (options: {
|
|
89
|
+
sampleRate?: number;
|
|
90
|
+
channels?: number;
|
|
91
|
+
enableAec?: boolean;
|
|
92
|
+
streamDelayMs?: number;
|
|
93
|
+
}) => RustAudioEngine;
|
|
94
|
+
onTrace?: (event: {
|
|
95
|
+
atMs: number;
|
|
96
|
+
event: string;
|
|
97
|
+
detail?: Record<string, unknown>;
|
|
98
|
+
}) => void;
|
|
24
99
|
createPlayer?: unknown;
|
|
25
100
|
};
|
|
26
101
|
declare function say(message: string, options?: SayOptions): Promise<void>;
|
|
27
102
|
|
|
28
|
-
declare const VOICES: readonly ["alloy", "ash", "ballad", "coral", "echo", "fable", "nova", "onyx", "sage", "shimmer", "verse"];
|
|
29
|
-
type Voice = (typeof VOICES)[number];
|
|
30
|
-
declare const DEFAULT_VOICE: Voice;
|
|
31
|
-
|
|
32
103
|
export { type AskOptions, type AuthConfig, DEFAULT_VOICE, type SayOptions, VOICES, type Voice, ask, resolveAuth, resolveVoice, say };
|
package/dist/index.js
CHANGED
|
@@ -132,22 +132,38 @@ var DEFAULT_VOICE = "ash";
|
|
|
132
132
|
|
|
133
133
|
// src/ask.ts
|
|
134
134
|
var require2 = createRequire(import.meta.url);
|
|
135
|
+
function pcm16Rms(pcm16) {
|
|
136
|
+
const samples = Math.floor(pcm16.length / 2);
|
|
137
|
+
if (samples === 0) return 0;
|
|
138
|
+
let sumSquares = 0;
|
|
139
|
+
for (let i = 0; i < samples; i++) {
|
|
140
|
+
const value = pcm16.readInt16LE(i * 2);
|
|
141
|
+
sumSquares += value * value;
|
|
142
|
+
}
|
|
143
|
+
return Math.sqrt(sumSquares / samples);
|
|
144
|
+
}
|
|
145
|
+
function readEnvInt(name, fallback) {
|
|
146
|
+
const raw = process.env[name];
|
|
147
|
+
if (raw == null) return fallback;
|
|
148
|
+
const parsed = Number.parseInt(raw, 10);
|
|
149
|
+
return Number.isFinite(parsed) ? parsed : fallback;
|
|
150
|
+
}
|
|
135
151
|
async function ask(message, options = {}) {
|
|
136
152
|
const {
|
|
137
153
|
voice = DEFAULT_VOICE,
|
|
138
154
|
timeout = 30,
|
|
139
155
|
ack = false,
|
|
140
156
|
auth,
|
|
157
|
+
createSession,
|
|
158
|
+
createAudioEngine,
|
|
159
|
+
onTrace,
|
|
141
160
|
onAudioFrameSent,
|
|
142
161
|
onAssistantAudio,
|
|
143
162
|
onMicAudio
|
|
144
163
|
} = options;
|
|
145
164
|
const { AudioEngine } = require2("agent-voice-audio");
|
|
146
|
-
const streamDelayMs =
|
|
147
|
-
|
|
148
|
-
10
|
|
149
|
-
);
|
|
150
|
-
const engine = new AudioEngine({
|
|
165
|
+
const streamDelayMs = readEnvInt("AGENT_VOICE_AEC_STREAM_DELAY_MS", 30);
|
|
166
|
+
const engine = (createAudioEngine ?? ((engineOptions) => new AudioEngine(engineOptions)))({
|
|
151
167
|
sampleRate: SAMPLE_RATE,
|
|
152
168
|
channels: 1,
|
|
153
169
|
enableAec: true,
|
|
@@ -163,7 +179,11 @@ async function ask(message, options = {}) {
|
|
|
163
179
|
process.stderr.write(`[ask ${elapsed}ms] ${event}${suffix}
|
|
164
180
|
`);
|
|
165
181
|
}
|
|
182
|
+
function trace(event, detail) {
|
|
183
|
+
onTrace?.({ atMs: Date.now() - startMs, event, detail });
|
|
184
|
+
}
|
|
166
185
|
logEvent("start");
|
|
186
|
+
trace("start");
|
|
167
187
|
return new Promise((resolve, reject) => {
|
|
168
188
|
let transcript = "";
|
|
169
189
|
let timeoutTimer = null;
|
|
@@ -171,15 +191,19 @@ async function ask(message, options = {}) {
|
|
|
171
191
|
let transcriptTimer = null;
|
|
172
192
|
let capturePollTimer = null;
|
|
173
193
|
let speechDetected = false;
|
|
194
|
+
let speechStartedAtMs = 0;
|
|
174
195
|
let initialResponseDone = false;
|
|
175
196
|
let heardAssistantAudio = false;
|
|
176
197
|
let lastAssistantAudioAt = 0;
|
|
198
|
+
let nearEndEvidenceSeen = false;
|
|
199
|
+
let nearEndEvidenceAtMs = 0;
|
|
177
200
|
let cleaned = false;
|
|
178
201
|
let settled = false;
|
|
179
202
|
async function cleanup() {
|
|
180
203
|
if (cleaned) return;
|
|
181
204
|
cleaned = true;
|
|
182
205
|
logEvent("cleanup:start");
|
|
206
|
+
trace("cleanup:start");
|
|
183
207
|
if (timeoutTimer) clearTimeout(timeoutTimer);
|
|
184
208
|
if (responseStartTimer) clearTimeout(responseStartTimer);
|
|
185
209
|
if (transcriptTimer) clearTimeout(transcriptTimer);
|
|
@@ -191,6 +215,7 @@ async function ask(message, options = {}) {
|
|
|
191
215
|
}
|
|
192
216
|
session.close();
|
|
193
217
|
logEvent("cleanup:done");
|
|
218
|
+
trace("cleanup:done");
|
|
194
219
|
}
|
|
195
220
|
function resolveOnce(value) {
|
|
196
221
|
if (settled) return;
|
|
@@ -215,41 +240,75 @@ async function ask(message, options = {}) {
|
|
|
215
240
|
`audio engine capture read failed: ${err instanceof Error ? err.message : String(err)}`
|
|
216
241
|
)
|
|
217
242
|
);
|
|
243
|
+
trace("audio:capture_read_error", {
|
|
244
|
+
error: err instanceof Error ? err.message : String(err)
|
|
245
|
+
});
|
|
218
246
|
return;
|
|
219
247
|
}
|
|
220
248
|
for (const frame of rawFrames) onMicAudio?.(frame);
|
|
221
249
|
if (!heardAssistantAudio) return;
|
|
222
250
|
for (const frame of processedFrames) {
|
|
251
|
+
const rms = pcm16Rms(frame);
|
|
252
|
+
const minSpeechRms = readEnvInt("AGENT_VOICE_MIN_SPEECH_RMS", 550);
|
|
253
|
+
if (rms >= minSpeechRms) {
|
|
254
|
+
nearEndEvidenceSeen = true;
|
|
255
|
+
nearEndEvidenceAtMs = Date.now();
|
|
256
|
+
trace("audio:near_end_evidence", { rms, minSpeechRms });
|
|
257
|
+
}
|
|
223
258
|
onAudioFrameSent?.(frame);
|
|
224
259
|
session.sendAudio(frame);
|
|
225
260
|
}
|
|
261
|
+
if (processedFrames.length > 0) {
|
|
262
|
+
trace("audio:sent_capture", { frames: processedFrames.length });
|
|
263
|
+
}
|
|
226
264
|
}, 10);
|
|
227
|
-
const session = createRealtimeSession({
|
|
265
|
+
const session = (createSession ?? createRealtimeSession)({
|
|
228
266
|
voice,
|
|
229
267
|
mode: "default",
|
|
230
268
|
ack,
|
|
231
269
|
auth,
|
|
232
270
|
onAudioDelta(pcm16) {
|
|
233
271
|
logEvent("realtime:audio_delta", `bytes=${pcm16.length}`);
|
|
272
|
+
trace("realtime:audio_delta", { bytes: pcm16.length });
|
|
234
273
|
heardAssistantAudio = true;
|
|
235
274
|
lastAssistantAudioAt = Date.now();
|
|
236
275
|
onAssistantAudio?.(pcm16);
|
|
237
276
|
engine.play(pcm16);
|
|
238
277
|
},
|
|
239
278
|
onTranscript(text) {
|
|
240
|
-
const echoGuardMs =
|
|
241
|
-
process.env.AGENT_VOICE_ECHO_GUARD_MS ?? "1500",
|
|
242
|
-
10
|
|
243
|
-
);
|
|
279
|
+
const echoGuardMs = readEnvInt("AGENT_VOICE_ECHO_GUARD_MS", 1500);
|
|
244
280
|
const sinceAssistantMs = Date.now() - lastAssistantAudioAt;
|
|
245
281
|
if (heardAssistantAudio && sinceAssistantMs < echoGuardMs) {
|
|
246
282
|
logEvent(
|
|
247
283
|
"realtime:transcript_ignored_echo_guard",
|
|
248
284
|
`since_assistant_ms=${sinceAssistantMs} text="${text}"`
|
|
249
285
|
);
|
|
286
|
+
trace("realtime:transcript_ignored_echo_guard", {
|
|
287
|
+
sinceAssistantMs,
|
|
288
|
+
text
|
|
289
|
+
});
|
|
250
290
|
return;
|
|
251
291
|
}
|
|
252
292
|
logEvent("realtime:transcript", `text="${text}"`);
|
|
293
|
+
trace("realtime:transcript", { text });
|
|
294
|
+
if (speechDetected) {
|
|
295
|
+
const evidenceWindowMs = readEnvInt(
|
|
296
|
+
"AGENT_VOICE_SPEECH_EVIDENCE_WINDOW_MS",
|
|
297
|
+
1200
|
|
298
|
+
);
|
|
299
|
+
const evidenceAgeMs = nearEndEvidenceSeen ? Math.abs(nearEndEvidenceAtMs - speechStartedAtMs) : Number.POSITIVE_INFINITY;
|
|
300
|
+
if (!nearEndEvidenceSeen || evidenceAgeMs > evidenceWindowMs) {
|
|
301
|
+
trace("realtime:transcript_ignored_no_near_end_evidence", {
|
|
302
|
+
text,
|
|
303
|
+
speechStartedAtMs,
|
|
304
|
+
nearEndEvidenceSeen,
|
|
305
|
+
nearEndEvidenceAtMs,
|
|
306
|
+
evidenceAgeMs,
|
|
307
|
+
evidenceWindowMs
|
|
308
|
+
});
|
|
309
|
+
return;
|
|
310
|
+
}
|
|
311
|
+
}
|
|
253
312
|
if (transcriptTimer) {
|
|
254
313
|
clearTimeout(transcriptTimer);
|
|
255
314
|
transcriptTimer = null;
|
|
@@ -259,7 +318,9 @@ async function ask(message, options = {}) {
|
|
|
259
318
|
},
|
|
260
319
|
onSpeechStarted() {
|
|
261
320
|
logEvent("realtime:speech_started");
|
|
321
|
+
trace("realtime:speech_started");
|
|
262
322
|
speechDetected = true;
|
|
323
|
+
speechStartedAtMs = Date.now();
|
|
263
324
|
if (timeoutTimer) {
|
|
264
325
|
clearTimeout(timeoutTimer);
|
|
265
326
|
timeoutTimer = null;
|
|
@@ -267,6 +328,9 @@ async function ask(message, options = {}) {
|
|
|
267
328
|
if (transcriptTimer) clearTimeout(transcriptTimer);
|
|
268
329
|
transcriptTimer = setTimeout(() => {
|
|
269
330
|
logEvent("timeout:no_transcript_after_speech");
|
|
331
|
+
trace("timeout:no_transcript_after_speech", {
|
|
332
|
+
timeoutSeconds: timeout
|
|
333
|
+
});
|
|
270
334
|
rejectOnce(
|
|
271
335
|
new Error(
|
|
272
336
|
`No transcript received within ${timeout}s after speech started`
|
|
@@ -282,10 +346,12 @@ async function ask(message, options = {}) {
|
|
|
282
346
|
},
|
|
283
347
|
onInitialResponseDone() {
|
|
284
348
|
logEvent("realtime:initial_response_done");
|
|
349
|
+
trace("realtime:initial_response_done");
|
|
285
350
|
initialResponseDone = true;
|
|
286
351
|
timeoutTimer = setTimeout(() => {
|
|
287
352
|
if (!speechDetected) {
|
|
288
353
|
logEvent("timeout:no_speech");
|
|
354
|
+
trace("timeout:no_speech", { timeoutSeconds: timeout });
|
|
289
355
|
rejectOnce(
|
|
290
356
|
new Error(`No speech detected within ${timeout}s timeout`)
|
|
291
357
|
);
|
|
@@ -294,21 +360,26 @@ async function ask(message, options = {}) {
|
|
|
294
360
|
},
|
|
295
361
|
onDone() {
|
|
296
362
|
logEvent("realtime:done");
|
|
363
|
+
trace("realtime:done");
|
|
297
364
|
if (ack) resolveOnce(transcript);
|
|
298
365
|
},
|
|
299
366
|
onError(error) {
|
|
300
367
|
logEvent("realtime:error", error);
|
|
368
|
+
trace("realtime:error", { error });
|
|
301
369
|
rejectOnce(new Error(error));
|
|
302
370
|
}
|
|
303
371
|
});
|
|
304
372
|
session.connect().then(
|
|
305
373
|
() => {
|
|
306
374
|
logEvent("realtime:connected");
|
|
375
|
+
trace("realtime:connected");
|
|
307
376
|
logEvent("realtime:send_message");
|
|
377
|
+
trace("realtime:send_message");
|
|
308
378
|
session.sendMessage(message);
|
|
309
379
|
responseStartTimer = setTimeout(() => {
|
|
310
380
|
if (!heardAssistantAudio) {
|
|
311
381
|
logEvent("timeout:no_assistant_audio");
|
|
382
|
+
trace("timeout:no_assistant_audio");
|
|
312
383
|
rejectOnce(
|
|
313
384
|
new Error("No assistant audio received after sending message")
|
|
314
385
|
);
|
|
@@ -320,6 +391,9 @@ async function ask(message, options = {}) {
|
|
|
320
391
|
"realtime:connect_error",
|
|
321
392
|
err instanceof Error ? err.message : String(err)
|
|
322
393
|
);
|
|
394
|
+
trace("realtime:connect_error", {
|
|
395
|
+
error: err instanceof Error ? err.message : String(err)
|
|
396
|
+
});
|
|
323
397
|
rejectOnce(err instanceof Error ? err : new Error(String(err)));
|
|
324
398
|
}
|
|
325
399
|
);
|
|
@@ -360,30 +434,46 @@ function resolveVoice() {
|
|
|
360
434
|
import { createRequire as createRequire2 } from "module";
|
|
361
435
|
var require3 = createRequire2(import.meta.url);
|
|
362
436
|
async function say(message, options = {}) {
|
|
363
|
-
const {
|
|
437
|
+
const {
|
|
438
|
+
voice = DEFAULT_VOICE,
|
|
439
|
+
auth,
|
|
440
|
+
createSession,
|
|
441
|
+
createAudioEngine,
|
|
442
|
+
onTrace
|
|
443
|
+
} = options;
|
|
364
444
|
const { AudioEngine } = require3("agent-voice-audio");
|
|
365
|
-
const
|
|
445
|
+
const startMs = Date.now();
|
|
446
|
+
function trace(event, detail) {
|
|
447
|
+
onTrace?.({ atMs: Date.now() - startMs, event, detail });
|
|
448
|
+
}
|
|
449
|
+
const engine = (createAudioEngine ?? ((engineOptions) => new AudioEngine(engineOptions)))({
|
|
366
450
|
sampleRate: SAMPLE_RATE,
|
|
367
451
|
channels: 1,
|
|
368
452
|
enableAec: false
|
|
369
453
|
});
|
|
370
454
|
engine.start();
|
|
455
|
+
trace("start");
|
|
371
456
|
return new Promise((resolve, reject) => {
|
|
372
457
|
let cleaned = false;
|
|
373
458
|
let settled = false;
|
|
374
459
|
let responseDoneFallbackTimer = null;
|
|
375
460
|
let completionTailTimer = null;
|
|
461
|
+
let drainPollTimer = null;
|
|
462
|
+
let drainDeadlineTimer = null;
|
|
376
463
|
function cleanup() {
|
|
377
464
|
if (cleaned) return;
|
|
378
465
|
cleaned = true;
|
|
379
466
|
if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
|
|
380
467
|
if (completionTailTimer) clearTimeout(completionTailTimer);
|
|
468
|
+
if (drainPollTimer) clearInterval(drainPollTimer);
|
|
469
|
+
if (drainDeadlineTimer) clearTimeout(drainDeadlineTimer);
|
|
381
470
|
try {
|
|
382
471
|
engine.stop();
|
|
383
472
|
engine.close();
|
|
384
473
|
} catch {
|
|
385
474
|
}
|
|
386
475
|
session.close();
|
|
476
|
+
trace("cleanup");
|
|
387
477
|
}
|
|
388
478
|
function resolveOnce() {
|
|
389
479
|
if (settled) return;
|
|
@@ -397,23 +487,76 @@ async function say(message, options = {}) {
|
|
|
397
487
|
cleanup();
|
|
398
488
|
reject(error);
|
|
399
489
|
}
|
|
490
|
+
function waitForPlaybackDrain() {
|
|
491
|
+
if (settled) return;
|
|
492
|
+
if (!engine.getStats) {
|
|
493
|
+
trace("drain:no_stats");
|
|
494
|
+
resolveOnce();
|
|
495
|
+
return;
|
|
496
|
+
}
|
|
497
|
+
const absoluteDeadlineMs = 2e4;
|
|
498
|
+
const maxNoProgressMs = 1200;
|
|
499
|
+
const drainStartMs = Date.now();
|
|
500
|
+
let lastProgressAtMs = drainStartMs;
|
|
501
|
+
let lastPending = Number.POSITIVE_INFINITY;
|
|
502
|
+
trace("drain:deadline_scheduled", {
|
|
503
|
+
absoluteDeadlineMs,
|
|
504
|
+
maxNoProgressMs
|
|
505
|
+
});
|
|
506
|
+
let zeroStreak = 0;
|
|
507
|
+
drainPollTimer = setInterval(() => {
|
|
508
|
+
if (settled) return;
|
|
509
|
+
let pending = 0;
|
|
510
|
+
try {
|
|
511
|
+
pending = Number(engine.getStats?.().pendingPlaybackSamples ?? 0);
|
|
512
|
+
} catch {
|
|
513
|
+
pending = 0;
|
|
514
|
+
}
|
|
515
|
+
trace("drain:poll", { pendingPlaybackSamples: pending });
|
|
516
|
+
if (pending < lastPending) {
|
|
517
|
+
lastPending = pending;
|
|
518
|
+
lastProgressAtMs = Date.now();
|
|
519
|
+
}
|
|
520
|
+
if (pending <= 0) {
|
|
521
|
+
zeroStreak += 1;
|
|
522
|
+
if (zeroStreak >= 3) {
|
|
523
|
+
resolveOnce();
|
|
524
|
+
}
|
|
525
|
+
return;
|
|
526
|
+
}
|
|
527
|
+
zeroStreak = 0;
|
|
528
|
+
if (Date.now() - lastProgressAtMs > maxNoProgressMs) {
|
|
529
|
+
trace("drain:no_progress_timeout", {
|
|
530
|
+
pendingPlaybackSamples: pending
|
|
531
|
+
});
|
|
532
|
+
resolveOnce();
|
|
533
|
+
}
|
|
534
|
+
}, 20);
|
|
535
|
+
drainDeadlineTimer = setTimeout(() => {
|
|
536
|
+
trace("drain:deadline");
|
|
537
|
+
resolveOnce();
|
|
538
|
+
}, absoluteDeadlineMs);
|
|
539
|
+
}
|
|
400
540
|
function scheduleTailResolve(delayMs) {
|
|
401
541
|
if (settled) return;
|
|
402
542
|
if (completionTailTimer) clearTimeout(completionTailTimer);
|
|
403
543
|
completionTailTimer = setTimeout(() => {
|
|
404
|
-
|
|
544
|
+
waitForPlaybackDrain();
|
|
405
545
|
}, delayMs);
|
|
546
|
+
trace("tail_scheduled", { delayMs });
|
|
406
547
|
}
|
|
407
|
-
const session = createRealtimeSession({
|
|
548
|
+
const session = (createSession ?? createRealtimeSession)({
|
|
408
549
|
voice,
|
|
409
550
|
mode: "say",
|
|
410
551
|
ack: false,
|
|
411
552
|
auth,
|
|
412
553
|
onAudioDelta(pcm16) {
|
|
413
554
|
engine.play(pcm16);
|
|
555
|
+
trace("realtime:audio_delta", { bytes: pcm16.length });
|
|
414
556
|
},
|
|
415
557
|
onAudioDone() {
|
|
416
558
|
scheduleTailResolve(140);
|
|
559
|
+
trace("realtime:audio_done");
|
|
417
560
|
},
|
|
418
561
|
onTranscript() {
|
|
419
562
|
},
|
|
@@ -424,14 +567,17 @@ async function say(message, options = {}) {
|
|
|
424
567
|
responseDoneFallbackTimer = setTimeout(() => {
|
|
425
568
|
scheduleTailResolve(220);
|
|
426
569
|
}, 700);
|
|
570
|
+
trace("realtime:initial_response_done");
|
|
427
571
|
},
|
|
428
572
|
onDone() {
|
|
429
573
|
},
|
|
430
574
|
onError(error) {
|
|
575
|
+
trace("realtime:error", { error });
|
|
431
576
|
rejectOnce(new Error(error));
|
|
432
577
|
}
|
|
433
578
|
});
|
|
434
579
|
session.connect().then(() => {
|
|
580
|
+
trace("realtime:connected");
|
|
435
581
|
session.sendMessage(message);
|
|
436
582
|
}, reject);
|
|
437
583
|
});
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import {
|
|
3
|
+
createRealtimeSession
|
|
4
|
+
} from "./chunk-UYBFONQE.js";
|
|
5
|
+
import {
|
|
6
|
+
DEFAULT_VOICE,
|
|
7
|
+
SAMPLE_RATE
|
|
8
|
+
} from "./chunk-AHLLYIEW.js";
|
|
9
|
+
|
|
10
|
+
// src/say.ts
|
|
11
|
+
import { createRequire } from "module";
|
|
12
|
+
var require2 = createRequire(import.meta.url);
|
|
13
|
+
async function say(message, options = {}) {
|
|
14
|
+
const {
|
|
15
|
+
voice = DEFAULT_VOICE,
|
|
16
|
+
auth,
|
|
17
|
+
createSession,
|
|
18
|
+
createAudioEngine,
|
|
19
|
+
onTrace
|
|
20
|
+
} = options;
|
|
21
|
+
const { AudioEngine } = require2("agent-voice-audio");
|
|
22
|
+
const startMs = Date.now();
|
|
23
|
+
function trace(event, detail) {
|
|
24
|
+
onTrace?.({ atMs: Date.now() - startMs, event, detail });
|
|
25
|
+
}
|
|
26
|
+
const engine = (createAudioEngine ?? ((engineOptions) => new AudioEngine(engineOptions)))({
|
|
27
|
+
sampleRate: SAMPLE_RATE,
|
|
28
|
+
channels: 1,
|
|
29
|
+
enableAec: false
|
|
30
|
+
});
|
|
31
|
+
engine.start();
|
|
32
|
+
trace("start");
|
|
33
|
+
return new Promise((resolve, reject) => {
|
|
34
|
+
let cleaned = false;
|
|
35
|
+
let settled = false;
|
|
36
|
+
let responseDoneFallbackTimer = null;
|
|
37
|
+
let completionTailTimer = null;
|
|
38
|
+
let drainPollTimer = null;
|
|
39
|
+
let drainDeadlineTimer = null;
|
|
40
|
+
function cleanup() {
|
|
41
|
+
if (cleaned) return;
|
|
42
|
+
cleaned = true;
|
|
43
|
+
if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
|
|
44
|
+
if (completionTailTimer) clearTimeout(completionTailTimer);
|
|
45
|
+
if (drainPollTimer) clearInterval(drainPollTimer);
|
|
46
|
+
if (drainDeadlineTimer) clearTimeout(drainDeadlineTimer);
|
|
47
|
+
try {
|
|
48
|
+
engine.stop();
|
|
49
|
+
engine.close();
|
|
50
|
+
} catch {
|
|
51
|
+
}
|
|
52
|
+
session.close();
|
|
53
|
+
trace("cleanup");
|
|
54
|
+
}
|
|
55
|
+
function resolveOnce() {
|
|
56
|
+
if (settled) return;
|
|
57
|
+
settled = true;
|
|
58
|
+
cleanup();
|
|
59
|
+
resolve();
|
|
60
|
+
}
|
|
61
|
+
function rejectOnce(error) {
|
|
62
|
+
if (settled) return;
|
|
63
|
+
settled = true;
|
|
64
|
+
cleanup();
|
|
65
|
+
reject(error);
|
|
66
|
+
}
|
|
67
|
+
function waitForPlaybackDrain() {
|
|
68
|
+
if (settled) return;
|
|
69
|
+
if (!engine.getStats) {
|
|
70
|
+
trace("drain:no_stats");
|
|
71
|
+
resolveOnce();
|
|
72
|
+
return;
|
|
73
|
+
}
|
|
74
|
+
const absoluteDeadlineMs = 2e4;
|
|
75
|
+
const maxNoProgressMs = 1200;
|
|
76
|
+
const drainStartMs = Date.now();
|
|
77
|
+
let lastProgressAtMs = drainStartMs;
|
|
78
|
+
let lastPending = Number.POSITIVE_INFINITY;
|
|
79
|
+
trace("drain:deadline_scheduled", {
|
|
80
|
+
absoluteDeadlineMs,
|
|
81
|
+
maxNoProgressMs
|
|
82
|
+
});
|
|
83
|
+
let zeroStreak = 0;
|
|
84
|
+
drainPollTimer = setInterval(() => {
|
|
85
|
+
if (settled) return;
|
|
86
|
+
let pending = 0;
|
|
87
|
+
try {
|
|
88
|
+
pending = Number(engine.getStats?.().pendingPlaybackSamples ?? 0);
|
|
89
|
+
} catch {
|
|
90
|
+
pending = 0;
|
|
91
|
+
}
|
|
92
|
+
trace("drain:poll", { pendingPlaybackSamples: pending });
|
|
93
|
+
if (pending < lastPending) {
|
|
94
|
+
lastPending = pending;
|
|
95
|
+
lastProgressAtMs = Date.now();
|
|
96
|
+
}
|
|
97
|
+
if (pending <= 0) {
|
|
98
|
+
zeroStreak += 1;
|
|
99
|
+
if (zeroStreak >= 3) {
|
|
100
|
+
resolveOnce();
|
|
101
|
+
}
|
|
102
|
+
return;
|
|
103
|
+
}
|
|
104
|
+
zeroStreak = 0;
|
|
105
|
+
if (Date.now() - lastProgressAtMs > maxNoProgressMs) {
|
|
106
|
+
trace("drain:no_progress_timeout", {
|
|
107
|
+
pendingPlaybackSamples: pending
|
|
108
|
+
});
|
|
109
|
+
resolveOnce();
|
|
110
|
+
}
|
|
111
|
+
}, 20);
|
|
112
|
+
drainDeadlineTimer = setTimeout(() => {
|
|
113
|
+
trace("drain:deadline");
|
|
114
|
+
resolveOnce();
|
|
115
|
+
}, absoluteDeadlineMs);
|
|
116
|
+
}
|
|
117
|
+
function scheduleTailResolve(delayMs) {
|
|
118
|
+
if (settled) return;
|
|
119
|
+
if (completionTailTimer) clearTimeout(completionTailTimer);
|
|
120
|
+
completionTailTimer = setTimeout(() => {
|
|
121
|
+
waitForPlaybackDrain();
|
|
122
|
+
}, delayMs);
|
|
123
|
+
trace("tail_scheduled", { delayMs });
|
|
124
|
+
}
|
|
125
|
+
const session = (createSession ?? createRealtimeSession)({
|
|
126
|
+
voice,
|
|
127
|
+
mode: "say",
|
|
128
|
+
ack: false,
|
|
129
|
+
auth,
|
|
130
|
+
onAudioDelta(pcm16) {
|
|
131
|
+
engine.play(pcm16);
|
|
132
|
+
trace("realtime:audio_delta", { bytes: pcm16.length });
|
|
133
|
+
},
|
|
134
|
+
onAudioDone() {
|
|
135
|
+
scheduleTailResolve(140);
|
|
136
|
+
trace("realtime:audio_done");
|
|
137
|
+
},
|
|
138
|
+
onTranscript() {
|
|
139
|
+
},
|
|
140
|
+
onSpeechStarted() {
|
|
141
|
+
},
|
|
142
|
+
onInitialResponseDone() {
|
|
143
|
+
if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
|
|
144
|
+
responseDoneFallbackTimer = setTimeout(() => {
|
|
145
|
+
scheduleTailResolve(220);
|
|
146
|
+
}, 700);
|
|
147
|
+
trace("realtime:initial_response_done");
|
|
148
|
+
},
|
|
149
|
+
onDone() {
|
|
150
|
+
},
|
|
151
|
+
onError(error) {
|
|
152
|
+
trace("realtime:error", { error });
|
|
153
|
+
rejectOnce(new Error(error));
|
|
154
|
+
}
|
|
155
|
+
});
|
|
156
|
+
session.connect().then(() => {
|
|
157
|
+
trace("realtime:connected");
|
|
158
|
+
session.sendMessage(message);
|
|
159
|
+
}, reject);
|
|
160
|
+
});
|
|
161
|
+
}
|
|
162
|
+
export {
|
|
163
|
+
say
|
|
164
|
+
};
|
package/package.json
CHANGED
package/dist/say-ELJAIWUM.js
DELETED
|
@@ -1,92 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
import {
|
|
3
|
-
createRealtimeSession
|
|
4
|
-
} from "./chunk-UYBFONQE.js";
|
|
5
|
-
import {
|
|
6
|
-
DEFAULT_VOICE,
|
|
7
|
-
SAMPLE_RATE
|
|
8
|
-
} from "./chunk-AHLLYIEW.js";
|
|
9
|
-
|
|
10
|
-
// src/say.ts
|
|
11
|
-
import { createRequire } from "module";
|
|
12
|
-
var require2 = createRequire(import.meta.url);
|
|
13
|
-
async function say(message, options = {}) {
|
|
14
|
-
const { voice = DEFAULT_VOICE, auth } = options;
|
|
15
|
-
const { AudioEngine } = require2("agent-voice-audio");
|
|
16
|
-
const engine = new AudioEngine({
|
|
17
|
-
sampleRate: SAMPLE_RATE,
|
|
18
|
-
channels: 1,
|
|
19
|
-
enableAec: false
|
|
20
|
-
});
|
|
21
|
-
engine.start();
|
|
22
|
-
return new Promise((resolve, reject) => {
|
|
23
|
-
let cleaned = false;
|
|
24
|
-
let settled = false;
|
|
25
|
-
let responseDoneFallbackTimer = null;
|
|
26
|
-
let completionTailTimer = null;
|
|
27
|
-
function cleanup() {
|
|
28
|
-
if (cleaned) return;
|
|
29
|
-
cleaned = true;
|
|
30
|
-
if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
|
|
31
|
-
if (completionTailTimer) clearTimeout(completionTailTimer);
|
|
32
|
-
try {
|
|
33
|
-
engine.stop();
|
|
34
|
-
engine.close();
|
|
35
|
-
} catch {
|
|
36
|
-
}
|
|
37
|
-
session.close();
|
|
38
|
-
}
|
|
39
|
-
function resolveOnce() {
|
|
40
|
-
if (settled) return;
|
|
41
|
-
settled = true;
|
|
42
|
-
cleanup();
|
|
43
|
-
resolve();
|
|
44
|
-
}
|
|
45
|
-
function rejectOnce(error) {
|
|
46
|
-
if (settled) return;
|
|
47
|
-
settled = true;
|
|
48
|
-
cleanup();
|
|
49
|
-
reject(error);
|
|
50
|
-
}
|
|
51
|
-
function scheduleTailResolve(delayMs) {
|
|
52
|
-
if (settled) return;
|
|
53
|
-
if (completionTailTimer) clearTimeout(completionTailTimer);
|
|
54
|
-
completionTailTimer = setTimeout(() => {
|
|
55
|
-
resolveOnce();
|
|
56
|
-
}, delayMs);
|
|
57
|
-
}
|
|
58
|
-
const session = createRealtimeSession({
|
|
59
|
-
voice,
|
|
60
|
-
mode: "say",
|
|
61
|
-
ack: false,
|
|
62
|
-
auth,
|
|
63
|
-
onAudioDelta(pcm16) {
|
|
64
|
-
engine.play(pcm16);
|
|
65
|
-
},
|
|
66
|
-
onAudioDone() {
|
|
67
|
-
scheduleTailResolve(140);
|
|
68
|
-
},
|
|
69
|
-
onTranscript() {
|
|
70
|
-
},
|
|
71
|
-
onSpeechStarted() {
|
|
72
|
-
},
|
|
73
|
-
onInitialResponseDone() {
|
|
74
|
-
if (responseDoneFallbackTimer) clearTimeout(responseDoneFallbackTimer);
|
|
75
|
-
responseDoneFallbackTimer = setTimeout(() => {
|
|
76
|
-
scheduleTailResolve(220);
|
|
77
|
-
}, 700);
|
|
78
|
-
},
|
|
79
|
-
onDone() {
|
|
80
|
-
},
|
|
81
|
-
onError(error) {
|
|
82
|
-
rejectOnce(new Error(error));
|
|
83
|
-
}
|
|
84
|
-
});
|
|
85
|
-
session.connect().then(() => {
|
|
86
|
-
session.sendMessage(message);
|
|
87
|
-
}, reject);
|
|
88
|
-
});
|
|
89
|
-
}
|
|
90
|
-
export {
|
|
91
|
-
say
|
|
92
|
-
};
|