agent-voice 0.2.2 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{ask-OIE6HL2H.js → ask-KM3JPI36.js} +26 -8
- package/dist/cli.js +40 -4
- package/dist/index.d.ts +1 -0
- package/dist/index.js +61 -11
- package/dist/{say-ZVF6EX52.js → say-OEQQFOCC.js} +35 -3
- package/package.json +1 -1
|
@@ -127,7 +127,19 @@ async function ask(message, options = {}) {
|
|
|
127
127
|
if (!heardAssistantAudio) return;
|
|
128
128
|
for (const frame of processedFrames) {
|
|
129
129
|
const rms = pcm16Rms(frame);
|
|
130
|
-
const
|
|
130
|
+
const configuredMinSpeechRms = readEnvInt(
|
|
131
|
+
"AGENT_VOICE_MIN_SPEECH_RMS",
|
|
132
|
+
220
|
|
133
|
+
);
|
|
134
|
+
const relaxAfterMs = readEnvInt(
|
|
135
|
+
"AGENT_VOICE_MIN_SPEECH_RMS_RELAX_AFTER_MS",
|
|
136
|
+
500
|
|
137
|
+
);
|
|
138
|
+
const relaxedMinSpeechRms = readEnvInt(
|
|
139
|
+
"AGENT_VOICE_MIN_SPEECH_RMS_RELAXED",
|
|
140
|
+
120
|
|
141
|
+
);
|
|
142
|
+
const minSpeechRms = speechDetected && speechStartedAtMs > 0 && Date.now() - speechStartedAtMs >= relaxAfterMs ? relaxedMinSpeechRms : configuredMinSpeechRms;
|
|
131
143
|
if (rms >= minSpeechRms) {
|
|
132
144
|
nearEndEvidenceSeen = true;
|
|
133
145
|
nearEndEvidenceAtMs = Date.now();
|
|
@@ -170,19 +182,25 @@ async function ask(message, options = {}) {
|
|
|
170
182
|
logEvent("realtime:transcript", `text="${text}"`);
|
|
171
183
|
trace("realtime:transcript", { text });
|
|
172
184
|
if (speechDetected) {
|
|
173
|
-
const
|
|
174
|
-
"
|
|
175
|
-
|
|
185
|
+
const evidencePreRollMs = readEnvInt(
|
|
186
|
+
"AGENT_VOICE_SPEECH_EVIDENCE_PREROLL_MS",
|
|
187
|
+
200
|
|
188
|
+
);
|
|
189
|
+
const evidencePostRollMs = readEnvInt(
|
|
190
|
+
"AGENT_VOICE_SPEECH_EVIDENCE_POSTROLL_MS",
|
|
191
|
+
1500
|
|
176
192
|
);
|
|
177
|
-
const
|
|
178
|
-
|
|
193
|
+
const evidenceEarliestMs = speechStartedAtMs - evidencePreRollMs;
|
|
194
|
+
const evidenceLatestMs = speechStartedAtMs + evidencePostRollMs;
|
|
195
|
+
const hasTimelyNearEndEvidence = nearEndEvidenceSeen && nearEndEvidenceAtMs >= evidenceEarliestMs && nearEndEvidenceAtMs <= evidenceLatestMs;
|
|
196
|
+
if (!hasTimelyNearEndEvidence) {
|
|
179
197
|
trace("realtime:transcript_ignored_no_near_end_evidence", {
|
|
180
198
|
text,
|
|
181
199
|
speechStartedAtMs,
|
|
182
200
|
nearEndEvidenceSeen,
|
|
183
201
|
nearEndEvidenceAtMs,
|
|
184
|
-
|
|
185
|
-
|
|
202
|
+
evidenceEarliestMs,
|
|
203
|
+
evidenceLatestMs
|
|
186
204
|
});
|
|
187
205
|
return;
|
|
188
206
|
}
|
package/dist/cli.js
CHANGED
|
@@ -28,8 +28,8 @@ async function withSuppressedNativeOutput() {
|
|
|
28
28
|
openSync("/dev/null", "w");
|
|
29
29
|
closeSync(2);
|
|
30
30
|
openSync("/dev/null", "w");
|
|
31
|
-
const { ask } = await import("./ask-
|
|
32
|
-
const { say } = await import("./say-
|
|
31
|
+
const { ask } = await import("./ask-KM3JPI36.js");
|
|
32
|
+
const { say } = await import("./say-OEQQFOCC.js");
|
|
33
33
|
function writeResult(text) {
|
|
34
34
|
writeSync(savedStdout, `${text}
|
|
35
35
|
`);
|
|
@@ -187,14 +187,50 @@ ${files.modelInputFile}`
|
|
|
187
187
|
process.exit(1);
|
|
188
188
|
}
|
|
189
189
|
});
|
|
190
|
-
program.command("say").description("Speak a message without listening for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).
|
|
190
|
+
program.command("say").description("Speak a message without listening for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).option(
|
|
191
|
+
"--debug-audio-dir <dir>",
|
|
192
|
+
"Write say audio debug WAV to this directory"
|
|
193
|
+
).action(async (opts) => {
|
|
191
194
|
const { say, writeError } = await withSuppressedNativeOutput();
|
|
195
|
+
const assistantChunks = [];
|
|
192
196
|
try {
|
|
193
197
|
const auth = resolveAuth();
|
|
194
198
|
const message = await getMessage(opts.message);
|
|
195
|
-
await say(message, {
|
|
199
|
+
await say(message, {
|
|
200
|
+
voice: opts.voice,
|
|
201
|
+
auth,
|
|
202
|
+
onAssistantAudio: opts.debugAudioDir ? (pcm16) => assistantChunks.push(Buffer.from(pcm16)) : void 0
|
|
203
|
+
});
|
|
204
|
+
if (opts.debugAudioDir) {
|
|
205
|
+
mkdirSync(opts.debugAudioDir, { recursive: true });
|
|
206
|
+
const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
207
|
+
const file = join(
|
|
208
|
+
opts.debugAudioDir,
|
|
209
|
+
`say-${stamp}-assistant-output.wav`
|
|
210
|
+
);
|
|
211
|
+
writeFileSync(file, createWavBuffer(Buffer.concat(assistantChunks)));
|
|
212
|
+
writeError(`debug audio written:
|
|
213
|
+
${file}`);
|
|
214
|
+
}
|
|
196
215
|
process.exit(0);
|
|
197
216
|
} catch (err) {
|
|
217
|
+
if (opts.debugAudioDir && assistantChunks.length > 0) {
|
|
218
|
+
try {
|
|
219
|
+
mkdirSync(opts.debugAudioDir, { recursive: true });
|
|
220
|
+
const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
221
|
+
const file = join(
|
|
222
|
+
opts.debugAudioDir,
|
|
223
|
+
`say-${stamp}-assistant-output.wav`
|
|
224
|
+
);
|
|
225
|
+
writeFileSync(
|
|
226
|
+
file,
|
|
227
|
+
createWavBuffer(Buffer.concat(assistantChunks))
|
|
228
|
+
);
|
|
229
|
+
writeError(`debug audio written:
|
|
230
|
+
${file}`);
|
|
231
|
+
} catch {
|
|
232
|
+
}
|
|
233
|
+
}
|
|
198
234
|
writeError(`${err instanceof Error ? err.message : err}`);
|
|
199
235
|
process.exit(1);
|
|
200
236
|
}
|
package/dist/index.d.ts
CHANGED
package/dist/index.js
CHANGED
|
@@ -249,7 +249,19 @@ async function ask(message, options = {}) {
|
|
|
249
249
|
if (!heardAssistantAudio) return;
|
|
250
250
|
for (const frame of processedFrames) {
|
|
251
251
|
const rms = pcm16Rms(frame);
|
|
252
|
-
const
|
|
252
|
+
const configuredMinSpeechRms = readEnvInt(
|
|
253
|
+
"AGENT_VOICE_MIN_SPEECH_RMS",
|
|
254
|
+
220
|
|
255
|
+
);
|
|
256
|
+
const relaxAfterMs = readEnvInt(
|
|
257
|
+
"AGENT_VOICE_MIN_SPEECH_RMS_RELAX_AFTER_MS",
|
|
258
|
+
500
|
|
259
|
+
);
|
|
260
|
+
const relaxedMinSpeechRms = readEnvInt(
|
|
261
|
+
"AGENT_VOICE_MIN_SPEECH_RMS_RELAXED",
|
|
262
|
+
120
|
|
263
|
+
);
|
|
264
|
+
const minSpeechRms = speechDetected && speechStartedAtMs > 0 && Date.now() - speechStartedAtMs >= relaxAfterMs ? relaxedMinSpeechRms : configuredMinSpeechRms;
|
|
253
265
|
if (rms >= minSpeechRms) {
|
|
254
266
|
nearEndEvidenceSeen = true;
|
|
255
267
|
nearEndEvidenceAtMs = Date.now();
|
|
@@ -292,19 +304,25 @@ async function ask(message, options = {}) {
|
|
|
292
304
|
logEvent("realtime:transcript", `text="${text}"`);
|
|
293
305
|
trace("realtime:transcript", { text });
|
|
294
306
|
if (speechDetected) {
|
|
295
|
-
const
|
|
296
|
-
"
|
|
297
|
-
|
|
307
|
+
const evidencePreRollMs = readEnvInt(
|
|
308
|
+
"AGENT_VOICE_SPEECH_EVIDENCE_PREROLL_MS",
|
|
309
|
+
200
|
|
310
|
+
);
|
|
311
|
+
const evidencePostRollMs = readEnvInt(
|
|
312
|
+
"AGENT_VOICE_SPEECH_EVIDENCE_POSTROLL_MS",
|
|
313
|
+
1500
|
|
298
314
|
);
|
|
299
|
-
const
|
|
300
|
-
|
|
315
|
+
const evidenceEarliestMs = speechStartedAtMs - evidencePreRollMs;
|
|
316
|
+
const evidenceLatestMs = speechStartedAtMs + evidencePostRollMs;
|
|
317
|
+
const hasTimelyNearEndEvidence = nearEndEvidenceSeen && nearEndEvidenceAtMs >= evidenceEarliestMs && nearEndEvidenceAtMs <= evidenceLatestMs;
|
|
318
|
+
if (!hasTimelyNearEndEvidence) {
|
|
301
319
|
trace("realtime:transcript_ignored_no_near_end_evidence", {
|
|
302
320
|
text,
|
|
303
321
|
speechStartedAtMs,
|
|
304
322
|
nearEndEvidenceSeen,
|
|
305
323
|
nearEndEvidenceAtMs,
|
|
306
|
-
|
|
307
|
-
|
|
324
|
+
evidenceEarliestMs,
|
|
325
|
+
evidenceLatestMs
|
|
308
326
|
});
|
|
309
327
|
return;
|
|
310
328
|
}
|
|
@@ -439,6 +457,7 @@ async function say(message, options = {}) {
|
|
|
439
457
|
auth,
|
|
440
458
|
createSession,
|
|
441
459
|
createAudioEngine,
|
|
460
|
+
onAssistantAudio,
|
|
442
461
|
onTrace
|
|
443
462
|
} = options;
|
|
444
463
|
const { AudioEngine } = require3("agent-voice-audio");
|
|
@@ -460,6 +479,9 @@ async function say(message, options = {}) {
|
|
|
460
479
|
let completionTailTimer = null;
|
|
461
480
|
let drainPollTimer = null;
|
|
462
481
|
let drainDeadlineTimer = null;
|
|
482
|
+
let playoutDeadlineTimer = null;
|
|
483
|
+
let firstAudioAtMs = 0;
|
|
484
|
+
let totalReceivedSamples = 0;
|
|
463
485
|
function cleanup() {
|
|
464
486
|
if (cleaned) return;
|
|
465
487
|
cleaned = true;
|
|
@@ -467,6 +489,7 @@ async function say(message, options = {}) {
|
|
|
467
489
|
if (completionTailTimer) clearTimeout(completionTailTimer);
|
|
468
490
|
if (drainPollTimer) clearInterval(drainPollTimer);
|
|
469
491
|
if (drainDeadlineTimer) clearTimeout(drainDeadlineTimer);
|
|
492
|
+
if (playoutDeadlineTimer) clearTimeout(playoutDeadlineTimer);
|
|
470
493
|
try {
|
|
471
494
|
engine.stop();
|
|
472
495
|
engine.close();
|
|
@@ -487,6 +510,28 @@ async function say(message, options = {}) {
|
|
|
487
510
|
cleanup();
|
|
488
511
|
reject(error);
|
|
489
512
|
}
|
|
513
|
+
function waitForWallClockPlayout() {
|
|
514
|
+
if (settled) return;
|
|
515
|
+
if (firstAudioAtMs <= 0 || totalReceivedSamples <= 0) {
|
|
516
|
+
resolveOnce();
|
|
517
|
+
return;
|
|
518
|
+
}
|
|
519
|
+
const expectedPlayoutMs = Math.ceil(
|
|
520
|
+
totalReceivedSamples / SAMPLE_RATE * 1e3
|
|
521
|
+
);
|
|
522
|
+
const playoutTailMs = 140;
|
|
523
|
+
const dueAtMs = firstAudioAtMs + expectedPlayoutMs + playoutTailMs;
|
|
524
|
+
const waitMs = Math.max(0, dueAtMs - Date.now());
|
|
525
|
+
trace("playout:wall_clock_wait", {
|
|
526
|
+
totalReceivedSamples,
|
|
527
|
+
expectedPlayoutMs,
|
|
528
|
+
playoutTailMs,
|
|
529
|
+
waitMs
|
|
530
|
+
});
|
|
531
|
+
playoutDeadlineTimer = setTimeout(() => {
|
|
532
|
+
resolveOnce();
|
|
533
|
+
}, waitMs);
|
|
534
|
+
}
|
|
490
535
|
function waitForPlaybackDrain() {
|
|
491
536
|
if (settled) return;
|
|
492
537
|
if (!engine.getStats) {
|
|
@@ -520,7 +565,7 @@ async function say(message, options = {}) {
|
|
|
520
565
|
if (pending <= 0) {
|
|
521
566
|
zeroStreak += 1;
|
|
522
567
|
if (zeroStreak >= 3) {
|
|
523
|
-
|
|
568
|
+
waitForWallClockPlayout();
|
|
524
569
|
}
|
|
525
570
|
return;
|
|
526
571
|
}
|
|
@@ -529,12 +574,12 @@ async function say(message, options = {}) {
|
|
|
529
574
|
trace("drain:no_progress_timeout", {
|
|
530
575
|
pendingPlaybackSamples: pending
|
|
531
576
|
});
|
|
532
|
-
|
|
577
|
+
waitForWallClockPlayout();
|
|
533
578
|
}
|
|
534
579
|
}, 20);
|
|
535
580
|
drainDeadlineTimer = setTimeout(() => {
|
|
536
581
|
trace("drain:deadline");
|
|
537
|
-
|
|
582
|
+
waitForWallClockPlayout();
|
|
538
583
|
}, absoluteDeadlineMs);
|
|
539
584
|
}
|
|
540
585
|
function scheduleTailResolve(delayMs) {
|
|
@@ -551,7 +596,12 @@ async function say(message, options = {}) {
|
|
|
551
596
|
ack: false,
|
|
552
597
|
auth,
|
|
553
598
|
onAudioDelta(pcm16) {
|
|
599
|
+
if (firstAudioAtMs <= 0) {
|
|
600
|
+
firstAudioAtMs = Date.now();
|
|
601
|
+
}
|
|
602
|
+
totalReceivedSamples += Math.floor(pcm16.length / 2);
|
|
554
603
|
engine.play(pcm16);
|
|
604
|
+
onAssistantAudio?.(pcm16);
|
|
555
605
|
trace("realtime:audio_delta", { bytes: pcm16.length });
|
|
556
606
|
},
|
|
557
607
|
onAudioDone() {
|
|
@@ -16,6 +16,7 @@ async function say(message, options = {}) {
|
|
|
16
16
|
auth,
|
|
17
17
|
createSession,
|
|
18
18
|
createAudioEngine,
|
|
19
|
+
onAssistantAudio,
|
|
19
20
|
onTrace
|
|
20
21
|
} = options;
|
|
21
22
|
const { AudioEngine } = require2("agent-voice-audio");
|
|
@@ -37,6 +38,9 @@ async function say(message, options = {}) {
|
|
|
37
38
|
let completionTailTimer = null;
|
|
38
39
|
let drainPollTimer = null;
|
|
39
40
|
let drainDeadlineTimer = null;
|
|
41
|
+
let playoutDeadlineTimer = null;
|
|
42
|
+
let firstAudioAtMs = 0;
|
|
43
|
+
let totalReceivedSamples = 0;
|
|
40
44
|
function cleanup() {
|
|
41
45
|
if (cleaned) return;
|
|
42
46
|
cleaned = true;
|
|
@@ -44,6 +48,7 @@ async function say(message, options = {}) {
|
|
|
44
48
|
if (completionTailTimer) clearTimeout(completionTailTimer);
|
|
45
49
|
if (drainPollTimer) clearInterval(drainPollTimer);
|
|
46
50
|
if (drainDeadlineTimer) clearTimeout(drainDeadlineTimer);
|
|
51
|
+
if (playoutDeadlineTimer) clearTimeout(playoutDeadlineTimer);
|
|
47
52
|
try {
|
|
48
53
|
engine.stop();
|
|
49
54
|
engine.close();
|
|
@@ -64,6 +69,28 @@ async function say(message, options = {}) {
|
|
|
64
69
|
cleanup();
|
|
65
70
|
reject(error);
|
|
66
71
|
}
|
|
72
|
+
function waitForWallClockPlayout() {
|
|
73
|
+
if (settled) return;
|
|
74
|
+
if (firstAudioAtMs <= 0 || totalReceivedSamples <= 0) {
|
|
75
|
+
resolveOnce();
|
|
76
|
+
return;
|
|
77
|
+
}
|
|
78
|
+
const expectedPlayoutMs = Math.ceil(
|
|
79
|
+
totalReceivedSamples / SAMPLE_RATE * 1e3
|
|
80
|
+
);
|
|
81
|
+
const playoutTailMs = 140;
|
|
82
|
+
const dueAtMs = firstAudioAtMs + expectedPlayoutMs + playoutTailMs;
|
|
83
|
+
const waitMs = Math.max(0, dueAtMs - Date.now());
|
|
84
|
+
trace("playout:wall_clock_wait", {
|
|
85
|
+
totalReceivedSamples,
|
|
86
|
+
expectedPlayoutMs,
|
|
87
|
+
playoutTailMs,
|
|
88
|
+
waitMs
|
|
89
|
+
});
|
|
90
|
+
playoutDeadlineTimer = setTimeout(() => {
|
|
91
|
+
resolveOnce();
|
|
92
|
+
}, waitMs);
|
|
93
|
+
}
|
|
67
94
|
function waitForPlaybackDrain() {
|
|
68
95
|
if (settled) return;
|
|
69
96
|
if (!engine.getStats) {
|
|
@@ -97,7 +124,7 @@ async function say(message, options = {}) {
|
|
|
97
124
|
if (pending <= 0) {
|
|
98
125
|
zeroStreak += 1;
|
|
99
126
|
if (zeroStreak >= 3) {
|
|
100
|
-
|
|
127
|
+
waitForWallClockPlayout();
|
|
101
128
|
}
|
|
102
129
|
return;
|
|
103
130
|
}
|
|
@@ -106,12 +133,12 @@ async function say(message, options = {}) {
|
|
|
106
133
|
trace("drain:no_progress_timeout", {
|
|
107
134
|
pendingPlaybackSamples: pending
|
|
108
135
|
});
|
|
109
|
-
|
|
136
|
+
waitForWallClockPlayout();
|
|
110
137
|
}
|
|
111
138
|
}, 20);
|
|
112
139
|
drainDeadlineTimer = setTimeout(() => {
|
|
113
140
|
trace("drain:deadline");
|
|
114
|
-
|
|
141
|
+
waitForWallClockPlayout();
|
|
115
142
|
}, absoluteDeadlineMs);
|
|
116
143
|
}
|
|
117
144
|
function scheduleTailResolve(delayMs) {
|
|
@@ -128,7 +155,12 @@ async function say(message, options = {}) {
|
|
|
128
155
|
ack: false,
|
|
129
156
|
auth,
|
|
130
157
|
onAudioDelta(pcm16) {
|
|
158
|
+
if (firstAudioAtMs <= 0) {
|
|
159
|
+
firstAudioAtMs = Date.now();
|
|
160
|
+
}
|
|
161
|
+
totalReceivedSamples += Math.floor(pcm16.length / 2);
|
|
131
162
|
engine.play(pcm16);
|
|
163
|
+
onAssistantAudio?.(pcm16);
|
|
132
164
|
trace("realtime:audio_delta", { bytes: pcm16.length });
|
|
133
165
|
},
|
|
134
166
|
onAudioDone() {
|