agent-voice 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,307 @@
1
+ import {
2
+ createRealtimeSession
3
+ } from "./chunk-3YEHGYHI.js";
4
+ import {
5
+ DEFAULT_VOICE,
6
+ SAMPLE_RATE
7
+ } from "./chunk-YU5FF2L7.js";
8
+
9
+ // src/ask.ts
10
+ import { createRequire } from "module";
11
+ var require2 = createRequire(import.meta.url);
12
+ function pcm16Rms(pcm16) {
13
+ const samples = Math.floor(pcm16.length / 2);
14
+ if (samples === 0) return 0;
15
+ let sumSquares = 0;
16
+ for (let i = 0; i < samples; i++) {
17
+ const value = pcm16.readInt16LE(i * 2);
18
+ sumSquares += value * value;
19
+ }
20
+ return Math.sqrt(sumSquares / samples);
21
+ }
22
+ function readEnvInt(name, fallback) {
23
+ const raw = process.env[name];
24
+ if (raw == null) return fallback;
25
+ const parsed = Number.parseInt(raw, 10);
26
+ return Number.isFinite(parsed) ? parsed : fallback;
27
+ }
28
+ async function ask(message, options = {}) {
29
+ const {
30
+ voice = DEFAULT_VOICE,
31
+ timeout = 30,
32
+ ack = false,
33
+ auth,
34
+ createSession,
35
+ createAudioEngine,
36
+ onTrace,
37
+ onAudioFrameSent,
38
+ onAssistantAudio,
39
+ onMicAudio
40
+ } = options;
41
+ const { AudioEngine } = require2("agent-voice-audio");
42
+ const streamDelayMs = readEnvInt("AGENT_VOICE_AEC_STREAM_DELAY_MS", 30);
43
+ const engine = (createAudioEngine ?? ((engineOptions) => new AudioEngine(engineOptions)))({
44
+ sampleRate: SAMPLE_RATE,
45
+ channels: 1,
46
+ enableAec: true,
47
+ streamDelayMs
48
+ });
49
+ engine.start();
50
+ const debug = process.env.AGENT_VOICE_DEBUG_ASK_EVENTS === "1";
51
+ const startMs = Date.now();
52
+ function logEvent(event, detail) {
53
+ if (!debug) return;
54
+ const elapsed = Date.now() - startMs;
55
+ const suffix = detail ? ` ${detail}` : "";
56
+ process.stderr.write(`[ask ${elapsed}ms] ${event}${suffix}
57
+ `);
58
+ }
59
+ function trace(event, detail) {
60
+ onTrace?.({ atMs: Date.now() - startMs, event, detail });
61
+ }
62
+ logEvent("start");
63
+ trace("start");
64
+ return new Promise((resolve, reject) => {
65
+ let transcript = "";
66
+ let timeoutTimer = null;
67
+ let responseStartTimer = null;
68
+ let transcriptTimer = null;
69
+ let capturePollTimer = null;
70
+ let speechDetected = false;
71
+ let speechStartedAtMs = 0;
72
+ let initialResponseDone = false;
73
+ let heardAssistantAudio = false;
74
+ let lastAssistantAudioAt = 0;
75
+ let nearEndEvidenceSeen = false;
76
+ let nearEndEvidenceAtMs = 0;
77
+ let nearEndEvidenceConfirmed = false;
78
+ let cleaned = false;
79
+ let settled = false;
80
+ async function cleanup() {
81
+ if (cleaned) return;
82
+ cleaned = true;
83
+ logEvent("cleanup:start");
84
+ trace("cleanup:start");
85
+ if (timeoutTimer) clearTimeout(timeoutTimer);
86
+ if (responseStartTimer) clearTimeout(responseStartTimer);
87
+ if (transcriptTimer) clearTimeout(transcriptTimer);
88
+ if (capturePollTimer) clearInterval(capturePollTimer);
89
+ try {
90
+ engine.stop();
91
+ engine.close();
92
+ } catch {
93
+ }
94
+ session.close();
95
+ logEvent("cleanup:done");
96
+ trace("cleanup:done");
97
+ }
98
+ function resolveOnce(value) {
99
+ if (settled) return;
100
+ settled = true;
101
+ cleanup().then(() => resolve(value));
102
+ }
103
+ function rejectOnce(error) {
104
+ if (settled) return;
105
+ settled = true;
106
+ cleanup().then(() => reject(error));
107
+ }
108
+ capturePollTimer = setInterval(() => {
109
+ if (settled) return;
110
+ let rawFrames = [];
111
+ let processedFrames = [];
112
+ try {
113
+ rawFrames = engine.readRawCapture(64);
114
+ processedFrames = engine.readProcessedCapture(64);
115
+ } catch (err) {
116
+ rejectOnce(
117
+ new Error(
118
+ `audio engine capture read failed: ${err instanceof Error ? err.message : String(err)}`
119
+ )
120
+ );
121
+ trace("audio:capture_read_error", {
122
+ error: err instanceof Error ? err.message : String(err)
123
+ });
124
+ return;
125
+ }
126
+ for (const frame of rawFrames) onMicAudio?.(frame);
127
+ if (!heardAssistantAudio) return;
128
+ for (const frame of processedFrames) {
129
+ const rms = pcm16Rms(frame);
130
+ const configuredMinSpeechRms = readEnvInt(
131
+ "AGENT_VOICE_MIN_SPEECH_RMS",
132
+ 220
133
+ );
134
+ const relaxAfterMs = readEnvInt(
135
+ "AGENT_VOICE_MIN_SPEECH_RMS_RELAX_AFTER_MS",
136
+ 500
137
+ );
138
+ const relaxedMinSpeechRms = readEnvInt(
139
+ "AGENT_VOICE_MIN_SPEECH_RMS_RELAXED",
140
+ 120
141
+ );
142
+ const minSpeechRms = speechDetected && speechStartedAtMs > 0 && Date.now() - speechStartedAtMs >= relaxAfterMs ? relaxedMinSpeechRms : configuredMinSpeechRms;
143
+ if (rms >= minSpeechRms) {
144
+ nearEndEvidenceSeen = true;
145
+ nearEndEvidenceAtMs = Date.now();
146
+ if (!nearEndEvidenceConfirmed && speechStartedAtMs > 0) {
147
+ const evidencePreRollMs = readEnvInt(
148
+ "AGENT_VOICE_SPEECH_EVIDENCE_PREROLL_MS",
149
+ 200
150
+ );
151
+ const evidencePostRollMs = readEnvInt(
152
+ "AGENT_VOICE_SPEECH_EVIDENCE_POSTROLL_MS",
153
+ 1500
154
+ );
155
+ if (nearEndEvidenceAtMs >= speechStartedAtMs - evidencePreRollMs && nearEndEvidenceAtMs <= speechStartedAtMs + evidencePostRollMs) {
156
+ nearEndEvidenceConfirmed = true;
157
+ }
158
+ }
159
+ trace("audio:near_end_evidence", { rms, minSpeechRms });
160
+ }
161
+ onAudioFrameSent?.(frame);
162
+ session.sendAudio(frame);
163
+ }
164
+ if (processedFrames.length > 0) {
165
+ trace("audio:sent_capture", { frames: processedFrames.length });
166
+ }
167
+ }, 10);
168
+ const session = (createSession ?? createRealtimeSession)({
169
+ voice,
170
+ mode: "default",
171
+ ack,
172
+ auth,
173
+ onAudioDelta(pcm16) {
174
+ logEvent("realtime:audio_delta", `bytes=${pcm16.length}`);
175
+ trace("realtime:audio_delta", { bytes: pcm16.length });
176
+ heardAssistantAudio = true;
177
+ lastAssistantAudioAt = Date.now();
178
+ onAssistantAudio?.(pcm16);
179
+ engine.play(pcm16);
180
+ },
181
+ onTranscript(text) {
182
+ const echoGuardMs = readEnvInt("AGENT_VOICE_ECHO_GUARD_MS", 1500);
183
+ const sinceAssistantMs = Date.now() - lastAssistantAudioAt;
184
+ if (heardAssistantAudio && sinceAssistantMs < echoGuardMs) {
185
+ logEvent(
186
+ "realtime:transcript_ignored_echo_guard",
187
+ `since_assistant_ms=${sinceAssistantMs} text="${text}"`
188
+ );
189
+ trace("realtime:transcript_ignored_echo_guard", {
190
+ sinceAssistantMs,
191
+ text
192
+ });
193
+ return;
194
+ }
195
+ logEvent("realtime:transcript", `text="${text}"`);
196
+ trace("realtime:transcript", { text });
197
+ if (speechDetected && !nearEndEvidenceConfirmed) {
198
+ trace("realtime:transcript_ignored_no_near_end_evidence", {
199
+ text,
200
+ speechStartedAtMs,
201
+ nearEndEvidenceSeen,
202
+ nearEndEvidenceAtMs
203
+ });
204
+ return;
205
+ }
206
+ if (transcriptTimer) {
207
+ clearTimeout(transcriptTimer);
208
+ transcriptTimer = null;
209
+ }
210
+ transcript = text;
211
+ if (!ack) resolveOnce(transcript);
212
+ },
213
+ onSpeechStarted() {
214
+ logEvent("realtime:speech_started");
215
+ trace("realtime:speech_started");
216
+ speechDetected = true;
217
+ speechStartedAtMs = Date.now();
218
+ if (nearEndEvidenceSeen && !nearEndEvidenceConfirmed) {
219
+ const evidencePreRollMs = readEnvInt(
220
+ "AGENT_VOICE_SPEECH_EVIDENCE_PREROLL_MS",
221
+ 200
222
+ );
223
+ if (nearEndEvidenceAtMs >= speechStartedAtMs - evidencePreRollMs) {
224
+ nearEndEvidenceConfirmed = true;
225
+ }
226
+ }
227
+ if (timeoutTimer) {
228
+ clearTimeout(timeoutTimer);
229
+ timeoutTimer = null;
230
+ }
231
+ if (transcriptTimer) clearTimeout(transcriptTimer);
232
+ transcriptTimer = setTimeout(() => {
233
+ logEvent("timeout:no_transcript_after_speech");
234
+ trace("timeout:no_transcript_after_speech", {
235
+ timeoutSeconds: timeout
236
+ });
237
+ rejectOnce(
238
+ new Error(
239
+ `No transcript received within ${timeout}s after speech started`
240
+ )
241
+ );
242
+ }, timeout * 1e3);
243
+ if (!initialResponseDone && heardAssistantAudio) {
244
+ try {
245
+ engine.play(Buffer.alloc(0));
246
+ } catch {
247
+ }
248
+ }
249
+ },
250
+ onInitialResponseDone() {
251
+ logEvent("realtime:initial_response_done");
252
+ trace("realtime:initial_response_done");
253
+ initialResponseDone = true;
254
+ timeoutTimer = setTimeout(() => {
255
+ if (!speechDetected) {
256
+ logEvent("timeout:no_speech");
257
+ trace("timeout:no_speech", { timeoutSeconds: timeout });
258
+ rejectOnce(
259
+ new Error(`No speech detected within ${timeout}s timeout`)
260
+ );
261
+ }
262
+ }, timeout * 1e3);
263
+ },
264
+ onDone() {
265
+ logEvent("realtime:done");
266
+ trace("realtime:done");
267
+ if (ack) resolveOnce(transcript);
268
+ },
269
+ onError(error) {
270
+ logEvent("realtime:error", error);
271
+ trace("realtime:error", { error });
272
+ rejectOnce(new Error(error));
273
+ }
274
+ });
275
+ session.connect().then(
276
+ () => {
277
+ logEvent("realtime:connected");
278
+ trace("realtime:connected");
279
+ logEvent("realtime:send_message");
280
+ trace("realtime:send_message");
281
+ session.sendMessage(message);
282
+ responseStartTimer = setTimeout(() => {
283
+ if (!heardAssistantAudio) {
284
+ logEvent("timeout:no_assistant_audio");
285
+ trace("timeout:no_assistant_audio");
286
+ rejectOnce(
287
+ new Error("No assistant audio received after sending message")
288
+ );
289
+ }
290
+ }, 1e4);
291
+ },
292
+ (err) => {
293
+ logEvent(
294
+ "realtime:connect_error",
295
+ err instanceof Error ? err.message : String(err)
296
+ );
297
+ trace("realtime:connect_error", {
298
+ error: err instanceof Error ? err.message : String(err)
299
+ });
300
+ rejectOnce(err instanceof Error ? err : new Error(String(err)));
301
+ }
302
+ );
303
+ });
304
+ }
305
+ export {
306
+ ask
307
+ };
@@ -75,6 +75,7 @@ async function ask(message, options = {}) {
75
75
  let lastAssistantAudioAt = 0;
76
76
  let nearEndEvidenceSeen = false;
77
77
  let nearEndEvidenceAtMs = 0;
78
+ let nearEndEvidenceConfirmed = false;
78
79
  let cleaned = false;
79
80
  let settled = false;
80
81
  async function cleanup() {
@@ -143,6 +144,19 @@ async function ask(message, options = {}) {
143
144
  if (rms >= minSpeechRms) {
144
145
  nearEndEvidenceSeen = true;
145
146
  nearEndEvidenceAtMs = Date.now();
147
+ if (!nearEndEvidenceConfirmed && speechStartedAtMs > 0) {
148
+ const evidencePreRollMs = readEnvInt(
149
+ "AGENT_VOICE_SPEECH_EVIDENCE_PREROLL_MS",
150
+ 200
151
+ );
152
+ const evidencePostRollMs = readEnvInt(
153
+ "AGENT_VOICE_SPEECH_EVIDENCE_POSTROLL_MS",
154
+ 1500
155
+ );
156
+ if (nearEndEvidenceAtMs >= speechStartedAtMs - evidencePreRollMs && nearEndEvidenceAtMs <= speechStartedAtMs + evidencePostRollMs) {
157
+ nearEndEvidenceConfirmed = true;
158
+ }
159
+ }
146
160
  trace("audio:near_end_evidence", { rms, minSpeechRms });
147
161
  }
148
162
  onAudioFrameSent?.(frame);
@@ -181,29 +195,14 @@ async function ask(message, options = {}) {
181
195
  }
182
196
  logEvent("realtime:transcript", `text="${text}"`);
183
197
  trace("realtime:transcript", { text });
184
- if (speechDetected) {
185
- const evidencePreRollMs = readEnvInt(
186
- "AGENT_VOICE_SPEECH_EVIDENCE_PREROLL_MS",
187
- 200
188
- );
189
- const evidencePostRollMs = readEnvInt(
190
- "AGENT_VOICE_SPEECH_EVIDENCE_POSTROLL_MS",
191
- 1500
192
- );
193
- const evidenceEarliestMs = speechStartedAtMs - evidencePreRollMs;
194
- const evidenceLatestMs = speechStartedAtMs + evidencePostRollMs;
195
- const hasTimelyNearEndEvidence = nearEndEvidenceSeen && nearEndEvidenceAtMs >= evidenceEarliestMs && nearEndEvidenceAtMs <= evidenceLatestMs;
196
- if (!hasTimelyNearEndEvidence) {
197
- trace("realtime:transcript_ignored_no_near_end_evidence", {
198
- text,
199
- speechStartedAtMs,
200
- nearEndEvidenceSeen,
201
- nearEndEvidenceAtMs,
202
- evidenceEarliestMs,
203
- evidenceLatestMs
204
- });
205
- return;
206
- }
198
+ if (speechDetected && !nearEndEvidenceConfirmed) {
199
+ trace("realtime:transcript_ignored_no_near_end_evidence", {
200
+ text,
201
+ speechStartedAtMs,
202
+ nearEndEvidenceSeen,
203
+ nearEndEvidenceAtMs
204
+ });
205
+ return;
207
206
  }
208
207
  if (transcriptTimer) {
209
208
  clearTimeout(transcriptTimer);
@@ -217,6 +216,15 @@ async function ask(message, options = {}) {
217
216
  trace("realtime:speech_started");
218
217
  speechDetected = true;
219
218
  speechStartedAtMs = Date.now();
219
+ if (nearEndEvidenceSeen && !nearEndEvidenceConfirmed) {
220
+ const evidencePreRollMs = readEnvInt(
221
+ "AGENT_VOICE_SPEECH_EVIDENCE_PREROLL_MS",
222
+ 200
223
+ );
224
+ if (nearEndEvidenceAtMs >= speechStartedAtMs - evidencePreRollMs) {
225
+ nearEndEvidenceConfirmed = true;
226
+ }
227
+ }
220
228
  if (timeoutTimer) {
221
229
  clearTimeout(timeoutTimer);
222
230
  timeoutTimer = null;
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
  import {
3
3
  writeAuthConfig
4
- } from "./chunk-RGYWLATZ.js";
4
+ } from "./chunk-ZNUQXGGO.js";
5
5
  import "./chunk-AHLLYIEW.js";
6
6
 
7
7
  // src/auth.ts
@@ -0,0 +1,115 @@
1
+ // src/realtime.ts
2
+ import { OpenAIRealtimeWS } from "openai/beta/realtime/ws";
3
+ var SYSTEM_INSTRUCTIONS = `
4
+ # Role
5
+ Voice relay between an AI agent and a human.
6
+
7
+ # Instructions
8
+ - When given a text message, read it aloud EXACTLY as written. Do not add, remove, or rephrase anything.
9
+ - After the human responds, acknowledge briefly \u2014 a few words only. Vary your phrasing.
10
+ - NEVER repeat back what the user said verbatim.
11
+ - NEVER ask follow-up questions.
12
+ - Keep every response under one sentence.
13
+
14
+ # Tone
15
+ - Calm, neutral, concise.
16
+ `.trim();
17
+ function createRealtimeSession(options) {
18
+ let rt;
19
+ let responseCount = 0;
20
+ function configureSession() {
21
+ const turnDetection = options.mode === "say" ? void 0 : {
22
+ type: "semantic_vad",
23
+ eagerness: "medium",
24
+ create_response: options.ack,
25
+ interrupt_response: true
26
+ };
27
+ rt.send({
28
+ type: "session.update",
29
+ session: {
30
+ instructions: SYSTEM_INSTRUCTIONS,
31
+ voice: options.voice,
32
+ input_audio_format: "pcm16",
33
+ output_audio_format: "pcm16",
34
+ input_audio_transcription: { model: "gpt-4o-transcribe" },
35
+ turn_detection: turnDetection
36
+ }
37
+ });
38
+ }
39
+ function bindEvents() {
40
+ rt.on("response.audio.delta", (event) => {
41
+ const pcm16 = Buffer.from(event.delta, "base64");
42
+ options.onAudioDelta(pcm16);
43
+ });
44
+ rt.on("response.audio.done", () => {
45
+ options.onAudioDone?.();
46
+ });
47
+ rt.on("conversation.item.input_audio_transcription.completed", (event) => {
48
+ options.onTranscript(event.transcript);
49
+ });
50
+ rt.on("input_audio_buffer.speech_started", () => {
51
+ options.onSpeechStarted();
52
+ });
53
+ rt.on("response.done", () => {
54
+ responseCount++;
55
+ if (responseCount === 1) {
56
+ options.onInitialResponseDone();
57
+ } else if (responseCount === 2) {
58
+ options.onDone();
59
+ }
60
+ });
61
+ rt.on("error", (event) => {
62
+ options.onError(event.error?.message ?? "Unknown realtime error");
63
+ });
64
+ }
65
+ return {
66
+ connect() {
67
+ return new Promise((resolve, reject) => {
68
+ const client = options.auth ? {
69
+ apiKey: options.auth.apiKey,
70
+ baseURL: options.auth.baseUrl ?? "https://api.openai.com/v1"
71
+ } : void 0;
72
+ rt = new OpenAIRealtimeWS({ model: "gpt-4o-realtime-preview" }, client);
73
+ rt.socket.on("open", () => {
74
+ configureSession();
75
+ bindEvents();
76
+ resolve();
77
+ });
78
+ rt.socket.on("error", (err) => {
79
+ reject(new Error(`WebSocket connection failed: ${err.message}`));
80
+ });
81
+ });
82
+ },
83
+ sendMessage(text) {
84
+ rt.send({
85
+ type: "conversation.item.create",
86
+ item: {
87
+ type: "message",
88
+ role: "user",
89
+ content: [
90
+ {
91
+ type: "input_text",
92
+ text: `Read this aloud exactly as written, word for word. Do not add, remove, or change anything:
93
+
94
+ ${text}`
95
+ }
96
+ ]
97
+ }
98
+ });
99
+ rt.send({ type: "response.create" });
100
+ },
101
+ sendAudio(pcm16) {
102
+ rt.send({
103
+ type: "input_audio_buffer.append",
104
+ audio: pcm16.toString("base64")
105
+ });
106
+ },
107
+ close() {
108
+ rt?.close();
109
+ }
110
+ };
111
+ }
112
+
113
+ export {
114
+ createRealtimeSession
115
+ };