agent-voice 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,307 @@
1
+ import {
2
+ createRealtimeSession
3
+ } from "./chunk-3YEHGYHI.js";
4
+ import {
5
+ DEFAULT_VOICE,
6
+ SAMPLE_RATE
7
+ } from "./chunk-YU5FF2L7.js";
8
+
9
+ // src/ask.ts
10
+ import { createRequire } from "module";
11
+ var require2 = createRequire(import.meta.url);
12
+ function pcm16Rms(pcm16) {
13
+ const samples = Math.floor(pcm16.length / 2);
14
+ if (samples === 0) return 0;
15
+ let sumSquares = 0;
16
+ for (let i = 0; i < samples; i++) {
17
+ const value = pcm16.readInt16LE(i * 2);
18
+ sumSquares += value * value;
19
+ }
20
+ return Math.sqrt(sumSquares / samples);
21
+ }
22
+ function readEnvInt(name, fallback) {
23
+ const raw = process.env[name];
24
+ if (raw == null) return fallback;
25
+ const parsed = Number.parseInt(raw, 10);
26
+ return Number.isFinite(parsed) ? parsed : fallback;
27
+ }
28
+ async function ask(message, options = {}) {
29
+ const {
30
+ voice = DEFAULT_VOICE,
31
+ timeout = 30,
32
+ ack = false,
33
+ auth,
34
+ createSession,
35
+ createAudioEngine,
36
+ onTrace,
37
+ onAudioFrameSent,
38
+ onAssistantAudio,
39
+ onMicAudio
40
+ } = options;
41
+ const { AudioEngine } = require2("agent-voice-audio");
42
+ const streamDelayMs = readEnvInt("AGENT_VOICE_AEC_STREAM_DELAY_MS", 30);
43
+ const engine = (createAudioEngine ?? ((engineOptions) => new AudioEngine(engineOptions)))({
44
+ sampleRate: SAMPLE_RATE,
45
+ channels: 1,
46
+ enableAec: true,
47
+ streamDelayMs
48
+ });
49
+ engine.start();
50
+ const debug = process.env.AGENT_VOICE_DEBUG_ASK_EVENTS === "1";
51
+ const startMs = Date.now();
52
+ function logEvent(event, detail) {
53
+ if (!debug) return;
54
+ const elapsed = Date.now() - startMs;
55
+ const suffix = detail ? ` ${detail}` : "";
56
+ process.stderr.write(`[ask ${elapsed}ms] ${event}${suffix}
57
+ `);
58
+ }
59
+ function trace(event, detail) {
60
+ onTrace?.({ atMs: Date.now() - startMs, event, detail });
61
+ }
62
+ logEvent("start");
63
+ trace("start");
64
+ return new Promise((resolve, reject) => {
65
+ let transcript = "";
66
+ let timeoutTimer = null;
67
+ let responseStartTimer = null;
68
+ let transcriptTimer = null;
69
+ let capturePollTimer = null;
70
+ let speechDetected = false;
71
+ let speechStartedAtMs = 0;
72
+ let initialResponseDone = false;
73
+ let heardAssistantAudio = false;
74
+ let lastAssistantAudioAt = 0;
75
+ let nearEndEvidenceSeen = false;
76
+ let nearEndEvidenceAtMs = 0;
77
+ let nearEndEvidenceConfirmed = false;
78
+ let cleaned = false;
79
+ let settled = false;
80
+ async function cleanup() {
81
+ if (cleaned) return;
82
+ cleaned = true;
83
+ logEvent("cleanup:start");
84
+ trace("cleanup:start");
85
+ if (timeoutTimer) clearTimeout(timeoutTimer);
86
+ if (responseStartTimer) clearTimeout(responseStartTimer);
87
+ if (transcriptTimer) clearTimeout(transcriptTimer);
88
+ if (capturePollTimer) clearInterval(capturePollTimer);
89
+ try {
90
+ engine.stop();
91
+ engine.close();
92
+ } catch {
93
+ }
94
+ session.close();
95
+ logEvent("cleanup:done");
96
+ trace("cleanup:done");
97
+ }
98
+ function resolveOnce(value) {
99
+ if (settled) return;
100
+ settled = true;
101
+ cleanup().then(() => resolve(value));
102
+ }
103
+ function rejectOnce(error) {
104
+ if (settled) return;
105
+ settled = true;
106
+ cleanup().then(() => reject(error));
107
+ }
108
+ capturePollTimer = setInterval(() => {
109
+ if (settled) return;
110
+ let rawFrames = [];
111
+ let processedFrames = [];
112
+ try {
113
+ rawFrames = engine.readRawCapture(64);
114
+ processedFrames = engine.readProcessedCapture(64);
115
+ } catch (err) {
116
+ rejectOnce(
117
+ new Error(
118
+ `audio engine capture read failed: ${err instanceof Error ? err.message : String(err)}`
119
+ )
120
+ );
121
+ trace("audio:capture_read_error", {
122
+ error: err instanceof Error ? err.message : String(err)
123
+ });
124
+ return;
125
+ }
126
+ for (const frame of rawFrames) onMicAudio?.(frame);
127
+ if (!heardAssistantAudio) return;
128
+ for (const frame of processedFrames) {
129
+ const rms = pcm16Rms(frame);
130
+ const configuredMinSpeechRms = readEnvInt(
131
+ "AGENT_VOICE_MIN_SPEECH_RMS",
132
+ 220
133
+ );
134
+ const relaxAfterMs = readEnvInt(
135
+ "AGENT_VOICE_MIN_SPEECH_RMS_RELAX_AFTER_MS",
136
+ 500
137
+ );
138
+ const relaxedMinSpeechRms = readEnvInt(
139
+ "AGENT_VOICE_MIN_SPEECH_RMS_RELAXED",
140
+ 120
141
+ );
142
+ const minSpeechRms = speechDetected && speechStartedAtMs > 0 && Date.now() - speechStartedAtMs >= relaxAfterMs ? relaxedMinSpeechRms : configuredMinSpeechRms;
143
+ if (rms >= minSpeechRms) {
144
+ nearEndEvidenceSeen = true;
145
+ nearEndEvidenceAtMs = Date.now();
146
+ if (!nearEndEvidenceConfirmed && speechStartedAtMs > 0) {
147
+ const evidencePreRollMs = readEnvInt(
148
+ "AGENT_VOICE_SPEECH_EVIDENCE_PREROLL_MS",
149
+ 200
150
+ );
151
+ const evidencePostRollMs = readEnvInt(
152
+ "AGENT_VOICE_SPEECH_EVIDENCE_POSTROLL_MS",
153
+ 1500
154
+ );
155
+ if (nearEndEvidenceAtMs >= speechStartedAtMs - evidencePreRollMs && nearEndEvidenceAtMs <= speechStartedAtMs + evidencePostRollMs) {
156
+ nearEndEvidenceConfirmed = true;
157
+ }
158
+ }
159
+ trace("audio:near_end_evidence", { rms, minSpeechRms });
160
+ }
161
+ onAudioFrameSent?.(frame);
162
+ session.sendAudio(frame);
163
+ }
164
+ if (processedFrames.length > 0) {
165
+ trace("audio:sent_capture", { frames: processedFrames.length });
166
+ }
167
+ }, 10);
168
+ const session = (createSession ?? createRealtimeSession)({
169
+ voice,
170
+ mode: "default",
171
+ ack,
172
+ auth,
173
+ onAudioDelta(pcm16) {
174
+ logEvent("realtime:audio_delta", `bytes=${pcm16.length}`);
175
+ trace("realtime:audio_delta", { bytes: pcm16.length });
176
+ heardAssistantAudio = true;
177
+ lastAssistantAudioAt = Date.now();
178
+ onAssistantAudio?.(pcm16);
179
+ engine.play(pcm16);
180
+ },
181
+ onTranscript(text) {
182
+ const echoGuardMs = readEnvInt("AGENT_VOICE_ECHO_GUARD_MS", 1500);
183
+ const sinceAssistantMs = Date.now() - lastAssistantAudioAt;
184
+ if (heardAssistantAudio && sinceAssistantMs < echoGuardMs) {
185
+ logEvent(
186
+ "realtime:transcript_ignored_echo_guard",
187
+ `since_assistant_ms=${sinceAssistantMs} text="${text}"`
188
+ );
189
+ trace("realtime:transcript_ignored_echo_guard", {
190
+ sinceAssistantMs,
191
+ text
192
+ });
193
+ return;
194
+ }
195
+ logEvent("realtime:transcript", `text="${text}"`);
196
+ trace("realtime:transcript", { text });
197
+ if (speechDetected && !nearEndEvidenceConfirmed) {
198
+ trace("realtime:transcript_ignored_no_near_end_evidence", {
199
+ text,
200
+ speechStartedAtMs,
201
+ nearEndEvidenceSeen,
202
+ nearEndEvidenceAtMs
203
+ });
204
+ return;
205
+ }
206
+ if (transcriptTimer) {
207
+ clearTimeout(transcriptTimer);
208
+ transcriptTimer = null;
209
+ }
210
+ transcript = text;
211
+ if (!ack) resolveOnce(transcript);
212
+ },
213
+ onSpeechStarted() {
214
+ logEvent("realtime:speech_started");
215
+ trace("realtime:speech_started");
216
+ speechDetected = true;
217
+ speechStartedAtMs = Date.now();
218
+ if (nearEndEvidenceSeen && !nearEndEvidenceConfirmed) {
219
+ const evidencePreRollMs = readEnvInt(
220
+ "AGENT_VOICE_SPEECH_EVIDENCE_PREROLL_MS",
221
+ 200
222
+ );
223
+ if (nearEndEvidenceAtMs >= speechStartedAtMs - evidencePreRollMs) {
224
+ nearEndEvidenceConfirmed = true;
225
+ }
226
+ }
227
+ if (timeoutTimer) {
228
+ clearTimeout(timeoutTimer);
229
+ timeoutTimer = null;
230
+ }
231
+ if (transcriptTimer) clearTimeout(transcriptTimer);
232
+ transcriptTimer = setTimeout(() => {
233
+ logEvent("timeout:no_transcript_after_speech");
234
+ trace("timeout:no_transcript_after_speech", {
235
+ timeoutSeconds: timeout
236
+ });
237
+ rejectOnce(
238
+ new Error(
239
+ `No transcript received within ${timeout}s after speech started`
240
+ )
241
+ );
242
+ }, timeout * 1e3);
243
+ if (!initialResponseDone && heardAssistantAudio) {
244
+ try {
245
+ engine.play(Buffer.alloc(0));
246
+ } catch {
247
+ }
248
+ }
249
+ },
250
+ onInitialResponseDone() {
251
+ logEvent("realtime:initial_response_done");
252
+ trace("realtime:initial_response_done");
253
+ initialResponseDone = true;
254
+ timeoutTimer = setTimeout(() => {
255
+ if (!speechDetected) {
256
+ logEvent("timeout:no_speech");
257
+ trace("timeout:no_speech", { timeoutSeconds: timeout });
258
+ rejectOnce(
259
+ new Error(`No speech detected within ${timeout}s timeout`)
260
+ );
261
+ }
262
+ }, timeout * 1e3);
263
+ },
264
+ onDone() {
265
+ logEvent("realtime:done");
266
+ trace("realtime:done");
267
+ if (ack) resolveOnce(transcript);
268
+ },
269
+ onError(error) {
270
+ logEvent("realtime:error", error);
271
+ trace("realtime:error", { error });
272
+ rejectOnce(new Error(error));
273
+ }
274
+ });
275
+ session.connect().then(
276
+ () => {
277
+ logEvent("realtime:connected");
278
+ trace("realtime:connected");
279
+ logEvent("realtime:send_message");
280
+ trace("realtime:send_message");
281
+ session.sendMessage(message);
282
+ responseStartTimer = setTimeout(() => {
283
+ if (!heardAssistantAudio) {
284
+ logEvent("timeout:no_assistant_audio");
285
+ trace("timeout:no_assistant_audio");
286
+ rejectOnce(
287
+ new Error("No assistant audio received after sending message")
288
+ );
289
+ }
290
+ }, 1e4);
291
+ },
292
+ (err) => {
293
+ logEvent(
294
+ "realtime:connect_error",
295
+ err instanceof Error ? err.message : String(err)
296
+ );
297
+ trace("realtime:connect_error", {
298
+ error: err instanceof Error ? err.message : String(err)
299
+ });
300
+ rejectOnce(err instanceof Error ? err : new Error(String(err)));
301
+ }
302
+ );
303
+ });
304
+ }
305
+ export {
306
+ ask
307
+ };
@@ -75,6 +75,7 @@ async function ask(message, options = {}) {
75
75
  let lastAssistantAudioAt = 0;
76
76
  let nearEndEvidenceSeen = false;
77
77
  let nearEndEvidenceAtMs = 0;
78
+ let nearEndEvidenceConfirmed = false;
78
79
  let cleaned = false;
79
80
  let settled = false;
80
81
  async function cleanup() {
@@ -127,10 +128,35 @@ async function ask(message, options = {}) {
127
128
  if (!heardAssistantAudio) return;
128
129
  for (const frame of processedFrames) {
129
130
  const rms = pcm16Rms(frame);
130
- const minSpeechRms = readEnvInt("AGENT_VOICE_MIN_SPEECH_RMS", 550);
131
+ const configuredMinSpeechRms = readEnvInt(
132
+ "AGENT_VOICE_MIN_SPEECH_RMS",
133
+ 220
134
+ );
135
+ const relaxAfterMs = readEnvInt(
136
+ "AGENT_VOICE_MIN_SPEECH_RMS_RELAX_AFTER_MS",
137
+ 500
138
+ );
139
+ const relaxedMinSpeechRms = readEnvInt(
140
+ "AGENT_VOICE_MIN_SPEECH_RMS_RELAXED",
141
+ 120
142
+ );
143
+ const minSpeechRms = speechDetected && speechStartedAtMs > 0 && Date.now() - speechStartedAtMs >= relaxAfterMs ? relaxedMinSpeechRms : configuredMinSpeechRms;
131
144
  if (rms >= minSpeechRms) {
132
145
  nearEndEvidenceSeen = true;
133
146
  nearEndEvidenceAtMs = Date.now();
147
+ if (!nearEndEvidenceConfirmed && speechStartedAtMs > 0) {
148
+ const evidencePreRollMs = readEnvInt(
149
+ "AGENT_VOICE_SPEECH_EVIDENCE_PREROLL_MS",
150
+ 200
151
+ );
152
+ const evidencePostRollMs = readEnvInt(
153
+ "AGENT_VOICE_SPEECH_EVIDENCE_POSTROLL_MS",
154
+ 1500
155
+ );
156
+ if (nearEndEvidenceAtMs >= speechStartedAtMs - evidencePreRollMs && nearEndEvidenceAtMs <= speechStartedAtMs + evidencePostRollMs) {
157
+ nearEndEvidenceConfirmed = true;
158
+ }
159
+ }
134
160
  trace("audio:near_end_evidence", { rms, minSpeechRms });
135
161
  }
136
162
  onAudioFrameSent?.(frame);
@@ -169,23 +195,14 @@ async function ask(message, options = {}) {
169
195
  }
170
196
  logEvent("realtime:transcript", `text="${text}"`);
171
197
  trace("realtime:transcript", { text });
172
- if (speechDetected) {
173
- const evidenceWindowMs = readEnvInt(
174
- "AGENT_VOICE_SPEECH_EVIDENCE_WINDOW_MS",
175
- 1200
176
- );
177
- const evidenceAgeMs = nearEndEvidenceSeen ? Math.abs(nearEndEvidenceAtMs - speechStartedAtMs) : Number.POSITIVE_INFINITY;
178
- if (!nearEndEvidenceSeen || evidenceAgeMs > evidenceWindowMs) {
179
- trace("realtime:transcript_ignored_no_near_end_evidence", {
180
- text,
181
- speechStartedAtMs,
182
- nearEndEvidenceSeen,
183
- nearEndEvidenceAtMs,
184
- evidenceAgeMs,
185
- evidenceWindowMs
186
- });
187
- return;
188
- }
198
+ if (speechDetected && !nearEndEvidenceConfirmed) {
199
+ trace("realtime:transcript_ignored_no_near_end_evidence", {
200
+ text,
201
+ speechStartedAtMs,
202
+ nearEndEvidenceSeen,
203
+ nearEndEvidenceAtMs
204
+ });
205
+ return;
189
206
  }
190
207
  if (transcriptTimer) {
191
208
  clearTimeout(transcriptTimer);
@@ -199,6 +216,15 @@ async function ask(message, options = {}) {
199
216
  trace("realtime:speech_started");
200
217
  speechDetected = true;
201
218
  speechStartedAtMs = Date.now();
219
+ if (nearEndEvidenceSeen && !nearEndEvidenceConfirmed) {
220
+ const evidencePreRollMs = readEnvInt(
221
+ "AGENT_VOICE_SPEECH_EVIDENCE_PREROLL_MS",
222
+ 200
223
+ );
224
+ if (nearEndEvidenceAtMs >= speechStartedAtMs - evidencePreRollMs) {
225
+ nearEndEvidenceConfirmed = true;
226
+ }
227
+ }
202
228
  if (timeoutTimer) {
203
229
  clearTimeout(timeoutTimer);
204
230
  timeoutTimer = null;
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
  import {
3
3
  writeAuthConfig
4
- } from "./chunk-RGYWLATZ.js";
4
+ } from "./chunk-ZNUQXGGO.js";
5
5
  import "./chunk-AHLLYIEW.js";
6
6
 
7
7
  // src/auth.ts
@@ -0,0 +1,115 @@
1
+ // src/realtime.ts
2
+ import { OpenAIRealtimeWS } from "openai/beta/realtime/ws";
3
+ var SYSTEM_INSTRUCTIONS = `
4
+ # Role
5
+ Voice relay between an AI agent and a human.
6
+
7
+ # Instructions
8
+ - When given a text message, read it aloud EXACTLY as written. Do not add, remove, or rephrase anything.
9
+ - After the human responds, acknowledge briefly \u2014 a few words only. Vary your phrasing.
10
+ - NEVER repeat back what the user said verbatim.
11
+ - NEVER ask follow-up questions.
12
+ - Keep every response under one sentence.
13
+
14
+ # Tone
15
+ - Calm, neutral, concise.
16
+ `.trim();
17
+ function createRealtimeSession(options) {
18
+ let rt;
19
+ let responseCount = 0;
20
+ function configureSession() {
21
+ const turnDetection = options.mode === "say" ? void 0 : {
22
+ type: "semantic_vad",
23
+ eagerness: "medium",
24
+ create_response: options.ack,
25
+ interrupt_response: true
26
+ };
27
+ rt.send({
28
+ type: "session.update",
29
+ session: {
30
+ instructions: SYSTEM_INSTRUCTIONS,
31
+ voice: options.voice,
32
+ input_audio_format: "pcm16",
33
+ output_audio_format: "pcm16",
34
+ input_audio_transcription: { model: "gpt-4o-transcribe" },
35
+ turn_detection: turnDetection
36
+ }
37
+ });
38
+ }
39
+ function bindEvents() {
40
+ rt.on("response.audio.delta", (event) => {
41
+ const pcm16 = Buffer.from(event.delta, "base64");
42
+ options.onAudioDelta(pcm16);
43
+ });
44
+ rt.on("response.audio.done", () => {
45
+ options.onAudioDone?.();
46
+ });
47
+ rt.on("conversation.item.input_audio_transcription.completed", (event) => {
48
+ options.onTranscript(event.transcript);
49
+ });
50
+ rt.on("input_audio_buffer.speech_started", () => {
51
+ options.onSpeechStarted();
52
+ });
53
+ rt.on("response.done", () => {
54
+ responseCount++;
55
+ if (responseCount === 1) {
56
+ options.onInitialResponseDone();
57
+ } else if (responseCount === 2) {
58
+ options.onDone();
59
+ }
60
+ });
61
+ rt.on("error", (event) => {
62
+ options.onError(event.error?.message ?? "Unknown realtime error");
63
+ });
64
+ }
65
+ return {
66
+ connect() {
67
+ return new Promise((resolve, reject) => {
68
+ const client = options.auth ? {
69
+ apiKey: options.auth.apiKey,
70
+ baseURL: options.auth.baseUrl ?? "https://api.openai.com/v1"
71
+ } : void 0;
72
+ rt = new OpenAIRealtimeWS({ model: "gpt-4o-realtime-preview" }, client);
73
+ rt.socket.on("open", () => {
74
+ configureSession();
75
+ bindEvents();
76
+ resolve();
77
+ });
78
+ rt.socket.on("error", (err) => {
79
+ reject(new Error(`WebSocket connection failed: ${err.message}`));
80
+ });
81
+ });
82
+ },
83
+ sendMessage(text) {
84
+ rt.send({
85
+ type: "conversation.item.create",
86
+ item: {
87
+ type: "message",
88
+ role: "user",
89
+ content: [
90
+ {
91
+ type: "input_text",
92
+ text: `Read this aloud exactly as written, word for word. Do not add, remove, or change anything:
93
+
94
+ ${text}`
95
+ }
96
+ ]
97
+ }
98
+ });
99
+ rt.send({ type: "response.create" });
100
+ },
101
+ sendAudio(pcm16) {
102
+ rt.send({
103
+ type: "input_audio_buffer.append",
104
+ audio: pcm16.toString("base64")
105
+ });
106
+ },
107
+ close() {
108
+ rt?.close();
109
+ }
110
+ };
111
+ }
112
+
113
+ export {
114
+ createRealtimeSession
115
+ };