@dtelecom/agents-js 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs ADDED
@@ -0,0 +1,1201 @@
1
+ import {
2
+ BaseSTTStream
3
+ } from "./chunk-6OWWB2X7.mjs";
4
+ import {
5
+ Embedder,
6
+ MemoryStore,
7
+ RoomMemory
8
+ } from "./chunk-RQKGHAFV.mjs";
9
+ import {
10
+ createLogger,
11
+ getLogLevel,
12
+ setLogLevel
13
+ } from "./chunk-BN7PIFNJ.mjs";
14
+
15
+ // src/core/voice-agent.ts
16
+ import { EventEmitter as EventEmitter2 } from "events";
17
+
18
+ // src/room/room-connection.ts
19
+ import { Room, LocalAudioTrack, AudioSource, TrackSource } from "@dtelecom/server-sdk-node";
20
+ var log = createLogger("RoomConnection");
21
+ var RoomConnection = class {
22
+ room;
23
+ audioSource = null;
24
+ localTrack = null;
25
+ _connected = false;
26
+ constructor() {
27
+ this.room = new Room();
28
+ }
29
+ get connected() {
30
+ return this._connected;
31
+ }
32
+ /**
33
+ * Connect to a dTelecom room.
34
+ *
35
+ * 1. Create an Ed25519 JWT via AccessToken
36
+ * 2. Discover nearest SFU via getWsUrl()
37
+ * 3. Connect Room via WebRTC
38
+ * 4. Publish an audio track for the agent to speak through
39
+ */
40
+ async connect(options) {
41
+ const { room: roomName, apiKey, apiSecret, identity = "agent", name } = options;
42
+ log.info(`Connecting to room "${roomName}" as "${identity}"...`);
43
+ const { AccessToken } = await import("@dtelecom/server-sdk-js");
44
+ const token = new AccessToken(apiKey, apiSecret, {
45
+ identity,
46
+ name: name ?? identity
47
+ });
48
+ token.addGrant({
49
+ roomJoin: true,
50
+ room: roomName,
51
+ canPublish: true,
52
+ canSubscribe: true,
53
+ canPublishData: true
54
+ });
55
+ const wsUrl = await token.getWsUrl();
56
+ const jwt = token.toJwt();
57
+ log.info(`SFU URL: ${wsUrl}`);
58
+ await this.room.connect(wsUrl, jwt, { autoSubscribe: true });
59
+ this._connected = true;
60
+ log.info("Connected successfully");
61
+ }
62
+ /**
63
+ * Publish an audio track so the agent can speak.
64
+ * Returns the AudioSource to feed PCM16 audio into.
65
+ */
66
+ async publishAudioTrack() {
67
+ if (this.audioSource) return this.audioSource;
68
+ this.audioSource = new AudioSource(48e3, 1);
69
+ this.localTrack = LocalAudioTrack.createAudioTrack("agent-voice", this.audioSource);
70
+ await this.room.localParticipant.publishTrack(this.localTrack, {
71
+ name: "agent-voice",
72
+ source: TrackSource.MICROPHONE
73
+ });
74
+ log.info("Audio track published");
75
+ return this.audioSource;
76
+ }
77
+ /** Disconnect from the room and clean up resources. */
78
+ async disconnect() {
79
+ if (!this._connected) return;
80
+ if (this.localTrack) {
81
+ await this.room.localParticipant.unpublishTrack(this.localTrack);
82
+ this.localTrack = null;
83
+ }
84
+ if (this.audioSource) {
85
+ this.audioSource.destroy();
86
+ this.audioSource = null;
87
+ }
88
+ await this.room.disconnect();
89
+ this._connected = false;
90
+ log.info("Disconnected from room");
91
+ }
92
+ };
93
+
94
+ // src/room/audio-input.ts
95
+ var log2 = createLogger("AudioInput");
96
+ var AudioInput = class {
97
+ participantIdentity;
98
+ stream;
99
+ _closed = false;
100
+ frameCount = 0;
101
+ constructor(track, participantIdentity) {
102
+ this.participantIdentity = participantIdentity;
103
+ this.stream = track.createStream(16e3, 1);
104
+ log2.info(`AudioInput created for "${participantIdentity}" (trackSid=${track.sid})`);
105
+ }
106
+ get closed() {
107
+ return this._closed;
108
+ }
109
+ /**
110
+ * Async iterate over PCM16 buffers from this participant.
111
+ * Each yielded Buffer is 16kHz mono PCM16 LE.
112
+ */
113
+ async *frames() {
114
+ for await (const frame of this.stream) {
115
+ if (this._closed) break;
116
+ this.frameCount++;
117
+ if (this.frameCount === 1 || this.frameCount % 500 === 0) {
118
+ log2.info(`[${this.participantIdentity}] frame #${this.frameCount}`);
119
+ }
120
+ yield frame.toBuffer();
121
+ }
122
+ log2.info(`[${this.participantIdentity}] frame iterator ended (total: ${this.frameCount})`);
123
+ }
124
+ /** Async iterate over AudioFrame objects. */
125
+ async *audioFrames() {
126
+ for await (const frame of this.stream) {
127
+ if (this._closed) break;
128
+ yield frame;
129
+ }
130
+ }
131
+ close() {
132
+ if (this._closed) return;
133
+ this._closed = true;
134
+ this.stream.close();
135
+ log2.debug(`AudioInput closed for participant "${this.participantIdentity}"`);
136
+ }
137
+ };
138
+
139
+ // src/room/audio-output.ts
140
+ import { AudioFrame } from "@dtelecom/server-sdk-node";
141
+ import { writeFileSync, appendFileSync, existsSync, mkdirSync } from "fs";
142
+ import { join } from "path";
143
+ var log3 = createLogger("AudioOutput");
144
+ var SAMPLE_RATE = 16e3;
145
+ var CHANNELS = 1;
146
+ var FRAME_DURATION_MS = 20;
147
+ var SAMPLES_PER_FRAME = SAMPLE_RATE * FRAME_DURATION_MS / 1e3;
148
+ var SILENCE = new Int16Array(SAMPLES_PER_FRAME);
149
+ var AudioOutput = class {
150
+ source;
151
+ _playing = false;
152
+ _responding = false;
153
+ _stopped = false;
154
+ silenceInterval = null;
155
+ /** When set, raw PCM from TTS is saved to this directory as WAV files for debugging. */
156
+ dumpDir = null;
157
+ dumpCounter = 0;
158
+ constructor(source) {
159
+ this.source = source;
160
+ }
161
+ get playing() {
162
+ return this._playing;
163
+ }
164
+ /**
165
+ * Mark the start of a multi-sentence response.
166
+ * Suppresses silence injection between sentences so partial frames
167
+ * in AudioSource's buffer don't get corrupted by interleaved silence.
168
+ */
169
+ beginResponse() {
170
+ this._responding = true;
171
+ }
172
+ /** Mark the end of a response — re-enable silence keepalive. */
173
+ endResponse() {
174
+ this._responding = false;
175
+ }
176
+ /**
177
+ * Start sparse silence keepalive to prevent the SFU from dropping the track.
178
+ * With Opus DTX enabled, the encoder handles silence natively — we only need
179
+ * an occasional packet to keep the SSRC alive.
180
+ */
181
+ startSilence() {
182
+ if (this.silenceInterval) return;
183
+ log3.debug("Starting silence keepalive (sparse, 3s interval)");
184
+ this.silenceInterval = setInterval(() => {
185
+ if (!this._playing && !this._responding && !this._stopped) {
186
+ const f = new AudioFrame(SILENCE, SAMPLE_RATE, CHANNELS, SAMPLES_PER_FRAME);
187
+ this.source.captureFrame(f).catch(() => {
188
+ });
189
+ }
190
+ }, 3e3);
191
+ }
192
+ /**
193
+ * Write a PCM16 buffer to the audio output.
194
+ * The buffer is split into 20ms frames and fed to AudioSource.
195
+ */
196
+ async writeBuffer(pcm16) {
197
+ this._playing = true;
198
+ try {
199
+ await this.writeFrames(pcm16);
200
+ } finally {
201
+ this._playing = false;
202
+ }
203
+ }
204
+ /**
205
+ * Write a stream of PCM16 buffers (from TTS) to the audio output.
206
+ * Supports cancellation via AbortSignal.
207
+ */
208
+ async writeStream(stream, signal) {
209
+ this._playing = true;
210
+ const streamStart = performance.now();
211
+ let chunkCount = 0;
212
+ let totalBytes = 0;
213
+ log3.debug("writeStream: started");
214
+ const rawChunks = this.dumpDir ? [] : null;
215
+ try {
216
+ for await (const chunk of stream) {
217
+ if (signal?.aborted) {
218
+ log3.debug(`writeStream: cancelled after ${chunkCount} chunks, ${(performance.now() - streamStart).toFixed(0)}ms`);
219
+ break;
220
+ }
221
+ chunkCount++;
222
+ totalBytes += chunk.byteLength;
223
+ rawChunks?.push(Buffer.from(chunk));
224
+ await this.writeFrames(chunk);
225
+ }
226
+ } finally {
227
+ this._playing = false;
228
+ const elapsed = performance.now() - streamStart;
229
+ const audioDurationMs = totalBytes / 2 / SAMPLE_RATE * 1e3;
230
+ log3.info(
231
+ `writeStream: done \u2014 ${chunkCount} chunks, ${totalBytes} bytes, audio=${audioDurationMs.toFixed(0)}ms, wall=${elapsed.toFixed(0)}ms`
232
+ );
233
+ if (rawChunks && rawChunks.length > 0 && this.dumpDir) {
234
+ try {
235
+ if (!existsSync(this.dumpDir)) mkdirSync(this.dumpDir, { recursive: true });
236
+ const filePath = join(this.dumpDir, `tts-raw-${++this.dumpCounter}.wav`);
237
+ writeWav(filePath, rawChunks, SAMPLE_RATE);
238
+ log3.info(`writeStream: saved raw TTS to ${filePath}`);
239
+ } catch (err) {
240
+ log3.warn("writeStream: failed to save WAV dump:", err);
241
+ }
242
+ }
243
+ }
244
+ }
245
+ /**
246
+ * Split a PCM16 buffer into 20ms frames and write them at real-time pace.
247
+ * Partial frames at the end are sent directly — AudioSource handles
248
+ * accumulation in its internal buffer.
249
+ */
250
+ async writeFrames(pcm16) {
251
+ const aligned = Buffer.alloc(pcm16.byteLength);
252
+ pcm16.copy(aligned);
253
+ const samples = new Int16Array(
254
+ aligned.buffer,
255
+ aligned.byteOffset,
256
+ aligned.byteLength / 2
257
+ );
258
+ let offset = 0;
259
+ while (offset < samples.length) {
260
+ const end = Math.min(offset + SAMPLES_PER_FRAME, samples.length);
261
+ const frameSamples = samples.subarray(offset, end);
262
+ const frame = new AudioFrame(
263
+ frameSamples,
264
+ SAMPLE_RATE,
265
+ CHANNELS,
266
+ frameSamples.length
267
+ );
268
+ await this.source.captureFrame(frame);
269
+ if (frameSamples.length === SAMPLES_PER_FRAME) {
270
+ await sleep(FRAME_DURATION_MS);
271
+ }
272
+ offset = end;
273
+ }
274
+ }
275
+ /**
276
+ * Write silence frames for the given duration.
277
+ * Used to pad the end of a response so the last Opus frame is fully flushed
278
+ * and the audio doesn't cut off abruptly.
279
+ */
280
+ async writeSilence(durationMs) {
281
+ const frameCount = Math.ceil(durationMs / FRAME_DURATION_MS);
282
+ for (let i = 0; i < frameCount; i++) {
283
+ const frame = new AudioFrame(SILENCE, SAMPLE_RATE, CHANNELS, SAMPLES_PER_FRAME);
284
+ await this.source.captureFrame(frame);
285
+ await sleep(FRAME_DURATION_MS);
286
+ }
287
+ }
288
+ /** Flush any buffered audio in AudioSource */
289
+ flush() {
290
+ this.source.flush();
291
+ this._playing = false;
292
+ }
293
+ /** Stop the silence keepalive */
294
+ stop() {
295
+ this._stopped = true;
296
+ if (this.silenceInterval) {
297
+ clearInterval(this.silenceInterval);
298
+ this.silenceInterval = null;
299
+ }
300
+ }
301
+ };
302
+ function sleep(ms) {
303
+ return new Promise((resolve) => setTimeout(resolve, ms));
304
+ }
305
+ function writeWav(filePath, pcmChunks, sampleRate) {
306
+ const dataSize = pcmChunks.reduce((sum, b) => sum + b.byteLength, 0);
307
+ const header = Buffer.alloc(44);
308
+ header.write("RIFF", 0);
309
+ header.writeUInt32LE(36 + dataSize, 4);
310
+ header.write("WAVE", 8);
311
+ header.write("fmt ", 12);
312
+ header.writeUInt32LE(16, 16);
313
+ header.writeUInt16LE(1, 20);
314
+ header.writeUInt16LE(1, 22);
315
+ header.writeUInt32LE(sampleRate, 24);
316
+ header.writeUInt32LE(sampleRate * 2, 28);
317
+ header.writeUInt16LE(2, 32);
318
+ header.writeUInt16LE(16, 34);
319
+ header.write("data", 36);
320
+ header.writeUInt32LE(dataSize, 40);
321
+ writeFileSync(filePath, header);
322
+ for (const chunk of pcmChunks) {
323
+ appendFileSync(filePath, chunk);
324
+ }
325
+ }
326
+
327
+ // src/core/pipeline.ts
328
+ import { EventEmitter } from "events";
329
+
330
+ // src/core/context-manager.ts
331
+ var log4 = createLogger("ContextManager");
332
+ function estimateTokens(text) {
333
+ return Math.ceil(text.length / 4);
334
+ }
335
+ var ContextManager = class {
336
+ instructions;
337
+ maxContextTokens;
338
+ recentTurnsToKeep;
339
+ turns = [];
340
+ summary = null;
341
+ constructor(options) {
342
+ this.instructions = options.instructions;
343
+ this.maxContextTokens = options.maxContextTokens ?? 5e3;
344
+ this.recentTurnsToKeep = options.recentTurnsToKeep ?? 4;
345
+ }
346
+ /** Add a user's speech turn to the conversation */
347
+ addUserTurn(speaker, text) {
348
+ this.turns.push({
349
+ speaker,
350
+ text,
351
+ isAgent: false,
352
+ timestamp: Date.now()
353
+ });
354
+ }
355
+ /** Add the agent's response to the conversation */
356
+ addAgentTurn(text) {
357
+ this.turns.push({
358
+ speaker: "assistant",
359
+ text,
360
+ isAgent: true,
361
+ timestamp: Date.now()
362
+ });
363
+ }
364
+ /**
365
+ * Build the messages array for the LLM call.
366
+ *
367
+ * Structure:
368
+ * [system prompt]
369
+ * [memory context, if provided]
370
+ * [conversation summary, if any]
371
+ * [recent verbatim turns]
372
+ *
373
+ * @param memoryContext - Optional relevant context injected by the application
374
+ */
375
+ buildMessages(memoryContext) {
376
+ const messages = [];
377
+ messages.push({ role: "system", content: this.instructions });
378
+ if (memoryContext) {
379
+ messages.push({
380
+ role: "system",
381
+ content: `Relevant context from past conversations:
382
+ ${memoryContext}`
383
+ });
384
+ }
385
+ if (this.summary) {
386
+ messages.push({
387
+ role: "system",
388
+ content: `Conversation summary so far:
389
+ ${this.summary}`
390
+ });
391
+ }
392
+ const turnsToInclude = this.summary ? this.turns.slice(-this.recentTurnsToKeep) : this.turns;
393
+ for (const turn of turnsToInclude) {
394
+ if (turn.isAgent) {
395
+ messages.push({ role: "assistant", content: turn.text });
396
+ } else {
397
+ messages.push({
398
+ role: "user",
399
+ content: `[${turn.speaker}]: ${turn.text}`
400
+ });
401
+ }
402
+ }
403
+ return messages;
404
+ }
405
+ /** Check if summarization should be triggered */
406
+ shouldSummarize() {
407
+ const totalTokens = this.turns.reduce(
408
+ (acc, t) => acc + estimateTokens(t.text) + 10,
409
+ estimateTokens(this.instructions)
410
+ );
411
+ return totalTokens > this.maxContextTokens;
412
+ }
413
+ /**
414
+ * Summarize older turns using the LLM.
415
+ * Keeps the most recent turns verbatim.
416
+ */
417
+ async summarize(llm) {
418
+ if (this.turns.length <= this.recentTurnsToKeep) return;
419
+ const olderTurns = this.turns.slice(0, -this.recentTurnsToKeep);
420
+ const transcript = olderTurns.map((t) => `[${t.speaker}]: ${t.text}`).join("\n");
421
+ const summaryPrompt = [
422
+ {
423
+ role: "system",
424
+ content: "Summarize this conversation concisely, preserving key facts, decisions, and action items."
425
+ },
426
+ { role: "user", content: transcript }
427
+ ];
428
+ let summaryText = "";
429
+ for await (const chunk of llm.chat(summaryPrompt)) {
430
+ if (chunk.type === "token" && chunk.token) {
431
+ summaryText += chunk.token;
432
+ }
433
+ }
434
+ this.summary = this.summary ? `${this.summary}
435
+
436
+ ${summaryText}` : summaryText;
437
+ this.turns = this.turns.slice(-this.recentTurnsToKeep);
438
+ log4.info(`Summarized ${olderTurns.length} turns, ${this.turns.length} recent turns kept`);
439
+ }
440
+ /** Get the full transcript */
441
+ getFullTranscript() {
442
+ return this.turns.map((t) => `[${t.speaker}]: ${t.text}`).join("\n");
443
+ }
444
+ /** Reset the context */
445
+ reset() {
446
+ this.turns = [];
447
+ this.summary = null;
448
+ }
449
+ };
450
+
451
+ // src/core/sentence-splitter.ts
452
+ var MIN_CHUNK = 20;
453
+ var MAX_CHUNK = 150;
454
+ var SentenceSplitter = class {
455
+ buffer = "";
456
+ /** Add a token and get back any speakable chunks */
457
+ push(token) {
458
+ this.buffer += token;
459
+ return this.extractChunks();
460
+ }
461
+ /** Flush any remaining text as a final chunk */
462
+ flush() {
463
+ const text = this.buffer.trim();
464
+ this.buffer = "";
465
+ return text.length > 0 ? text : null;
466
+ }
467
+ /** Reset the splitter */
468
+ reset() {
469
+ this.buffer = "";
470
+ }
471
+ extractChunks() {
472
+ const chunks = [];
473
+ while (true) {
474
+ const sentenceMatch = this.buffer.match(/[^.!?]*[.!?]\s*/);
475
+ if (sentenceMatch && sentenceMatch.index !== void 0) {
476
+ const end = sentenceMatch.index + sentenceMatch[0].length;
477
+ const chunk = this.buffer.slice(0, end).trim();
478
+ if (chunk.length >= MIN_CHUNK) {
479
+ chunks.push(chunk);
480
+ this.buffer = this.buffer.slice(end);
481
+ continue;
482
+ }
483
+ }
484
+ if (this.buffer.length >= MAX_CHUNK) {
485
+ const clauseMatch = this.buffer.match(/[,;:\u2014]\s*/);
486
+ if (clauseMatch && clauseMatch.index !== void 0 && clauseMatch.index >= MIN_CHUNK) {
487
+ const end = clauseMatch.index + clauseMatch[0].length;
488
+ const chunk = this.buffer.slice(0, end).trim();
489
+ chunks.push(chunk);
490
+ this.buffer = this.buffer.slice(end);
491
+ continue;
492
+ }
493
+ const spaceIdx = this.buffer.lastIndexOf(" ", MAX_CHUNK);
494
+ if (spaceIdx >= MIN_CHUNK) {
495
+ const chunk = this.buffer.slice(0, spaceIdx).trim();
496
+ chunks.push(chunk);
497
+ this.buffer = this.buffer.slice(spaceIdx);
498
+ continue;
499
+ }
500
+ }
501
+ break;
502
+ }
503
+ return chunks;
504
+ }
505
+ };
506
+
507
+ // src/core/turn-detector.ts
508
+ var log5 = createLogger("TurnDetector");
509
+ var TurnDetector = class {
510
+ silenceTimeoutMs;
511
+ silenceTimer = null;
512
+ _onTurnEnd = null;
513
+ lastFinalText = "";
514
+ constructor(options = {}) {
515
+ this.silenceTimeoutMs = options.silenceTimeoutMs ?? 800;
516
+ }
517
+ /** Set the callback for when a turn ends */
518
+ set onTurnEnd(cb) {
519
+ this._onTurnEnd = cb;
520
+ }
521
+ /**
522
+ * Feed a transcription result.
523
+ * Returns true if this result represents a completed turn.
524
+ */
525
+ handleTranscription(text, isFinal) {
526
+ this.clearTimer();
527
+ if (isFinal && text.trim().length > 0) {
528
+ this.lastFinalText = text;
529
+ this.silenceTimer = setTimeout(() => {
530
+ log5.debug(`Turn ended after ${this.silenceTimeoutMs}ms silence`);
531
+ this._onTurnEnd?.();
532
+ }, this.silenceTimeoutMs);
533
+ return false;
534
+ }
535
+ if (!isFinal && text.trim().length > 0) {
536
+ this.clearTimer();
537
+ }
538
+ return false;
539
+ }
540
+ /** Force-trigger turn end */
541
+ forceTurnEnd() {
542
+ this.clearTimer();
543
+ this._onTurnEnd?.();
544
+ }
545
+ /** Reset state */
546
+ reset() {
547
+ this.clearTimer();
548
+ this.lastFinalText = "";
549
+ }
550
+ clearTimer() {
551
+ if (this.silenceTimer) {
552
+ clearTimeout(this.silenceTimer);
553
+ this.silenceTimer = null;
554
+ }
555
+ }
556
+ };
557
+
558
+ // src/core/barge-in.ts
559
+ var log6 = createLogger("BargeIn");
560
+ var BargeIn = class {
561
+ abortController = null;
562
+ _interrupted = false;
563
+ _onInterrupt = null;
564
+ get interrupted() {
565
+ return this._interrupted;
566
+ }
567
+ /** Set the callback for when barge-in occurs */
568
+ set onInterrupt(cb) {
569
+ this._onInterrupt = cb;
570
+ }
571
+ /**
572
+ * Create a new AbortController for the current response cycle.
573
+ * Call this at the start of each STT->LLM->TTS cycle.
574
+ */
575
+ startCycle() {
576
+ this.abortController = new AbortController();
577
+ this._interrupted = false;
578
+ return this.abortController.signal;
579
+ }
580
+ /** Trigger barge-in. Called when STT detects speech during agent output. */
581
+ trigger() {
582
+ if (this._interrupted) return;
583
+ this._interrupted = true;
584
+ log6.info("Barge-in detected \u2014 cancelling current response");
585
+ if (this.abortController) {
586
+ this.abortController.abort();
587
+ this.abortController = null;
588
+ }
589
+ this._onInterrupt?.();
590
+ }
591
+ /** Reset after the interrupted cycle is cleaned up */
592
+ reset() {
593
+ this._interrupted = false;
594
+ this.abortController = null;
595
+ }
596
+ };
597
+
598
+ // src/core/pipeline.ts
599
+ var log7 = createLogger("Pipeline");
600
+ var AUDIO_DRAIN_MS = 800;
601
+ function sleep2(ms) {
602
+ return new Promise((resolve) => setTimeout(resolve, ms));
603
+ }
604
+ var Pipeline = class extends EventEmitter {
605
+ stt;
606
+ llm;
607
+ tts;
608
+ audioOutput;
609
+ context;
610
+ turnDetector;
611
+ bargeIn;
612
+ splitter;
613
+ respondMode;
614
+ agentName;
615
+ nameVariants;
616
+ beforeRespond;
617
+ memory;
618
+ /** Active STT streams, keyed by participant identity */
619
+ sttStreams = /* @__PURE__ */ new Map();
620
+ _processing = false;
621
+ _running = false;
622
+ _agentState = "idle";
623
+ /** Queued turn while current one is still processing */
624
+ pendingTurn = null;
625
+ constructor(options) {
626
+ super();
627
+ this.stt = options.stt;
628
+ this.llm = options.llm;
629
+ this.tts = options.tts;
630
+ this.audioOutput = options.audioOutput;
631
+ this.respondMode = options.respondMode ?? "always";
632
+ this.agentName = (options.agentName ?? "assistant").toLowerCase();
633
+ this.nameVariants = (options.nameVariants ?? []).map((n) => n.toLowerCase());
634
+ this.beforeRespond = options.beforeRespond;
635
+ this.memory = options.memory;
636
+ this.context = new ContextManager({
637
+ instructions: options.instructions
638
+ });
639
+ this.turnDetector = new TurnDetector({
640
+ silenceTimeoutMs: options.silenceTimeoutMs
641
+ });
642
+ this.bargeIn = new BargeIn();
643
+ this.splitter = new SentenceSplitter();
644
+ this.turnDetector.onTurnEnd = () => {
645
+ };
646
+ this.bargeIn.onInterrupt = () => {
647
+ this.audioOutput.flush();
648
+ this.splitter.reset();
649
+ this.setAgentState("idle");
650
+ };
651
+ if (this.llm.warmup) {
652
+ this.llm.warmup(options.instructions).catch((err) => {
653
+ log7.warn("LLM warmup failed:", err);
654
+ });
655
+ }
656
+ if (this.tts?.warmup) {
657
+ this.tts.warmup().catch((err) => {
658
+ log7.warn("TTS warmup failed:", err);
659
+ });
660
+ }
661
+ }
662
+ get processing() {
663
+ return this._processing;
664
+ }
665
+ get running() {
666
+ return this._running;
667
+ }
668
+ get agentState() {
669
+ return this._agentState;
670
+ }
671
+ setAgentState(state) {
672
+ if (this._agentState !== state) {
673
+ this._agentState = state;
674
+ this.emit("agentState", state);
675
+ }
676
+ }
677
+ addParticipant(identity) {
678
+ const existing = this.sttStreams.get(identity);
679
+ if (existing) {
680
+ existing.close();
681
+ this.sttStreams.delete(identity);
682
+ log7.info(`Replacing STT stream for "${identity}"`);
683
+ }
684
+ const stream = this.stt.createStream();
685
+ this.sttStreams.set(identity, stream);
686
+ this._running = true;
687
+ stream.on("transcription", (result) => {
688
+ this.handleTranscription(identity, result);
689
+ });
690
+ stream.on("error", (error) => {
691
+ log7.error(`STT error for ${identity}:`, error);
692
+ this.emit("error", error);
693
+ });
694
+ log7.info(`STT stream started for participant "${identity}"`);
695
+ return stream;
696
+ }
697
+ async removeParticipant(identity) {
698
+ const stream = this.sttStreams.get(identity);
699
+ if (stream) {
700
+ await stream.close();
701
+ this.sttStreams.delete(identity);
702
+ log7.info(`STT stream removed for participant "${identity}"`);
703
+ }
704
+ }
705
+ async stop() {
706
+ this._running = false;
707
+ this.turnDetector.reset();
708
+ this.bargeIn.reset();
709
+ this.splitter.reset();
710
+ for (const [, stream] of this.sttStreams) {
711
+ await stream.close();
712
+ }
713
+ this.sttStreams.clear();
714
+ log7.info("Pipeline stopped");
715
+ }
716
+ getContextManager() {
717
+ return this.context;
718
+ }
719
+ lastFinalAt = 0;
720
+ lastSttDuration = 0;
721
+ async handleTranscription(speaker, result) {
722
+ this.emit("transcription", { ...result, speaker });
723
+ if (!result.isFinal && result.text.trim()) {
724
+ this.setAgentState("listening");
725
+ }
726
+ if (this.audioOutput.playing && result.text.trim().length > 0) {
727
+ this.bargeIn.trigger();
728
+ }
729
+ if (result.isFinal && result.text.trim()) {
730
+ const text = result.text.trim();
731
+ this.lastFinalAt = performance.now();
732
+ this.lastSttDuration = result.sttDuration ?? 0;
733
+ this.memory?.storeTurn(speaker, text, false);
734
+ if (await this.shouldRespond(speaker, text)) {
735
+ this.processTurn(speaker, text);
736
+ } else {
737
+ log7.info(`Not responding to "${speaker}": "${text.slice(0, 60)}" (mode=${this.respondMode})`);
738
+ this.setAgentState("idle");
739
+ }
740
+ } else if (result.isFinal) {
741
+ this.setAgentState("idle");
742
+ }
743
+ }
744
+ /**
745
+ * Determine if the agent should respond to this turn.
746
+ * In 'always' mode: responds to everything.
747
+ * In 'addressed' mode: only when agent name is mentioned + optional beforeRespond hook.
748
+ */
749
+ async shouldRespond(speaker, text) {
750
+ if (this.respondMode === "always") return true;
751
+ const lower = text.toLowerCase();
752
+ const nameMatch = lower.includes(this.agentName) || this.nameVariants.some((v) => lower.includes(v));
753
+ if (!nameMatch) return false;
754
+ if (this.beforeRespond) {
755
+ return this.beforeRespond(speaker, text);
756
+ }
757
+ return true;
758
+ }
759
+ async processTurn(speaker, text) {
760
+ if (this._processing) {
761
+ log7.info(`Queuing turn (current still processing): "${text}"`);
762
+ this.pendingTurn = { speaker, text };
763
+ this.bargeIn.trigger();
764
+ return;
765
+ }
766
+ this._processing = true;
767
+ const tSpeechEnd = this.lastFinalAt;
768
+ const sttDuration = this.lastSttDuration;
769
+ let tLlmFirstToken = 0;
770
+ let tFirstSentence = 0;
771
+ let tFirstAudioPlayed = 0;
772
+ log7.info(`Processing turn from "${speaker}": ${text}`);
773
+ try {
774
+ this.context.addUserTurn(speaker, text);
775
+ if (this.context.shouldSummarize()) {
776
+ await this.context.summarize(this.llm);
777
+ }
778
+ const signal = this.bargeIn.startCycle();
779
+ let memoryContext = "";
780
+ if (this.memory) {
781
+ try {
782
+ memoryContext = await this.memory.searchRelevant(text);
783
+ } catch (err) {
784
+ log7.warn("Memory search failed:", err);
785
+ }
786
+ }
787
+ const messages = this.context.buildMessages(memoryContext || void 0);
788
+ let fullResponse = "";
789
+ this.setAgentState("thinking");
790
+ const sentenceQueue = [];
791
+ let producerDone = false;
792
+ let wakeConsumer = null;
793
+ const wake = () => {
794
+ wakeConsumer?.();
795
+ };
796
+ const producer = async () => {
797
+ let isFirstToken = true;
798
+ let isFirstSentence = true;
799
+ const llmStream = this.llm.chat(messages, signal);
800
+ try {
801
+ while (!signal.aborted) {
802
+ const { value: chunk, done } = await llmStream.next();
803
+ if (done || !chunk) break;
804
+ if (signal.aborted) break;
805
+ if (chunk.type === "token" && chunk.token) {
806
+ if (isFirstToken) {
807
+ tLlmFirstToken = performance.now();
808
+ isFirstToken = false;
809
+ log7.info(`llm_first_token: ${(tLlmFirstToken - tSpeechEnd).toFixed(0)}ms`);
810
+ }
811
+ fullResponse += chunk.token;
812
+ const sentences = this.splitter.push(chunk.token);
813
+ for (const sentence of sentences) {
814
+ if (signal.aborted) break;
815
+ if (isFirstSentence) {
816
+ tFirstSentence = performance.now();
817
+ isFirstSentence = false;
818
+ log7.info(`first_sentence: ${(tFirstSentence - tSpeechEnd).toFixed(0)}ms \u2014 "${sentence.slice(0, 60)}"`);
819
+ }
820
+ sentenceQueue.push(sentence);
821
+ wake();
822
+ }
823
+ }
824
+ }
825
+ } finally {
826
+ await llmStream.return(void 0);
827
+ }
828
+ if (!signal.aborted) {
829
+ const remaining = this.splitter.flush();
830
+ if (remaining) {
831
+ if (isFirstSentence) {
832
+ tFirstSentence = performance.now();
833
+ isFirstSentence = false;
834
+ log7.info(`first_sentence (flush): ${(tFirstSentence - tSpeechEnd).toFixed(0)}ms \u2014 "${remaining.slice(0, 60)}"`);
835
+ }
836
+ sentenceQueue.push(remaining);
837
+ wake();
838
+ }
839
+ }
840
+ producerDone = true;
841
+ wake();
842
+ };
843
+ const consumer = async () => {
844
+ this.audioOutput.beginResponse();
845
+ try {
846
+ while (true) {
847
+ if (signal.aborted) break;
848
+ if (sentenceQueue.length > 0) {
849
+ const sentence = sentenceQueue.shift();
850
+ if (!/\w/.test(sentence)) {
851
+ log7.debug(`Skipping non-word sentence: "${sentence}"`);
852
+ continue;
853
+ }
854
+ await this.synthesizeAndPlay(sentence, signal, (t) => {
855
+ if (!tFirstAudioPlayed) {
856
+ tFirstAudioPlayed = t;
857
+ this.setAgentState("speaking");
858
+ }
859
+ this.emit("sentence", sentence);
860
+ });
861
+ continue;
862
+ }
863
+ if (producerDone) break;
864
+ await new Promise((resolve) => {
865
+ wakeConsumer = resolve;
866
+ });
867
+ wakeConsumer = null;
868
+ }
869
+ } finally {
870
+ if (!signal.aborted) {
871
+ await this.audioOutput.writeSilence(40);
872
+ }
873
+ this.audioOutput.endResponse();
874
+ }
875
+ };
876
+ await Promise.all([producer(), consumer()]);
877
+ const ttftMs = tLlmFirstToken ? tLlmFirstToken - tSpeechEnd : 0;
878
+ const llmMs = tFirstSentence ? tFirstSentence - tSpeechEnd : 0;
879
+ const ttsMs = tFirstAudioPlayed && tFirstSentence ? tFirstAudioPlayed - tFirstSentence : 0;
880
+ const overallMs = sttDuration + llmMs + ttsMs;
881
+ log7.info(
882
+ `LATENCY "${text.slice(0, 30)}": STT=${sttDuration.toFixed(0)}ms LLM=${llmMs.toFixed(0)}ms (TTFT=${ttftMs.toFixed(0)}ms) TTS=${ttsMs.toFixed(0)}ms Overall=${overallMs.toFixed(0)}ms`
883
+ );
884
+ if (fullResponse.trim()) {
885
+ this.context.addAgentTurn(fullResponse.trim());
886
+ this.memory?.storeTurn("assistant", fullResponse.trim(), true);
887
+ this.emit("response", fullResponse.trim());
888
+ }
889
+ await sleep2(AUDIO_DRAIN_MS);
890
+ this.setAgentState("idle");
891
+ } catch (err) {
892
+ if (err instanceof Error && err.name === "AbortError") {
893
+ log7.debug("Turn processing aborted (barge-in)");
894
+ } else {
895
+ log7.error("Error processing turn:", err);
896
+ this.emit("error", err instanceof Error ? err : new Error(String(err)));
897
+ }
898
+ } finally {
899
+ this._processing = false;
900
+ this.bargeIn.reset();
901
+ if (this.pendingTurn) {
902
+ const { speaker: nextSpeaker, text: nextText } = this.pendingTurn;
903
+ this.pendingTurn = null;
904
+ log7.info(`Processing queued turn from "${nextSpeaker}": ${nextText}`);
905
+ this.processTurn(nextSpeaker, nextText);
906
+ }
907
+ }
908
+ }
909
+ /**
910
+ * Speak text directly via TTS, bypassing the LLM.
911
+ * Supports barge-in — if the student speaks, the greeting is cut short.
912
+ * Adds the text to conversation context so the LLM knows what was said.
913
+ */
914
+ async say(text) {
915
+ if (this._processing) {
916
+ log7.warn("say() called while processing \u2014 skipping");
917
+ return;
918
+ }
919
+ this._processing = true;
920
+ log7.info(`say(): "${text.slice(0, 60)}"`);
921
+ try {
922
+ const signal = this.bargeIn.startCycle();
923
+ this.audioOutput.beginResponse();
924
+ this.setAgentState("thinking");
925
+ await this.synthesizeAndPlay(text, signal, () => {
926
+ this.setAgentState("speaking");
927
+ this.emit("sentence", text);
928
+ });
929
+ if (!signal.aborted) {
930
+ await this.audioOutput.writeSilence(40);
931
+ this.context.addAgentTurn(text);
932
+ this.memory?.storeTurn("assistant", text, true);
933
+ this.emit("response", text);
934
+ }
935
+ await sleep2(AUDIO_DRAIN_MS);
936
+ this.setAgentState("idle");
937
+ } catch (err) {
938
+ if (err instanceof Error && err.name === "AbortError") {
939
+ log7.debug("say() aborted (barge-in)");
940
+ } else {
941
+ log7.error("Error in say():", err);
942
+ this.emit("error", err instanceof Error ? err : new Error(String(err)));
943
+ }
944
+ } finally {
945
+ this._processing = false;
946
+ this.audioOutput.endResponse();
947
+ this.bargeIn.reset();
948
+ if (this.pendingTurn) {
949
+ const { speaker: nextSpeaker, text: nextText } = this.pendingTurn;
950
+ this.pendingTurn = null;
951
+ log7.info(`Processing queued turn from "${nextSpeaker}": ${nextText}`);
952
+ this.processTurn(nextSpeaker, nextText);
953
+ }
954
+ }
955
+ }
956
+ async synthesizeAndPlay(text, signal, onFirstAudio) {
957
+ if (!this.tts || signal.aborted) {
958
+ log7.info(`[Agent says]: ${text}`);
959
+ return;
960
+ }
961
+ try {
962
+ const ttsStart = performance.now();
963
+ let firstChunk = true;
964
+ let ttsChunkCount = 0;
965
+ const ttsStream = this.tts.synthesize(text, signal);
966
+ const measuredStream = async function* () {
967
+ for await (const chunk of ttsStream) {
968
+ ttsChunkCount++;
969
+ if (firstChunk) {
970
+ firstChunk = false;
971
+ const now = performance.now();
972
+ log7.info(`tts_first_audio: ${(now - ttsStart).toFixed(0)}ms for "${text.slice(0, 40)}"`);
973
+ onFirstAudio(now);
974
+ }
975
+ yield chunk;
976
+ }
977
+ };
978
+ await this.audioOutput.writeStream(measuredStream(), signal);
979
+ log7.info(`synthesizeAndPlay done: ${(performance.now() - ttsStart).toFixed(0)}ms, ${ttsChunkCount} chunks for "${text.slice(0, 40)}"`);
980
+ } catch (err) {
981
+ if (err instanceof Error && err.name === "AbortError") return;
982
+ throw err;
983
+ }
984
+ }
985
+ };
986
+
987
+ // src/core/voice-agent.ts
988
+ var log8 = createLogger("VoiceAgent");
989
+ var VoiceAgent = class extends EventEmitter2 {
990
+ config;
991
+ connection = null;
992
+ pipeline = null;
993
+ audioInputs = /* @__PURE__ */ new Map();
994
+ audioOutput = null;
995
+ memory = null;
996
+ _running = false;
997
+ constructor(config) {
998
+ super();
999
+ this.config = config;
1000
+ }
1001
+ get running() {
1002
+ return this._running;
1003
+ }
1004
+ get room() {
1005
+ return this.connection?.room ?? null;
1006
+ }
1007
+ /** Enable saving raw TTS audio as WAV files to `dir` for debugging. */
1008
+ enableAudioDump(dir) {
1009
+ this._dumpDir = dir;
1010
+ if (this.audioOutput) {
1011
+ this.audioOutput.dumpDir = dir;
1012
+ }
1013
+ }
1014
+ _dumpDir = null;
1015
+ /**
1016
+ * Speak text directly via TTS, bypassing the LLM.
1017
+ * Use for greetings or announcements. Supports barge-in.
1018
+ */
1019
+ async say(text) {
1020
+ if (!this.pipeline) {
1021
+ throw new Error("Agent not started \u2014 call start() first");
1022
+ }
1023
+ await this.pipeline.say(text);
1024
+ }
1025
+ /** Start the agent — connect to room and begin listening. */
1026
+ async start(options) {
1027
+ if (this._running) {
1028
+ throw new Error("Agent is already running");
1029
+ }
1030
+ log8.info(`Starting agent for room "${options.room}"...`);
1031
+ if (this.config.memory?.enabled) {
1032
+ const { RoomMemory: RoomMemory2 } = await import("./room-memory-VAREPHY6.mjs");
1033
+ this.memory = new RoomMemory2({
1034
+ dbPath: this.config.memory.dbPath ?? "./data/memory.db",
1035
+ room: options.room
1036
+ });
1037
+ await this.memory.init();
1038
+ this.memory.startSession();
1039
+ log8.info("Memory initialized");
1040
+ }
1041
+ this.connection = new RoomConnection();
1042
+ await this.connection.connect({
1043
+ room: options.room,
1044
+ apiKey: options.apiKey,
1045
+ apiSecret: options.apiSecret,
1046
+ identity: options.identity ?? "agent",
1047
+ name: options.name ?? options.identity ?? "AI Agent"
1048
+ });
1049
+ const source = await this.connection.publishAudioTrack();
1050
+ this.audioOutput = new AudioOutput(source);
1051
+ if (this._dumpDir) this.audioOutput.dumpDir = this._dumpDir;
1052
+ this.audioOutput.startSilence();
1053
+ this.pipeline = new Pipeline({
1054
+ stt: this.config.stt,
1055
+ llm: this.config.llm,
1056
+ tts: this.config.tts,
1057
+ instructions: this.config.instructions,
1058
+ audioOutput: this.audioOutput,
1059
+ respondMode: this.config.respondMode,
1060
+ agentName: this.config.agentName,
1061
+ nameVariants: this.config.nameVariants,
1062
+ memory: this.memory ?? void 0
1063
+ });
1064
+ this.pipeline.on("transcription", (result) => this.emit("transcription", result));
1065
+ this.pipeline.on("sentence", (text) => this.emit("sentence", text));
1066
+ this.pipeline.on("response", (text) => this.emit("response", text));
1067
+ this.pipeline.on("agentState", (state) => this.emit("agentState", state));
1068
+ this.pipeline.on("error", (error) => this.emit("error", error));
1069
+ for (const participant of this.connection.room.remoteParticipants.values()) {
1070
+ for (const [, pub] of participant.trackPublications) {
1071
+ if (pub.track) {
1072
+ this.handleTrackSubscribed(pub.track, pub, participant);
1073
+ }
1074
+ }
1075
+ }
1076
+ this.connection.room.on("trackSubscribed", (track, pub, participant) => {
1077
+ this.handleTrackSubscribed(track, pub, participant);
1078
+ });
1079
+ this.connection.room.on("trackUnsubscribed", (track, _pub, participant) => {
1080
+ this.handleTrackUnsubscribed(track, participant);
1081
+ });
1082
+ this.connection.room.on("participantDisconnected", (participant) => {
1083
+ this.handleParticipantDisconnected(participant);
1084
+ });
1085
+ this.connection.room.on("disconnected", (reason) => {
1086
+ log8.info(`Room disconnected: ${reason}`);
1087
+ this.emit("disconnected", reason);
1088
+ });
1089
+ if (this.config.onDataMessage) {
1090
+ this.setupDataChannel(this.config.onDataMessage);
1091
+ }
1092
+ this._running = true;
1093
+ this.emit("connected");
1094
+ log8.info("Agent started and listening");
1095
+ }
1096
+ /** Stop the agent — disconnect and clean up. */
1097
+ async stop() {
1098
+ if (!this._running) return;
1099
+ log8.info("Stopping agent...");
1100
+ this._running = false;
1101
+ if (this.pipeline) {
1102
+ await this.pipeline.stop();
1103
+ this.pipeline = null;
1104
+ }
1105
+ if (this.memory) {
1106
+ try {
1107
+ await this.memory.endSession(this.config.llm);
1108
+ await this.memory.close();
1109
+ } catch (err) {
1110
+ log8.error("Error closing memory:", err);
1111
+ }
1112
+ this.memory = null;
1113
+ }
1114
+ for (const [, input] of this.audioInputs) {
1115
+ input.close();
1116
+ }
1117
+ this.audioInputs.clear();
1118
+ if (this.audioOutput) {
1119
+ this.audioOutput.stop();
1120
+ this.audioOutput = null;
1121
+ }
1122
+ if (this.connection) {
1123
+ await this.connection.disconnect();
1124
+ this.connection = null;
1125
+ }
1126
+ this.emit("disconnected", "agent_stopped");
1127
+ log8.info("Agent stopped");
1128
+ }
1129
+ setupDataChannel(handler) {
1130
+ if (!this.connection) return;
1131
+ this.connection.room.on("dataReceived", (payload, participant, _kind, topic) => {
1132
+ const identity = participant?.identity ?? "unknown";
1133
+ handler(payload, identity, topic);
1134
+ });
1135
+ log8.info("Data channel handler registered");
1136
+ }
1137
+ handleTrackSubscribed(track, _publication, participant) {
1138
+ const identity = participant.identity;
1139
+ log8.info(`Track subscribed from "${identity}" (sid=${track.sid})`);
1140
+ this.memory?.addParticipant(identity);
1141
+ const existing = this.audioInputs.get(identity);
1142
+ if (existing) {
1143
+ log8.info(`Closing old AudioInput for "${identity}" (re-subscription)`);
1144
+ existing.close();
1145
+ }
1146
+ const audioInput = new AudioInput(track, identity);
1147
+ this.audioInputs.set(identity, audioInput);
1148
+ const sttStream = this.pipeline.addParticipant(identity);
1149
+ this.pipeAudioToSTT(audioInput, sttStream, identity);
1150
+ }
1151
+ handleTrackUnsubscribed(_track, participant) {
1152
+ const identity = participant.identity;
1153
+ log8.info(`Track unsubscribed from "${identity}"`);
1154
+ const input = this.audioInputs.get(identity);
1155
+ if (input) {
1156
+ input.close();
1157
+ this.audioInputs.delete(identity);
1158
+ }
1159
+ }
1160
+ handleParticipantDisconnected(participant) {
1161
+ const identity = participant.identity;
1162
+ log8.info(`Participant disconnected: "${identity}"`);
1163
+ const input = this.audioInputs.get(identity);
1164
+ if (input) {
1165
+ input.close();
1166
+ this.audioInputs.delete(identity);
1167
+ }
1168
+ this.pipeline?.removeParticipant(identity);
1169
+ }
1170
+ async pipeAudioToSTT(input, sttStream, identity) {
1171
+ try {
1172
+ for await (const buffer of input.frames()) {
1173
+ if (!this._running) break;
1174
+ sttStream.sendAudio(buffer);
1175
+ }
1176
+ } catch (err) {
1177
+ if (this._running) {
1178
+ log8.error(`Audio pipe error for "${identity}":`, err);
1179
+ }
1180
+ }
1181
+ }
1182
+ };
1183
+ export {
1184
+ AudioInput,
1185
+ AudioOutput,
1186
+ BargeIn,
1187
+ BaseSTTStream,
1188
+ ContextManager,
1189
+ Embedder,
1190
+ MemoryStore,
1191
+ Pipeline,
1192
+ RoomConnection,
1193
+ RoomMemory,
1194
+ SentenceSplitter,
1195
+ TurnDetector,
1196
+ VoiceAgent,
1197
+ createLogger,
1198
+ getLogLevel,
1199
+ setLogLevel
1200
+ };
1201
+ //# sourceMappingURL=index.mjs.map