@dtelecom/agents-js 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,1750 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __esm = (fn, res) => function __init() {
9
+ return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
10
+ };
11
+ var __export = (target, all) => {
12
+ for (var name in all)
13
+ __defProp(target, name, { get: all[name], enumerable: true });
14
+ };
15
+ var __copyProps = (to, from, except, desc) => {
16
+ if (from && typeof from === "object" || typeof from === "function") {
17
+ for (let key of __getOwnPropNames(from))
18
+ if (!__hasOwnProp.call(to, key) && key !== except)
19
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
20
+ }
21
+ return to;
22
+ };
23
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
24
+ // If the importer is in node compatibility mode or this is not an ESM
25
+ // file that has been converted to a CommonJS file using a Babel-
26
+ // compatible transform (i.e. "__esModule" has not been set), then set
27
+ // "default" to the CommonJS "module.exports" for node compatibility.
28
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
29
+ mod
30
+ ));
31
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
32
+
33
+ // src/utils/logger.ts
34
+ function detectLevel() {
35
+ const debug = typeof process !== "undefined" && process.env?.DEBUG;
36
+ if (debug && (debug === "*" || debug.includes("@dtelecom/agents"))) {
37
+ return "debug";
38
+ }
39
+ return "info";
40
+ }
41
+ function setLogLevel(level) {
42
+ globalLevel = level;
43
+ }
44
+ function getLogLevel() {
45
+ return globalLevel;
46
+ }
47
+ function timestamp() {
48
+ const d = /* @__PURE__ */ new Date();
49
+ const h = String(d.getHours()).padStart(2, "0");
50
+ const m = String(d.getMinutes()).padStart(2, "0");
51
+ const s = String(d.getSeconds()).padStart(2, "0");
52
+ const ms = String(d.getMilliseconds()).padStart(3, "0");
53
+ return `${h}:${m}:${s}.${ms}`;
54
+ }
55
+ function createLogger(tag) {
56
+ const prefix = `[@dtelecom/agents:${tag}]`;
57
+ return {
58
+ debug(...args) {
59
+ if (LEVELS[globalLevel] <= LEVELS.debug) console.debug(timestamp(), prefix, ...args);
60
+ },
61
+ info(...args) {
62
+ if (LEVELS[globalLevel] <= LEVELS.info) console.info(timestamp(), prefix, ...args);
63
+ },
64
+ warn(...args) {
65
+ if (LEVELS[globalLevel] <= LEVELS.warn) console.warn(timestamp(), prefix, ...args);
66
+ },
67
+ error(...args) {
68
+ if (LEVELS[globalLevel] <= LEVELS.error) console.error(timestamp(), prefix, ...args);
69
+ }
70
+ };
71
+ }
72
+ var LEVELS, globalLevel;
73
+ var init_logger = __esm({
74
+ "src/utils/logger.ts"() {
75
+ "use strict";
76
+ LEVELS = {
77
+ debug: 0,
78
+ info: 1,
79
+ warn: 2,
80
+ error: 3,
81
+ silent: 4
82
+ };
83
+ globalLevel = detectLevel();
84
+ }
85
+ });
86
+
87
+ // src/memory/memory-store.ts
88
+ var import_better_sqlite3, sqliteVec, log8, MemoryStore;
89
+ var init_memory_store = __esm({
90
+ "src/memory/memory-store.ts"() {
91
+ "use strict";
92
+ import_better_sqlite3 = __toESM(require("better-sqlite3"));
93
+ sqliteVec = __toESM(require("sqlite-vec"));
94
+ init_logger();
95
+ log8 = createLogger("MemoryStore");
96
+ MemoryStore = class {
97
+ db;
98
+ constructor(dbPath) {
99
+ this.db = new import_better_sqlite3.default(dbPath);
100
+ this.db.pragma("journal_mode = WAL");
101
+ this.db.pragma("synchronous = NORMAL");
102
+ sqliteVec.load(this.db);
103
+ this.createTables();
104
+ log8.info(`Memory store opened: ${dbPath}`);
105
+ }
106
+ createTables() {
107
+ this.db.exec(`
108
+ CREATE TABLE IF NOT EXISTS turns (
109
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
110
+ room TEXT NOT NULL,
111
+ session_id TEXT NOT NULL,
112
+ speaker TEXT NOT NULL,
113
+ text TEXT NOT NULL,
114
+ is_agent BOOLEAN DEFAULT 0,
115
+ created_at INTEGER NOT NULL
116
+ );
117
+
118
+ CREATE TABLE IF NOT EXISTS sessions (
119
+ id TEXT PRIMARY KEY,
120
+ room TEXT NOT NULL,
121
+ started_at INTEGER NOT NULL,
122
+ ended_at INTEGER,
123
+ participants TEXT,
124
+ summary TEXT,
125
+ turn_count INTEGER DEFAULT 0
126
+ );
127
+
128
+ CREATE INDEX IF NOT EXISTS idx_turns_room_session ON turns(room, session_id);
129
+ CREATE INDEX IF NOT EXISTS idx_turns_room_time ON turns(room, created_at);
130
+ CREATE INDEX IF NOT EXISTS idx_sessions_room ON sessions(room);
131
+ `);
132
+ const hasVecTable = this.db.prepare(
133
+ "SELECT name FROM sqlite_master WHERE type='table' AND name=?"
134
+ );
135
+ if (!hasVecTable.get("turn_vectors")) {
136
+ this.db.exec(`
137
+ CREATE VIRTUAL TABLE turn_vectors USING vec0(
138
+ turn_id INTEGER PRIMARY KEY,
139
+ embedding FLOAT[384] distance_metric=cosine
140
+ );
141
+ `);
142
+ }
143
+ if (!hasVecTable.get("session_vectors")) {
144
+ this.db.exec(`
145
+ CREATE VIRTUAL TABLE session_vectors USING vec0(
146
+ session_id TEXT PRIMARY KEY,
147
+ embedding FLOAT[384] distance_metric=cosine
148
+ );
149
+ `);
150
+ }
151
+ }
152
+ /** Insert a turn and its embedding vector. */
153
+ insertTurn(room, sessionId, speaker, text, isAgent, embedding) {
154
+ const stmt = this.db.prepare(`
155
+ INSERT INTO turns (room, session_id, speaker, text, is_agent, created_at)
156
+ VALUES (?, ?, ?, ?, ?, ?)
157
+ `);
158
+ const info = stmt.run(room, sessionId, speaker, text, isAgent ? 1 : 0, Date.now());
159
+ const turnId = info.lastInsertRowid;
160
+ this.db.prepare(
161
+ "INSERT INTO turn_vectors (turn_id, embedding) VALUES (?, ?)"
162
+ ).run(BigInt(turnId), Buffer.from(embedding.buffer));
163
+ return Number(turnId);
164
+ }
165
+ /** Create a new session record. */
166
+ insertSession(id, room) {
167
+ this.db.prepare(`
168
+ INSERT INTO sessions (id, room, started_at)
169
+ VALUES (?, ?, ?)
170
+ `).run(id, room, Date.now());
171
+ }
172
+ /** Update a session with summary and end time. */
173
+ updateSessionSummary(sessionId, summary, turnCount, participants, embedding) {
174
+ this.db.prepare(`
175
+ UPDATE sessions
176
+ SET summary = ?, ended_at = ?, turn_count = ?, participants = ?
177
+ WHERE id = ?
178
+ `).run(summary, Date.now(), turnCount, JSON.stringify(participants), sessionId);
179
+ this.db.prepare(
180
+ "INSERT INTO session_vectors (session_id, embedding) VALUES (?, ?)"
181
+ ).run(sessionId, Buffer.from(embedding.buffer));
182
+ }
183
+ /** End a session without summary (e.g., too few turns). */
184
+ endSession(sessionId, turnCount, participants) {
185
+ this.db.prepare(`
186
+ UPDATE sessions
187
+ SET ended_at = ?, turn_count = ?, participants = ?
188
+ WHERE id = ?
189
+ `).run(Date.now(), turnCount, JSON.stringify(participants), sessionId);
190
+ }
191
+ /** KNN search turns by embedding similarity. */
192
+ searchTurns(room, queryEmbedding, limit) {
193
+ const rows = this.db.prepare(`
194
+ SELECT t.speaker, t.text, t.created_at, t.session_id, tv.distance
195
+ FROM turn_vectors tv
196
+ JOIN turns t ON t.id = tv.turn_id
197
+ WHERE t.room = ?
198
+ AND tv.embedding MATCH ?
199
+ AND k = ?
200
+ ORDER BY tv.distance
201
+ `).all(room, Buffer.from(queryEmbedding.buffer), limit * 2);
202
+ return rows.slice(0, limit).map((r) => ({
203
+ speaker: r.speaker,
204
+ text: r.text,
205
+ created_at: r.created_at,
206
+ session_id: r.session_id,
207
+ distance: r.distance
208
+ }));
209
+ }
210
+ /** KNN search session summaries by embedding similarity. */
211
+ searchSessions(room, queryEmbedding, limit) {
212
+ const rows = this.db.prepare(`
213
+ SELECT s.id as session_id, s.summary, s.started_at, sv.distance
214
+ FROM session_vectors sv
215
+ JOIN sessions s ON s.id = sv.session_id
216
+ WHERE s.room = ?
217
+ AND sv.embedding MATCH ?
218
+ AND k = ?
219
+ ORDER BY sv.distance
220
+ `).all(room, Buffer.from(queryEmbedding.buffer), limit * 2);
221
+ return rows.filter((r) => r.summary).slice(0, limit);
222
+ }
223
+ /** Get the last N turns from a specific session. */
224
+ getRecentTurns(room, sessionId, limit) {
225
+ return this.db.prepare(`
226
+ SELECT * FROM turns
227
+ WHERE room = ? AND session_id = ?
228
+ ORDER BY created_at DESC
229
+ LIMIT ?
230
+ `).all(room, sessionId, limit);
231
+ }
232
+ /** Get all turns for a session (for summarization). */
233
+ getSessionTurns(sessionId) {
234
+ return this.db.prepare(`
235
+ SELECT * FROM turns
236
+ WHERE session_id = ?
237
+ ORDER BY created_at ASC
238
+ `).all(sessionId);
239
+ }
240
+ /** Get total turn count for a session. */
241
+ getSessionTurnCount(sessionId) {
242
+ const row = this.db.prepare(
243
+ "SELECT COUNT(*) as count FROM turns WHERE session_id = ?"
244
+ ).get(sessionId);
245
+ return row.count;
246
+ }
247
+ /** Close the database. */
248
+ close() {
249
+ this.db.close();
250
+ log8.info("Memory store closed");
251
+ }
252
+ };
253
+ }
254
+ });
255
+
256
+ // src/memory/embedder.ts
257
+ var log9, MODEL_NAME, EMBEDDING_DIM, Embedder;
258
+ var init_embedder = __esm({
259
+ "src/memory/embedder.ts"() {
260
+ "use strict";
261
+ init_logger();
262
+ log9 = createLogger("Embedder");
263
+ MODEL_NAME = "Xenova/all-MiniLM-L6-v2";
264
+ EMBEDDING_DIM = 384;
265
+ Embedder = class {
266
+ pipeline = null;
267
+ initPromise = null;
268
+ get dimensions() {
269
+ return EMBEDDING_DIM;
270
+ }
271
+ /** Load the embedding model. Call once at startup. */
272
+ async init() {
273
+ if (this.pipeline) return;
274
+ if (this.initPromise) return this.initPromise;
275
+ this.initPromise = this.loadModel();
276
+ return this.initPromise;
277
+ }
278
+ async loadModel() {
279
+ const start = performance.now();
280
+ log9.info(`Loading embedding model "${MODEL_NAME}"...`);
281
+ const { pipeline } = await import("@huggingface/transformers");
282
+ this.pipeline = await pipeline("feature-extraction", MODEL_NAME);
283
+ log9.info(`Embedding model loaded in ${(performance.now() - start).toFixed(0)}ms`);
284
+ }
285
+ /** Embed a single text. Returns Float32Array of length 384. */
286
+ async embed(text) {
287
+ await this.init();
288
+ const result = await this.pipeline(text, {
289
+ pooling: "mean",
290
+ normalize: true
291
+ });
292
+ return new Float32Array(result.data);
293
+ }
294
+ /** Cosine similarity between two normalized vectors. Returns value in [-1, 1]. */
295
+ static cosineSimilarity(a, b) {
296
+ let dot = 0;
297
+ for (let i = 0; i < a.length; i++) {
298
+ dot += a[i] * b[i];
299
+ }
300
+ return dot;
301
+ }
302
+ /** Embed multiple texts in one call (more efficient than calling embed() in a loop). */
303
+ async embedBatch(texts) {
304
+ if (texts.length === 0) return [];
305
+ await this.init();
306
+ const results = [];
307
+ for (const text of texts) {
308
+ const result = await this.pipeline(text, {
309
+ pooling: "mean",
310
+ normalize: true
311
+ });
312
+ results.push(new Float32Array(result.data));
313
+ }
314
+ return results;
315
+ }
316
+ };
317
+ }
318
+ });
319
+
320
+ // src/memory/room-memory.ts
321
+ var room_memory_exports = {};
322
+ __export(room_memory_exports, {
323
+ RoomMemory: () => RoomMemory
324
+ });
325
+ var import_crypto, log10, RoomMemory;
326
+ var init_room_memory = __esm({
327
+ "src/memory/room-memory.ts"() {
328
+ "use strict";
329
+ import_crypto = require("crypto");
330
+ init_memory_store();
331
+ init_embedder();
332
+ init_logger();
333
+ log10 = createLogger("RoomMemory");
334
+ RoomMemory = class {
335
+ store;
336
+ embedder;
337
+ room;
338
+ sessionId = null;
339
+ participants = /* @__PURE__ */ new Set();
340
+ pendingTurns = [];
341
+ flushTimer = null;
342
+ flushIntervalMs;
343
+ flushing = false;
344
+ constructor(config) {
345
+ this.store = new MemoryStore(config.dbPath);
346
+ this.embedder = new Embedder();
347
+ this.room = config.room;
348
+ this.flushIntervalMs = config.flushIntervalMs ?? 5e3;
349
+ }
350
+ /** Get the embedder instance (for reuse in other components). */
351
+ getEmbedder() {
352
+ return this.embedder;
353
+ }
354
+ /** Initialize embedder (loads model). Call once at startup. */
355
+ async init() {
356
+ await this.embedder.init();
357
+ }
358
+ /** Start a new session for this room. */
359
+ startSession() {
360
+ this.sessionId = (0, import_crypto.randomUUID)();
361
+ this.participants.clear();
362
+ this.store.insertSession(this.sessionId, this.room);
363
+ this.flushTimer = setInterval(() => {
364
+ this.flushPending().catch((err) => {
365
+ log10.error("Error flushing pending turns:", err);
366
+ });
367
+ }, this.flushIntervalMs);
368
+ log10.info(`Session started: ${this.sessionId}`);
369
+ return this.sessionId;
370
+ }
371
+ /** Track a participant joining. */
372
+ addParticipant(identity) {
373
+ this.participants.add(identity);
374
+ }
375
+ /**
376
+ * Store a turn to memory. Non-blocking — queues for batch embedding.
377
+ * Call this for EVERY final transcription, even if agent doesn't respond.
378
+ */
379
+ storeTurn(speaker, text, isAgent) {
380
+ if (!this.sessionId) {
381
+ log10.warn("storeTurn called without active session");
382
+ return;
383
+ }
384
+ this.pendingTurns.push({ speaker, text, isAgent });
385
+ if (this.pendingTurns.length >= 5) {
386
+ this.flushPending().catch((err) => {
387
+ log10.error("Error flushing pending turns:", err);
388
+ });
389
+ }
390
+ }
391
+ /** Flush pending turns: embed and insert into database. */
392
+ async flushPending() {
393
+ if (this.flushing || this.pendingTurns.length === 0 || !this.sessionId) return;
394
+ this.flushing = true;
395
+ const batch = this.pendingTurns.splice(0);
396
+ const texts = batch.map((t) => `[${t.speaker}]: ${t.text}`);
397
+ try {
398
+ const embeddings = await this.embedder.embedBatch(texts);
399
+ for (let i = 0; i < batch.length; i++) {
400
+ const turn = batch[i];
401
+ this.store.insertTurn(
402
+ this.room,
403
+ this.sessionId,
404
+ turn.speaker,
405
+ turn.text,
406
+ turn.isAgent,
407
+ embeddings[i]
408
+ );
409
+ }
410
+ log10.debug(`Flushed ${batch.length} turns to memory`);
411
+ } catch (err) {
412
+ log10.error("Error embedding/storing turns:", err);
413
+ this.pendingTurns.unshift(...batch);
414
+ } finally {
415
+ this.flushing = false;
416
+ }
417
+ }
418
+ /**
419
+ * Search memory for context relevant to a query.
420
+ * Returns formatted string ready to inject into LLM system prompt.
421
+ */
422
+ async searchRelevant(query, turnLimit = 5, sessionLimit = 2) {
423
+ const queryEmbedding = await this.embedder.embed(query);
424
+ const turns = this.store.searchTurns(this.room, queryEmbedding, turnLimit);
425
+ const sessions = this.store.searchSessions(this.room, queryEmbedding, sessionLimit);
426
+ if (turns.length === 0 && sessions.length === 0) {
427
+ return "";
428
+ }
429
+ const parts = [];
430
+ if (sessions.length > 0) {
431
+ parts.push("Past session summaries:");
432
+ for (const s of sessions) {
433
+ const date = new Date(s.started_at).toLocaleDateString();
434
+ parts.push(` [${date}]: ${s.summary}`);
435
+ }
436
+ }
437
+ if (turns.length > 0) {
438
+ parts.push("Relevant past turns:");
439
+ for (const t of turns) {
440
+ const date = new Date(t.created_at).toLocaleDateString();
441
+ const time = new Date(t.created_at).toLocaleTimeString([], { hour: "2-digit", minute: "2-digit" });
442
+ parts.push(` [${date} ${time}, ${t.speaker}]: ${t.text}`);
443
+ }
444
+ }
445
+ return parts.join("\n");
446
+ }
447
+ /**
448
+ * End the current session. Generates an LLM summary and stores it.
449
+ */
450
+ async endSession(llm) {
451
+ if (!this.sessionId) return;
452
+ await this.flushPending();
453
+ if (this.flushTimer) {
454
+ clearInterval(this.flushTimer);
455
+ this.flushTimer = null;
456
+ }
457
+ const turnCount = this.store.getSessionTurnCount(this.sessionId);
458
+ const participantList = Array.from(this.participants);
459
+ if (turnCount < 3) {
460
+ this.store.endSession(this.sessionId, turnCount, participantList);
461
+ log10.info(`Session ended (${turnCount} turns, no summary)`);
462
+ this.sessionId = null;
463
+ return;
464
+ }
465
+ try {
466
+ const turns = this.store.getSessionTurns(this.sessionId);
467
+ const transcript = turns.map((t) => `[${t.speaker}]: ${t.text}`).join("\n");
468
+ const messages = [
469
+ {
470
+ role: "system",
471
+ content: "Summarize this tutoring session concisely. Include: topics covered, phrases practiced, mistakes the student made, what they struggled with, and what they did well. Be factual and brief."
472
+ },
473
+ { role: "user", content: transcript }
474
+ ];
475
+ let summary = "";
476
+ for await (const chunk of llm.chat(messages)) {
477
+ if (chunk.type === "token" && chunk.token) {
478
+ summary += chunk.token;
479
+ }
480
+ }
481
+ if (summary.trim()) {
482
+ const embedding = await this.embedder.embed(summary.trim());
483
+ this.store.updateSessionSummary(
484
+ this.sessionId,
485
+ summary.trim(),
486
+ turnCount,
487
+ participantList,
488
+ embedding
489
+ );
490
+ log10.info(`Session ended with summary (${turnCount} turns, ${participantList.length} participants)`);
491
+ } else {
492
+ this.store.endSession(this.sessionId, turnCount, participantList);
493
+ log10.info(`Session ended (${turnCount} turns, summary was empty)`);
494
+ }
495
+ } catch (err) {
496
+ log10.error("Error generating session summary:", err);
497
+ this.store.endSession(this.sessionId, turnCount, participantList);
498
+ }
499
+ this.sessionId = null;
500
+ }
501
+ /** Close the memory store. Flush pending turns first. */
502
+ async close() {
503
+ if (this.flushTimer) {
504
+ clearInterval(this.flushTimer);
505
+ this.flushTimer = null;
506
+ }
507
+ await this.flushPending();
508
+ this.store.close();
509
+ }
510
+ };
511
+ }
512
+ });
513
+
514
+ // src/index.ts
515
+ var src_exports = {};
516
+ __export(src_exports, {
517
+ AudioInput: () => AudioInput,
518
+ AudioOutput: () => AudioOutput,
519
+ BargeIn: () => BargeIn,
520
+ BaseSTTStream: () => BaseSTTStream,
521
+ ContextManager: () => ContextManager,
522
+ Embedder: () => Embedder,
523
+ MemoryStore: () => MemoryStore,
524
+ Pipeline: () => Pipeline,
525
+ RoomConnection: () => RoomConnection,
526
+ RoomMemory: () => RoomMemory,
527
+ SentenceSplitter: () => SentenceSplitter,
528
+ TurnDetector: () => TurnDetector,
529
+ VoiceAgent: () => VoiceAgent,
530
+ createLogger: () => createLogger,
531
+ getLogLevel: () => getLogLevel,
532
+ setLogLevel: () => setLogLevel
533
+ });
534
+ module.exports = __toCommonJS(src_exports);
535
+
536
+ // src/core/voice-agent.ts
537
+ var import_events2 = require("events");
538
+
539
+ // src/room/room-connection.ts
540
+ var import_server_sdk_node = require("@dtelecom/server-sdk-node");
541
+ init_logger();
542
+ var log = createLogger("RoomConnection");
543
+ var RoomConnection = class {
544
+ room;
545
+ audioSource = null;
546
+ localTrack = null;
547
+ _connected = false;
548
+ constructor() {
549
+ this.room = new import_server_sdk_node.Room();
550
+ }
551
+ get connected() {
552
+ return this._connected;
553
+ }
554
+ /**
555
+ * Connect to a dTelecom room.
556
+ *
557
+ * 1. Create an Ed25519 JWT via AccessToken
558
+ * 2. Discover nearest SFU via getWsUrl()
559
+ * 3. Connect Room via WebRTC
560
+ * 4. Publish an audio track for the agent to speak through
561
+ */
562
+ async connect(options) {
563
+ const { room: roomName, apiKey, apiSecret, identity = "agent", name } = options;
564
+ log.info(`Connecting to room "${roomName}" as "${identity}"...`);
565
+ const { AccessToken } = await import("@dtelecom/server-sdk-js");
566
+ const token = new AccessToken(apiKey, apiSecret, {
567
+ identity,
568
+ name: name ?? identity
569
+ });
570
+ token.addGrant({
571
+ roomJoin: true,
572
+ room: roomName,
573
+ canPublish: true,
574
+ canSubscribe: true,
575
+ canPublishData: true
576
+ });
577
+ const wsUrl = await token.getWsUrl();
578
+ const jwt = token.toJwt();
579
+ log.info(`SFU URL: ${wsUrl}`);
580
+ await this.room.connect(wsUrl, jwt, { autoSubscribe: true });
581
+ this._connected = true;
582
+ log.info("Connected successfully");
583
+ }
584
+ /**
585
+ * Publish an audio track so the agent can speak.
586
+ * Returns the AudioSource to feed PCM16 audio into.
587
+ */
588
+ async publishAudioTrack() {
589
+ if (this.audioSource) return this.audioSource;
590
+ this.audioSource = new import_server_sdk_node.AudioSource(48e3, 1);
591
+ this.localTrack = import_server_sdk_node.LocalAudioTrack.createAudioTrack("agent-voice", this.audioSource);
592
+ await this.room.localParticipant.publishTrack(this.localTrack, {
593
+ name: "agent-voice",
594
+ source: import_server_sdk_node.TrackSource.MICROPHONE
595
+ });
596
+ log.info("Audio track published");
597
+ return this.audioSource;
598
+ }
599
+ /** Disconnect from the room and clean up resources. */
600
+ async disconnect() {
601
+ if (!this._connected) return;
602
+ if (this.localTrack) {
603
+ await this.room.localParticipant.unpublishTrack(this.localTrack);
604
+ this.localTrack = null;
605
+ }
606
+ if (this.audioSource) {
607
+ this.audioSource.destroy();
608
+ this.audioSource = null;
609
+ }
610
+ await this.room.disconnect();
611
+ this._connected = false;
612
+ log.info("Disconnected from room");
613
+ }
614
+ };
615
+
616
+ // src/room/audio-input.ts
617
+ init_logger();
618
+ var log2 = createLogger("AudioInput");
619
+ var AudioInput = class {
620
+ participantIdentity;
621
+ stream;
622
+ _closed = false;
623
+ frameCount = 0;
624
+ constructor(track, participantIdentity) {
625
+ this.participantIdentity = participantIdentity;
626
+ this.stream = track.createStream(16e3, 1);
627
+ log2.info(`AudioInput created for "${participantIdentity}" (trackSid=${track.sid})`);
628
+ }
629
+ get closed() {
630
+ return this._closed;
631
+ }
632
+ /**
633
+ * Async iterate over PCM16 buffers from this participant.
634
+ * Each yielded Buffer is 16kHz mono PCM16 LE.
635
+ */
636
+ async *frames() {
637
+ for await (const frame of this.stream) {
638
+ if (this._closed) break;
639
+ this.frameCount++;
640
+ if (this.frameCount === 1 || this.frameCount % 500 === 0) {
641
+ log2.info(`[${this.participantIdentity}] frame #${this.frameCount}`);
642
+ }
643
+ yield frame.toBuffer();
644
+ }
645
+ log2.info(`[${this.participantIdentity}] frame iterator ended (total: ${this.frameCount})`);
646
+ }
647
+ /** Async iterate over AudioFrame objects. */
648
+ async *audioFrames() {
649
+ for await (const frame of this.stream) {
650
+ if (this._closed) break;
651
+ yield frame;
652
+ }
653
+ }
654
+ close() {
655
+ if (this._closed) return;
656
+ this._closed = true;
657
+ this.stream.close();
658
+ log2.debug(`AudioInput closed for participant "${this.participantIdentity}"`);
659
+ }
660
+ };
661
+
662
+ // src/room/audio-output.ts
663
+ var import_server_sdk_node2 = require("@dtelecom/server-sdk-node");
664
+ var import_fs = require("fs");
665
+ var import_path = require("path");
666
+ init_logger();
667
+ var log3 = createLogger("AudioOutput");
668
+ var SAMPLE_RATE = 16e3;
669
+ var CHANNELS = 1;
670
+ var FRAME_DURATION_MS = 20;
671
+ var SAMPLES_PER_FRAME = SAMPLE_RATE * FRAME_DURATION_MS / 1e3;
672
+ var SILENCE = new Int16Array(SAMPLES_PER_FRAME);
673
+ var AudioOutput = class {
674
+ source;
675
+ _playing = false;
676
+ _responding = false;
677
+ _stopped = false;
678
+ silenceInterval = null;
679
+ /** When set, raw PCM from TTS is saved to this directory as WAV files for debugging. */
680
+ dumpDir = null;
681
+ dumpCounter = 0;
682
+ constructor(source) {
683
+ this.source = source;
684
+ }
685
+ get playing() {
686
+ return this._playing;
687
+ }
688
+ /**
689
+ * Mark the start of a multi-sentence response.
690
+ * Suppresses silence injection between sentences so partial frames
691
+ * in AudioSource's buffer don't get corrupted by interleaved silence.
692
+ */
693
+ beginResponse() {
694
+ this._responding = true;
695
+ }
696
+ /** Mark the end of a response — re-enable silence keepalive. */
697
+ endResponse() {
698
+ this._responding = false;
699
+ }
700
+ /**
701
+ * Start sparse silence keepalive to prevent the SFU from dropping the track.
702
+ * With Opus DTX enabled, the encoder handles silence natively — we only need
703
+ * an occasional packet to keep the SSRC alive.
704
+ */
705
+ startSilence() {
706
+ if (this.silenceInterval) return;
707
+ log3.debug("Starting silence keepalive (sparse, 3s interval)");
708
+ this.silenceInterval = setInterval(() => {
709
+ if (!this._playing && !this._responding && !this._stopped) {
710
+ const f = new import_server_sdk_node2.AudioFrame(SILENCE, SAMPLE_RATE, CHANNELS, SAMPLES_PER_FRAME);
711
+ this.source.captureFrame(f).catch(() => {
712
+ });
713
+ }
714
+ }, 3e3);
715
+ }
716
+ /**
717
+ * Write a PCM16 buffer to the audio output.
718
+ * The buffer is split into 20ms frames and fed to AudioSource.
719
+ */
720
+ async writeBuffer(pcm16) {
721
+ this._playing = true;
722
+ try {
723
+ await this.writeFrames(pcm16);
724
+ } finally {
725
+ this._playing = false;
726
+ }
727
+ }
728
+ /**
729
+ * Write a stream of PCM16 buffers (from TTS) to the audio output.
730
+ * Supports cancellation via AbortSignal.
731
+ */
732
+ async writeStream(stream, signal) {
733
+ this._playing = true;
734
+ const streamStart = performance.now();
735
+ let chunkCount = 0;
736
+ let totalBytes = 0;
737
+ log3.debug("writeStream: started");
738
+ const rawChunks = this.dumpDir ? [] : null;
739
+ try {
740
+ for await (const chunk of stream) {
741
+ if (signal?.aborted) {
742
+ log3.debug(`writeStream: cancelled after ${chunkCount} chunks, ${(performance.now() - streamStart).toFixed(0)}ms`);
743
+ break;
744
+ }
745
+ chunkCount++;
746
+ totalBytes += chunk.byteLength;
747
+ rawChunks?.push(Buffer.from(chunk));
748
+ await this.writeFrames(chunk);
749
+ }
750
+ } finally {
751
+ this._playing = false;
752
+ const elapsed = performance.now() - streamStart;
753
+ const audioDurationMs = totalBytes / 2 / SAMPLE_RATE * 1e3;
754
+ log3.info(
755
+ `writeStream: done \u2014 ${chunkCount} chunks, ${totalBytes} bytes, audio=${audioDurationMs.toFixed(0)}ms, wall=${elapsed.toFixed(0)}ms`
756
+ );
757
+ if (rawChunks && rawChunks.length > 0 && this.dumpDir) {
758
+ try {
759
+ if (!(0, import_fs.existsSync)(this.dumpDir)) (0, import_fs.mkdirSync)(this.dumpDir, { recursive: true });
760
+ const filePath = (0, import_path.join)(this.dumpDir, `tts-raw-${++this.dumpCounter}.wav`);
761
+ writeWav(filePath, rawChunks, SAMPLE_RATE);
762
+ log3.info(`writeStream: saved raw TTS to ${filePath}`);
763
+ } catch (err) {
764
+ log3.warn("writeStream: failed to save WAV dump:", err);
765
+ }
766
+ }
767
+ }
768
+ }
769
+ /**
770
+ * Split a PCM16 buffer into 20ms frames and write them at real-time pace.
771
+ * Partial frames at the end are sent directly — AudioSource handles
772
+ * accumulation in its internal buffer.
773
+ */
774
+ async writeFrames(pcm16) {
775
+ const aligned = Buffer.alloc(pcm16.byteLength);
776
+ pcm16.copy(aligned);
777
+ const samples = new Int16Array(
778
+ aligned.buffer,
779
+ aligned.byteOffset,
780
+ aligned.byteLength / 2
781
+ );
782
+ let offset = 0;
783
+ while (offset < samples.length) {
784
+ const end = Math.min(offset + SAMPLES_PER_FRAME, samples.length);
785
+ const frameSamples = samples.subarray(offset, end);
786
+ const frame = new import_server_sdk_node2.AudioFrame(
787
+ frameSamples,
788
+ SAMPLE_RATE,
789
+ CHANNELS,
790
+ frameSamples.length
791
+ );
792
+ await this.source.captureFrame(frame);
793
+ if (frameSamples.length === SAMPLES_PER_FRAME) {
794
+ await sleep(FRAME_DURATION_MS);
795
+ }
796
+ offset = end;
797
+ }
798
+ }
799
+ /**
800
+ * Write silence frames for the given duration.
801
+ * Used to pad the end of a response so the last Opus frame is fully flushed
802
+ * and the audio doesn't cut off abruptly.
803
+ */
804
+ async writeSilence(durationMs) {
805
+ const frameCount = Math.ceil(durationMs / FRAME_DURATION_MS);
806
+ for (let i = 0; i < frameCount; i++) {
807
+ const frame = new import_server_sdk_node2.AudioFrame(SILENCE, SAMPLE_RATE, CHANNELS, SAMPLES_PER_FRAME);
808
+ await this.source.captureFrame(frame);
809
+ await sleep(FRAME_DURATION_MS);
810
+ }
811
+ }
812
+ /** Flush any buffered audio in AudioSource */
813
+ flush() {
814
+ this.source.flush();
815
+ this._playing = false;
816
+ }
817
+ /** Stop the silence keepalive */
818
+ stop() {
819
+ this._stopped = true;
820
+ if (this.silenceInterval) {
821
+ clearInterval(this.silenceInterval);
822
+ this.silenceInterval = null;
823
+ }
824
+ }
825
+ };
826
+ function sleep(ms) {
827
+ return new Promise((resolve) => setTimeout(resolve, ms));
828
+ }
829
+ function writeWav(filePath, pcmChunks, sampleRate) {
830
+ const dataSize = pcmChunks.reduce((sum, b) => sum + b.byteLength, 0);
831
+ const header = Buffer.alloc(44);
832
+ header.write("RIFF", 0);
833
+ header.writeUInt32LE(36 + dataSize, 4);
834
+ header.write("WAVE", 8);
835
+ header.write("fmt ", 12);
836
+ header.writeUInt32LE(16, 16);
837
+ header.writeUInt16LE(1, 20);
838
+ header.writeUInt16LE(1, 22);
839
+ header.writeUInt32LE(sampleRate, 24);
840
+ header.writeUInt32LE(sampleRate * 2, 28);
841
+ header.writeUInt16LE(2, 32);
842
+ header.writeUInt16LE(16, 34);
843
+ header.write("data", 36);
844
+ header.writeUInt32LE(dataSize, 40);
845
+ (0, import_fs.writeFileSync)(filePath, header);
846
+ for (const chunk of pcmChunks) {
847
+ (0, import_fs.appendFileSync)(filePath, chunk);
848
+ }
849
+ }
850
+
851
+ // src/core/pipeline.ts
852
+ var import_events = require("events");
853
+
854
+ // src/core/context-manager.ts
855
+ init_logger();
856
+ var log4 = createLogger("ContextManager");
857
+ function estimateTokens(text) {
858
+ return Math.ceil(text.length / 4);
859
+ }
860
+ var ContextManager = class {
861
+ instructions;
862
+ maxContextTokens;
863
+ recentTurnsToKeep;
864
+ turns = [];
865
+ summary = null;
866
+ constructor(options) {
867
+ this.instructions = options.instructions;
868
+ this.maxContextTokens = options.maxContextTokens ?? 5e3;
869
+ this.recentTurnsToKeep = options.recentTurnsToKeep ?? 4;
870
+ }
871
+ /** Add a user's speech turn to the conversation */
872
+ addUserTurn(speaker, text) {
873
+ this.turns.push({
874
+ speaker,
875
+ text,
876
+ isAgent: false,
877
+ timestamp: Date.now()
878
+ });
879
+ }
880
+ /** Add the agent's response to the conversation */
881
+ addAgentTurn(text) {
882
+ this.turns.push({
883
+ speaker: "assistant",
884
+ text,
885
+ isAgent: true,
886
+ timestamp: Date.now()
887
+ });
888
+ }
889
+ /**
890
+ * Build the messages array for the LLM call.
891
+ *
892
+ * Structure:
893
+ * [system prompt]
894
+ * [memory context, if provided]
895
+ * [conversation summary, if any]
896
+ * [recent verbatim turns]
897
+ *
898
+ * @param memoryContext - Optional relevant context injected by the application
899
+ */
900
+ buildMessages(memoryContext) {
901
+ const messages = [];
902
+ messages.push({ role: "system", content: this.instructions });
903
+ if (memoryContext) {
904
+ messages.push({
905
+ role: "system",
906
+ content: `Relevant context from past conversations:
907
+ ${memoryContext}`
908
+ });
909
+ }
910
+ if (this.summary) {
911
+ messages.push({
912
+ role: "system",
913
+ content: `Conversation summary so far:
914
+ ${this.summary}`
915
+ });
916
+ }
917
+ const turnsToInclude = this.summary ? this.turns.slice(-this.recentTurnsToKeep) : this.turns;
918
+ for (const turn of turnsToInclude) {
919
+ if (turn.isAgent) {
920
+ messages.push({ role: "assistant", content: turn.text });
921
+ } else {
922
+ messages.push({
923
+ role: "user",
924
+ content: `[${turn.speaker}]: ${turn.text}`
925
+ });
926
+ }
927
+ }
928
+ return messages;
929
+ }
930
+ /** Check if summarization should be triggered */
931
+ shouldSummarize() {
932
+ const totalTokens = this.turns.reduce(
933
+ (acc, t) => acc + estimateTokens(t.text) + 10,
934
+ estimateTokens(this.instructions)
935
+ );
936
+ return totalTokens > this.maxContextTokens;
937
+ }
938
+ /**
939
+ * Summarize older turns using the LLM.
940
+ * Keeps the most recent turns verbatim.
941
+ */
942
+ async summarize(llm) {
943
+ if (this.turns.length <= this.recentTurnsToKeep) return;
944
+ const olderTurns = this.turns.slice(0, -this.recentTurnsToKeep);
945
+ const transcript = olderTurns.map((t) => `[${t.speaker}]: ${t.text}`).join("\n");
946
+ const summaryPrompt = [
947
+ {
948
+ role: "system",
949
+ content: "Summarize this conversation concisely, preserving key facts, decisions, and action items."
950
+ },
951
+ { role: "user", content: transcript }
952
+ ];
953
+ let summaryText = "";
954
+ for await (const chunk of llm.chat(summaryPrompt)) {
955
+ if (chunk.type === "token" && chunk.token) {
956
+ summaryText += chunk.token;
957
+ }
958
+ }
959
+ this.summary = this.summary ? `${this.summary}
960
+
961
+ ${summaryText}` : summaryText;
962
+ this.turns = this.turns.slice(-this.recentTurnsToKeep);
963
+ log4.info(`Summarized ${olderTurns.length} turns, ${this.turns.length} recent turns kept`);
964
+ }
965
+ /** Get the full transcript */
966
+ getFullTranscript() {
967
+ return this.turns.map((t) => `[${t.speaker}]: ${t.text}`).join("\n");
968
+ }
969
+ /** Reset the context */
970
+ reset() {
971
+ this.turns = [];
972
+ this.summary = null;
973
+ }
974
+ };
975
+
976
+ // src/core/sentence-splitter.ts
977
+ var MIN_CHUNK = 20;
978
+ var MAX_CHUNK = 150;
979
+ var SentenceSplitter = class {
980
+ buffer = "";
981
+ /** Add a token and get back any speakable chunks */
982
+ push(token) {
983
+ this.buffer += token;
984
+ return this.extractChunks();
985
+ }
986
+ /** Flush any remaining text as a final chunk */
987
+ flush() {
988
+ const text = this.buffer.trim();
989
+ this.buffer = "";
990
+ return text.length > 0 ? text : null;
991
+ }
992
+ /** Reset the splitter */
993
+ reset() {
994
+ this.buffer = "";
995
+ }
996
+ extractChunks() {
997
+ const chunks = [];
998
+ while (true) {
999
+ const sentenceMatch = this.buffer.match(/[^.!?]*[.!?]\s*/);
1000
+ if (sentenceMatch && sentenceMatch.index !== void 0) {
1001
+ const end = sentenceMatch.index + sentenceMatch[0].length;
1002
+ const chunk = this.buffer.slice(0, end).trim();
1003
+ if (chunk.length >= MIN_CHUNK) {
1004
+ chunks.push(chunk);
1005
+ this.buffer = this.buffer.slice(end);
1006
+ continue;
1007
+ }
1008
+ }
1009
+ if (this.buffer.length >= MAX_CHUNK) {
1010
+ const clauseMatch = this.buffer.match(/[,;:\u2014]\s*/);
1011
+ if (clauseMatch && clauseMatch.index !== void 0 && clauseMatch.index >= MIN_CHUNK) {
1012
+ const end = clauseMatch.index + clauseMatch[0].length;
1013
+ const chunk = this.buffer.slice(0, end).trim();
1014
+ chunks.push(chunk);
1015
+ this.buffer = this.buffer.slice(end);
1016
+ continue;
1017
+ }
1018
+ const spaceIdx = this.buffer.lastIndexOf(" ", MAX_CHUNK);
1019
+ if (spaceIdx >= MIN_CHUNK) {
1020
+ const chunk = this.buffer.slice(0, spaceIdx).trim();
1021
+ chunks.push(chunk);
1022
+ this.buffer = this.buffer.slice(spaceIdx);
1023
+ continue;
1024
+ }
1025
+ }
1026
+ break;
1027
+ }
1028
+ return chunks;
1029
+ }
1030
+ };
1031
+
1032
+ // src/core/turn-detector.ts
1033
+ init_logger();
1034
+ var log5 = createLogger("TurnDetector");
1035
+ var TurnDetector = class {
1036
+ silenceTimeoutMs;
1037
+ silenceTimer = null;
1038
+ _onTurnEnd = null;
1039
+ lastFinalText = "";
1040
+ constructor(options = {}) {
1041
+ this.silenceTimeoutMs = options.silenceTimeoutMs ?? 800;
1042
+ }
1043
+ /** Set the callback for when a turn ends */
1044
+ set onTurnEnd(cb) {
1045
+ this._onTurnEnd = cb;
1046
+ }
1047
+ /**
1048
+ * Feed a transcription result.
1049
+ * Returns true if this result represents a completed turn.
1050
+ */
1051
+ handleTranscription(text, isFinal) {
1052
+ this.clearTimer();
1053
+ if (isFinal && text.trim().length > 0) {
1054
+ this.lastFinalText = text;
1055
+ this.silenceTimer = setTimeout(() => {
1056
+ log5.debug(`Turn ended after ${this.silenceTimeoutMs}ms silence`);
1057
+ this._onTurnEnd?.();
1058
+ }, this.silenceTimeoutMs);
1059
+ return false;
1060
+ }
1061
+ if (!isFinal && text.trim().length > 0) {
1062
+ this.clearTimer();
1063
+ }
1064
+ return false;
1065
+ }
1066
+ /** Force-trigger turn end */
1067
+ forceTurnEnd() {
1068
+ this.clearTimer();
1069
+ this._onTurnEnd?.();
1070
+ }
1071
+ /** Reset state */
1072
+ reset() {
1073
+ this.clearTimer();
1074
+ this.lastFinalText = "";
1075
+ }
1076
+ clearTimer() {
1077
+ if (this.silenceTimer) {
1078
+ clearTimeout(this.silenceTimer);
1079
+ this.silenceTimer = null;
1080
+ }
1081
+ }
1082
+ };
1083
+
1084
+ // src/core/barge-in.ts
1085
+ init_logger();
1086
+ var log6 = createLogger("BargeIn");
1087
+ var BargeIn = class {
1088
+ abortController = null;
1089
+ _interrupted = false;
1090
+ _onInterrupt = null;
1091
+ get interrupted() {
1092
+ return this._interrupted;
1093
+ }
1094
+ /** Set the callback for when barge-in occurs */
1095
+ set onInterrupt(cb) {
1096
+ this._onInterrupt = cb;
1097
+ }
1098
+ /**
1099
+ * Create a new AbortController for the current response cycle.
1100
+ * Call this at the start of each STT->LLM->TTS cycle.
1101
+ */
1102
+ startCycle() {
1103
+ this.abortController = new AbortController();
1104
+ this._interrupted = false;
1105
+ return this.abortController.signal;
1106
+ }
1107
+ /** Trigger barge-in. Called when STT detects speech during agent output. */
1108
+ trigger() {
1109
+ if (this._interrupted) return;
1110
+ this._interrupted = true;
1111
+ log6.info("Barge-in detected \u2014 cancelling current response");
1112
+ if (this.abortController) {
1113
+ this.abortController.abort();
1114
+ this.abortController = null;
1115
+ }
1116
+ this._onInterrupt?.();
1117
+ }
1118
+ /** Reset after the interrupted cycle is cleaned up */
1119
+ reset() {
1120
+ this._interrupted = false;
1121
+ this.abortController = null;
1122
+ }
1123
+ };
1124
+
1125
+ // src/core/pipeline.ts
1126
+ init_logger();
1127
+ var log7 = createLogger("Pipeline");
1128
+ var AUDIO_DRAIN_MS = 800;
1129
+ function sleep2(ms) {
1130
+ return new Promise((resolve) => setTimeout(resolve, ms));
1131
+ }
1132
+ var Pipeline = class extends import_events.EventEmitter {
1133
+ stt;
1134
+ llm;
1135
+ tts;
1136
+ audioOutput;
1137
+ context;
1138
+ turnDetector;
1139
+ bargeIn;
1140
+ splitter;
1141
+ respondMode;
1142
+ agentName;
1143
+ nameVariants;
1144
+ beforeRespond;
1145
+ memory;
1146
+ /** Active STT streams, keyed by participant identity */
1147
+ sttStreams = /* @__PURE__ */ new Map();
1148
+ _processing = false;
1149
+ _running = false;
1150
+ _agentState = "idle";
1151
+ /** Queued turn while current one is still processing */
1152
+ pendingTurn = null;
1153
+ constructor(options) {
1154
+ super();
1155
+ this.stt = options.stt;
1156
+ this.llm = options.llm;
1157
+ this.tts = options.tts;
1158
+ this.audioOutput = options.audioOutput;
1159
+ this.respondMode = options.respondMode ?? "always";
1160
+ this.agentName = (options.agentName ?? "assistant").toLowerCase();
1161
+ this.nameVariants = (options.nameVariants ?? []).map((n) => n.toLowerCase());
1162
+ this.beforeRespond = options.beforeRespond;
1163
+ this.memory = options.memory;
1164
+ this.context = new ContextManager({
1165
+ instructions: options.instructions
1166
+ });
1167
+ this.turnDetector = new TurnDetector({
1168
+ silenceTimeoutMs: options.silenceTimeoutMs
1169
+ });
1170
+ this.bargeIn = new BargeIn();
1171
+ this.splitter = new SentenceSplitter();
1172
+ this.turnDetector.onTurnEnd = () => {
1173
+ };
1174
+ this.bargeIn.onInterrupt = () => {
1175
+ this.audioOutput.flush();
1176
+ this.splitter.reset();
1177
+ this.setAgentState("idle");
1178
+ };
1179
+ if (this.llm.warmup) {
1180
+ this.llm.warmup(options.instructions).catch((err) => {
1181
+ log7.warn("LLM warmup failed:", err);
1182
+ });
1183
+ }
1184
+ if (this.tts?.warmup) {
1185
+ this.tts.warmup().catch((err) => {
1186
+ log7.warn("TTS warmup failed:", err);
1187
+ });
1188
+ }
1189
+ }
1190
+ get processing() {
1191
+ return this._processing;
1192
+ }
1193
+ get running() {
1194
+ return this._running;
1195
+ }
1196
+ get agentState() {
1197
+ return this._agentState;
1198
+ }
1199
+ setAgentState(state) {
1200
+ if (this._agentState !== state) {
1201
+ this._agentState = state;
1202
+ this.emit("agentState", state);
1203
+ }
1204
+ }
1205
+ addParticipant(identity) {
1206
+ const existing = this.sttStreams.get(identity);
1207
+ if (existing) {
1208
+ existing.close();
1209
+ this.sttStreams.delete(identity);
1210
+ log7.info(`Replacing STT stream for "${identity}"`);
1211
+ }
1212
+ const stream = this.stt.createStream();
1213
+ this.sttStreams.set(identity, stream);
1214
+ this._running = true;
1215
+ stream.on("transcription", (result) => {
1216
+ this.handleTranscription(identity, result);
1217
+ });
1218
+ stream.on("error", (error) => {
1219
+ log7.error(`STT error for ${identity}:`, error);
1220
+ this.emit("error", error);
1221
+ });
1222
+ log7.info(`STT stream started for participant "${identity}"`);
1223
+ return stream;
1224
+ }
1225
+ async removeParticipant(identity) {
1226
+ const stream = this.sttStreams.get(identity);
1227
+ if (stream) {
1228
+ await stream.close();
1229
+ this.sttStreams.delete(identity);
1230
+ log7.info(`STT stream removed for participant "${identity}"`);
1231
+ }
1232
+ }
1233
+ async stop() {
1234
+ this._running = false;
1235
+ this.turnDetector.reset();
1236
+ this.bargeIn.reset();
1237
+ this.splitter.reset();
1238
+ for (const [, stream] of this.sttStreams) {
1239
+ await stream.close();
1240
+ }
1241
+ this.sttStreams.clear();
1242
+ log7.info("Pipeline stopped");
1243
+ }
1244
+ getContextManager() {
1245
+ return this.context;
1246
+ }
1247
+ lastFinalAt = 0;
1248
+ lastSttDuration = 0;
1249
+ async handleTranscription(speaker, result) {
1250
+ this.emit("transcription", { ...result, speaker });
1251
+ if (!result.isFinal && result.text.trim()) {
1252
+ this.setAgentState("listening");
1253
+ }
1254
+ if (this.audioOutput.playing && result.text.trim().length > 0) {
1255
+ this.bargeIn.trigger();
1256
+ }
1257
+ if (result.isFinal && result.text.trim()) {
1258
+ const text = result.text.trim();
1259
+ this.lastFinalAt = performance.now();
1260
+ this.lastSttDuration = result.sttDuration ?? 0;
1261
+ this.memory?.storeTurn(speaker, text, false);
1262
+ if (await this.shouldRespond(speaker, text)) {
1263
+ this.processTurn(speaker, text);
1264
+ } else {
1265
+ log7.info(`Not responding to "${speaker}": "${text.slice(0, 60)}" (mode=${this.respondMode})`);
1266
+ this.setAgentState("idle");
1267
+ }
1268
+ } else if (result.isFinal) {
1269
+ this.setAgentState("idle");
1270
+ }
1271
+ }
1272
+ /**
1273
+ * Determine if the agent should respond to this turn.
1274
+ * In 'always' mode: responds to everything.
1275
+ * In 'addressed' mode: only when agent name is mentioned + optional beforeRespond hook.
1276
+ */
1277
+ async shouldRespond(speaker, text) {
1278
+ if (this.respondMode === "always") return true;
1279
+ const lower = text.toLowerCase();
1280
+ const nameMatch = lower.includes(this.agentName) || this.nameVariants.some((v) => lower.includes(v));
1281
+ if (!nameMatch) return false;
1282
+ if (this.beforeRespond) {
1283
+ return this.beforeRespond(speaker, text);
1284
+ }
1285
+ return true;
1286
+ }
1287
+ async processTurn(speaker, text) {
1288
+ if (this._processing) {
1289
+ log7.info(`Queuing turn (current still processing): "${text}"`);
1290
+ this.pendingTurn = { speaker, text };
1291
+ this.bargeIn.trigger();
1292
+ return;
1293
+ }
1294
+ this._processing = true;
1295
+ const tSpeechEnd = this.lastFinalAt;
1296
+ const sttDuration = this.lastSttDuration;
1297
+ let tLlmFirstToken = 0;
1298
+ let tFirstSentence = 0;
1299
+ let tFirstAudioPlayed = 0;
1300
+ log7.info(`Processing turn from "${speaker}": ${text}`);
1301
+ try {
1302
+ this.context.addUserTurn(speaker, text);
1303
+ if (this.context.shouldSummarize()) {
1304
+ await this.context.summarize(this.llm);
1305
+ }
1306
+ const signal = this.bargeIn.startCycle();
1307
+ let memoryContext = "";
1308
+ if (this.memory) {
1309
+ try {
1310
+ memoryContext = await this.memory.searchRelevant(text);
1311
+ } catch (err) {
1312
+ log7.warn("Memory search failed:", err);
1313
+ }
1314
+ }
1315
+ const messages = this.context.buildMessages(memoryContext || void 0);
1316
+ let fullResponse = "";
1317
+ this.setAgentState("thinking");
1318
+ const sentenceQueue = [];
1319
+ let producerDone = false;
1320
+ let wakeConsumer = null;
1321
+ const wake = () => {
1322
+ wakeConsumer?.();
1323
+ };
1324
+ const producer = async () => {
1325
+ let isFirstToken = true;
1326
+ let isFirstSentence = true;
1327
+ const llmStream = this.llm.chat(messages, signal);
1328
+ try {
1329
+ while (!signal.aborted) {
1330
+ const { value: chunk, done } = await llmStream.next();
1331
+ if (done || !chunk) break;
1332
+ if (signal.aborted) break;
1333
+ if (chunk.type === "token" && chunk.token) {
1334
+ if (isFirstToken) {
1335
+ tLlmFirstToken = performance.now();
1336
+ isFirstToken = false;
1337
+ log7.info(`llm_first_token: ${(tLlmFirstToken - tSpeechEnd).toFixed(0)}ms`);
1338
+ }
1339
+ fullResponse += chunk.token;
1340
+ const sentences = this.splitter.push(chunk.token);
1341
+ for (const sentence of sentences) {
1342
+ if (signal.aborted) break;
1343
+ if (isFirstSentence) {
1344
+ tFirstSentence = performance.now();
1345
+ isFirstSentence = false;
1346
+ log7.info(`first_sentence: ${(tFirstSentence - tSpeechEnd).toFixed(0)}ms \u2014 "${sentence.slice(0, 60)}"`);
1347
+ }
1348
+ sentenceQueue.push(sentence);
1349
+ wake();
1350
+ }
1351
+ }
1352
+ }
1353
+ } finally {
1354
+ await llmStream.return(void 0);
1355
+ }
1356
+ if (!signal.aborted) {
1357
+ const remaining = this.splitter.flush();
1358
+ if (remaining) {
1359
+ if (isFirstSentence) {
1360
+ tFirstSentence = performance.now();
1361
+ isFirstSentence = false;
1362
+ log7.info(`first_sentence (flush): ${(tFirstSentence - tSpeechEnd).toFixed(0)}ms \u2014 "${remaining.slice(0, 60)}"`);
1363
+ }
1364
+ sentenceQueue.push(remaining);
1365
+ wake();
1366
+ }
1367
+ }
1368
+ producerDone = true;
1369
+ wake();
1370
+ };
1371
+ const consumer = async () => {
1372
+ this.audioOutput.beginResponse();
1373
+ try {
1374
+ while (true) {
1375
+ if (signal.aborted) break;
1376
+ if (sentenceQueue.length > 0) {
1377
+ const sentence = sentenceQueue.shift();
1378
+ if (!/\w/.test(sentence)) {
1379
+ log7.debug(`Skipping non-word sentence: "${sentence}"`);
1380
+ continue;
1381
+ }
1382
+ await this.synthesizeAndPlay(sentence, signal, (t) => {
1383
+ if (!tFirstAudioPlayed) {
1384
+ tFirstAudioPlayed = t;
1385
+ this.setAgentState("speaking");
1386
+ }
1387
+ this.emit("sentence", sentence);
1388
+ });
1389
+ continue;
1390
+ }
1391
+ if (producerDone) break;
1392
+ await new Promise((resolve) => {
1393
+ wakeConsumer = resolve;
1394
+ });
1395
+ wakeConsumer = null;
1396
+ }
1397
+ } finally {
1398
+ if (!signal.aborted) {
1399
+ await this.audioOutput.writeSilence(40);
1400
+ }
1401
+ this.audioOutput.endResponse();
1402
+ }
1403
+ };
1404
+ await Promise.all([producer(), consumer()]);
1405
+ const ttftMs = tLlmFirstToken ? tLlmFirstToken - tSpeechEnd : 0;
1406
+ const llmMs = tFirstSentence ? tFirstSentence - tSpeechEnd : 0;
1407
+ const ttsMs = tFirstAudioPlayed && tFirstSentence ? tFirstAudioPlayed - tFirstSentence : 0;
1408
+ const overallMs = sttDuration + llmMs + ttsMs;
1409
+ log7.info(
1410
+ `LATENCY "${text.slice(0, 30)}": STT=${sttDuration.toFixed(0)}ms LLM=${llmMs.toFixed(0)}ms (TTFT=${ttftMs.toFixed(0)}ms) TTS=${ttsMs.toFixed(0)}ms Overall=${overallMs.toFixed(0)}ms`
1411
+ );
1412
+ if (fullResponse.trim()) {
1413
+ this.context.addAgentTurn(fullResponse.trim());
1414
+ this.memory?.storeTurn("assistant", fullResponse.trim(), true);
1415
+ this.emit("response", fullResponse.trim());
1416
+ }
1417
+ await sleep2(AUDIO_DRAIN_MS);
1418
+ this.setAgentState("idle");
1419
+ } catch (err) {
1420
+ if (err instanceof Error && err.name === "AbortError") {
1421
+ log7.debug("Turn processing aborted (barge-in)");
1422
+ } else {
1423
+ log7.error("Error processing turn:", err);
1424
+ this.emit("error", err instanceof Error ? err : new Error(String(err)));
1425
+ }
1426
+ } finally {
1427
+ this._processing = false;
1428
+ this.bargeIn.reset();
1429
+ if (this.pendingTurn) {
1430
+ const { speaker: nextSpeaker, text: nextText } = this.pendingTurn;
1431
+ this.pendingTurn = null;
1432
+ log7.info(`Processing queued turn from "${nextSpeaker}": ${nextText}`);
1433
+ this.processTurn(nextSpeaker, nextText);
1434
+ }
1435
+ }
1436
+ }
1437
+ /**
1438
+ * Speak text directly via TTS, bypassing the LLM.
1439
+ * Supports barge-in — if the student speaks, the greeting is cut short.
1440
+ * Adds the text to conversation context so the LLM knows what was said.
1441
+ */
1442
+ async say(text) {
1443
+ if (this._processing) {
1444
+ log7.warn("say() called while processing \u2014 skipping");
1445
+ return;
1446
+ }
1447
+ this._processing = true;
1448
+ log7.info(`say(): "${text.slice(0, 60)}"`);
1449
+ try {
1450
+ const signal = this.bargeIn.startCycle();
1451
+ this.audioOutput.beginResponse();
1452
+ this.setAgentState("thinking");
1453
+ await this.synthesizeAndPlay(text, signal, () => {
1454
+ this.setAgentState("speaking");
1455
+ this.emit("sentence", text);
1456
+ });
1457
+ if (!signal.aborted) {
1458
+ await this.audioOutput.writeSilence(40);
1459
+ this.context.addAgentTurn(text);
1460
+ this.memory?.storeTurn("assistant", text, true);
1461
+ this.emit("response", text);
1462
+ }
1463
+ await sleep2(AUDIO_DRAIN_MS);
1464
+ this.setAgentState("idle");
1465
+ } catch (err) {
1466
+ if (err instanceof Error && err.name === "AbortError") {
1467
+ log7.debug("say() aborted (barge-in)");
1468
+ } else {
1469
+ log7.error("Error in say():", err);
1470
+ this.emit("error", err instanceof Error ? err : new Error(String(err)));
1471
+ }
1472
+ } finally {
1473
+ this._processing = false;
1474
+ this.audioOutput.endResponse();
1475
+ this.bargeIn.reset();
1476
+ if (this.pendingTurn) {
1477
+ const { speaker: nextSpeaker, text: nextText } = this.pendingTurn;
1478
+ this.pendingTurn = null;
1479
+ log7.info(`Processing queued turn from "${nextSpeaker}": ${nextText}`);
1480
+ this.processTurn(nextSpeaker, nextText);
1481
+ }
1482
+ }
1483
+ }
1484
+ async synthesizeAndPlay(text, signal, onFirstAudio) {
1485
+ if (!this.tts || signal.aborted) {
1486
+ log7.info(`[Agent says]: ${text}`);
1487
+ return;
1488
+ }
1489
+ try {
1490
+ const ttsStart = performance.now();
1491
+ let firstChunk = true;
1492
+ let ttsChunkCount = 0;
1493
+ const ttsStream = this.tts.synthesize(text, signal);
1494
+ const measuredStream = async function* () {
1495
+ for await (const chunk of ttsStream) {
1496
+ ttsChunkCount++;
1497
+ if (firstChunk) {
1498
+ firstChunk = false;
1499
+ const now = performance.now();
1500
+ log7.info(`tts_first_audio: ${(now - ttsStart).toFixed(0)}ms for "${text.slice(0, 40)}"`);
1501
+ onFirstAudio(now);
1502
+ }
1503
+ yield chunk;
1504
+ }
1505
+ };
1506
+ await this.audioOutput.writeStream(measuredStream(), signal);
1507
+ log7.info(`synthesizeAndPlay done: ${(performance.now() - ttsStart).toFixed(0)}ms, ${ttsChunkCount} chunks for "${text.slice(0, 40)}"`);
1508
+ } catch (err) {
1509
+ if (err instanceof Error && err.name === "AbortError") return;
1510
+ throw err;
1511
+ }
1512
+ }
1513
+ };
1514
+
1515
+ // src/core/voice-agent.ts
1516
+ init_logger();
1517
+ var log11 = createLogger("VoiceAgent");
1518
+ var VoiceAgent = class extends import_events2.EventEmitter {
1519
+ config;
1520
+ connection = null;
1521
+ pipeline = null;
1522
+ audioInputs = /* @__PURE__ */ new Map();
1523
+ audioOutput = null;
1524
+ memory = null;
1525
+ _running = false;
1526
+ constructor(config) {
1527
+ super();
1528
+ this.config = config;
1529
+ }
1530
+ get running() {
1531
+ return this._running;
1532
+ }
1533
+ get room() {
1534
+ return this.connection?.room ?? null;
1535
+ }
1536
+ /** Enable saving raw TTS audio as WAV files to `dir` for debugging. */
1537
+ enableAudioDump(dir) {
1538
+ this._dumpDir = dir;
1539
+ if (this.audioOutput) {
1540
+ this.audioOutput.dumpDir = dir;
1541
+ }
1542
+ }
1543
+ _dumpDir = null;
1544
+ /**
1545
+ * Speak text directly via TTS, bypassing the LLM.
1546
+ * Use for greetings or announcements. Supports barge-in.
1547
+ */
1548
+ async say(text) {
1549
+ if (!this.pipeline) {
1550
+ throw new Error("Agent not started \u2014 call start() first");
1551
+ }
1552
+ await this.pipeline.say(text);
1553
+ }
1554
+ /** Start the agent — connect to room and begin listening. */
1555
+ async start(options) {
1556
+ if (this._running) {
1557
+ throw new Error("Agent is already running");
1558
+ }
1559
+ log11.info(`Starting agent for room "${options.room}"...`);
1560
+ if (this.config.memory?.enabled) {
1561
+ const { RoomMemory: RoomMemory2 } = await Promise.resolve().then(() => (init_room_memory(), room_memory_exports));
1562
+ this.memory = new RoomMemory2({
1563
+ dbPath: this.config.memory.dbPath ?? "./data/memory.db",
1564
+ room: options.room
1565
+ });
1566
+ await this.memory.init();
1567
+ this.memory.startSession();
1568
+ log11.info("Memory initialized");
1569
+ }
1570
+ this.connection = new RoomConnection();
1571
+ await this.connection.connect({
1572
+ room: options.room,
1573
+ apiKey: options.apiKey,
1574
+ apiSecret: options.apiSecret,
1575
+ identity: options.identity ?? "agent",
1576
+ name: options.name ?? options.identity ?? "AI Agent"
1577
+ });
1578
+ const source = await this.connection.publishAudioTrack();
1579
+ this.audioOutput = new AudioOutput(source);
1580
+ if (this._dumpDir) this.audioOutput.dumpDir = this._dumpDir;
1581
+ this.audioOutput.startSilence();
1582
+ this.pipeline = new Pipeline({
1583
+ stt: this.config.stt,
1584
+ llm: this.config.llm,
1585
+ tts: this.config.tts,
1586
+ instructions: this.config.instructions,
1587
+ audioOutput: this.audioOutput,
1588
+ respondMode: this.config.respondMode,
1589
+ agentName: this.config.agentName,
1590
+ nameVariants: this.config.nameVariants,
1591
+ memory: this.memory ?? void 0
1592
+ });
1593
+ this.pipeline.on("transcription", (result) => this.emit("transcription", result));
1594
+ this.pipeline.on("sentence", (text) => this.emit("sentence", text));
1595
+ this.pipeline.on("response", (text) => this.emit("response", text));
1596
+ this.pipeline.on("agentState", (state) => this.emit("agentState", state));
1597
+ this.pipeline.on("error", (error) => this.emit("error", error));
1598
+ for (const participant of this.connection.room.remoteParticipants.values()) {
1599
+ for (const [, pub] of participant.trackPublications) {
1600
+ if (pub.track) {
1601
+ this.handleTrackSubscribed(pub.track, pub, participant);
1602
+ }
1603
+ }
1604
+ }
1605
+ this.connection.room.on("trackSubscribed", (track, pub, participant) => {
1606
+ this.handleTrackSubscribed(track, pub, participant);
1607
+ });
1608
+ this.connection.room.on("trackUnsubscribed", (track, _pub, participant) => {
1609
+ this.handleTrackUnsubscribed(track, participant);
1610
+ });
1611
+ this.connection.room.on("participantDisconnected", (participant) => {
1612
+ this.handleParticipantDisconnected(participant);
1613
+ });
1614
+ this.connection.room.on("disconnected", (reason) => {
1615
+ log11.info(`Room disconnected: ${reason}`);
1616
+ this.emit("disconnected", reason);
1617
+ });
1618
+ if (this.config.onDataMessage) {
1619
+ this.setupDataChannel(this.config.onDataMessage);
1620
+ }
1621
+ this._running = true;
1622
+ this.emit("connected");
1623
+ log11.info("Agent started and listening");
1624
+ }
1625
+ /** Stop the agent — disconnect and clean up. */
1626
+ async stop() {
1627
+ if (!this._running) return;
1628
+ log11.info("Stopping agent...");
1629
+ this._running = false;
1630
+ if (this.pipeline) {
1631
+ await this.pipeline.stop();
1632
+ this.pipeline = null;
1633
+ }
1634
+ if (this.memory) {
1635
+ try {
1636
+ await this.memory.endSession(this.config.llm);
1637
+ await this.memory.close();
1638
+ } catch (err) {
1639
+ log11.error("Error closing memory:", err);
1640
+ }
1641
+ this.memory = null;
1642
+ }
1643
+ for (const [, input] of this.audioInputs) {
1644
+ input.close();
1645
+ }
1646
+ this.audioInputs.clear();
1647
+ if (this.audioOutput) {
1648
+ this.audioOutput.stop();
1649
+ this.audioOutput = null;
1650
+ }
1651
+ if (this.connection) {
1652
+ await this.connection.disconnect();
1653
+ this.connection = null;
1654
+ }
1655
+ this.emit("disconnected", "agent_stopped");
1656
+ log11.info("Agent stopped");
1657
+ }
1658
+ setupDataChannel(handler) {
1659
+ if (!this.connection) return;
1660
+ this.connection.room.on("dataReceived", (payload, participant, _kind, topic) => {
1661
+ const identity = participant?.identity ?? "unknown";
1662
+ handler(payload, identity, topic);
1663
+ });
1664
+ log11.info("Data channel handler registered");
1665
+ }
1666
+ handleTrackSubscribed(track, _publication, participant) {
1667
+ const identity = participant.identity;
1668
+ log11.info(`Track subscribed from "${identity}" (sid=${track.sid})`);
1669
+ this.memory?.addParticipant(identity);
1670
+ const existing = this.audioInputs.get(identity);
1671
+ if (existing) {
1672
+ log11.info(`Closing old AudioInput for "${identity}" (re-subscription)`);
1673
+ existing.close();
1674
+ }
1675
+ const audioInput = new AudioInput(track, identity);
1676
+ this.audioInputs.set(identity, audioInput);
1677
+ const sttStream = this.pipeline.addParticipant(identity);
1678
+ this.pipeAudioToSTT(audioInput, sttStream, identity);
1679
+ }
1680
+ handleTrackUnsubscribed(_track, participant) {
1681
+ const identity = participant.identity;
1682
+ log11.info(`Track unsubscribed from "${identity}"`);
1683
+ const input = this.audioInputs.get(identity);
1684
+ if (input) {
1685
+ input.close();
1686
+ this.audioInputs.delete(identity);
1687
+ }
1688
+ }
1689
+ handleParticipantDisconnected(participant) {
1690
+ const identity = participant.identity;
1691
+ log11.info(`Participant disconnected: "${identity}"`);
1692
+ const input = this.audioInputs.get(identity);
1693
+ if (input) {
1694
+ input.close();
1695
+ this.audioInputs.delete(identity);
1696
+ }
1697
+ this.pipeline?.removeParticipant(identity);
1698
+ }
1699
+ async pipeAudioToSTT(input, sttStream, identity) {
1700
+ try {
1701
+ for await (const buffer of input.frames()) {
1702
+ if (!this._running) break;
1703
+ sttStream.sendAudio(buffer);
1704
+ }
1705
+ } catch (err) {
1706
+ if (this._running) {
1707
+ log11.error(`Audio pipe error for "${identity}":`, err);
1708
+ }
1709
+ }
1710
+ }
1711
+ };
1712
+
1713
+ // src/core/base-stt-stream.ts
1714
+ var import_events3 = require("events");
1715
+ var BaseSTTStream = class extends import_events3.EventEmitter {
1716
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
1717
+ on(event, cb) {
1718
+ return super.on(event, cb);
1719
+ }
1720
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
1721
+ emit(event, ...args) {
1722
+ return super.emit(event, ...args);
1723
+ }
1724
+ };
1725
+
1726
+ // src/index.ts
1727
+ init_embedder();
1728
+ init_memory_store();
1729
+ init_room_memory();
1730
+ init_logger();
1731
+ // Annotate the CommonJS export names for ESM import in node:
1732
+ 0 && (module.exports = {
1733
+ AudioInput,
1734
+ AudioOutput,
1735
+ BargeIn,
1736
+ BaseSTTStream,
1737
+ ContextManager,
1738
+ Embedder,
1739
+ MemoryStore,
1740
+ Pipeline,
1741
+ RoomConnection,
1742
+ RoomMemory,
1743
+ SentenceSplitter,
1744
+ TurnDetector,
1745
+ VoiceAgent,
1746
+ createLogger,
1747
+ getLogLevel,
1748
+ setLogLevel
1749
+ });
1750
+ //# sourceMappingURL=index.js.map