@dtelecom/agents-js 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -1,8 +1,8 @@
1
1
  import * as _dtelecom_server_sdk_node from '@dtelecom/server-sdk-node';
2
2
  import { Room, AudioSource, RemoteAudioTrack, AudioFrame } from '@dtelecom/server-sdk-node';
3
3
  import { EventEmitter } from 'events';
4
- import { A as AgentConfig, a as AgentStartOptions, M as Message, L as LLMPlugin, P as PipelineOptions, b as AgentState, S as STTStream, T as TranscriptionResult } from './types-BBKtiPvm.mjs';
5
- export { c as AgentEvents, d as AudioOutput, D as DataMessageHandler, e as LLMChatOptions, f as LLMChunk, g as MemoryConfig, h as PipelineEvents, R as RespondMode, i as STTPlugin, j as STTStreamOptions, k as TTSPlugin } from './types-BBKtiPvm.mjs';
4
+ import { A as AgentConfig, a as AgentStartOptions, M as Message, L as LLMPlugin, P as PipelineOptions, b as AgentState, S as STTStream, T as TranscriptionResult } from './types-Di_jxIgs.mjs';
5
+ export { c as AgentEvents, d as AudioOutput, D as DataMessageHandler, e as LLMChatOptions, f as LLMChunk, g as MemoryConfig, h as PipelineEvents, R as RespondMode, i as STTPlugin, j as STTStreamOptions, k as TTSPlugin, l as ToolCallResult, m as ToolDefinition } from './types-Di_jxIgs.mjs';
6
6
 
7
7
  declare class VoiceAgent extends EventEmitter {
8
8
  private readonly config;
@@ -111,6 +111,7 @@ declare class Pipeline extends EventEmitter {
111
111
  private readonly nameVariants;
112
112
  private readonly beforeRespond?;
113
113
  private readonly memory?;
114
+ private readonly tools?;
114
115
  /** Strip provider-specific markup (e.g. SSML lang tags) for display. */
115
116
  private cleanText;
116
117
  /** Active STT streams, keyed by participant identity */
@@ -121,9 +122,9 @@ declare class Pipeline extends EventEmitter {
121
122
  /** Queued turn while current one is still processing */
122
123
  private pendingTurn;
123
124
  constructor(options: PipelineOptions);
124
- /** One-shot warmup — safe to call from constructor, resolves when both LLM and TTS are ready. */
125
- private _warmupPromise;
126
- private warmup;
125
+ private readonly _warmupPromise;
126
+ private readonly _ttsWarmupPromise;
127
+ private readonly _llmWarmupPromise;
127
128
  get processing(): boolean;
128
129
  get running(): boolean;
129
130
  get agentState(): AgentState;
package/dist/index.d.ts CHANGED
@@ -1,8 +1,8 @@
1
1
  import * as _dtelecom_server_sdk_node from '@dtelecom/server-sdk-node';
2
2
  import { Room, AudioSource, RemoteAudioTrack, AudioFrame } from '@dtelecom/server-sdk-node';
3
3
  import { EventEmitter } from 'events';
4
- import { A as AgentConfig, a as AgentStartOptions, M as Message, L as LLMPlugin, P as PipelineOptions, b as AgentState, S as STTStream, T as TranscriptionResult } from './types-BBKtiPvm.js';
5
- export { c as AgentEvents, d as AudioOutput, D as DataMessageHandler, e as LLMChatOptions, f as LLMChunk, g as MemoryConfig, h as PipelineEvents, R as RespondMode, i as STTPlugin, j as STTStreamOptions, k as TTSPlugin } from './types-BBKtiPvm.js';
4
+ import { A as AgentConfig, a as AgentStartOptions, M as Message, L as LLMPlugin, P as PipelineOptions, b as AgentState, S as STTStream, T as TranscriptionResult } from './types-Di_jxIgs.js';
5
+ export { c as AgentEvents, d as AudioOutput, D as DataMessageHandler, e as LLMChatOptions, f as LLMChunk, g as MemoryConfig, h as PipelineEvents, R as RespondMode, i as STTPlugin, j as STTStreamOptions, k as TTSPlugin, l as ToolCallResult, m as ToolDefinition } from './types-Di_jxIgs.js';
6
6
 
7
7
  declare class VoiceAgent extends EventEmitter {
8
8
  private readonly config;
@@ -111,6 +111,7 @@ declare class Pipeline extends EventEmitter {
111
111
  private readonly nameVariants;
112
112
  private readonly beforeRespond?;
113
113
  private readonly memory?;
114
+ private readonly tools?;
114
115
  /** Strip provider-specific markup (e.g. SSML lang tags) for display. */
115
116
  private cleanText;
116
117
  /** Active STT streams, keyed by participant identity */
@@ -121,9 +122,9 @@ declare class Pipeline extends EventEmitter {
121
122
  /** Queued turn while current one is still processing */
122
123
  private pendingTurn;
123
124
  constructor(options: PipelineOptions);
124
- /** One-shot warmup — safe to call from constructor, resolves when both LLM and TTS are ready. */
125
- private _warmupPromise;
126
- private warmup;
125
+ private readonly _warmupPromise;
126
+ private readonly _ttsWarmupPromise;
127
+ private readonly _llmWarmupPromise;
127
128
  get processing(): boolean;
128
129
  get running(): boolean;
129
130
  get agentState(): AgentState;
package/dist/index.js CHANGED
@@ -1151,6 +1151,49 @@ var AUDIO_DRAIN_MS = 800;
1151
1151
  function sleep2(ms) {
1152
1152
  return new Promise((resolve) => setTimeout(resolve, ms));
1153
1153
  }
1154
+ function prefetchTTS(tts, text, signal) {
1155
+ const buffer = [];
1156
+ let done = false;
1157
+ let error = null;
1158
+ let wake = null;
1159
+ const notify = () => {
1160
+ if (wake) {
1161
+ const w = wake;
1162
+ wake = null;
1163
+ w();
1164
+ }
1165
+ };
1166
+ void (async () => {
1167
+ try {
1168
+ const stream = tts.synthesize(text, signal);
1169
+ for await (const chunk of stream) {
1170
+ if (signal?.aborted) break;
1171
+ buffer.push(chunk);
1172
+ notify();
1173
+ }
1174
+ } catch (e) {
1175
+ if (!(e instanceof Error && e.name === "AbortError")) error = e;
1176
+ } finally {
1177
+ done = true;
1178
+ notify();
1179
+ }
1180
+ })();
1181
+ return async function* () {
1182
+ let index = 0;
1183
+ while (true) {
1184
+ if (signal?.aborted) return;
1185
+ if (error) throw error;
1186
+ if (index < buffer.length) {
1187
+ yield buffer[index++];
1188
+ continue;
1189
+ }
1190
+ if (done) return;
1191
+ await new Promise((r) => {
1192
+ wake = r;
1193
+ });
1194
+ }
1195
+ };
1196
+ }
1154
1197
  var Pipeline = class extends import_events.EventEmitter {
1155
1198
  stt;
1156
1199
  llm;
@@ -1165,6 +1208,7 @@ var Pipeline = class extends import_events.EventEmitter {
1165
1208
  nameVariants;
1166
1209
  beforeRespond;
1167
1210
  memory;
1211
+ tools;
1168
1212
  /** Strip provider-specific markup (e.g. SSML lang tags) for display. */
1169
1213
  cleanText(text) {
1170
1214
  return this.tts?.cleanText ? this.tts.cleanText(text) : text;
@@ -1187,6 +1231,7 @@ var Pipeline = class extends import_events.EventEmitter {
1187
1231
  this.nameVariants = (options.nameVariants ?? []).map((n) => n.toLowerCase());
1188
1232
  this.beforeRespond = options.beforeRespond;
1189
1233
  this.memory = options.memory;
1234
+ this.tools = options.tools;
1190
1235
  this.context = new ContextManager({
1191
1236
  instructions: options.instructions,
1192
1237
  maxContextTokens: options.maxContextTokens
@@ -1203,28 +1248,18 @@ var Pipeline = class extends import_events.EventEmitter {
1203
1248
  this.splitter.reset();
1204
1249
  this.setAgentState("idle");
1205
1250
  };
1206
- this._warmupPromise = this.warmup(options.instructions);
1251
+ this._ttsWarmupPromise = this.tts?.warmup ? this.tts.warmup().catch((err) => {
1252
+ log7.warn("TTS warmup failed (non-fatal):", err);
1253
+ }) : Promise.resolve();
1254
+ this._llmWarmupPromise = this.llm.warmup ? this.llm.warmup(options.instructions).catch((err) => {
1255
+ log7.warn("LLM warmup failed (non-fatal):", err);
1256
+ }) : Promise.resolve();
1257
+ this._warmupPromise = Promise.all([this._ttsWarmupPromise, this._llmWarmupPromise]).then(() => {
1258
+ });
1207
1259
  }
1208
- /** One-shot warmup — safe to call from constructor, resolves when both LLM and TTS are ready. */
1209
1260
  _warmupPromise;
1210
- async warmup(instructions) {
1211
- const tasks = [];
1212
- if (this.llm.warmup) {
1213
- tasks.push(
1214
- this.llm.warmup(instructions).catch((err) => {
1215
- log7.warn("LLM warmup failed:", err);
1216
- })
1217
- );
1218
- }
1219
- if (this.tts?.warmup) {
1220
- tasks.push(
1221
- this.tts.warmup().catch((err) => {
1222
- log7.warn("TTS warmup failed:", err);
1223
- })
1224
- );
1225
- }
1226
- await Promise.all(tasks);
1227
- }
1261
+ _ttsWarmupPromise;
1262
+ _llmWarmupPromise;
1228
1263
  get processing() {
1229
1264
  return this._processing;
1230
1265
  }
@@ -1400,7 +1435,7 @@ var Pipeline = class extends import_events.EventEmitter {
1400
1435
  segBuf.length = 0;
1401
1436
  pushSentence(combined);
1402
1437
  };
1403
- const llmStream = this.llm.chat(messages, signal);
1438
+ const llmStream = this.llm.chat(messages, signal, { tools: this.tools });
1404
1439
  try {
1405
1440
  while (!signal.aborted) {
1406
1441
  const { value: chunk, done } = await llmStream.next();
@@ -1429,6 +1464,9 @@ var Pipeline = class extends import_events.EventEmitter {
1429
1464
  for (const sentence of sentences) {
1430
1465
  pushSentence(sentence);
1431
1466
  }
1467
+ } else if (chunk.type === "tool_call" && chunk.toolCall) {
1468
+ log7.info(`Tool call: ${chunk.toolCall.name}(${chunk.toolCall.arguments})`);
1469
+ this.emit("toolCall", chunk.toolCall);
1432
1470
  }
1433
1471
  }
1434
1472
  } finally {
@@ -1451,29 +1489,55 @@ var Pipeline = class extends import_events.EventEmitter {
1451
1489
  };
1452
1490
  const consumer = async () => {
1453
1491
  this.audioOutput.beginResponse();
1492
+ const state = { prefetched: null };
1454
1493
  try {
1455
1494
  while (true) {
1456
1495
  if (signal.aborted) break;
1457
- if (sentenceQueue.length > 0) {
1458
- const sentence = sentenceQueue.shift();
1496
+ let sentence;
1497
+ let existingStream;
1498
+ if (state.prefetched) {
1499
+ sentence = state.prefetched.sentence;
1500
+ existingStream = state.prefetched.streamFn();
1501
+ state.prefetched = null;
1502
+ } else if (sentenceQueue.length > 0) {
1503
+ sentence = sentenceQueue.shift();
1459
1504
  if (!/\w/.test(sentence)) {
1460
1505
  log7.debug(`Skipping non-word sentence: "${sentence}"`);
1461
1506
  continue;
1462
1507
  }
1508
+ existingStream = void 0;
1509
+ } else if (producerDone) {
1510
+ break;
1511
+ } else {
1512
+ await new Promise((resolve) => {
1513
+ wakeConsumer = resolve;
1514
+ });
1515
+ wakeConsumer = null;
1516
+ continue;
1517
+ }
1518
+ const tryPrefetch = () => {
1519
+ if (state.prefetched || !this.tts) return;
1520
+ if (sentenceQueue.length > 0) {
1521
+ const next = sentenceQueue.shift();
1522
+ if (/\w/.test(next)) {
1523
+ state.prefetched = { sentence: next, streamFn: prefetchTTS(this.tts, next, signal) };
1524
+ }
1525
+ }
1526
+ };
1527
+ tryPrefetch();
1528
+ try {
1463
1529
  await this.synthesizeAndPlay(sentence, signal, (t) => {
1464
1530
  if (!tFirstAudioPlayed) {
1465
1531
  tFirstAudioPlayed = t;
1466
1532
  this.setAgentState("speaking");
1467
1533
  }
1468
1534
  this.emit("sentence", this.cleanText(sentence), sentence);
1469
- });
1470
- continue;
1535
+ tryPrefetch();
1536
+ }, existingStream);
1537
+ } catch (ttsErr) {
1538
+ if (ttsErr instanceof Error && ttsErr.name === "AbortError") throw ttsErr;
1539
+ log7.warn(`TTS error for sentence (skipping): "${sentence.slice(0, 40)}"`, ttsErr);
1471
1540
  }
1472
- if (producerDone) break;
1473
- await new Promise((resolve) => {
1474
- wakeConsumer = resolve;
1475
- });
1476
- wakeConsumer = null;
1477
1541
  }
1478
1542
  } finally {
1479
1543
  if (!signal.aborted) {
@@ -1526,7 +1590,7 @@ var Pipeline = class extends import_events.EventEmitter {
1526
1590
  return;
1527
1591
  }
1528
1592
  this._processing = true;
1529
- await this._warmupPromise;
1593
+ await this._ttsWarmupPromise;
1530
1594
  log7.info(`say(): "${text.slice(0, 60)}"`);
1531
1595
  try {
1532
1596
  const signal = this.bargeIn.startCycle();
@@ -1563,7 +1627,7 @@ var Pipeline = class extends import_events.EventEmitter {
1563
1627
  }
1564
1628
  }
1565
1629
  }
1566
- async synthesizeAndPlay(text, signal, onFirstAudio) {
1630
+ async synthesizeAndPlay(text, signal, onFirstAudio, existingStream) {
1567
1631
  if (!this.tts || signal.aborted) {
1568
1632
  log7.info(`[Agent says]: ${text}`);
1569
1633
  return;
@@ -1572,7 +1636,7 @@ var Pipeline = class extends import_events.EventEmitter {
1572
1636
  const ttsStart = performance.now();
1573
1637
  let firstChunk = true;
1574
1638
  let ttsChunkCount = 0;
1575
- const ttsStream = this.tts.synthesize(text, signal);
1639
+ const ttsStream = existingStream ?? this.tts.synthesize(text, signal);
1576
1640
  const measuredStream = async function* () {
1577
1641
  for await (const chunk of ttsStream) {
1578
1642
  ttsChunkCount++;
@@ -1681,12 +1745,14 @@ var VoiceAgent = class extends import_events2.EventEmitter {
1681
1745
  agentName: this.config.agentName,
1682
1746
  nameVariants: this.config.nameVariants,
1683
1747
  memory: this.memory ?? void 0,
1684
- maxContextTokens: this.config.maxContextTokens
1748
+ maxContextTokens: this.config.maxContextTokens,
1749
+ tools: this.config.tools
1685
1750
  });
1686
1751
  this.pipeline.on("transcription", (result) => this.emit("transcription", result));
1687
1752
  this.pipeline.on("sentence", (text, raw) => this.emit("sentence", text, raw));
1688
1753
  this.pipeline.on("response", (text) => this.emit("response", text));
1689
1754
  this.pipeline.on("agentState", (state) => this.emit("agentState", state));
1755
+ this.pipeline.on("toolCall", (tc) => this.emit("toolCall", tc));
1690
1756
  this.pipeline.on("error", (error) => this.emit("error", error));
1691
1757
  for (const participant of this.connection.room.remoteParticipants.values()) {
1692
1758
  for (const [, pub] of participant.trackPublications) {