@dtelecom/agents-js 0.1.15 → 0.1.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -23,6 +23,14 @@ declare class VoiceAgent extends EventEmitter {
23
23
  * Use for greetings or announcements. Supports barge-in.
24
24
  */
25
25
  say(text: string): Promise<void>;
26
+ /**
27
+ * Switch STT language on all active streams.
28
+ * Use for bilingual lessons — e.g. switch to 'es' with forceWhisper when
29
+ * expecting Spanish, back to 'auto' for Parakeet auto-detect otherwise.
30
+ */
31
+ setSTTLanguage(language: string, options?: {
32
+ forceWhisper?: boolean;
33
+ }): void;
26
34
  /** Start the agent — connect to room and begin listening. */
27
35
  start(options: AgentStartOptions): Promise<void>;
28
36
  /** Stop the agent — disconnect and clean up. */
@@ -124,6 +132,10 @@ declare class Pipeline extends EventEmitter {
124
132
  removeParticipant(identity: string): Promise<void>;
125
133
  stop(): Promise<void>;
126
134
  getContextManager(): ContextManager;
135
+ /** Switch STT language on all active streams (e.g. for bilingual lessons). */
136
+ setSTTLanguage(language: string, options?: {
137
+ forceWhisper?: boolean;
138
+ }): void;
127
139
  private lastFinalAt;
128
140
  private lastSttDuration;
129
141
  private handleTranscription;
package/dist/index.d.ts CHANGED
@@ -23,6 +23,14 @@ declare class VoiceAgent extends EventEmitter {
23
23
  * Use for greetings or announcements. Supports barge-in.
24
24
  */
25
25
  say(text: string): Promise<void>;
26
+ /**
27
+ * Switch STT language on all active streams.
28
+ * Use for bilingual lessons — e.g. switch to 'es' with forceWhisper when
29
+ * expecting Spanish, back to 'auto' for Parakeet auto-detect otherwise.
30
+ */
31
+ setSTTLanguage(language: string, options?: {
32
+ forceWhisper?: boolean;
33
+ }): void;
26
34
  /** Start the agent — connect to room and begin listening. */
27
35
  start(options: AgentStartOptions): Promise<void>;
28
36
  /** Stop the agent — disconnect and clean up. */
@@ -124,6 +132,10 @@ declare class Pipeline extends EventEmitter {
124
132
  removeParticipant(identity: string): Promise<void>;
125
133
  stop(): Promise<void>;
126
134
  getContextManager(): ContextManager;
135
+ /** Switch STT language on all active streams (e.g. for bilingual lessons). */
136
+ setSTTLanguage(language: string, options?: {
137
+ forceWhisper?: boolean;
138
+ }): void;
127
139
  private lastFinalAt;
128
140
  private lastSttDuration;
129
141
  private handleTranscription;
package/dist/index.js CHANGED
@@ -1274,11 +1274,20 @@ var Pipeline = class extends import_events.EventEmitter {
1274
1274
  getContextManager() {
1275
1275
  return this.context;
1276
1276
  }
1277
+ /** Switch STT language on all active streams (e.g. for bilingual lessons). */
1278
+ setSTTLanguage(language, options) {
1279
+ for (const [identity, stream] of this.sttStreams) {
1280
+ if (stream.setLanguage) {
1281
+ stream.setLanguage(language, options);
1282
+ log7.info(`STT language \u2192 ${language}${options?.forceWhisper ? " (whisper)" : ""} for "${identity}"`);
1283
+ }
1284
+ }
1285
+ }
1277
1286
  lastFinalAt = 0;
1278
1287
  lastSttDuration = 0;
1279
1288
  async handleTranscription(speaker, result) {
1280
1289
  this.emit("transcription", { ...result, speaker });
1281
- if (!result.isFinal && result.text.trim()) {
1290
+ if (!result.isFinal) {
1282
1291
  this.setAgentState("listening");
1283
1292
  }
1284
1293
  if (this.audioOutput.playing && result.text.trim().length > 0) {
@@ -1363,60 +1372,69 @@ var Pipeline = class extends import_events.EventEmitter {
1363
1372
  sentenceQueue.push(text2);
1364
1373
  wake();
1365
1374
  };
1375
+ const MAX_LLM_RETRIES = 2;
1366
1376
  const producer = async () => {
1367
- let isFirstChunk = true;
1368
1377
  const defaultLang = this.tts?.defaultLanguage;
1369
- const segBuf = [];
1370
- const flushSegments = () => {
1371
- if (segBuf.length === 0) return;
1372
- const combined = segBuf.map(
1373
- (s) => s.lang !== defaultLang ? `<lang xml:lang="${s.lang}">${s.text}</lang>` : s.text
1374
- ).join(" ");
1375
- segBuf.length = 0;
1376
- pushSentence(combined);
1377
- };
1378
- const llmStream = this.llm.chat(messages, signal);
1379
- try {
1380
- while (!signal.aborted) {
1381
- const { value: chunk, done } = await llmStream.next();
1382
- if (done || !chunk) break;
1383
- if (signal.aborted) break;
1384
- if (chunk.type === "segment" && chunk.segment) {
1385
- if (isFirstChunk) {
1386
- tLlmFirstToken = performance.now();
1387
- isFirstChunk = false;
1388
- log7.info(`llm_first_segment: ${(tLlmFirstToken - tSpeechEnd).toFixed(0)}ms`);
1389
- }
1390
- if (fullResponse) fullResponse += " ";
1391
- fullResponse += chunk.segment.text;
1392
- segBuf.push(chunk.segment);
1393
- if (/[.!?]["'»)]*\s*$/.test(chunk.segment.text)) {
1394
- flushSegments();
1395
- }
1396
- } else if (chunk.type === "token" && chunk.token) {
1397
- if (isFirstChunk) {
1398
- tLlmFirstToken = performance.now();
1399
- isFirstChunk = false;
1400
- log7.info(`llm_first_token: ${(tLlmFirstToken - tSpeechEnd).toFixed(0)}ms`);
1401
- }
1402
- fullResponse += chunk.token;
1403
- const sentences = this.splitter.push(chunk.token);
1404
- for (const sentence of sentences) {
1405
- pushSentence(sentence);
1378
+ for (let attempt = 0; attempt <= MAX_LLM_RETRIES; attempt++) {
1379
+ if (signal.aborted) break;
1380
+ if (attempt > 0) {
1381
+ log7.warn(`LLM retry ${attempt}/${MAX_LLM_RETRIES}...`);
1382
+ this.splitter.reset();
1383
+ }
1384
+ let isFirstChunk = true;
1385
+ const segBuf = [];
1386
+ const flushSegments = () => {
1387
+ if (segBuf.length === 0) return;
1388
+ const combined = segBuf.map(
1389
+ (s) => s.lang !== defaultLang ? `<lang xml:lang="${s.lang}">${s.text}</lang>` : s.text
1390
+ ).join(" ");
1391
+ segBuf.length = 0;
1392
+ pushSentence(combined);
1393
+ };
1394
+ const llmStream = this.llm.chat(messages, signal);
1395
+ try {
1396
+ while (!signal.aborted) {
1397
+ const { value: chunk, done } = await llmStream.next();
1398
+ if (done || !chunk) break;
1399
+ if (signal.aborted) break;
1400
+ if (chunk.type === "segment" && chunk.segment) {
1401
+ if (isFirstChunk) {
1402
+ tLlmFirstToken = performance.now();
1403
+ isFirstChunk = false;
1404
+ log7.info(`llm_first_segment: ${(tLlmFirstToken - tSpeechEnd).toFixed(0)}ms`);
1405
+ }
1406
+ if (fullResponse) fullResponse += " ";
1407
+ fullResponse += chunk.segment.text;
1408
+ segBuf.push(chunk.segment);
1409
+ if (/[.!?]["'»)]*\s*$/.test(chunk.segment.text)) {
1410
+ flushSegments();
1411
+ }
1412
+ } else if (chunk.type === "token" && chunk.token) {
1413
+ if (isFirstChunk) {
1414
+ tLlmFirstToken = performance.now();
1415
+ isFirstChunk = false;
1416
+ log7.info(`llm_first_token: ${(tLlmFirstToken - tSpeechEnd).toFixed(0)}ms`);
1417
+ }
1418
+ fullResponse += chunk.token;
1419
+ const sentences = this.splitter.push(chunk.token);
1420
+ for (const sentence of sentences) {
1421
+ pushSentence(sentence);
1422
+ }
1406
1423
  }
1407
1424
  }
1425
+ } finally {
1426
+ await llmStream.return(void 0);
1408
1427
  }
1409
- } finally {
1410
- await llmStream.return(void 0);
1411
- }
1412
- if (!signal.aborted) {
1413
- flushSegments();
1414
- const remaining = this.splitter.flush();
1415
- if (remaining) {
1416
- pushSentence(remaining);
1417
- }
1418
- if (!fullResponse.trim()) {
1419
- log7.warn("LLM produced no output (empty response or no segments detected)");
1428
+ if (!signal.aborted) {
1429
+ flushSegments();
1430
+ const remaining = this.splitter.flush();
1431
+ if (remaining) {
1432
+ pushSentence(remaining);
1433
+ }
1434
+ if (fullResponse.trim()) {
1435
+ break;
1436
+ }
1437
+ log7.warn(`LLM produced no output (attempt ${attempt + 1}/${MAX_LLM_RETRIES + 1})`);
1420
1438
  }
1421
1439
  }
1422
1440
  producerDone = true;
@@ -1438,7 +1456,7 @@ var Pipeline = class extends import_events.EventEmitter {
1438
1456
  tFirstAudioPlayed = t;
1439
1457
  this.setAgentState("speaking");
1440
1458
  }
1441
- this.emit("sentence", this.cleanText(sentence));
1459
+ this.emit("sentence", this.cleanText(sentence), sentence);
1442
1460
  });
1443
1461
  continue;
1444
1462
  }
@@ -1507,7 +1525,7 @@ var Pipeline = class extends import_events.EventEmitter {
1507
1525
  this.setAgentState("thinking");
1508
1526
  await this.synthesizeAndPlay(text, signal, () => {
1509
1527
  this.setAgentState("speaking");
1510
- this.emit("sentence", this.cleanText(text));
1528
+ this.emit("sentence", this.cleanText(text), text);
1511
1529
  });
1512
1530
  if (!signal.aborted) {
1513
1531
  await this.audioOutput.writeSilence(40);
@@ -1606,6 +1624,14 @@ var VoiceAgent = class extends import_events2.EventEmitter {
1606
1624
  }
1607
1625
  await this.pipeline.say(text);
1608
1626
  }
1627
+ /**
1628
+ * Switch STT language on all active streams.
1629
+ * Use for bilingual lessons — e.g. switch to 'es' with forceWhisper when
1630
+ * expecting Spanish, back to 'auto' for Parakeet auto-detect otherwise.
1631
+ */
1632
+ setSTTLanguage(language, options) {
1633
+ this.pipeline?.setSTTLanguage(language, options);
1634
+ }
1609
1635
  /** Start the agent — connect to room and begin listening. */
1610
1636
  async start(options) {
1611
1637
  if (this._running) {
@@ -1647,7 +1673,7 @@ var VoiceAgent = class extends import_events2.EventEmitter {
1647
1673
  maxContextTokens: this.config.maxContextTokens
1648
1674
  });
1649
1675
  this.pipeline.on("transcription", (result) => this.emit("transcription", result));
1650
- this.pipeline.on("sentence", (text) => this.emit("sentence", text));
1676
+ this.pipeline.on("sentence", (text, raw) => this.emit("sentence", text, raw));
1651
1677
  this.pipeline.on("response", (text) => this.emit("response", text));
1652
1678
  this.pipeline.on("agentState", (state) => this.emit("agentState", state));
1653
1679
  this.pipeline.on("error", (error) => this.emit("error", error));