@dtelecom/agents-js 0.1.14 → 0.1.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +14 -2
- package/dist/index.d.ts +14 -2
- package/dist/index.js +79 -53
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +79 -53
- package/dist/index.mjs.map +1 -1
- package/dist/memory/index.d.mts +1 -1
- package/dist/memory/index.d.ts +1 -1
- package/dist/providers/index.d.mts +85 -2
- package/dist/providers/index.d.ts +85 -2
- package/dist/providers/index.js +469 -29
- package/dist/providers/index.js.map +1 -1
- package/dist/providers/index.mjs +467 -29
- package/dist/providers/index.mjs.map +1 -1
- package/dist/{types-BVMiP1bW.d.mts → types-MPHcuMhp.d.mts} +4 -0
- package/dist/{types-BVMiP1bW.d.ts → types-MPHcuMhp.d.ts} +4 -0
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import * as _dtelecom_server_sdk_node from '@dtelecom/server-sdk-node';
|
|
2
2
|
import { Room, AudioSource, RemoteAudioTrack, AudioFrame } from '@dtelecom/server-sdk-node';
|
|
3
3
|
import { EventEmitter } from 'events';
|
|
4
|
-
import { A as AgentConfig, a as AgentStartOptions, M as Message, L as LLMPlugin, P as PipelineOptions, b as AgentState, S as STTStream, T as TranscriptionResult } from './types-
|
|
5
|
-
export { c as AgentEvents, d as AudioOutput, D as DataMessageHandler, e as LLMChunk, f as MemoryConfig, g as PipelineEvents, R as RespondMode, h as STTPlugin, i as STTStreamOptions, j as TTSPlugin } from './types-
|
|
4
|
+
import { A as AgentConfig, a as AgentStartOptions, M as Message, L as LLMPlugin, P as PipelineOptions, b as AgentState, S as STTStream, T as TranscriptionResult } from './types-MPHcuMhp.mjs';
|
|
5
|
+
export { c as AgentEvents, d as AudioOutput, D as DataMessageHandler, e as LLMChunk, f as MemoryConfig, g as PipelineEvents, R as RespondMode, h as STTPlugin, i as STTStreamOptions, j as TTSPlugin } from './types-MPHcuMhp.mjs';
|
|
6
6
|
|
|
7
7
|
declare class VoiceAgent extends EventEmitter {
|
|
8
8
|
private readonly config;
|
|
@@ -23,6 +23,14 @@ declare class VoiceAgent extends EventEmitter {
|
|
|
23
23
|
* Use for greetings or announcements. Supports barge-in.
|
|
24
24
|
*/
|
|
25
25
|
say(text: string): Promise<void>;
|
|
26
|
+
/**
|
|
27
|
+
* Switch STT language on all active streams.
|
|
28
|
+
* Use for bilingual lessons — e.g. switch to 'es' with forceWhisper when
|
|
29
|
+
* expecting Spanish, back to 'auto' for Parakeet auto-detect otherwise.
|
|
30
|
+
*/
|
|
31
|
+
setSTTLanguage(language: string, options?: {
|
|
32
|
+
forceWhisper?: boolean;
|
|
33
|
+
}): void;
|
|
26
34
|
/** Start the agent — connect to room and begin listening. */
|
|
27
35
|
start(options: AgentStartOptions): Promise<void>;
|
|
28
36
|
/** Stop the agent — disconnect and clean up. */
|
|
@@ -124,6 +132,10 @@ declare class Pipeline extends EventEmitter {
|
|
|
124
132
|
removeParticipant(identity: string): Promise<void>;
|
|
125
133
|
stop(): Promise<void>;
|
|
126
134
|
getContextManager(): ContextManager;
|
|
135
|
+
/** Switch STT language on all active streams (e.g. for bilingual lessons). */
|
|
136
|
+
setSTTLanguage(language: string, options?: {
|
|
137
|
+
forceWhisper?: boolean;
|
|
138
|
+
}): void;
|
|
127
139
|
private lastFinalAt;
|
|
128
140
|
private lastSttDuration;
|
|
129
141
|
private handleTranscription;
|
package/dist/index.d.ts
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import * as _dtelecom_server_sdk_node from '@dtelecom/server-sdk-node';
|
|
2
2
|
import { Room, AudioSource, RemoteAudioTrack, AudioFrame } from '@dtelecom/server-sdk-node';
|
|
3
3
|
import { EventEmitter } from 'events';
|
|
4
|
-
import { A as AgentConfig, a as AgentStartOptions, M as Message, L as LLMPlugin, P as PipelineOptions, b as AgentState, S as STTStream, T as TranscriptionResult } from './types-
|
|
5
|
-
export { c as AgentEvents, d as AudioOutput, D as DataMessageHandler, e as LLMChunk, f as MemoryConfig, g as PipelineEvents, R as RespondMode, h as STTPlugin, i as STTStreamOptions, j as TTSPlugin } from './types-
|
|
4
|
+
import { A as AgentConfig, a as AgentStartOptions, M as Message, L as LLMPlugin, P as PipelineOptions, b as AgentState, S as STTStream, T as TranscriptionResult } from './types-MPHcuMhp.js';
|
|
5
|
+
export { c as AgentEvents, d as AudioOutput, D as DataMessageHandler, e as LLMChunk, f as MemoryConfig, g as PipelineEvents, R as RespondMode, h as STTPlugin, i as STTStreamOptions, j as TTSPlugin } from './types-MPHcuMhp.js';
|
|
6
6
|
|
|
7
7
|
declare class VoiceAgent extends EventEmitter {
|
|
8
8
|
private readonly config;
|
|
@@ -23,6 +23,14 @@ declare class VoiceAgent extends EventEmitter {
|
|
|
23
23
|
* Use for greetings or announcements. Supports barge-in.
|
|
24
24
|
*/
|
|
25
25
|
say(text: string): Promise<void>;
|
|
26
|
+
/**
|
|
27
|
+
* Switch STT language on all active streams.
|
|
28
|
+
* Use for bilingual lessons — e.g. switch to 'es' with forceWhisper when
|
|
29
|
+
* expecting Spanish, back to 'auto' for Parakeet auto-detect otherwise.
|
|
30
|
+
*/
|
|
31
|
+
setSTTLanguage(language: string, options?: {
|
|
32
|
+
forceWhisper?: boolean;
|
|
33
|
+
}): void;
|
|
26
34
|
/** Start the agent — connect to room and begin listening. */
|
|
27
35
|
start(options: AgentStartOptions): Promise<void>;
|
|
28
36
|
/** Stop the agent — disconnect and clean up. */
|
|
@@ -124,6 +132,10 @@ declare class Pipeline extends EventEmitter {
|
|
|
124
132
|
removeParticipant(identity: string): Promise<void>;
|
|
125
133
|
stop(): Promise<void>;
|
|
126
134
|
getContextManager(): ContextManager;
|
|
135
|
+
/** Switch STT language on all active streams (e.g. for bilingual lessons). */
|
|
136
|
+
setSTTLanguage(language: string, options?: {
|
|
137
|
+
forceWhisper?: boolean;
|
|
138
|
+
}): void;
|
|
127
139
|
private lastFinalAt;
|
|
128
140
|
private lastSttDuration;
|
|
129
141
|
private handleTranscription;
|
package/dist/index.js
CHANGED
|
@@ -1274,11 +1274,20 @@ var Pipeline = class extends import_events.EventEmitter {
|
|
|
1274
1274
|
getContextManager() {
|
|
1275
1275
|
return this.context;
|
|
1276
1276
|
}
|
|
1277
|
+
/** Switch STT language on all active streams (e.g. for bilingual lessons). */
|
|
1278
|
+
setSTTLanguage(language, options) {
|
|
1279
|
+
for (const [identity, stream] of this.sttStreams) {
|
|
1280
|
+
if (stream.setLanguage) {
|
|
1281
|
+
stream.setLanguage(language, options);
|
|
1282
|
+
log7.info(`STT language \u2192 ${language}${options?.forceWhisper ? " (whisper)" : ""} for "${identity}"`);
|
|
1283
|
+
}
|
|
1284
|
+
}
|
|
1285
|
+
}
|
|
1277
1286
|
lastFinalAt = 0;
|
|
1278
1287
|
lastSttDuration = 0;
|
|
1279
1288
|
async handleTranscription(speaker, result) {
|
|
1280
1289
|
this.emit("transcription", { ...result, speaker });
|
|
1281
|
-
if (!result.isFinal
|
|
1290
|
+
if (!result.isFinal) {
|
|
1282
1291
|
this.setAgentState("listening");
|
|
1283
1292
|
}
|
|
1284
1293
|
if (this.audioOutput.playing && result.text.trim().length > 0) {
|
|
@@ -1363,60 +1372,69 @@ var Pipeline = class extends import_events.EventEmitter {
|
|
|
1363
1372
|
sentenceQueue.push(text2);
|
|
1364
1373
|
wake();
|
|
1365
1374
|
};
|
|
1375
|
+
const MAX_LLM_RETRIES = 2;
|
|
1366
1376
|
const producer = async () => {
|
|
1367
|
-
let isFirstChunk = true;
|
|
1368
1377
|
const defaultLang = this.tts?.defaultLanguage;
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
if (
|
|
1372
|
-
|
|
1373
|
-
(
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
if (
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1378
|
+
for (let attempt = 0; attempt <= MAX_LLM_RETRIES; attempt++) {
|
|
1379
|
+
if (signal.aborted) break;
|
|
1380
|
+
if (attempt > 0) {
|
|
1381
|
+
log7.warn(`LLM retry ${attempt}/${MAX_LLM_RETRIES}...`);
|
|
1382
|
+
this.splitter.reset();
|
|
1383
|
+
}
|
|
1384
|
+
let isFirstChunk = true;
|
|
1385
|
+
const segBuf = [];
|
|
1386
|
+
const flushSegments = () => {
|
|
1387
|
+
if (segBuf.length === 0) return;
|
|
1388
|
+
const combined = segBuf.map(
|
|
1389
|
+
(s) => s.lang !== defaultLang ? `<lang xml:lang="${s.lang}">${s.text}</lang>` : s.text
|
|
1390
|
+
).join(" ");
|
|
1391
|
+
segBuf.length = 0;
|
|
1392
|
+
pushSentence(combined);
|
|
1393
|
+
};
|
|
1394
|
+
const llmStream = this.llm.chat(messages, signal);
|
|
1395
|
+
try {
|
|
1396
|
+
while (!signal.aborted) {
|
|
1397
|
+
const { value: chunk, done } = await llmStream.next();
|
|
1398
|
+
if (done || !chunk) break;
|
|
1399
|
+
if (signal.aborted) break;
|
|
1400
|
+
if (chunk.type === "segment" && chunk.segment) {
|
|
1401
|
+
if (isFirstChunk) {
|
|
1402
|
+
tLlmFirstToken = performance.now();
|
|
1403
|
+
isFirstChunk = false;
|
|
1404
|
+
log7.info(`llm_first_segment: ${(tLlmFirstToken - tSpeechEnd).toFixed(0)}ms`);
|
|
1405
|
+
}
|
|
1406
|
+
if (fullResponse) fullResponse += " ";
|
|
1407
|
+
fullResponse += chunk.segment.text;
|
|
1408
|
+
segBuf.push(chunk.segment);
|
|
1409
|
+
if (/[.!?]["'»)]*\s*$/.test(chunk.segment.text)) {
|
|
1410
|
+
flushSegments();
|
|
1411
|
+
}
|
|
1412
|
+
} else if (chunk.type === "token" && chunk.token) {
|
|
1413
|
+
if (isFirstChunk) {
|
|
1414
|
+
tLlmFirstToken = performance.now();
|
|
1415
|
+
isFirstChunk = false;
|
|
1416
|
+
log7.info(`llm_first_token: ${(tLlmFirstToken - tSpeechEnd).toFixed(0)}ms`);
|
|
1417
|
+
}
|
|
1418
|
+
fullResponse += chunk.token;
|
|
1419
|
+
const sentences = this.splitter.push(chunk.token);
|
|
1420
|
+
for (const sentence of sentences) {
|
|
1421
|
+
pushSentence(sentence);
|
|
1422
|
+
}
|
|
1406
1423
|
}
|
|
1407
1424
|
}
|
|
1425
|
+
} finally {
|
|
1426
|
+
await llmStream.return(void 0);
|
|
1408
1427
|
}
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
log7.warn("LLM produced no output (empty response or no segments detected)");
|
|
1428
|
+
if (!signal.aborted) {
|
|
1429
|
+
flushSegments();
|
|
1430
|
+
const remaining = this.splitter.flush();
|
|
1431
|
+
if (remaining) {
|
|
1432
|
+
pushSentence(remaining);
|
|
1433
|
+
}
|
|
1434
|
+
if (fullResponse.trim()) {
|
|
1435
|
+
break;
|
|
1436
|
+
}
|
|
1437
|
+
log7.warn(`LLM produced no output (attempt ${attempt + 1}/${MAX_LLM_RETRIES + 1})`);
|
|
1420
1438
|
}
|
|
1421
1439
|
}
|
|
1422
1440
|
producerDone = true;
|
|
@@ -1438,7 +1456,7 @@ var Pipeline = class extends import_events.EventEmitter {
|
|
|
1438
1456
|
tFirstAudioPlayed = t;
|
|
1439
1457
|
this.setAgentState("speaking");
|
|
1440
1458
|
}
|
|
1441
|
-
this.emit("sentence", this.cleanText(sentence));
|
|
1459
|
+
this.emit("sentence", this.cleanText(sentence), sentence);
|
|
1442
1460
|
});
|
|
1443
1461
|
continue;
|
|
1444
1462
|
}
|
|
@@ -1507,7 +1525,7 @@ var Pipeline = class extends import_events.EventEmitter {
|
|
|
1507
1525
|
this.setAgentState("thinking");
|
|
1508
1526
|
await this.synthesizeAndPlay(text, signal, () => {
|
|
1509
1527
|
this.setAgentState("speaking");
|
|
1510
|
-
this.emit("sentence", this.cleanText(text));
|
|
1528
|
+
this.emit("sentence", this.cleanText(text), text);
|
|
1511
1529
|
});
|
|
1512
1530
|
if (!signal.aborted) {
|
|
1513
1531
|
await this.audioOutput.writeSilence(40);
|
|
@@ -1606,6 +1624,14 @@ var VoiceAgent = class extends import_events2.EventEmitter {
|
|
|
1606
1624
|
}
|
|
1607
1625
|
await this.pipeline.say(text);
|
|
1608
1626
|
}
|
|
1627
|
+
/**
|
|
1628
|
+
* Switch STT language on all active streams.
|
|
1629
|
+
* Use for bilingual lessons — e.g. switch to 'es' with forceWhisper when
|
|
1630
|
+
* expecting Spanish, back to 'auto' for Parakeet auto-detect otherwise.
|
|
1631
|
+
*/
|
|
1632
|
+
setSTTLanguage(language, options) {
|
|
1633
|
+
this.pipeline?.setSTTLanguage(language, options);
|
|
1634
|
+
}
|
|
1609
1635
|
/** Start the agent — connect to room and begin listening. */
|
|
1610
1636
|
async start(options) {
|
|
1611
1637
|
if (this._running) {
|
|
@@ -1647,7 +1673,7 @@ var VoiceAgent = class extends import_events2.EventEmitter {
|
|
|
1647
1673
|
maxContextTokens: this.config.maxContextTokens
|
|
1648
1674
|
});
|
|
1649
1675
|
this.pipeline.on("transcription", (result) => this.emit("transcription", result));
|
|
1650
|
-
this.pipeline.on("sentence", (text) => this.emit("sentence", text));
|
|
1676
|
+
this.pipeline.on("sentence", (text, raw) => this.emit("sentence", text, raw));
|
|
1651
1677
|
this.pipeline.on("response", (text) => this.emit("response", text));
|
|
1652
1678
|
this.pipeline.on("agentState", (state) => this.emit("agentState", state));
|
|
1653
1679
|
this.pipeline.on("error", (error) => this.emit("error", error));
|