@dtelecom/agents-js 0.1.14 → 0.1.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +2 -2
- package/dist/index.d.ts +2 -2
- package/dist/memory/index.d.mts +1 -1
- package/dist/memory/index.d.ts +1 -1
- package/dist/providers/index.d.mts +85 -2
- package/dist/providers/index.d.ts +85 -2
- package/dist/providers/index.js +394 -0
- package/dist/providers/index.js.map +1 -1
- package/dist/providers/index.mjs +392 -0
- package/dist/providers/index.mjs.map +1 -1
- package/dist/{types-BVMiP1bW.d.mts → types-MPHcuMhp.d.mts} +4 -0
- package/dist/{types-BVMiP1bW.d.ts → types-MPHcuMhp.d.ts} +4 -0
- package/package.json +2 -1
package/dist/index.d.mts
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import * as _dtelecom_server_sdk_node from '@dtelecom/server-sdk-node';
|
|
2
2
|
import { Room, AudioSource, RemoteAudioTrack, AudioFrame } from '@dtelecom/server-sdk-node';
|
|
3
3
|
import { EventEmitter } from 'events';
|
|
4
|
-
import { A as AgentConfig, a as AgentStartOptions, M as Message, L as LLMPlugin, P as PipelineOptions, b as AgentState, S as STTStream, T as TranscriptionResult } from './types-
|
|
5
|
-
export { c as AgentEvents, d as AudioOutput, D as DataMessageHandler, e as LLMChunk, f as MemoryConfig, g as PipelineEvents, R as RespondMode, h as STTPlugin, i as STTStreamOptions, j as TTSPlugin } from './types-
|
|
4
|
+
import { A as AgentConfig, a as AgentStartOptions, M as Message, L as LLMPlugin, P as PipelineOptions, b as AgentState, S as STTStream, T as TranscriptionResult } from './types-MPHcuMhp.mjs';
|
|
5
|
+
export { c as AgentEvents, d as AudioOutput, D as DataMessageHandler, e as LLMChunk, f as MemoryConfig, g as PipelineEvents, R as RespondMode, h as STTPlugin, i as STTStreamOptions, j as TTSPlugin } from './types-MPHcuMhp.mjs';
|
|
6
6
|
|
|
7
7
|
declare class VoiceAgent extends EventEmitter {
|
|
8
8
|
private readonly config;
|
package/dist/index.d.ts
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import * as _dtelecom_server_sdk_node from '@dtelecom/server-sdk-node';
|
|
2
2
|
import { Room, AudioSource, RemoteAudioTrack, AudioFrame } from '@dtelecom/server-sdk-node';
|
|
3
3
|
import { EventEmitter } from 'events';
|
|
4
|
-
import { A as AgentConfig, a as AgentStartOptions, M as Message, L as LLMPlugin, P as PipelineOptions, b as AgentState, S as STTStream, T as TranscriptionResult } from './types-
|
|
5
|
-
export { c as AgentEvents, d as AudioOutput, D as DataMessageHandler, e as LLMChunk, f as MemoryConfig, g as PipelineEvents, R as RespondMode, h as STTPlugin, i as STTStreamOptions, j as TTSPlugin } from './types-
|
|
4
|
+
import { A as AgentConfig, a as AgentStartOptions, M as Message, L as LLMPlugin, P as PipelineOptions, b as AgentState, S as STTStream, T as TranscriptionResult } from './types-MPHcuMhp.js';
|
|
5
|
+
export { c as AgentEvents, d as AudioOutput, D as DataMessageHandler, e as LLMChunk, f as MemoryConfig, g as PipelineEvents, R as RespondMode, h as STTPlugin, i as STTStreamOptions, j as TTSPlugin } from './types-MPHcuMhp.js';
|
|
6
6
|
|
|
7
7
|
declare class VoiceAgent extends EventEmitter {
|
|
8
8
|
private readonly config;
|
package/dist/memory/index.d.mts
CHANGED
package/dist/memory/index.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { h as STTPlugin, i as STTStreamOptions, S as STTStream, L as LLMPlugin, M as Message, e as LLMChunk, j as TTSPlugin } from '../types-
|
|
1
|
+
import { h as STTPlugin, i as STTStreamOptions, S as STTStream, L as LLMPlugin, M as Message, e as LLMChunk, j as TTSPlugin } from '../types-MPHcuMhp.mjs';
|
|
2
2
|
import '@dtelecom/server-sdk-node';
|
|
3
3
|
|
|
4
4
|
/**
|
|
@@ -200,4 +200,87 @@ declare class DeepgramTTS implements TTSPlugin {
|
|
|
200
200
|
private ensureConnection;
|
|
201
201
|
}
|
|
202
202
|
|
|
203
|
-
|
|
203
|
+
/**
|
|
204
|
+
* DtelecomSTT — real-time streaming STT via dTelecom STT server (realtime-stt-m2).
|
|
205
|
+
*
|
|
206
|
+
* Protocol:
|
|
207
|
+
* - Connect to ws://<server>:<port> (address from options, no API key)
|
|
208
|
+
* - Send config: {"type":"config","language":"en"} (or "auto" for Parakeet auto-detect)
|
|
209
|
+
* - Wait for ready: {"type":"ready","client_id":"...","language":"en"}
|
|
210
|
+
* - Send audio as binary PCM16 16kHz mono frames
|
|
211
|
+
* - Receive transcriptions: {"type":"transcription","text":"...","is_final":true,"latency_ms":N}
|
|
212
|
+
* - Receive VAD events: {"type":"vad_event","event":"speech_start"|"speech_end"}
|
|
213
|
+
* - Keepalive via {"type":"ping"} / {"type":"pong"}
|
|
214
|
+
* - Mid-session reconfigure: send {"type":"config","language":"es","model":"whisper"} at any time
|
|
215
|
+
*/
|
|
216
|
+
|
|
217
|
+
interface DtelecomSTTOptions {
|
|
218
|
+
/** WebSocket server URL, e.g. "ws://192.168.1.100:8765" */
|
|
219
|
+
serverUrl: string;
|
|
220
|
+
/** Initial language (default: "auto" for Parakeet auto-detect) */
|
|
221
|
+
language?: string;
|
|
222
|
+
/** Force Whisper model even if Parakeet supports the language */
|
|
223
|
+
forceWhisper?: boolean;
|
|
224
|
+
}
|
|
225
|
+
declare class DtelecomSTT implements STTPlugin {
|
|
226
|
+
private readonly options;
|
|
227
|
+
constructor(options: DtelecomSTTOptions);
|
|
228
|
+
createStream(options?: STTStreamOptions): STTStream;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
/**
|
|
232
|
+
* DtelecomTTS — real-time streaming TTS via dTelecom TTS server (realtime-tts-m2).
|
|
233
|
+
*
|
|
234
|
+
* Protocol:
|
|
235
|
+
* - Connect to ws://<server>:<port> (address from options, no API key)
|
|
236
|
+
* - Send config: {"config":{"voice":"af_heart","lang_code":"a","speed":1.0}}
|
|
237
|
+
* - Send text: {"text":"Hello world"} — uses config defaults
|
|
238
|
+
* - Send text with per-message override: {"text":"Hola","voice":"bf_emma","lang_code":"b","speed":1.0}
|
|
239
|
+
* - Receive: {"type":"generating","text":"..."} then binary PCM16 24kHz chunks, then {"type":"done"}
|
|
240
|
+
* - Cancel: {"type":"clear"} → {"type":"cleared"}
|
|
241
|
+
*
|
|
242
|
+
* Key differences from DeepgramTTS:
|
|
243
|
+
* - Single WebSocket connection (not per-language pool)
|
|
244
|
+
* - Per-message voice/language switching instead of separate connections
|
|
245
|
+
* - Server outputs 24kHz PCM16, we resample to 48kHz for the pipeline
|
|
246
|
+
* - Uses SSML <lang> tags to route text segments to correct voice (same as DeepgramTTS)
|
|
247
|
+
*/
|
|
248
|
+
|
|
249
|
+
interface VoiceConfig {
|
|
250
|
+
voice: string;
|
|
251
|
+
langCode: string;
|
|
252
|
+
}
|
|
253
|
+
interface DtelecomTTSOptions {
|
|
254
|
+
/** WebSocket server URL, e.g. "ws://192.168.1.100:8766" */
|
|
255
|
+
serverUrl: string;
|
|
256
|
+
/** Voice config per language: { en: { voice: "af_heart", langCode: "a" }, es: { voice: "bf_emma", langCode: "b" } } */
|
|
257
|
+
voices: Record<string, VoiceConfig>;
|
|
258
|
+
/** Default language code (default: "en") */
|
|
259
|
+
defaultLanguage?: string;
|
|
260
|
+
/** Speech speed multiplier (default: 1.0) */
|
|
261
|
+
speed?: number;
|
|
262
|
+
}
|
|
263
|
+
declare class DtelecomTTS implements TTSPlugin {
|
|
264
|
+
private readonly serverUrl;
|
|
265
|
+
private readonly voices;
|
|
266
|
+
private readonly defaultLang;
|
|
267
|
+
private readonly speed;
|
|
268
|
+
private ws;
|
|
269
|
+
private connectPromise;
|
|
270
|
+
private flushState;
|
|
271
|
+
/** Default language code for untagged text (e.g. 'en'). */
|
|
272
|
+
get defaultLanguage(): string;
|
|
273
|
+
constructor(options: DtelecomTTSOptions);
|
|
274
|
+
/** Pre-connect WebSocket to TTS server. */
|
|
275
|
+
warmup(): Promise<void>;
|
|
276
|
+
/** Close WebSocket connection. */
|
|
277
|
+
close(): void;
|
|
278
|
+
/** Strip SSML lang tags from text for display/events. */
|
|
279
|
+
cleanText(text: string): string;
|
|
280
|
+
synthesize(text: string, signal?: AbortSignal): AsyncGenerator<Buffer>;
|
|
281
|
+
private synthesizeSegment;
|
|
282
|
+
/** Ensure a WebSocket connection exists and is open. */
|
|
283
|
+
private ensureConnection;
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
export { CartesiaTTS, type CartesiaTTSOptions, DeepgramSTT, type DeepgramSTTOptions, DeepgramTTS, type DeepgramTTSOptions, DtelecomSTT, type DtelecomSTTOptions, DtelecomTTS, type DtelecomTTSOptions, type VoiceConfig as DtelecomVoiceConfig, OpenRouterLLM, type OpenRouterLLMOptions };
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { h as STTPlugin, i as STTStreamOptions, S as STTStream, L as LLMPlugin, M as Message, e as LLMChunk, j as TTSPlugin } from '../types-
|
|
1
|
+
import { h as STTPlugin, i as STTStreamOptions, S as STTStream, L as LLMPlugin, M as Message, e as LLMChunk, j as TTSPlugin } from '../types-MPHcuMhp.js';
|
|
2
2
|
import '@dtelecom/server-sdk-node';
|
|
3
3
|
|
|
4
4
|
/**
|
|
@@ -200,4 +200,87 @@ declare class DeepgramTTS implements TTSPlugin {
|
|
|
200
200
|
private ensureConnection;
|
|
201
201
|
}
|
|
202
202
|
|
|
203
|
-
|
|
203
|
+
/**
|
|
204
|
+
* DtelecomSTT — real-time streaming STT via dTelecom STT server (realtime-stt-m2).
|
|
205
|
+
*
|
|
206
|
+
* Protocol:
|
|
207
|
+
* - Connect to ws://<server>:<port> (address from options, no API key)
|
|
208
|
+
* - Send config: {"type":"config","language":"en"} (or "auto" for Parakeet auto-detect)
|
|
209
|
+
* - Wait for ready: {"type":"ready","client_id":"...","language":"en"}
|
|
210
|
+
* - Send audio as binary PCM16 16kHz mono frames
|
|
211
|
+
* - Receive transcriptions: {"type":"transcription","text":"...","is_final":true,"latency_ms":N}
|
|
212
|
+
* - Receive VAD events: {"type":"vad_event","event":"speech_start"|"speech_end"}
|
|
213
|
+
* - Keepalive via {"type":"ping"} / {"type":"pong"}
|
|
214
|
+
* - Mid-session reconfigure: send {"type":"config","language":"es","model":"whisper"} at any time
|
|
215
|
+
*/
|
|
216
|
+
|
|
217
|
+
interface DtelecomSTTOptions {
|
|
218
|
+
/** WebSocket server URL, e.g. "ws://192.168.1.100:8765" */
|
|
219
|
+
serverUrl: string;
|
|
220
|
+
/** Initial language (default: "auto" for Parakeet auto-detect) */
|
|
221
|
+
language?: string;
|
|
222
|
+
/** Force Whisper model even if Parakeet supports the language */
|
|
223
|
+
forceWhisper?: boolean;
|
|
224
|
+
}
|
|
225
|
+
declare class DtelecomSTT implements STTPlugin {
|
|
226
|
+
private readonly options;
|
|
227
|
+
constructor(options: DtelecomSTTOptions);
|
|
228
|
+
createStream(options?: STTStreamOptions): STTStream;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
/**
|
|
232
|
+
* DtelecomTTS — real-time streaming TTS via dTelecom TTS server (realtime-tts-m2).
|
|
233
|
+
*
|
|
234
|
+
* Protocol:
|
|
235
|
+
* - Connect to ws://<server>:<port> (address from options, no API key)
|
|
236
|
+
* - Send config: {"config":{"voice":"af_heart","lang_code":"a","speed":1.0}}
|
|
237
|
+
* - Send text: {"text":"Hello world"} — uses config defaults
|
|
238
|
+
* - Send text with per-message override: {"text":"Hola","voice":"bf_emma","lang_code":"b","speed":1.0}
|
|
239
|
+
* - Receive: {"type":"generating","text":"..."} then binary PCM16 24kHz chunks, then {"type":"done"}
|
|
240
|
+
* - Cancel: {"type":"clear"} → {"type":"cleared"}
|
|
241
|
+
*
|
|
242
|
+
* Key differences from DeepgramTTS:
|
|
243
|
+
* - Single WebSocket connection (not per-language pool)
|
|
244
|
+
* - Per-message voice/language switching instead of separate connections
|
|
245
|
+
* - Server outputs 24kHz PCM16, we resample to 48kHz for the pipeline
|
|
246
|
+
* - Uses SSML <lang> tags to route text segments to correct voice (same as DeepgramTTS)
|
|
247
|
+
*/
|
|
248
|
+
|
|
249
|
+
interface VoiceConfig {
|
|
250
|
+
voice: string;
|
|
251
|
+
langCode: string;
|
|
252
|
+
}
|
|
253
|
+
interface DtelecomTTSOptions {
|
|
254
|
+
/** WebSocket server URL, e.g. "ws://192.168.1.100:8766" */
|
|
255
|
+
serverUrl: string;
|
|
256
|
+
/** Voice config per language: { en: { voice: "af_heart", langCode: "a" }, es: { voice: "bf_emma", langCode: "b" } } */
|
|
257
|
+
voices: Record<string, VoiceConfig>;
|
|
258
|
+
/** Default language code (default: "en") */
|
|
259
|
+
defaultLanguage?: string;
|
|
260
|
+
/** Speech speed multiplier (default: 1.0) */
|
|
261
|
+
speed?: number;
|
|
262
|
+
}
|
|
263
|
+
declare class DtelecomTTS implements TTSPlugin {
|
|
264
|
+
private readonly serverUrl;
|
|
265
|
+
private readonly voices;
|
|
266
|
+
private readonly defaultLang;
|
|
267
|
+
private readonly speed;
|
|
268
|
+
private ws;
|
|
269
|
+
private connectPromise;
|
|
270
|
+
private flushState;
|
|
271
|
+
/** Default language code for untagged text (e.g. 'en'). */
|
|
272
|
+
get defaultLanguage(): string;
|
|
273
|
+
constructor(options: DtelecomTTSOptions);
|
|
274
|
+
/** Pre-connect WebSocket to TTS server. */
|
|
275
|
+
warmup(): Promise<void>;
|
|
276
|
+
/** Close WebSocket connection. */
|
|
277
|
+
close(): void;
|
|
278
|
+
/** Strip SSML lang tags from text for display/events. */
|
|
279
|
+
cleanText(text: string): string;
|
|
280
|
+
synthesize(text: string, signal?: AbortSignal): AsyncGenerator<Buffer>;
|
|
281
|
+
private synthesizeSegment;
|
|
282
|
+
/** Ensure a WebSocket connection exists and is open. */
|
|
283
|
+
private ensureConnection;
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
export { CartesiaTTS, type CartesiaTTSOptions, DeepgramSTT, type DeepgramSTTOptions, DeepgramTTS, type DeepgramTTSOptions, DtelecomSTT, type DtelecomSTTOptions, DtelecomTTS, type DtelecomTTSOptions, type VoiceConfig as DtelecomVoiceConfig, OpenRouterLLM, type OpenRouterLLMOptions };
|
package/dist/providers/index.js
CHANGED
|
@@ -33,6 +33,8 @@ __export(providers_exports, {
|
|
|
33
33
|
CartesiaTTS: () => CartesiaTTS,
|
|
34
34
|
DeepgramSTT: () => DeepgramSTT,
|
|
35
35
|
DeepgramTTS: () => DeepgramTTS,
|
|
36
|
+
DtelecomSTT: () => DtelecomSTT,
|
|
37
|
+
DtelecomTTS: () => DtelecomTTS,
|
|
36
38
|
OpenRouterLLM: () => OpenRouterLLM
|
|
37
39
|
});
|
|
38
40
|
module.exports = __toCommonJS(providers_exports);
|
|
@@ -904,11 +906,403 @@ var DeepgramTTS = class {
|
|
|
904
906
|
return promise;
|
|
905
907
|
}
|
|
906
908
|
};
|
|
909
|
+
|
|
910
|
+
// src/providers/dtelecom-stt.ts
|
|
911
|
+
var import_ws4 = __toESM(require("ws"));
|
|
912
|
+
var log5 = createLogger("DtelecomSTT");
|
|
913
|
+
var KEEPALIVE_INTERVAL_MS2 = 5e3;
|
|
914
|
+
var DtelecomSTT = class {
|
|
915
|
+
options;
|
|
916
|
+
constructor(options) {
|
|
917
|
+
if (!options.serverUrl) {
|
|
918
|
+
throw new Error("DtelecomSTT requires a serverUrl");
|
|
919
|
+
}
|
|
920
|
+
this.options = options;
|
|
921
|
+
}
|
|
922
|
+
createStream(options) {
|
|
923
|
+
const language = options?.language ?? this.options.language ?? "auto";
|
|
924
|
+
return new DtelecomSTTStream(this.options, language);
|
|
925
|
+
}
|
|
926
|
+
};
|
|
927
|
+
var DtelecomSTTStream = class extends BaseSTTStream {
|
|
928
|
+
ws = null;
|
|
929
|
+
serverUrl;
|
|
930
|
+
forceWhisper;
|
|
931
|
+
_ready = false;
|
|
932
|
+
_closed = false;
|
|
933
|
+
pendingAudio = [];
|
|
934
|
+
keepAliveTimer = null;
|
|
935
|
+
language;
|
|
936
|
+
constructor(options, language) {
|
|
937
|
+
super();
|
|
938
|
+
this.serverUrl = options.serverUrl;
|
|
939
|
+
this.language = language;
|
|
940
|
+
this.forceWhisper = options.forceWhisper ?? false;
|
|
941
|
+
this.connect();
|
|
942
|
+
}
|
|
943
|
+
sendAudio(pcm16) {
|
|
944
|
+
if (this._closed) return;
|
|
945
|
+
if (!this._ready) {
|
|
946
|
+
this.pendingAudio.push(pcm16);
|
|
947
|
+
return;
|
|
948
|
+
}
|
|
949
|
+
if (this.ws?.readyState === import_ws4.default.OPEN) {
|
|
950
|
+
this.ws.send(pcm16);
|
|
951
|
+
}
|
|
952
|
+
}
|
|
953
|
+
/**
|
|
954
|
+
* Switch language mid-session.
|
|
955
|
+
* Sends a reconfigure message to the server; clears buffers and updates model routing.
|
|
956
|
+
*/
|
|
957
|
+
setLanguage(language, options) {
|
|
958
|
+
if (this._closed) return;
|
|
959
|
+
this.language = language;
|
|
960
|
+
const config = { type: "config", language };
|
|
961
|
+
if (options?.forceWhisper) {
|
|
962
|
+
config.model = "whisper";
|
|
963
|
+
}
|
|
964
|
+
if (this.ws?.readyState === import_ws4.default.OPEN) {
|
|
965
|
+
this.ws.send(JSON.stringify(config));
|
|
966
|
+
log5.info(`Reconfiguring STT: language=${language}${options?.forceWhisper ? ", model=whisper" : ""}`);
|
|
967
|
+
}
|
|
968
|
+
}
|
|
969
|
+
async close() {
|
|
970
|
+
if (this._closed) return;
|
|
971
|
+
this._closed = true;
|
|
972
|
+
this._ready = false;
|
|
973
|
+
this.pendingAudio = [];
|
|
974
|
+
this.stopKeepAlive();
|
|
975
|
+
if (this.ws) {
|
|
976
|
+
this.ws.close();
|
|
977
|
+
this.ws = null;
|
|
978
|
+
}
|
|
979
|
+
log5.debug("DtelecomSTT stream closed");
|
|
980
|
+
}
|
|
981
|
+
connect() {
|
|
982
|
+
log5.debug(`Connecting to dTelecom STT: ${this.serverUrl}`);
|
|
983
|
+
this.ws = new import_ws4.default(this.serverUrl);
|
|
984
|
+
this.ws.on("open", () => {
|
|
985
|
+
log5.info("dTelecom STT WebSocket connected");
|
|
986
|
+
const config = { type: "config", language: this.language };
|
|
987
|
+
if (this.forceWhisper) {
|
|
988
|
+
config.model = "whisper";
|
|
989
|
+
}
|
|
990
|
+
this.ws.send(JSON.stringify(config));
|
|
991
|
+
});
|
|
992
|
+
this.ws.on("message", (data, isBinary) => {
|
|
993
|
+
if (isBinary) return;
|
|
994
|
+
try {
|
|
995
|
+
const msg = JSON.parse(data.toString());
|
|
996
|
+
this.handleMessage(msg);
|
|
997
|
+
} catch (err) {
|
|
998
|
+
log5.error("Failed to parse dTelecom STT message:", err);
|
|
999
|
+
}
|
|
1000
|
+
});
|
|
1001
|
+
this.ws.on("error", (err) => {
|
|
1002
|
+
log5.error("dTelecom STT WebSocket error:", err);
|
|
1003
|
+
this.emit("error", err instanceof Error ? err : new Error(String(err)));
|
|
1004
|
+
});
|
|
1005
|
+
this.ws.on("close", (code, reason) => {
|
|
1006
|
+
log5.debug(`dTelecom STT WebSocket closed: ${code} ${reason.toString()}`);
|
|
1007
|
+
this._ready = false;
|
|
1008
|
+
this.stopKeepAlive();
|
|
1009
|
+
if (!this._closed) {
|
|
1010
|
+
log5.info("dTelecom STT connection lost, reconnecting in 1s...");
|
|
1011
|
+
setTimeout(() => {
|
|
1012
|
+
if (!this._closed) this.connect();
|
|
1013
|
+
}, 1e3);
|
|
1014
|
+
}
|
|
1015
|
+
});
|
|
1016
|
+
}
|
|
1017
|
+
handleMessage(msg) {
|
|
1018
|
+
const type = msg.type;
|
|
1019
|
+
if (type === "ready") {
|
|
1020
|
+
this.handleReady(msg);
|
|
1021
|
+
} else if (type === "transcription") {
|
|
1022
|
+
this.handleTranscription(msg);
|
|
1023
|
+
} else if (type === "vad_event") {
|
|
1024
|
+
this.handleVadEvent(msg);
|
|
1025
|
+
} else if (type === "pong") {
|
|
1026
|
+
} else if (type === "error") {
|
|
1027
|
+
const errorMsg = msg.message || msg.error || "Unknown STT error";
|
|
1028
|
+
log5.error(`dTelecom STT error: ${errorMsg}`);
|
|
1029
|
+
this.emit("error", new Error(errorMsg));
|
|
1030
|
+
}
|
|
1031
|
+
}
|
|
1032
|
+
handleReady(msg) {
|
|
1033
|
+
const clientId = msg.client_id;
|
|
1034
|
+
const lang = msg.language;
|
|
1035
|
+
log5.info(`dTelecom STT ready: client_id=${clientId}, language=${lang}`);
|
|
1036
|
+
this._ready = true;
|
|
1037
|
+
for (const buf of this.pendingAudio) {
|
|
1038
|
+
if (this.ws?.readyState === import_ws4.default.OPEN) {
|
|
1039
|
+
this.ws.send(buf);
|
|
1040
|
+
}
|
|
1041
|
+
}
|
|
1042
|
+
this.pendingAudio = [];
|
|
1043
|
+
this.startKeepAlive();
|
|
1044
|
+
}
|
|
1045
|
+
handleTranscription(msg) {
|
|
1046
|
+
const text = msg.text ?? "";
|
|
1047
|
+
const isFinal = msg.is_final ?? false;
|
|
1048
|
+
const language = msg.language;
|
|
1049
|
+
const latencyMs = msg.latency_ms;
|
|
1050
|
+
if (!text) return;
|
|
1051
|
+
if (isFinal && latencyMs !== void 0) {
|
|
1052
|
+
log5.info(`stt_final: ${latencyMs.toFixed(0)}ms "${text.slice(0, 50)}"`);
|
|
1053
|
+
}
|
|
1054
|
+
this.emit("transcription", {
|
|
1055
|
+
text,
|
|
1056
|
+
isFinal,
|
|
1057
|
+
language,
|
|
1058
|
+
sttDuration: isFinal ? latencyMs : void 0
|
|
1059
|
+
});
|
|
1060
|
+
}
|
|
1061
|
+
handleVadEvent(msg) {
|
|
1062
|
+
const event = msg.event;
|
|
1063
|
+
log5.debug(`VAD event: ${event}`);
|
|
1064
|
+
if (event === "speech_start") {
|
|
1065
|
+
this.emit("transcription", {
|
|
1066
|
+
text: "",
|
|
1067
|
+
isFinal: false
|
|
1068
|
+
});
|
|
1069
|
+
}
|
|
1070
|
+
}
|
|
1071
|
+
startKeepAlive() {
|
|
1072
|
+
this.stopKeepAlive();
|
|
1073
|
+
this.keepAliveTimer = setInterval(() => {
|
|
1074
|
+
if (this.ws?.readyState === import_ws4.default.OPEN) {
|
|
1075
|
+
this.ws.send(JSON.stringify({ type: "ping" }));
|
|
1076
|
+
}
|
|
1077
|
+
}, KEEPALIVE_INTERVAL_MS2);
|
|
1078
|
+
}
|
|
1079
|
+
stopKeepAlive() {
|
|
1080
|
+
if (this.keepAliveTimer) {
|
|
1081
|
+
clearInterval(this.keepAliveTimer);
|
|
1082
|
+
this.keepAliveTimer = null;
|
|
1083
|
+
}
|
|
1084
|
+
}
|
|
1085
|
+
};
|
|
1086
|
+
|
|
1087
|
+
// src/providers/dtelecom-tts.ts
|
|
1088
|
+
var import_ws5 = __toESM(require("ws"));
|
|
1089
|
+
var import_wave_resampler = require("wave-resampler");
|
|
1090
|
+
var log6 = createLogger("DtelecomTTS");
|
|
1091
|
+
function resample24to48(input) {
|
|
1092
|
+
const samples = new Int16Array(input.buffer, input.byteOffset, input.length / 2);
|
|
1093
|
+
if (samples.length === 0) return Buffer.alloc(0);
|
|
1094
|
+
const resampled = (0, import_wave_resampler.resample)(samples, 24e3, 48e3, { method: "sinc", LPF: false });
|
|
1095
|
+
const output = new Int16Array(resampled.length);
|
|
1096
|
+
for (let i = 0; i < resampled.length; i++) {
|
|
1097
|
+
output[i] = Math.round(resampled[i]);
|
|
1098
|
+
}
|
|
1099
|
+
return Buffer.from(output.buffer, output.byteOffset, output.byteLength);
|
|
1100
|
+
}
|
|
1101
|
+
var DtelecomTTS = class {
|
|
1102
|
+
serverUrl;
|
|
1103
|
+
voices;
|
|
1104
|
+
defaultLang;
|
|
1105
|
+
speed;
|
|
1106
|
+
ws = null;
|
|
1107
|
+
connectPromise = null;
|
|
1108
|
+
flushState = null;
|
|
1109
|
+
/** Default language code for untagged text (e.g. 'en'). */
|
|
1110
|
+
get defaultLanguage() {
|
|
1111
|
+
return this.defaultLang;
|
|
1112
|
+
}
|
|
1113
|
+
constructor(options) {
|
|
1114
|
+
if (!options.serverUrl) {
|
|
1115
|
+
throw new Error("DtelecomTTS requires a serverUrl");
|
|
1116
|
+
}
|
|
1117
|
+
if (!options.voices || Object.keys(options.voices).length === 0) {
|
|
1118
|
+
throw new Error("DtelecomTTS requires at least one voice config");
|
|
1119
|
+
}
|
|
1120
|
+
this.serverUrl = options.serverUrl;
|
|
1121
|
+
this.voices = { ...options.voices };
|
|
1122
|
+
this.defaultLang = options.defaultLanguage ?? Object.keys(this.voices)[0];
|
|
1123
|
+
this.speed = options.speed ?? 1;
|
|
1124
|
+
}
|
|
1125
|
+
/** Pre-connect WebSocket to TTS server. */
|
|
1126
|
+
async warmup() {
|
|
1127
|
+
log6.info("Warming up TTS connection...");
|
|
1128
|
+
const start = performance.now();
|
|
1129
|
+
try {
|
|
1130
|
+
await this.ensureConnection();
|
|
1131
|
+
log6.info(`TTS warmup complete in ${(performance.now() - start).toFixed(0)}ms`);
|
|
1132
|
+
} catch (err) {
|
|
1133
|
+
log6.warn("TTS warmup failed (non-fatal):", err);
|
|
1134
|
+
}
|
|
1135
|
+
}
|
|
1136
|
+
/** Close WebSocket connection. */
|
|
1137
|
+
close() {
|
|
1138
|
+
if (this.ws) {
|
|
1139
|
+
log6.debug("Closing TTS WebSocket");
|
|
1140
|
+
this.ws.close();
|
|
1141
|
+
this.ws = null;
|
|
1142
|
+
}
|
|
1143
|
+
this.connectPromise = null;
|
|
1144
|
+
this.flushState = null;
|
|
1145
|
+
}
|
|
1146
|
+
/** Strip SSML lang tags from text for display/events. */
|
|
1147
|
+
cleanText(text) {
|
|
1148
|
+
return parseLangSegments(text, this.defaultLang).map((s) => s.text).join(" ").replace(/\s+/g, " ").trim();
|
|
1149
|
+
}
|
|
1150
|
+
async *synthesize(text, signal) {
|
|
1151
|
+
if (signal?.aborted) return;
|
|
1152
|
+
const segments = parseLangSegments(text, this.defaultLang);
|
|
1153
|
+
const silenceBytes = Math.round(48e3 * 0.2) * 2;
|
|
1154
|
+
const silence = Buffer.alloc(silenceBytes);
|
|
1155
|
+
let prevLang = null;
|
|
1156
|
+
for (const segment of segments) {
|
|
1157
|
+
if (signal?.aborted) break;
|
|
1158
|
+
if (!segment.text.trim()) continue;
|
|
1159
|
+
const lang = this.voices[segment.lang] ? segment.lang : this.defaultLang;
|
|
1160
|
+
if (prevLang !== null && lang !== prevLang) {
|
|
1161
|
+
yield silence;
|
|
1162
|
+
}
|
|
1163
|
+
prevLang = lang;
|
|
1164
|
+
yield* this.synthesizeSegment(lang, segment.text, signal);
|
|
1165
|
+
}
|
|
1166
|
+
}
|
|
1167
|
+
async *synthesizeSegment(lang, text, signal) {
|
|
1168
|
+
log6.debug(`Synthesizing [${lang}]: "${text.slice(0, 60)}"`);
|
|
1169
|
+
await this.ensureConnection();
|
|
1170
|
+
const ws = this.ws;
|
|
1171
|
+
if (!ws || ws.readyState !== import_ws5.default.OPEN) {
|
|
1172
|
+
throw new Error("dTelecom TTS WebSocket not connected");
|
|
1173
|
+
}
|
|
1174
|
+
const state = { chunks: [], done: false, cleared: false, error: null, wake: null };
|
|
1175
|
+
this.flushState = state;
|
|
1176
|
+
const onAbort = () => {
|
|
1177
|
+
state.done = true;
|
|
1178
|
+
state.wake?.();
|
|
1179
|
+
if (ws.readyState === import_ws5.default.OPEN) {
|
|
1180
|
+
try {
|
|
1181
|
+
ws.send(JSON.stringify({ type: "clear" }));
|
|
1182
|
+
} catch {
|
|
1183
|
+
}
|
|
1184
|
+
}
|
|
1185
|
+
};
|
|
1186
|
+
signal?.addEventListener("abort", onAbort, { once: true });
|
|
1187
|
+
const voiceConfig = this.voices[lang];
|
|
1188
|
+
const msg = { text };
|
|
1189
|
+
if (voiceConfig) {
|
|
1190
|
+
msg.voice = voiceConfig.voice;
|
|
1191
|
+
msg.lang_code = voiceConfig.langCode;
|
|
1192
|
+
msg.speed = this.speed;
|
|
1193
|
+
}
|
|
1194
|
+
ws.send(JSON.stringify(msg));
|
|
1195
|
+
try {
|
|
1196
|
+
while (true) {
|
|
1197
|
+
if (signal?.aborted) break;
|
|
1198
|
+
if (state.error) throw state.error;
|
|
1199
|
+
if (state.chunks.length > 0) {
|
|
1200
|
+
yield state.chunks.shift();
|
|
1201
|
+
continue;
|
|
1202
|
+
}
|
|
1203
|
+
if (state.done) break;
|
|
1204
|
+
await new Promise((resolve) => {
|
|
1205
|
+
state.wake = resolve;
|
|
1206
|
+
});
|
|
1207
|
+
state.wake = null;
|
|
1208
|
+
}
|
|
1209
|
+
while (state.chunks.length > 0) {
|
|
1210
|
+
yield state.chunks.shift();
|
|
1211
|
+
}
|
|
1212
|
+
} finally {
|
|
1213
|
+
signal?.removeEventListener("abort", onAbort);
|
|
1214
|
+
this.flushState = null;
|
|
1215
|
+
}
|
|
1216
|
+
}
|
|
1217
|
+
/** Ensure a WebSocket connection exists and is open. */
|
|
1218
|
+
ensureConnection() {
|
|
1219
|
+
if (this.ws && this.ws.readyState === import_ws5.default.OPEN) {
|
|
1220
|
+
return Promise.resolve();
|
|
1221
|
+
}
|
|
1222
|
+
if (this.connectPromise) return this.connectPromise;
|
|
1223
|
+
this.connectPromise = new Promise((resolve, reject) => {
|
|
1224
|
+
log6.debug(`Connecting to dTelecom TTS: ${this.serverUrl}`);
|
|
1225
|
+
const ws = new import_ws5.default(this.serverUrl);
|
|
1226
|
+
ws.on("open", () => {
|
|
1227
|
+
this.ws = ws;
|
|
1228
|
+
this.connectPromise = null;
|
|
1229
|
+
const defaultVoice = this.voices[this.defaultLang];
|
|
1230
|
+
if (defaultVoice) {
|
|
1231
|
+
ws.send(JSON.stringify({
|
|
1232
|
+
config: {
|
|
1233
|
+
voice: defaultVoice.voice,
|
|
1234
|
+
lang_code: defaultVoice.langCode,
|
|
1235
|
+
speed: this.speed
|
|
1236
|
+
}
|
|
1237
|
+
}));
|
|
1238
|
+
}
|
|
1239
|
+
log6.info("dTelecom TTS WebSocket connected");
|
|
1240
|
+
resolve();
|
|
1241
|
+
});
|
|
1242
|
+
ws.on("message", (data, isBinary) => {
|
|
1243
|
+
const state = this.flushState;
|
|
1244
|
+
if (!state) return;
|
|
1245
|
+
if (isBinary) {
|
|
1246
|
+
const buf = Buffer.isBuffer(data) ? data : Buffer.from(data);
|
|
1247
|
+
const resampled = resample24to48(buf);
|
|
1248
|
+
state.chunks.push(resampled);
|
|
1249
|
+
state.wake?.();
|
|
1250
|
+
} else {
|
|
1251
|
+
try {
|
|
1252
|
+
const msg = JSON.parse(data.toString());
|
|
1253
|
+
if (msg.type === "done") {
|
|
1254
|
+
state.done = true;
|
|
1255
|
+
state.wake?.();
|
|
1256
|
+
} else if (msg.type === "cleared") {
|
|
1257
|
+
state.cleared = true;
|
|
1258
|
+
state.done = true;
|
|
1259
|
+
state.wake?.();
|
|
1260
|
+
} else if (msg.type === "generating") {
|
|
1261
|
+
log6.debug(`TTS generating: "${msg.text?.slice(0, 40)}"`);
|
|
1262
|
+
} else if (msg.type === "error") {
|
|
1263
|
+
const errorMsg = msg.message || "Unknown TTS error";
|
|
1264
|
+
log6.error(`dTelecom TTS error: ${errorMsg}`);
|
|
1265
|
+
state.error = new Error(errorMsg);
|
|
1266
|
+
state.wake?.();
|
|
1267
|
+
}
|
|
1268
|
+
} catch {
|
|
1269
|
+
log6.warn("Failed to parse dTelecom TTS message");
|
|
1270
|
+
}
|
|
1271
|
+
}
|
|
1272
|
+
});
|
|
1273
|
+
ws.on("error", (err) => {
|
|
1274
|
+
const error = err instanceof Error ? err : new Error(String(err));
|
|
1275
|
+
log6.error("dTelecom TTS WebSocket error:", error);
|
|
1276
|
+
const state = this.flushState;
|
|
1277
|
+
if (state) {
|
|
1278
|
+
state.error = error;
|
|
1279
|
+
state.wake?.();
|
|
1280
|
+
}
|
|
1281
|
+
this.ws = null;
|
|
1282
|
+
this.connectPromise = null;
|
|
1283
|
+
reject(error);
|
|
1284
|
+
});
|
|
1285
|
+
ws.on("close", (code, reason) => {
|
|
1286
|
+
log6.debug(`dTelecom TTS WebSocket closed: ${code} ${reason.toString()}`);
|
|
1287
|
+
this.ws = null;
|
|
1288
|
+
this.connectPromise = null;
|
|
1289
|
+
const state = this.flushState;
|
|
1290
|
+
if (state) {
|
|
1291
|
+
state.done = true;
|
|
1292
|
+
state.wake?.();
|
|
1293
|
+
}
|
|
1294
|
+
});
|
|
1295
|
+
});
|
|
1296
|
+
return this.connectPromise;
|
|
1297
|
+
}
|
|
1298
|
+
};
|
|
907
1299
|
// Annotate the CommonJS export names for ESM import in node:
|
|
908
1300
|
0 && (module.exports = {
|
|
909
1301
|
CartesiaTTS,
|
|
910
1302
|
DeepgramSTT,
|
|
911
1303
|
DeepgramTTS,
|
|
1304
|
+
DtelecomSTT,
|
|
1305
|
+
DtelecomTTS,
|
|
912
1306
|
OpenRouterLLM
|
|
913
1307
|
});
|
|
914
1308
|
//# sourceMappingURL=index.js.map
|