@livekit/agents-plugin-baseten 1.0.36 → 1.0.38
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/stt.cjs +27 -16
- package/dist/stt.cjs.map +1 -1
- package/dist/stt.d.ts.map +1 -1
- package/dist/stt.js +27 -16
- package/dist/stt.js.map +1 -1
- package/dist/types.cjs.map +1 -1
- package/dist/types.d.cts +4 -1
- package/dist/types.d.ts +4 -1
- package/dist/types.d.ts.map +1 -1
- package/package.json +5 -5
- package/src/stt.ts +38 -17
- package/src/types.ts +4 -1
package/dist/stt.cjs
CHANGED
|
@@ -45,24 +45,27 @@ class STT extends import_agents.stt.STT {
|
|
|
45
45
|
constructor(opts = {}) {
|
|
46
46
|
super({
|
|
47
47
|
streaming: true,
|
|
48
|
-
interimResults: opts.enablePartialTranscripts ?? defaultSTTOptions.enablePartialTranscripts
|
|
48
|
+
interimResults: opts.enablePartialTranscripts ?? defaultSTTOptions.enablePartialTranscripts,
|
|
49
|
+
alignedTranscript: "word"
|
|
49
50
|
});
|
|
50
51
|
const apiKey = opts.apiKey ?? process.env.BASETEN_API_KEY;
|
|
52
|
+
const modelEndpoint = opts.modelEndpoint ?? process.env.BASETEN_MODEL_ENDPOINT;
|
|
51
53
|
const modelId = opts.modelId ?? process.env.BASETEN_STT_MODEL_ID;
|
|
52
54
|
if (!apiKey) {
|
|
53
55
|
throw new Error(
|
|
54
56
|
"Baseten API key is required, either pass it as `apiKey` or set $BASETEN_API_KEY"
|
|
55
57
|
);
|
|
56
58
|
}
|
|
57
|
-
if (!modelId) {
|
|
59
|
+
if (!modelEndpoint && !modelId) {
|
|
58
60
|
throw new Error(
|
|
59
|
-
"Baseten model
|
|
61
|
+
"Baseten model endpoint is required, either pass it as `modelEndpoint` or set $BASETEN_MODEL_ENDPOINT"
|
|
60
62
|
);
|
|
61
63
|
}
|
|
62
64
|
this.#opts = {
|
|
63
65
|
...defaultSTTOptions,
|
|
64
66
|
...opts,
|
|
65
67
|
apiKey,
|
|
68
|
+
modelEndpoint,
|
|
66
69
|
modelId
|
|
67
70
|
};
|
|
68
71
|
}
|
|
@@ -89,6 +92,9 @@ class SpeechStream extends import_agents.stt.SpeechStream {
|
|
|
89
92
|
this.closed = false;
|
|
90
93
|
}
|
|
91
94
|
getWsUrl() {
|
|
95
|
+
if (this.#opts.modelEndpoint) {
|
|
96
|
+
return this.#opts.modelEndpoint;
|
|
97
|
+
}
|
|
92
98
|
return `wss://model-${this.#opts.modelId}.api.baseten.co/environments/${this.#opts.environment}/websocket`;
|
|
93
99
|
}
|
|
94
100
|
async run() {
|
|
@@ -99,7 +105,7 @@ class SpeechStream extends import_agents.stt.SpeechStream {
|
|
|
99
105
|
const headers = {
|
|
100
106
|
Authorization: `Api-Key ${this.#opts.apiKey}`
|
|
101
107
|
};
|
|
102
|
-
const ws = new import_ws.WebSocket(url, { headers });
|
|
108
|
+
const ws = new import_ws.WebSocket(url, { headers, rejectUnauthorized: false });
|
|
103
109
|
try {
|
|
104
110
|
await new Promise((resolve, reject) => {
|
|
105
111
|
ws.on("open", resolve);
|
|
@@ -130,22 +136,17 @@ class SpeechStream extends import_agents.stt.SpeechStream {
|
|
|
130
136
|
async #runWS(ws) {
|
|
131
137
|
let closing = false;
|
|
132
138
|
const metadata = {
|
|
133
|
-
|
|
139
|
+
vad_params: {
|
|
134
140
|
threshold: this.#opts.vadThreshold,
|
|
135
141
|
min_silence_duration_ms: this.#opts.vadMinSilenceDurationMs,
|
|
136
142
|
speech_pad_ms: this.#opts.vadSpeechPadMs
|
|
137
143
|
},
|
|
138
|
-
|
|
144
|
+
streaming_whisper_params: {
|
|
139
145
|
encoding: this.#opts.encoding ?? "pcm_s16le",
|
|
140
146
|
sample_rate: this.#opts.sampleRate ?? 16e3,
|
|
141
|
-
enable_partial_transcripts:
|
|
142
|
-
partial_transcript_interval_s: this.#opts.partialTranscriptIntervalS,
|
|
143
|
-
final_transcript_max_duration_s: this.#opts.finalTranscriptMaxDurationS
|
|
144
|
-
},
|
|
145
|
-
whisper_params: {
|
|
146
|
-
prompt: this.#opts.prompt,
|
|
147
|
+
enable_partial_transcripts: false,
|
|
147
148
|
audio_language: this.#opts.audioLanguage ?? "en",
|
|
148
|
-
|
|
149
|
+
show_word_timestamps: true
|
|
149
150
|
}
|
|
150
151
|
};
|
|
151
152
|
ws.send(JSON.stringify(metadata));
|
|
@@ -213,14 +214,24 @@ class SpeechStream extends import_agents.stt.SpeechStream {
|
|
|
213
214
|
this.#speaking = true;
|
|
214
215
|
this.queue.put({ type: import_agents.stt.SpeechEventType.START_OF_SPEECH });
|
|
215
216
|
}
|
|
216
|
-
const startTime = segments.length > 0 ? segments[0].
|
|
217
|
-
const endTime = segments.length > 0 ? segments[segments.length - 1].
|
|
217
|
+
const startTime = segments.length > 0 ? (segments[0].start_time ?? 0) + this.startTimeOffset : this.startTimeOffset;
|
|
218
|
+
const endTime = segments.length > 0 ? (segments[segments.length - 1].end_time ?? 0) + this.startTimeOffset : this.startTimeOffset;
|
|
219
|
+
const words = segments.map(
|
|
220
|
+
(segment) => ({
|
|
221
|
+
text: segment.text ?? "",
|
|
222
|
+
startTime: (segment.start_time ?? 0) + this.startTimeOffset,
|
|
223
|
+
endTime: (segment.end_time ?? 0) + this.startTimeOffset,
|
|
224
|
+
startTimeOffset: this.startTimeOffset,
|
|
225
|
+
confidence
|
|
226
|
+
})
|
|
227
|
+
);
|
|
218
228
|
const speechData = {
|
|
219
229
|
language: languageCode,
|
|
220
230
|
text: transcript,
|
|
221
231
|
startTime,
|
|
222
232
|
endTime,
|
|
223
|
-
confidence
|
|
233
|
+
confidence,
|
|
234
|
+
words: words.length > 0 ? words : void 0
|
|
224
235
|
};
|
|
225
236
|
if (!isFinal) {
|
|
226
237
|
this.queue.put({
|
package/dist/stt.cjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/stt.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { type AudioBuffer, AudioByteStream, Task, log, stt, waitForAbort } from '@livekit/agents';\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { WebSocket } from 'ws';\nimport type { BasetenSttOptions } from './types.js';\n\nconst defaultSTTOptions: Partial<BasetenSttOptions> = {\n environment: 'production',\n encoding: 'pcm_s16le',\n sampleRate: 16000,\n bufferSizeSeconds: 0.032,\n enablePartialTranscripts: true,\n partialTranscriptIntervalS: 0.5,\n finalTranscriptMaxDurationS: 5,\n audioLanguage: 'en',\n languageDetectionOnly: false,\n vadThreshold: 0.5,\n vadMinSilenceDurationMs: 300,\n vadSpeechPadMs: 30,\n};\n\nexport class STT extends stt.STT {\n #opts: BasetenSttOptions;\n #logger = log();\n label = 'baseten.STT';\n\n constructor(opts: Partial<BasetenSttOptions> = {}) {\n super({\n streaming: true,\n interimResults: opts.enablePartialTranscripts ?? defaultSTTOptions.enablePartialTranscripts!,\n });\n\n const apiKey = opts.apiKey ?? process.env.BASETEN_API_KEY;\n const modelId = opts.modelId ?? process.env.BASETEN_STT_MODEL_ID;\n\n if (!apiKey) {\n throw new Error(\n 'Baseten API key is required, either pass it as `apiKey` or set $BASETEN_API_KEY',\n );\n }\n if (!modelId) {\n throw new Error(\n 'Baseten model ID is required, either pass it as `modelId` or set $BASETEN_STT_MODEL_ID',\n );\n }\n\n this.#opts = {\n ...defaultSTTOptions,\n ...opts,\n apiKey,\n modelId,\n } as BasetenSttOptions;\n }\n\n // eslint-disable-next-line\n async _recognize(_: AudioBuffer): Promise<stt.SpeechEvent> {\n throw new Error('Recognize is not supported on Baseten STT');\n }\n\n updateOptions(opts: Partial<BasetenSttOptions>) {\n this.#opts = { ...this.#opts, ...opts };\n }\n\n stream(): SpeechStream {\n return new SpeechStream(this, this.#opts);\n }\n}\n\nexport class SpeechStream extends stt.SpeechStream {\n #opts: BasetenSttOptions;\n #logger = log();\n #speaking = false;\n #requestId = '';\n label = 'baseten.SpeechStream';\n\n constructor(stt: STT, opts: BasetenSttOptions) {\n super(stt, opts.sampleRate);\n this.#opts = opts;\n this.closed = false;\n }\n\n private getWsUrl(): string {\n return `wss://model-${this.#opts.modelId}.api.baseten.co/environments/${this.#opts.environment}/websocket`;\n }\n\n protected async run() {\n const maxRetry = 32;\n let retries = 0;\n\n while (!this.input.closed && !this.closed) {\n const url = this.getWsUrl();\n const headers = {\n Authorization: `Api-Key ${this.#opts.apiKey}`,\n };\n\n const ws = new WebSocket(url, { headers });\n\n try {\n await new Promise((resolve, reject) => {\n ws.on('open', resolve);\n ws.on('error', (error) => reject(error));\n ws.on('close', (code) => reject(`WebSocket returned ${code}`));\n });\n\n await this.#runWS(ws);\n } catch (e) {\n if (!this.closed && !this.input.closed) {\n if (retries >= maxRetry) {\n throw new Error(`failed to connect to Baseten after ${retries} attempts: ${e}`);\n }\n\n const delay = Math.min(retries * 5, 10);\n retries++;\n\n this.#logger.warn(\n `failed to connect to Baseten, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`,\n );\n await new Promise((resolve) => setTimeout(resolve, delay * 1000));\n } else {\n this.#logger.warn(\n `Baseten disconnected, connection is closed: ${e} (inputClosed: ${this.input.closed}, isClosed: ${this.closed})`,\n );\n }\n }\n }\n\n this.closed = true;\n }\n\n async #runWS(ws: WebSocket) {\n let closing = false;\n\n // Send initial metadata\n const metadata = {\n streaming_vad_config: {\n threshold: this.#opts.vadThreshold,\n min_silence_duration_ms: this.#opts.vadMinSilenceDurationMs,\n speech_pad_ms: this.#opts.vadSpeechPadMs,\n },\n streaming_params: {\n encoding: this.#opts.encoding ?? 'pcm_s16le',\n sample_rate: this.#opts.sampleRate ?? 16000,\n enable_partial_transcripts: this.#opts.enablePartialTranscripts,\n partial_transcript_interval_s: this.#opts.partialTranscriptIntervalS,\n final_transcript_max_duration_s: this.#opts.finalTranscriptMaxDurationS,\n },\n whisper_params: {\n prompt: this.#opts.prompt,\n audio_language: this.#opts.audioLanguage ?? 'en',\n language_detection_only: this.#opts.languageDetectionOnly ?? false,\n },\n };\n ws.send(JSON.stringify(metadata));\n\n const sendTask = async () => {\n const sampleRate = this.#opts.sampleRate ?? 16000;\n const samplesPerChunk = sampleRate === 16000 ? 512 : 256;\n const audioByteStream = new AudioByteStream(sampleRate, 1, samplesPerChunk);\n\n try {\n while (!this.closed) {\n const result = await this.input.next();\n if (result.done) {\n break;\n }\n\n const data = result.value;\n\n let frames: AudioFrame[];\n if (data === SpeechStream.FLUSH_SENTINEL) {\n // Flush any remaining buffered audio\n frames = audioByteStream.flush();\n } else {\n if (data.sampleRate !== sampleRate || data.channels !== 1) {\n throw new Error(\n `sample rate or channel count mismatch: expected ${sampleRate}Hz/1ch, got ${data.sampleRate}Hz/${data.channels}ch`,\n );\n }\n frames = audioByteStream.write(data.data.buffer as ArrayBuffer);\n }\n\n for (const frame of frames) {\n const buffer = Buffer.from(\n frame.data.buffer,\n frame.data.byteOffset,\n frame.data.byteLength,\n );\n ws.send(buffer);\n }\n }\n } finally {\n closing = true;\n ws.close();\n }\n };\n\n const listenTask = Task.from(async (controller) => {\n const listenMessage = new Promise<void>((resolve, reject) => {\n ws.on('message', (data) => {\n try {\n let jsonString: string;\n\n if (typeof data === 'string') {\n jsonString = data;\n } else if (data instanceof Buffer) {\n jsonString = data.toString('utf-8');\n } else if (Array.isArray(data)) {\n jsonString = Buffer.concat(data).toString('utf-8');\n } else {\n return;\n }\n\n const msg = JSON.parse(jsonString);\n\n // Parse response format matching Python implementation\n const isFinal = msg.is_final ?? true;\n const segments = msg.segments ?? [];\n const transcript = msg.transcript ?? '';\n const confidence = msg.confidence ?? 0.0;\n const languageCode = msg.language_code ?? this.#opts.audioLanguage;\n\n // Skip if no transcript text\n if (!transcript) {\n this.#logger.debug('Received non-transcript message:', msg);\n return;\n }\n\n // Emit START_OF_SPEECH if not already speaking (only for interim or first final)\n if (!this.#speaking && !isFinal) {\n this.#speaking = true;\n this.queue.put({ type: stt.SpeechEventType.START_OF_SPEECH });\n }\n\n // Extract timing from segments\n const startTime = segments.length > 0 ? segments[0].start ?? 0.0 : 0.0;\n const endTime = segments.length > 0 ? segments[segments.length - 1].end ?? 0.0 : 0.0;\n\n const speechData: stt.SpeechData = {\n language: languageCode!,\n text: transcript,\n startTime,\n endTime,\n confidence,\n };\n\n // Handle interim vs final transcripts (matching Python implementation)\n if (!isFinal) {\n // Interim transcript\n this.queue.put({\n type: stt.SpeechEventType.INTERIM_TRANSCRIPT,\n alternatives: [speechData],\n });\n } else {\n // Final transcript\n this.queue.put({\n type: stt.SpeechEventType.FINAL_TRANSCRIPT,\n alternatives: [speechData],\n });\n\n // Emit END_OF_SPEECH after final transcript\n if (this.#speaking) {\n this.#speaking = false;\n this.queue.put({ type: stt.SpeechEventType.END_OF_SPEECH });\n }\n }\n\n if (this.closed || closing) {\n resolve();\n }\n } catch (err) {\n this.#logger.error(`STT: Error processing message: ${data}`);\n reject(err);\n }\n });\n\n ws.on('error', (err) => {\n if (!closing) {\n reject(err);\n }\n });\n\n ws.on('close', () => {\n if (!closing) {\n resolve();\n }\n });\n });\n\n await Promise.race([listenMessage, waitForAbort(controller.signal)]);\n }, this.abortController);\n\n await Promise.all([sendTask(), listenTask.result]);\n closing = true;\n ws.close();\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,oBAAgF;AAEhF,gBAA0B;AAG1B,MAAM,oBAAgD;AAAA,EACpD,aAAa;AAAA,EACb,UAAU;AAAA,EACV,YAAY;AAAA,EACZ,mBAAmB;AAAA,EACnB,0BAA0B;AAAA,EAC1B,4BAA4B;AAAA,EAC5B,6BAA6B;AAAA,EAC7B,eAAe;AAAA,EACf,uBAAuB;AAAA,EACvB,cAAc;AAAA,EACd,yBAAyB;AAAA,EACzB,gBAAgB;AAClB;AAEO,MAAM,YAAY,kBAAI,IAAI;AAAA,EAC/B;AAAA,EACA,cAAU,mBAAI;AAAA,EACd,QAAQ;AAAA,EAER,YAAY,OAAmC,CAAC,GAAG;AACjD,UAAM;AAAA,MACJ,WAAW;AAAA,MACX,gBAAgB,KAAK,4BAA4B,kBAAkB;AAAA,IACrE,CAAC;AAED,UAAM,SAAS,KAAK,UAAU,QAAQ,IAAI;AAC1C,UAAM,UAAU,KAAK,WAAW,QAAQ,IAAI;AAE5C,QAAI,CAAC,QAAQ;AACX,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,SAAS;AACZ,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAEA,SAAK,QAAQ;AAAA,MACX,GAAG;AAAA,MACH,GAAG;AAAA,MACH;AAAA,MACA;AAAA,IACF;AAAA,EACF;AAAA;AAAA,EAGA,MAAM,WAAW,GAA0C;AACzD,UAAM,IAAI,MAAM,2CAA2C;AAAA,EAC7D;AAAA,EAEA,cAAc,MAAkC;AAC9C,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AAAA,EACxC;AAAA,EAEA,SAAuB;AACrB,WAAO,IAAI,aAAa,MAAM,KAAK,KAAK;AAAA,EAC1C;AACF;AAEO,MAAM,qBAAqB,kBAAI,aAAa;AAAA,EACjD;AAAA,EACA,cAAU,mBAAI;AAAA,EACd,YAAY;AAAA,EACZ,aAAa;AAAA,EACb,QAAQ;AAAA,EAER,YAAYA,MAAU,MAAyB;AAC7C,UAAMA,MAAK,KAAK,UAAU;AAC1B,SAAK,QAAQ;AACb,SAAK,SAAS;AAAA,EAChB;AAAA,EAEQ,WAAmB;AACzB,WAAO,eAAe,KAAK,MAAM,OAAO,gCAAgC,KAAK,MAAM,WAAW;AAAA,EAChG;AAAA,EAEA,MAAgB,MAAM;AACpB,UAAM,WAAW;AACjB,QAAI,UAAU;AAEd,WAAO,CAAC,KAAK,MAAM,UAAU,CAAC,KAAK,QAAQ;AACzC,YAAM,MAAM,KAAK,SAAS;AAC1B,YAAM,UAAU;AAAA,QACd,eAAe,WAAW,KAAK,MAAM,MAAM;AAAA,MAC7C;AAEA,YAAM,KAAK,IAAI,oBAAU,KAAK,EAAE,QAAQ,CAAC;AAEzC,UAAI;AACF,cAAM,IAAI,QAAQ,CAAC,SAAS,WAAW;AACrC,aAAG,GAAG,QAAQ,OAAO;AACrB,aAAG,GAAG,SAAS,CAAC,UAAU,OAAO,KAAK,CAAC;AACvC,aAAG,GAAG,SAAS,CAAC,SAAS,OAAO,sBAAsB,IAAI,EAAE,CAAC;AAAA,QAC/D,CAAC;AAED,cAAM,KAAK,OAAO,EAAE;AAAA,MACtB,SAAS,GAAG;AACV,YAAI,CAAC,KAAK,UAAU,CAAC,KAAK,MAAM,QAAQ;AACtC,cAAI,WAAW,UAAU;AACvB,kBAAM,IAAI,MAAM,sCAAsC,OAAO,cAAc,CAAC,EAAE;AAAA,UAChF;AAEA,gBAAM,QAAQ,KAAK,IAAI,UAAU,GAAG,EAAE;AACtC;AAEA,eAAK,QAAQ;AAAA,YACX,6CAA6C,KAAK,aAAa,CAAC,KAAK,OAAO,IAAI,QAAQ;AAAA,UAC1F;AACA,gBAAM,IAAI,QAAQ,CAAC,YAAY,WAAW,SAAS,QAAQ,GAAI,CAAC;AAAA,QAClE,OAAO;AACL,eAAK,QAAQ;AAAA,YACX,+CAA+C,CAAC,kBAAkB,KAAK,MAAM,MAAM,eAAe,KAAK,MAAM;AAAA,UAC/G;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,MAAM,OAAO,IAAe;AAC1B,QAAI,UAAU;AAGd,UAAM,WAAW;AAAA,MACf,sBAAsB;AAAA,QACpB,WAAW,KAAK,MAAM;AAAA,QACtB,yBAAyB,KAAK,MAAM;AAAA,QACpC,eAAe,KAAK,MAAM;AAAA,MAC5B;AAAA,MACA,kBAAkB;AAAA,QAChB,UAAU,KAAK,MAAM,YAAY;AAAA,QACjC,aAAa,KAAK,MAAM,cAAc;AAAA,QACtC,4BAA4B,KAAK,MAAM;AAAA,QACvC,+BAA+B,KAAK,MAAM;AAAA,QAC1C,iCAAiC,KAAK,MAAM;AAAA,MAC9C;AAAA,MACA,gBAAgB;AAAA,QACd,QAAQ,KAAK,MAAM;AAAA,QACnB,gBAAgB,KAAK,MAAM,iBAAiB;AAAA,QAC5C,yBAAyB,KAAK,MAAM,yBAAyB;AAAA,MAC/D;AAAA,IACF;AACA,OAAG,KAAK,KAAK,UAAU,QAAQ,CAAC;AAEhC,UAAM,WAAW,YAAY;AAC3B,YAAM,aAAa,KAAK,MAAM,cAAc;AAC5C,YAAM,kBAAkB,eAAe,OAAQ,MAAM;AACrD,YAAM,kBAAkB,IAAI,8BAAgB,YAAY,GAAG,eAAe;AAE1E,UAAI;AACF,eAAO,CAAC,KAAK,QAAQ;AACnB,gBAAM,SAAS,MAAM,KAAK,MAAM,KAAK;AACrC,cAAI,OAAO,MAAM;AACf;AAAA,UACF;AAEA,gBAAM,OAAO,OAAO;AAEpB,cAAI;AACJ,cAAI,SAAS,aAAa,gBAAgB;AAExC,qBAAS,gBAAgB,MAAM;AAAA,UACjC,OAAO;AACL,gBAAI,KAAK,eAAe,cAAc,KAAK,aAAa,GAAG;AACzD,oBAAM,IAAI;AAAA,gBACR,mDAAmD,UAAU,eAAe,KAAK,UAAU,MAAM,KAAK,QAAQ;AAAA,cAChH;AAAA,YACF;AACA,qBAAS,gBAAgB,MAAM,KAAK,KAAK,MAAqB;AAAA,UAChE;AAEA,qBAAW,SAAS,QAAQ;AAC1B,kBAAM,SAAS,OAAO;AAAA,cACpB,MAAM,KAAK;AAAA,cACX,MAAM,KAAK;AAAA,cACX,MAAM,KAAK;AAAA,YACb;AACA,eAAG,KAAK,MAAM;AAAA,UAChB;AAAA,QACF;AAAA,MACF,UAAE;AACA,kBAAU;AACV,WAAG,MAAM;AAAA,MACX;AAAA,IACF;AAEA,UAAM,aAAa,mBAAK,KAAK,OAAO,eAAe;AACjD,YAAM,gBAAgB,IAAI,QAAc,CAAC,SAAS,WAAW;AAC3D,WAAG,GAAG,WAAW,CAAC,SAAS;AACzB,cAAI;AACF,gBAAI;AAEJ,gBAAI,OAAO,SAAS,UAAU;AAC5B,2BAAa;AAAA,YACf,WAAW,gBAAgB,QAAQ;AACjC,2BAAa,KAAK,SAAS,OAAO;AAAA,YACpC,WAAW,MAAM,QAAQ,IAAI,GAAG;AAC9B,2BAAa,OAAO,OAAO,IAAI,EAAE,SAAS,OAAO;AAAA,YACnD,OAAO;AACL;AAAA,YACF;AAEA,kBAAM,MAAM,KAAK,MAAM,UAAU;AAGjC,kBAAM,UAAU,IAAI,YAAY;AAChC,kBAAM,WAAW,IAAI,YAAY,CAAC;AAClC,kBAAM,aAAa,IAAI,cAAc;AACrC,kBAAM,aAAa,IAAI,cAAc;AACrC,kBAAM,eAAe,IAAI,iBAAiB,KAAK,MAAM;AAGrD,gBAAI,CAAC,YAAY;AACf,mBAAK,QAAQ,MAAM,oCAAoC,GAAG;AAC1D;AAAA,YACF;AAGA,gBAAI,CAAC,KAAK,aAAa,CAAC,SAAS;AAC/B,mBAAK,YAAY;AACjB,mBAAK,MAAM,IAAI,EAAE,MAAM,kBAAI,gBAAgB,gBAAgB,CAAC;AAAA,YAC9D;AAGA,kBAAM,YAAY,SAAS,SAAS,IAAI,SAAS,CAAC,EAAE,SAAS,IAAM;AACnE,kBAAM,UAAU,SAAS,SAAS,IAAI,SAAS,SAAS,SAAS,CAAC,EAAE,OAAO,IAAM;AAEjF,kBAAM,aAA6B;AAAA,cACjC,UAAU;AAAA,cACV,MAAM;AAAA,cACN;AAAA,cACA;AAAA,cACA;AAAA,YACF;AAGA,gBAAI,CAAC,SAAS;AAEZ,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,kBAAI,gBAAgB;AAAA,gBAC1B,cAAc,CAAC,UAAU;AAAA,cAC3B,CAAC;AAAA,YACH,OAAO;AAEL,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,kBAAI,gBAAgB;AAAA,gBAC1B,cAAc,CAAC,UAAU;AAAA,cAC3B,CAAC;AAGD,kBAAI,KAAK,WAAW;AAClB,qBAAK,YAAY;AACjB,qBAAK,MAAM,IAAI,EAAE,MAAM,kBAAI,gBAAgB,cAAc,CAAC;AAAA,cAC5D;AAAA,YACF;AAEA,gBAAI,KAAK,UAAU,SAAS;AAC1B,sBAAQ;AAAA,YACV;AAAA,UACF,SAAS,KAAK;AACZ,iBAAK,QAAQ,MAAM,kCAAkC,IAAI,EAAE;AAC3D,mBAAO,GAAG;AAAA,UACZ;AAAA,QACF,CAAC;AAED,WAAG,GAAG,SAAS,CAAC,QAAQ;AACtB,cAAI,CAAC,SAAS;AACZ,mBAAO,GAAG;AAAA,UACZ;AAAA,QACF,CAAC;AAED,WAAG,GAAG,SAAS,MAAM;AACnB,cAAI,CAAC,SAAS;AACZ,oBAAQ;AAAA,UACV;AAAA,QACF,CAAC;AAAA,MACH,CAAC;AAED,YAAM,QAAQ,KAAK,CAAC,mBAAe,4BAAa,WAAW,MAAM,CAAC,CAAC;AAAA,IACrE,GAAG,KAAK,eAAe;AAEvB,UAAM,QAAQ,IAAI,CAAC,SAAS,GAAG,WAAW,MAAM,CAAC;AACjD,cAAU;AACV,OAAG,MAAM;AAAA,EACX;AACF;","names":["stt"]}
|
|
1
|
+
{"version":3,"sources":["../src/stt.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { type AudioBuffer, AudioByteStream, Task, log, stt, waitForAbort } from '@livekit/agents';\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { WebSocket } from 'ws';\nimport type { BasetenSttOptions } from './types.js';\n\nconst defaultSTTOptions: Partial<BasetenSttOptions> = {\n environment: 'production',\n encoding: 'pcm_s16le',\n sampleRate: 16000,\n bufferSizeSeconds: 0.032,\n enablePartialTranscripts: true,\n partialTranscriptIntervalS: 0.5,\n finalTranscriptMaxDurationS: 5,\n audioLanguage: 'en',\n languageDetectionOnly: false,\n vadThreshold: 0.5,\n vadMinSilenceDurationMs: 300,\n vadSpeechPadMs: 30,\n};\n\nexport class STT extends stt.STT {\n #opts: BasetenSttOptions;\n #logger = log();\n label = 'baseten.STT';\n\n constructor(opts: Partial<BasetenSttOptions> = {}) {\n super({\n streaming: true,\n interimResults: opts.enablePartialTranscripts ?? defaultSTTOptions.enablePartialTranscripts!,\n alignedTranscript: 'word',\n });\n\n const apiKey = opts.apiKey ?? process.env.BASETEN_API_KEY;\n const modelEndpoint = opts.modelEndpoint ?? process.env.BASETEN_MODEL_ENDPOINT;\n const modelId = opts.modelId ?? process.env.BASETEN_STT_MODEL_ID;\n\n if (!apiKey) {\n throw new Error(\n 'Baseten API key is required, either pass it as `apiKey` or set $BASETEN_API_KEY',\n );\n }\n if (!modelEndpoint && !modelId) {\n throw new Error(\n 'Baseten model endpoint is required, either pass it as `modelEndpoint` or set $BASETEN_MODEL_ENDPOINT',\n );\n }\n\n this.#opts = {\n ...defaultSTTOptions,\n ...opts,\n apiKey,\n modelEndpoint,\n modelId,\n } as BasetenSttOptions;\n }\n\n // eslint-disable-next-line\n async _recognize(_: AudioBuffer): Promise<stt.SpeechEvent> {\n throw new Error('Recognize is not supported on Baseten STT');\n }\n\n updateOptions(opts: Partial<BasetenSttOptions>) {\n this.#opts = { ...this.#opts, ...opts };\n }\n\n stream(): SpeechStream {\n return new SpeechStream(this, this.#opts);\n }\n}\n\nexport class SpeechStream extends stt.SpeechStream {\n #opts: BasetenSttOptions;\n #logger = log();\n #speaking = false;\n #requestId = '';\n label = 'baseten.SpeechStream';\n\n constructor(stt: STT, opts: BasetenSttOptions) {\n super(stt, opts.sampleRate);\n this.#opts = opts;\n this.closed = false;\n }\n\n private getWsUrl(): string {\n if (this.#opts.modelEndpoint) {\n return this.#opts.modelEndpoint;\n }\n // Fallback to constructing URL from modelId (deprecated)\n return `wss://model-${this.#opts.modelId}.api.baseten.co/environments/${this.#opts.environment}/websocket`;\n }\n\n protected async run() {\n const maxRetry = 32;\n let retries = 0;\n\n while (!this.input.closed && !this.closed) {\n const url = this.getWsUrl();\n const headers = {\n Authorization: `Api-Key ${this.#opts.apiKey}`,\n };\n\n const ws = new WebSocket(url, { headers, rejectUnauthorized: false });\n\n try {\n await new Promise((resolve, reject) => {\n ws.on('open', resolve);\n ws.on('error', (error) => reject(error));\n ws.on('close', (code) => reject(`WebSocket returned ${code}`));\n });\n\n await this.#runWS(ws);\n } catch (e) {\n if (!this.closed && !this.input.closed) {\n if (retries >= maxRetry) {\n throw new Error(`failed to connect to Baseten after ${retries} attempts: ${e}`);\n }\n\n const delay = Math.min(retries * 5, 10);\n retries++;\n\n this.#logger.warn(\n `failed to connect to Baseten, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`,\n );\n await new Promise((resolve) => setTimeout(resolve, delay * 1000));\n } else {\n this.#logger.warn(\n `Baseten disconnected, connection is closed: ${e} (inputClosed: ${this.input.closed}, isClosed: ${this.closed})`,\n );\n }\n }\n }\n\n this.closed = true;\n }\n\n async #runWS(ws: WebSocket) {\n let closing = false;\n\n // Send initial metadata\n // Note: Baseten server expects 'vad_params' and 'streaming_whisper_params' field names\n // (not 'streaming_vad_config', 'streaming_params', 'whisper_params' as in older versions)\n const metadata = {\n vad_params: {\n threshold: this.#opts.vadThreshold,\n min_silence_duration_ms: this.#opts.vadMinSilenceDurationMs,\n speech_pad_ms: this.#opts.vadSpeechPadMs,\n },\n streaming_whisper_params: {\n encoding: this.#opts.encoding ?? 'pcm_s16le',\n sample_rate: this.#opts.sampleRate ?? 16000,\n enable_partial_transcripts: false,\n audio_language: this.#opts.audioLanguage ?? 'en',\n show_word_timestamps: true,\n },\n };\n\n ws.send(JSON.stringify(metadata));\n\n const sendTask = async () => {\n const sampleRate = this.#opts.sampleRate ?? 16000;\n const samplesPerChunk = sampleRate === 16000 ? 512 : 256;\n const audioByteStream = new AudioByteStream(sampleRate, 1, samplesPerChunk);\n\n try {\n while (!this.closed) {\n const result = await this.input.next();\n if (result.done) {\n break;\n }\n\n const data = result.value;\n\n let frames: AudioFrame[];\n if (data === SpeechStream.FLUSH_SENTINEL) {\n // Flush any remaining buffered audio\n frames = audioByteStream.flush();\n } else {\n if (data.sampleRate !== sampleRate || data.channels !== 1) {\n throw new Error(\n `sample rate or channel count mismatch: expected ${sampleRate}Hz/1ch, got ${data.sampleRate}Hz/${data.channels}ch`,\n );\n }\n frames = audioByteStream.write(data.data.buffer as ArrayBuffer);\n }\n\n for (const frame of frames) {\n const buffer = Buffer.from(\n frame.data.buffer,\n frame.data.byteOffset,\n frame.data.byteLength,\n );\n ws.send(buffer);\n }\n }\n } finally {\n closing = true;\n ws.close();\n }\n };\n\n const listenTask = Task.from(async (controller) => {\n const listenMessage = new Promise<void>((resolve, reject) => {\n ws.on('message', (data) => {\n try {\n let jsonString: string;\n\n if (typeof data === 'string') {\n jsonString = data;\n } else if (data instanceof Buffer) {\n jsonString = data.toString('utf-8');\n } else if (Array.isArray(data)) {\n jsonString = Buffer.concat(data).toString('utf-8');\n } else {\n return;\n }\n\n const msg = JSON.parse(jsonString);\n const isFinal = msg.is_final ?? true;\n const segments = msg.segments ?? [];\n const transcript = msg.transcript ?? '';\n const confidence = msg.confidence ?? 0.0;\n const languageCode = msg.language_code ?? this.#opts.audioLanguage;\n\n // Skip if no transcript text\n if (!transcript) {\n this.#logger.debug('Received non-transcript message:', msg);\n return;\n }\n\n // Emit START_OF_SPEECH if not already speaking (only for interim or first final)\n if (!this.#speaking && !isFinal) {\n this.#speaking = true;\n this.queue.put({ type: stt.SpeechEventType.START_OF_SPEECH });\n }\n\n // Note: Baseten uses 'start_time' and 'end_time' field names (with underscores)\n const startTime =\n segments.length > 0\n ? (segments[0].start_time ?? 0.0) + this.startTimeOffset\n : this.startTimeOffset;\n const endTime =\n segments.length > 0\n ? (segments[segments.length - 1].end_time ?? 0.0) + this.startTimeOffset\n : this.startTimeOffset;\n\n // Note: Baseten returns segments (chunks) which we treat as words for aligned transcripts\n const words = segments.map(\n (segment: { text?: string; start_time?: number; end_time?: number }) => ({\n text: segment.text ?? '',\n startTime: (segment.start_time ?? 0.0) + this.startTimeOffset,\n endTime: (segment.end_time ?? 0.0) + this.startTimeOffset,\n startTimeOffset: this.startTimeOffset,\n confidence: confidence,\n }),\n );\n\n const speechData: stt.SpeechData = {\n language: languageCode!,\n text: transcript,\n startTime,\n endTime,\n confidence,\n words: words.length > 0 ? words : undefined,\n };\n\n // Handle interim vs final transcripts (matching Python implementation)\n if (!isFinal) {\n // Interim transcript\n this.queue.put({\n type: stt.SpeechEventType.INTERIM_TRANSCRIPT,\n alternatives: [speechData],\n });\n } else {\n // Final transcript\n this.queue.put({\n type: stt.SpeechEventType.FINAL_TRANSCRIPT,\n alternatives: [speechData],\n });\n\n // Emit END_OF_SPEECH after final transcript\n if (this.#speaking) {\n this.#speaking = false;\n this.queue.put({ type: stt.SpeechEventType.END_OF_SPEECH });\n }\n }\n\n if (this.closed || closing) {\n resolve();\n }\n } catch (err) {\n this.#logger.error(`STT: Error processing message: ${data}`);\n reject(err);\n }\n });\n\n ws.on('error', (err) => {\n if (!closing) {\n reject(err);\n }\n });\n\n ws.on('close', () => {\n if (!closing) {\n resolve();\n }\n });\n });\n\n await Promise.race([listenMessage, waitForAbort(controller.signal)]);\n }, this.abortController);\n\n await Promise.all([sendTask(), listenTask.result]);\n closing = true;\n ws.close();\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,oBAAgF;AAEhF,gBAA0B;AAG1B,MAAM,oBAAgD;AAAA,EACpD,aAAa;AAAA,EACb,UAAU;AAAA,EACV,YAAY;AAAA,EACZ,mBAAmB;AAAA,EACnB,0BAA0B;AAAA,EAC1B,4BAA4B;AAAA,EAC5B,6BAA6B;AAAA,EAC7B,eAAe;AAAA,EACf,uBAAuB;AAAA,EACvB,cAAc;AAAA,EACd,yBAAyB;AAAA,EACzB,gBAAgB;AAClB;AAEO,MAAM,YAAY,kBAAI,IAAI;AAAA,EAC/B;AAAA,EACA,cAAU,mBAAI;AAAA,EACd,QAAQ;AAAA,EAER,YAAY,OAAmC,CAAC,GAAG;AACjD,UAAM;AAAA,MACJ,WAAW;AAAA,MACX,gBAAgB,KAAK,4BAA4B,kBAAkB;AAAA,MACnE,mBAAmB;AAAA,IACrB,CAAC;AAED,UAAM,SAAS,KAAK,UAAU,QAAQ,IAAI;AAC1C,UAAM,gBAAgB,KAAK,iBAAiB,QAAQ,IAAI;AACxD,UAAM,UAAU,KAAK,WAAW,QAAQ,IAAI;AAE5C,QAAI,CAAC,QAAQ;AACX,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,iBAAiB,CAAC,SAAS;AAC9B,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAEA,SAAK,QAAQ;AAAA,MACX,GAAG;AAAA,MACH,GAAG;AAAA,MACH;AAAA,MACA;AAAA,MACA;AAAA,IACF;AAAA,EACF;AAAA;AAAA,EAGA,MAAM,WAAW,GAA0C;AACzD,UAAM,IAAI,MAAM,2CAA2C;AAAA,EAC7D;AAAA,EAEA,cAAc,MAAkC;AAC9C,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AAAA,EACxC;AAAA,EAEA,SAAuB;AACrB,WAAO,IAAI,aAAa,MAAM,KAAK,KAAK;AAAA,EAC1C;AACF;AAEO,MAAM,qBAAqB,kBAAI,aAAa;AAAA,EACjD;AAAA,EACA,cAAU,mBAAI;AAAA,EACd,YAAY;AAAA,EACZ,aAAa;AAAA,EACb,QAAQ;AAAA,EAER,YAAYA,MAAU,MAAyB;AAC7C,UAAMA,MAAK,KAAK,UAAU;AAC1B,SAAK,QAAQ;AACb,SAAK,SAAS;AAAA,EAChB;AAAA,EAEQ,WAAmB;AACzB,QAAI,KAAK,MAAM,eAAe;AAC5B,aAAO,KAAK,MAAM;AAAA,IACpB;AAEA,WAAO,eAAe,KAAK,MAAM,OAAO,gCAAgC,KAAK,MAAM,WAAW;AAAA,EAChG;AAAA,EAEA,MAAgB,MAAM;AACpB,UAAM,WAAW;AACjB,QAAI,UAAU;AAEd,WAAO,CAAC,KAAK,MAAM,UAAU,CAAC,KAAK,QAAQ;AACzC,YAAM,MAAM,KAAK,SAAS;AAC1B,YAAM,UAAU;AAAA,QACd,eAAe,WAAW,KAAK,MAAM,MAAM;AAAA,MAC7C;AAEA,YAAM,KAAK,IAAI,oBAAU,KAAK,EAAE,SAAS,oBAAoB,MAAM,CAAC;AAEpE,UAAI;AACF,cAAM,IAAI,QAAQ,CAAC,SAAS,WAAW;AACrC,aAAG,GAAG,QAAQ,OAAO;AACrB,aAAG,GAAG,SAAS,CAAC,UAAU,OAAO,KAAK,CAAC;AACvC,aAAG,GAAG,SAAS,CAAC,SAAS,OAAO,sBAAsB,IAAI,EAAE,CAAC;AAAA,QAC/D,CAAC;AAED,cAAM,KAAK,OAAO,EAAE;AAAA,MACtB,SAAS,GAAG;AACV,YAAI,CAAC,KAAK,UAAU,CAAC,KAAK,MAAM,QAAQ;AACtC,cAAI,WAAW,UAAU;AACvB,kBAAM,IAAI,MAAM,sCAAsC,OAAO,cAAc,CAAC,EAAE;AAAA,UAChF;AAEA,gBAAM,QAAQ,KAAK,IAAI,UAAU,GAAG,EAAE;AACtC;AAEA,eAAK,QAAQ;AAAA,YACX,6CAA6C,KAAK,aAAa,CAAC,KAAK,OAAO,IAAI,QAAQ;AAAA,UAC1F;AACA,gBAAM,IAAI,QAAQ,CAAC,YAAY,WAAW,SAAS,QAAQ,GAAI,CAAC;AAAA,QAClE,OAAO;AACL,eAAK,QAAQ;AAAA,YACX,+CAA+C,CAAC,kBAAkB,KAAK,MAAM,MAAM,eAAe,KAAK,MAAM;AAAA,UAC/G;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,MAAM,OAAO,IAAe;AAC1B,QAAI,UAAU;AAKd,UAAM,WAAW;AAAA,MACf,YAAY;AAAA,QACV,WAAW,KAAK,MAAM;AAAA,QACtB,yBAAyB,KAAK,MAAM;AAAA,QACpC,eAAe,KAAK,MAAM;AAAA,MAC5B;AAAA,MACA,0BAA0B;AAAA,QACxB,UAAU,KAAK,MAAM,YAAY;AAAA,QACjC,aAAa,KAAK,MAAM,cAAc;AAAA,QACtC,4BAA4B;AAAA,QAC5B,gBAAgB,KAAK,MAAM,iBAAiB;AAAA,QAC5C,sBAAsB;AAAA,MACxB;AAAA,IACF;AAEA,OAAG,KAAK,KAAK,UAAU,QAAQ,CAAC;AAEhC,UAAM,WAAW,YAAY;AAC3B,YAAM,aAAa,KAAK,MAAM,cAAc;AAC5C,YAAM,kBAAkB,eAAe,OAAQ,MAAM;AACrD,YAAM,kBAAkB,IAAI,8BAAgB,YAAY,GAAG,eAAe;AAE1E,UAAI;AACF,eAAO,CAAC,KAAK,QAAQ;AACnB,gBAAM,SAAS,MAAM,KAAK,MAAM,KAAK;AACrC,cAAI,OAAO,MAAM;AACf;AAAA,UACF;AAEA,gBAAM,OAAO,OAAO;AAEpB,cAAI;AACJ,cAAI,SAAS,aAAa,gBAAgB;AAExC,qBAAS,gBAAgB,MAAM;AAAA,UACjC,OAAO;AACL,gBAAI,KAAK,eAAe,cAAc,KAAK,aAAa,GAAG;AACzD,oBAAM,IAAI;AAAA,gBACR,mDAAmD,UAAU,eAAe,KAAK,UAAU,MAAM,KAAK,QAAQ;AAAA,cAChH;AAAA,YACF;AACA,qBAAS,gBAAgB,MAAM,KAAK,KAAK,MAAqB;AAAA,UAChE;AAEA,qBAAW,SAAS,QAAQ;AAC1B,kBAAM,SAAS,OAAO;AAAA,cACpB,MAAM,KAAK;AAAA,cACX,MAAM,KAAK;AAAA,cACX,MAAM,KAAK;AAAA,YACb;AACA,eAAG,KAAK,MAAM;AAAA,UAChB;AAAA,QACF;AAAA,MACF,UAAE;AACA,kBAAU;AACV,WAAG,MAAM;AAAA,MACX;AAAA,IACF;AAEA,UAAM,aAAa,mBAAK,KAAK,OAAO,eAAe;AACjD,YAAM,gBAAgB,IAAI,QAAc,CAAC,SAAS,WAAW;AAC3D,WAAG,GAAG,WAAW,CAAC,SAAS;AACzB,cAAI;AACF,gBAAI;AAEJ,gBAAI,OAAO,SAAS,UAAU;AAC5B,2BAAa;AAAA,YACf,WAAW,gBAAgB,QAAQ;AACjC,2BAAa,KAAK,SAAS,OAAO;AAAA,YACpC,WAAW,MAAM,QAAQ,IAAI,GAAG;AAC9B,2BAAa,OAAO,OAAO,IAAI,EAAE,SAAS,OAAO;AAAA,YACnD,OAAO;AACL;AAAA,YACF;AAEA,kBAAM,MAAM,KAAK,MAAM,UAAU;AACjC,kBAAM,UAAU,IAAI,YAAY;AAChC,kBAAM,WAAW,IAAI,YAAY,CAAC;AAClC,kBAAM,aAAa,IAAI,cAAc;AACrC,kBAAM,aAAa,IAAI,cAAc;AACrC,kBAAM,eAAe,IAAI,iBAAiB,KAAK,MAAM;AAGrD,gBAAI,CAAC,YAAY;AACf,mBAAK,QAAQ,MAAM,oCAAoC,GAAG;AAC1D;AAAA,YACF;AAGA,gBAAI,CAAC,KAAK,aAAa,CAAC,SAAS;AAC/B,mBAAK,YAAY;AACjB,mBAAK,MAAM,IAAI,EAAE,MAAM,kBAAI,gBAAgB,gBAAgB,CAAC;AAAA,YAC9D;AAGA,kBAAM,YACJ,SAAS,SAAS,KACb,SAAS,CAAC,EAAE,cAAc,KAAO,KAAK,kBACvC,KAAK;AACX,kBAAM,UACJ,SAAS,SAAS,KACb,SAAS,SAAS,SAAS,CAAC,EAAE,YAAY,KAAO,KAAK,kBACvD,KAAK;AAGX,kBAAM,QAAQ,SAAS;AAAA,cACrB,CAAC,aAAwE;AAAA,gBACvE,MAAM,QAAQ,QAAQ;AAAA,gBACtB,YAAY,QAAQ,cAAc,KAAO,KAAK;AAAA,gBAC9C,UAAU,QAAQ,YAAY,KAAO,KAAK;AAAA,gBAC1C,iBAAiB,KAAK;AAAA,gBACtB;AAAA,cACF;AAAA,YACF;AAEA,kBAAM,aAA6B;AAAA,cACjC,UAAU;AAAA,cACV,MAAM;AAAA,cACN;AAAA,cACA;AAAA,cACA;AAAA,cACA,OAAO,MAAM,SAAS,IAAI,QAAQ;AAAA,YACpC;AAGA,gBAAI,CAAC,SAAS;AAEZ,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,kBAAI,gBAAgB;AAAA,gBAC1B,cAAc,CAAC,UAAU;AAAA,cAC3B,CAAC;AAAA,YACH,OAAO;AAEL,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,kBAAI,gBAAgB;AAAA,gBAC1B,cAAc,CAAC,UAAU;AAAA,cAC3B,CAAC;AAGD,kBAAI,KAAK,WAAW;AAClB,qBAAK,YAAY;AACjB,qBAAK,MAAM,IAAI,EAAE,MAAM,kBAAI,gBAAgB,cAAc,CAAC;AAAA,cAC5D;AAAA,YACF;AAEA,gBAAI,KAAK,UAAU,SAAS;AAC1B,sBAAQ;AAAA,YACV;AAAA,UACF,SAAS,KAAK;AACZ,iBAAK,QAAQ,MAAM,kCAAkC,IAAI,EAAE;AAC3D,mBAAO,GAAG;AAAA,UACZ;AAAA,QACF,CAAC;AAED,WAAG,GAAG,SAAS,CAAC,QAAQ;AACtB,cAAI,CAAC,SAAS;AACZ,mBAAO,GAAG;AAAA,UACZ;AAAA,QACF,CAAC;AAED,WAAG,GAAG,SAAS,MAAM;AACnB,cAAI,CAAC,SAAS;AACZ,oBAAQ;AAAA,UACV;AAAA,QACF,CAAC;AAAA,MACH,CAAC;AAED,YAAM,QAAQ,KAAK,CAAC,mBAAe,4BAAa,WAAW,MAAM,CAAC,CAAC;AAAA,IACrE,GAAG,KAAK,eAAe;AAEvB,UAAM,QAAQ,IAAI,CAAC,SAAS,GAAG,WAAW,MAAM,CAAC;AACjD,cAAU;AACV,OAAG,MAAM;AAAA,EACX;AACF;","names":["stt"]}
|
package/dist/stt.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"stt.d.ts","sourceRoot":"","sources":["../src/stt.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,KAAK,WAAW,EAA8B,GAAG,EAAgB,MAAM,iBAAiB,CAAC;AAGlG,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,YAAY,CAAC;AAiBpD,qBAAa,GAAI,SAAQ,GAAG,CAAC,GAAG;;IAG9B,KAAK,SAAiB;gBAEV,IAAI,GAAE,OAAO,CAAC,iBAAiB,CAAM;
|
|
1
|
+
{"version":3,"file":"stt.d.ts","sourceRoot":"","sources":["../src/stt.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,KAAK,WAAW,EAA8B,GAAG,EAAgB,MAAM,iBAAiB,CAAC;AAGlG,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,YAAY,CAAC;AAiBpD,qBAAa,GAAI,SAAQ,GAAG,CAAC,GAAG;;IAG9B,KAAK,SAAiB;gBAEV,IAAI,GAAE,OAAO,CAAC,iBAAiB,CAAM;IAgC3C,UAAU,CAAC,CAAC,EAAE,WAAW,GAAG,OAAO,CAAC,GAAG,CAAC,WAAW,CAAC;IAI1D,aAAa,CAAC,IAAI,EAAE,OAAO,CAAC,iBAAiB,CAAC;IAI9C,MAAM,IAAI,YAAY;CAGvB;AAED,qBAAa,YAAa,SAAQ,GAAG,CAAC,YAAY;;IAKhD,KAAK,SAA0B;gBAEnB,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,iBAAiB;IAM7C,OAAO,CAAC,QAAQ;cAQA,GAAG;CAgOpB"}
|
package/dist/stt.js
CHANGED
|
@@ -21,24 +21,27 @@ class STT extends stt.STT {
|
|
|
21
21
|
constructor(opts = {}) {
|
|
22
22
|
super({
|
|
23
23
|
streaming: true,
|
|
24
|
-
interimResults: opts.enablePartialTranscripts ?? defaultSTTOptions.enablePartialTranscripts
|
|
24
|
+
interimResults: opts.enablePartialTranscripts ?? defaultSTTOptions.enablePartialTranscripts,
|
|
25
|
+
alignedTranscript: "word"
|
|
25
26
|
});
|
|
26
27
|
const apiKey = opts.apiKey ?? process.env.BASETEN_API_KEY;
|
|
28
|
+
const modelEndpoint = opts.modelEndpoint ?? process.env.BASETEN_MODEL_ENDPOINT;
|
|
27
29
|
const modelId = opts.modelId ?? process.env.BASETEN_STT_MODEL_ID;
|
|
28
30
|
if (!apiKey) {
|
|
29
31
|
throw new Error(
|
|
30
32
|
"Baseten API key is required, either pass it as `apiKey` or set $BASETEN_API_KEY"
|
|
31
33
|
);
|
|
32
34
|
}
|
|
33
|
-
if (!modelId) {
|
|
35
|
+
if (!modelEndpoint && !modelId) {
|
|
34
36
|
throw new Error(
|
|
35
|
-
"Baseten model
|
|
37
|
+
"Baseten model endpoint is required, either pass it as `modelEndpoint` or set $BASETEN_MODEL_ENDPOINT"
|
|
36
38
|
);
|
|
37
39
|
}
|
|
38
40
|
this.#opts = {
|
|
39
41
|
...defaultSTTOptions,
|
|
40
42
|
...opts,
|
|
41
43
|
apiKey,
|
|
44
|
+
modelEndpoint,
|
|
42
45
|
modelId
|
|
43
46
|
};
|
|
44
47
|
}
|
|
@@ -65,6 +68,9 @@ class SpeechStream extends stt.SpeechStream {
|
|
|
65
68
|
this.closed = false;
|
|
66
69
|
}
|
|
67
70
|
getWsUrl() {
|
|
71
|
+
if (this.#opts.modelEndpoint) {
|
|
72
|
+
return this.#opts.modelEndpoint;
|
|
73
|
+
}
|
|
68
74
|
return `wss://model-${this.#opts.modelId}.api.baseten.co/environments/${this.#opts.environment}/websocket`;
|
|
69
75
|
}
|
|
70
76
|
async run() {
|
|
@@ -75,7 +81,7 @@ class SpeechStream extends stt.SpeechStream {
|
|
|
75
81
|
const headers = {
|
|
76
82
|
Authorization: `Api-Key ${this.#opts.apiKey}`
|
|
77
83
|
};
|
|
78
|
-
const ws = new WebSocket(url, { headers });
|
|
84
|
+
const ws = new WebSocket(url, { headers, rejectUnauthorized: false });
|
|
79
85
|
try {
|
|
80
86
|
await new Promise((resolve, reject) => {
|
|
81
87
|
ws.on("open", resolve);
|
|
@@ -106,22 +112,17 @@ class SpeechStream extends stt.SpeechStream {
|
|
|
106
112
|
async #runWS(ws) {
|
|
107
113
|
let closing = false;
|
|
108
114
|
const metadata = {
|
|
109
|
-
|
|
115
|
+
vad_params: {
|
|
110
116
|
threshold: this.#opts.vadThreshold,
|
|
111
117
|
min_silence_duration_ms: this.#opts.vadMinSilenceDurationMs,
|
|
112
118
|
speech_pad_ms: this.#opts.vadSpeechPadMs
|
|
113
119
|
},
|
|
114
|
-
|
|
120
|
+
streaming_whisper_params: {
|
|
115
121
|
encoding: this.#opts.encoding ?? "pcm_s16le",
|
|
116
122
|
sample_rate: this.#opts.sampleRate ?? 16e3,
|
|
117
|
-
enable_partial_transcripts:
|
|
118
|
-
partial_transcript_interval_s: this.#opts.partialTranscriptIntervalS,
|
|
119
|
-
final_transcript_max_duration_s: this.#opts.finalTranscriptMaxDurationS
|
|
120
|
-
},
|
|
121
|
-
whisper_params: {
|
|
122
|
-
prompt: this.#opts.prompt,
|
|
123
|
+
enable_partial_transcripts: false,
|
|
123
124
|
audio_language: this.#opts.audioLanguage ?? "en",
|
|
124
|
-
|
|
125
|
+
show_word_timestamps: true
|
|
125
126
|
}
|
|
126
127
|
};
|
|
127
128
|
ws.send(JSON.stringify(metadata));
|
|
@@ -189,14 +190,24 @@ class SpeechStream extends stt.SpeechStream {
|
|
|
189
190
|
this.#speaking = true;
|
|
190
191
|
this.queue.put({ type: stt.SpeechEventType.START_OF_SPEECH });
|
|
191
192
|
}
|
|
192
|
-
const startTime = segments.length > 0 ? segments[0].
|
|
193
|
-
const endTime = segments.length > 0 ? segments[segments.length - 1].
|
|
193
|
+
const startTime = segments.length > 0 ? (segments[0].start_time ?? 0) + this.startTimeOffset : this.startTimeOffset;
|
|
194
|
+
const endTime = segments.length > 0 ? (segments[segments.length - 1].end_time ?? 0) + this.startTimeOffset : this.startTimeOffset;
|
|
195
|
+
const words = segments.map(
|
|
196
|
+
(segment) => ({
|
|
197
|
+
text: segment.text ?? "",
|
|
198
|
+
startTime: (segment.start_time ?? 0) + this.startTimeOffset,
|
|
199
|
+
endTime: (segment.end_time ?? 0) + this.startTimeOffset,
|
|
200
|
+
startTimeOffset: this.startTimeOffset,
|
|
201
|
+
confidence
|
|
202
|
+
})
|
|
203
|
+
);
|
|
194
204
|
const speechData = {
|
|
195
205
|
language: languageCode,
|
|
196
206
|
text: transcript,
|
|
197
207
|
startTime,
|
|
198
208
|
endTime,
|
|
199
|
-
confidence
|
|
209
|
+
confidence,
|
|
210
|
+
words: words.length > 0 ? words : void 0
|
|
200
211
|
};
|
|
201
212
|
if (!isFinal) {
|
|
202
213
|
this.queue.put({
|
package/dist/stt.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/stt.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { type AudioBuffer, AudioByteStream, Task, log, stt, waitForAbort } from '@livekit/agents';\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { WebSocket } from 'ws';\nimport type { BasetenSttOptions } from './types.js';\n\nconst defaultSTTOptions: Partial<BasetenSttOptions> = {\n environment: 'production',\n encoding: 'pcm_s16le',\n sampleRate: 16000,\n bufferSizeSeconds: 0.032,\n enablePartialTranscripts: true,\n partialTranscriptIntervalS: 0.5,\n finalTranscriptMaxDurationS: 5,\n audioLanguage: 'en',\n languageDetectionOnly: false,\n vadThreshold: 0.5,\n vadMinSilenceDurationMs: 300,\n vadSpeechPadMs: 30,\n};\n\nexport class STT extends stt.STT {\n #opts: BasetenSttOptions;\n #logger = log();\n label = 'baseten.STT';\n\n constructor(opts: Partial<BasetenSttOptions> = {}) {\n super({\n streaming: true,\n interimResults: opts.enablePartialTranscripts ?? defaultSTTOptions.enablePartialTranscripts!,\n });\n\n const apiKey = opts.apiKey ?? process.env.BASETEN_API_KEY;\n const modelId = opts.modelId ?? process.env.BASETEN_STT_MODEL_ID;\n\n if (!apiKey) {\n throw new Error(\n 'Baseten API key is required, either pass it as `apiKey` or set $BASETEN_API_KEY',\n );\n }\n if (!modelId) {\n throw new Error(\n 'Baseten model ID is required, either pass it as `modelId` or set $BASETEN_STT_MODEL_ID',\n );\n }\n\n this.#opts = {\n ...defaultSTTOptions,\n ...opts,\n apiKey,\n modelId,\n } as BasetenSttOptions;\n }\n\n // eslint-disable-next-line\n async _recognize(_: AudioBuffer): Promise<stt.SpeechEvent> {\n throw new Error('Recognize is not supported on Baseten STT');\n }\n\n updateOptions(opts: Partial<BasetenSttOptions>) {\n this.#opts = { ...this.#opts, ...opts };\n }\n\n stream(): SpeechStream {\n return new SpeechStream(this, this.#opts);\n }\n}\n\nexport class SpeechStream extends stt.SpeechStream {\n #opts: BasetenSttOptions;\n #logger = log();\n #speaking = false;\n #requestId = '';\n label = 'baseten.SpeechStream';\n\n constructor(stt: STT, opts: BasetenSttOptions) {\n super(stt, opts.sampleRate);\n this.#opts = opts;\n this.closed = false;\n }\n\n private getWsUrl(): string {\n return `wss://model-${this.#opts.modelId}.api.baseten.co/environments/${this.#opts.environment}/websocket`;\n }\n\n protected async run() {\n const maxRetry = 32;\n let retries = 0;\n\n while (!this.input.closed && !this.closed) {\n const url = this.getWsUrl();\n const headers = {\n Authorization: `Api-Key ${this.#opts.apiKey}`,\n };\n\n const ws = new WebSocket(url, { headers });\n\n try {\n await new Promise((resolve, reject) => {\n ws.on('open', resolve);\n ws.on('error', (error) => reject(error));\n ws.on('close', (code) => reject(`WebSocket returned ${code}`));\n });\n\n await this.#runWS(ws);\n } catch (e) {\n if (!this.closed && !this.input.closed) {\n if (retries >= maxRetry) {\n throw new Error(`failed to connect to Baseten after ${retries} attempts: ${e}`);\n }\n\n const delay = Math.min(retries * 5, 10);\n retries++;\n\n this.#logger.warn(\n `failed to connect to Baseten, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`,\n );\n await new Promise((resolve) => setTimeout(resolve, delay * 1000));\n } else {\n this.#logger.warn(\n `Baseten disconnected, connection is closed: ${e} (inputClosed: ${this.input.closed}, isClosed: ${this.closed})`,\n );\n }\n }\n }\n\n this.closed = true;\n }\n\n async #runWS(ws: WebSocket) {\n let closing = false;\n\n // Send initial metadata\n const metadata = {\n streaming_vad_config: {\n threshold: this.#opts.vadThreshold,\n min_silence_duration_ms: this.#opts.vadMinSilenceDurationMs,\n speech_pad_ms: this.#opts.vadSpeechPadMs,\n },\n streaming_params: {\n encoding: this.#opts.encoding ?? 'pcm_s16le',\n sample_rate: this.#opts.sampleRate ?? 16000,\n enable_partial_transcripts: this.#opts.enablePartialTranscripts,\n partial_transcript_interval_s: this.#opts.partialTranscriptIntervalS,\n final_transcript_max_duration_s: this.#opts.finalTranscriptMaxDurationS,\n },\n whisper_params: {\n prompt: this.#opts.prompt,\n audio_language: this.#opts.audioLanguage ?? 'en',\n language_detection_only: this.#opts.languageDetectionOnly ?? false,\n },\n };\n ws.send(JSON.stringify(metadata));\n\n const sendTask = async () => {\n const sampleRate = this.#opts.sampleRate ?? 16000;\n const samplesPerChunk = sampleRate === 16000 ? 512 : 256;\n const audioByteStream = new AudioByteStream(sampleRate, 1, samplesPerChunk);\n\n try {\n while (!this.closed) {\n const result = await this.input.next();\n if (result.done) {\n break;\n }\n\n const data = result.value;\n\n let frames: AudioFrame[];\n if (data === SpeechStream.FLUSH_SENTINEL) {\n // Flush any remaining buffered audio\n frames = audioByteStream.flush();\n } else {\n if (data.sampleRate !== sampleRate || data.channels !== 1) {\n throw new Error(\n `sample rate or channel count mismatch: expected ${sampleRate}Hz/1ch, got ${data.sampleRate}Hz/${data.channels}ch`,\n );\n }\n frames = audioByteStream.write(data.data.buffer as ArrayBuffer);\n }\n\n for (const frame of frames) {\n const buffer = Buffer.from(\n frame.data.buffer,\n frame.data.byteOffset,\n frame.data.byteLength,\n );\n ws.send(buffer);\n }\n }\n } finally {\n closing = true;\n ws.close();\n }\n };\n\n const listenTask = Task.from(async (controller) => {\n const listenMessage = new Promise<void>((resolve, reject) => {\n ws.on('message', (data) => {\n try {\n let jsonString: string;\n\n if (typeof data === 'string') {\n jsonString = data;\n } else if (data instanceof Buffer) {\n jsonString = data.toString('utf-8');\n } else if (Array.isArray(data)) {\n jsonString = Buffer.concat(data).toString('utf-8');\n } else {\n return;\n }\n\n const msg = JSON.parse(jsonString);\n\n // Parse response format matching Python implementation\n const isFinal = msg.is_final ?? true;\n const segments = msg.segments ?? [];\n const transcript = msg.transcript ?? '';\n const confidence = msg.confidence ?? 0.0;\n const languageCode = msg.language_code ?? this.#opts.audioLanguage;\n\n // Skip if no transcript text\n if (!transcript) {\n this.#logger.debug('Received non-transcript message:', msg);\n return;\n }\n\n // Emit START_OF_SPEECH if not already speaking (only for interim or first final)\n if (!this.#speaking && !isFinal) {\n this.#speaking = true;\n this.queue.put({ type: stt.SpeechEventType.START_OF_SPEECH });\n }\n\n // Extract timing from segments\n const startTime = segments.length > 0 ? segments[0].start ?? 0.0 : 0.0;\n const endTime = segments.length > 0 ? segments[segments.length - 1].end ?? 0.0 : 0.0;\n\n const speechData: stt.SpeechData = {\n language: languageCode!,\n text: transcript,\n startTime,\n endTime,\n confidence,\n };\n\n // Handle interim vs final transcripts (matching Python implementation)\n if (!isFinal) {\n // Interim transcript\n this.queue.put({\n type: stt.SpeechEventType.INTERIM_TRANSCRIPT,\n alternatives: [speechData],\n });\n } else {\n // Final transcript\n this.queue.put({\n type: stt.SpeechEventType.FINAL_TRANSCRIPT,\n alternatives: [speechData],\n });\n\n // Emit END_OF_SPEECH after final transcript\n if (this.#speaking) {\n this.#speaking = false;\n this.queue.put({ type: stt.SpeechEventType.END_OF_SPEECH });\n }\n }\n\n if (this.closed || closing) {\n resolve();\n }\n } catch (err) {\n this.#logger.error(`STT: Error processing message: ${data}`);\n reject(err);\n }\n });\n\n ws.on('error', (err) => {\n if (!closing) {\n reject(err);\n }\n });\n\n ws.on('close', () => {\n if (!closing) {\n resolve();\n }\n });\n });\n\n await Promise.race([listenMessage, waitForAbort(controller.signal)]);\n }, this.abortController);\n\n await Promise.all([sendTask(), listenTask.result]);\n closing = true;\n ws.close();\n }\n}\n"],"mappings":"AAGA,SAA2B,iBAAiB,MAAM,KAAK,KAAK,oBAAoB;AAEhF,SAAS,iBAAiB;AAG1B,MAAM,oBAAgD;AAAA,EACpD,aAAa;AAAA,EACb,UAAU;AAAA,EACV,YAAY;AAAA,EACZ,mBAAmB;AAAA,EACnB,0BAA0B;AAAA,EAC1B,4BAA4B;AAAA,EAC5B,6BAA6B;AAAA,EAC7B,eAAe;AAAA,EACf,uBAAuB;AAAA,EACvB,cAAc;AAAA,EACd,yBAAyB;AAAA,EACzB,gBAAgB;AAClB;AAEO,MAAM,YAAY,IAAI,IAAI;AAAA,EAC/B;AAAA,EACA,UAAU,IAAI;AAAA,EACd,QAAQ;AAAA,EAER,YAAY,OAAmC,CAAC,GAAG;AACjD,UAAM;AAAA,MACJ,WAAW;AAAA,MACX,gBAAgB,KAAK,4BAA4B,kBAAkB;AAAA,IACrE,CAAC;AAED,UAAM,SAAS,KAAK,UAAU,QAAQ,IAAI;AAC1C,UAAM,UAAU,KAAK,WAAW,QAAQ,IAAI;AAE5C,QAAI,CAAC,QAAQ;AACX,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,SAAS;AACZ,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAEA,SAAK,QAAQ;AAAA,MACX,GAAG;AAAA,MACH,GAAG;AAAA,MACH;AAAA,MACA;AAAA,IACF;AAAA,EACF;AAAA;AAAA,EAGA,MAAM,WAAW,GAA0C;AACzD,UAAM,IAAI,MAAM,2CAA2C;AAAA,EAC7D;AAAA,EAEA,cAAc,MAAkC;AAC9C,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AAAA,EACxC;AAAA,EAEA,SAAuB;AACrB,WAAO,IAAI,aAAa,MAAM,KAAK,KAAK;AAAA,EAC1C;AACF;AAEO,MAAM,qBAAqB,IAAI,aAAa;AAAA,EACjD;AAAA,EACA,UAAU,IAAI;AAAA,EACd,YAAY;AAAA,EACZ,aAAa;AAAA,EACb,QAAQ;AAAA,EAER,YAAYA,MAAU,MAAyB;AAC7C,UAAMA,MAAK,KAAK,UAAU;AAC1B,SAAK,QAAQ;AACb,SAAK,SAAS;AAAA,EAChB;AAAA,EAEQ,WAAmB;AACzB,WAAO,eAAe,KAAK,MAAM,OAAO,gCAAgC,KAAK,MAAM,WAAW;AAAA,EAChG;AAAA,EAEA,MAAgB,MAAM;AACpB,UAAM,WAAW;AACjB,QAAI,UAAU;AAEd,WAAO,CAAC,KAAK,MAAM,UAAU,CAAC,KAAK,QAAQ;AACzC,YAAM,MAAM,KAAK,SAAS;AAC1B,YAAM,UAAU;AAAA,QACd,eAAe,WAAW,KAAK,MAAM,MAAM;AAAA,MAC7C;AAEA,YAAM,KAAK,IAAI,UAAU,KAAK,EAAE,QAAQ,CAAC;AAEzC,UAAI;AACF,cAAM,IAAI,QAAQ,CAAC,SAAS,WAAW;AACrC,aAAG,GAAG,QAAQ,OAAO;AACrB,aAAG,GAAG,SAAS,CAAC,UAAU,OAAO,KAAK,CAAC;AACvC,aAAG,GAAG,SAAS,CAAC,SAAS,OAAO,sBAAsB,IAAI,EAAE,CAAC;AAAA,QAC/D,CAAC;AAED,cAAM,KAAK,OAAO,EAAE;AAAA,MACtB,SAAS,GAAG;AACV,YAAI,CAAC,KAAK,UAAU,CAAC,KAAK,MAAM,QAAQ;AACtC,cAAI,WAAW,UAAU;AACvB,kBAAM,IAAI,MAAM,sCAAsC,OAAO,cAAc,CAAC,EAAE;AAAA,UAChF;AAEA,gBAAM,QAAQ,KAAK,IAAI,UAAU,GAAG,EAAE;AACtC;AAEA,eAAK,QAAQ;AAAA,YACX,6CAA6C,KAAK,aAAa,CAAC,KAAK,OAAO,IAAI,QAAQ;AAAA,UAC1F;AACA,gBAAM,IAAI,QAAQ,CAAC,YAAY,WAAW,SAAS,QAAQ,GAAI,CAAC;AAAA,QAClE,OAAO;AACL,eAAK,QAAQ;AAAA,YACX,+CAA+C,CAAC,kBAAkB,KAAK,MAAM,MAAM,eAAe,KAAK,MAAM;AAAA,UAC/G;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,MAAM,OAAO,IAAe;AAC1B,QAAI,UAAU;AAGd,UAAM,WAAW;AAAA,MACf,sBAAsB;AAAA,QACpB,WAAW,KAAK,MAAM;AAAA,QACtB,yBAAyB,KAAK,MAAM;AAAA,QACpC,eAAe,KAAK,MAAM;AAAA,MAC5B;AAAA,MACA,kBAAkB;AAAA,QAChB,UAAU,KAAK,MAAM,YAAY;AAAA,QACjC,aAAa,KAAK,MAAM,cAAc;AAAA,QACtC,4BAA4B,KAAK,MAAM;AAAA,QACvC,+BAA+B,KAAK,MAAM;AAAA,QAC1C,iCAAiC,KAAK,MAAM;AAAA,MAC9C;AAAA,MACA,gBAAgB;AAAA,QACd,QAAQ,KAAK,MAAM;AAAA,QACnB,gBAAgB,KAAK,MAAM,iBAAiB;AAAA,QAC5C,yBAAyB,KAAK,MAAM,yBAAyB;AAAA,MAC/D;AAAA,IACF;AACA,OAAG,KAAK,KAAK,UAAU,QAAQ,CAAC;AAEhC,UAAM,WAAW,YAAY;AAC3B,YAAM,aAAa,KAAK,MAAM,cAAc;AAC5C,YAAM,kBAAkB,eAAe,OAAQ,MAAM;AACrD,YAAM,kBAAkB,IAAI,gBAAgB,YAAY,GAAG,eAAe;AAE1E,UAAI;AACF,eAAO,CAAC,KAAK,QAAQ;AACnB,gBAAM,SAAS,MAAM,KAAK,MAAM,KAAK;AACrC,cAAI,OAAO,MAAM;AACf;AAAA,UACF;AAEA,gBAAM,OAAO,OAAO;AAEpB,cAAI;AACJ,cAAI,SAAS,aAAa,gBAAgB;AAExC,qBAAS,gBAAgB,MAAM;AAAA,UACjC,OAAO;AACL,gBAAI,KAAK,eAAe,cAAc,KAAK,aAAa,GAAG;AACzD,oBAAM,IAAI;AAAA,gBACR,mDAAmD,UAAU,eAAe,KAAK,UAAU,MAAM,KAAK,QAAQ;AAAA,cAChH;AAAA,YACF;AACA,qBAAS,gBAAgB,MAAM,KAAK,KAAK,MAAqB;AAAA,UAChE;AAEA,qBAAW,SAAS,QAAQ;AAC1B,kBAAM,SAAS,OAAO;AAAA,cACpB,MAAM,KAAK;AAAA,cACX,MAAM,KAAK;AAAA,cACX,MAAM,KAAK;AAAA,YACb;AACA,eAAG,KAAK,MAAM;AAAA,UAChB;AAAA,QACF;AAAA,MACF,UAAE;AACA,kBAAU;AACV,WAAG,MAAM;AAAA,MACX;AAAA,IACF;AAEA,UAAM,aAAa,KAAK,KAAK,OAAO,eAAe;AACjD,YAAM,gBAAgB,IAAI,QAAc,CAAC,SAAS,WAAW;AAC3D,WAAG,GAAG,WAAW,CAAC,SAAS;AACzB,cAAI;AACF,gBAAI;AAEJ,gBAAI,OAAO,SAAS,UAAU;AAC5B,2BAAa;AAAA,YACf,WAAW,gBAAgB,QAAQ;AACjC,2BAAa,KAAK,SAAS,OAAO;AAAA,YACpC,WAAW,MAAM,QAAQ,IAAI,GAAG;AAC9B,2BAAa,OAAO,OAAO,IAAI,EAAE,SAAS,OAAO;AAAA,YACnD,OAAO;AACL;AAAA,YACF;AAEA,kBAAM,MAAM,KAAK,MAAM,UAAU;AAGjC,kBAAM,UAAU,IAAI,YAAY;AAChC,kBAAM,WAAW,IAAI,YAAY,CAAC;AAClC,kBAAM,aAAa,IAAI,cAAc;AACrC,kBAAM,aAAa,IAAI,cAAc;AACrC,kBAAM,eAAe,IAAI,iBAAiB,KAAK,MAAM;AAGrD,gBAAI,CAAC,YAAY;AACf,mBAAK,QAAQ,MAAM,oCAAoC,GAAG;AAC1D;AAAA,YACF;AAGA,gBAAI,CAAC,KAAK,aAAa,CAAC,SAAS;AAC/B,mBAAK,YAAY;AACjB,mBAAK,MAAM,IAAI,EAAE,MAAM,IAAI,gBAAgB,gBAAgB,CAAC;AAAA,YAC9D;AAGA,kBAAM,YAAY,SAAS,SAAS,IAAI,SAAS,CAAC,EAAE,SAAS,IAAM;AACnE,kBAAM,UAAU,SAAS,SAAS,IAAI,SAAS,SAAS,SAAS,CAAC,EAAE,OAAO,IAAM;AAEjF,kBAAM,aAA6B;AAAA,cACjC,UAAU;AAAA,cACV,MAAM;AAAA,cACN;AAAA,cACA;AAAA,cACA;AAAA,YACF;AAGA,gBAAI,CAAC,SAAS;AAEZ,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,IAAI,gBAAgB;AAAA,gBAC1B,cAAc,CAAC,UAAU;AAAA,cAC3B,CAAC;AAAA,YACH,OAAO;AAEL,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,IAAI,gBAAgB;AAAA,gBAC1B,cAAc,CAAC,UAAU;AAAA,cAC3B,CAAC;AAGD,kBAAI,KAAK,WAAW;AAClB,qBAAK,YAAY;AACjB,qBAAK,MAAM,IAAI,EAAE,MAAM,IAAI,gBAAgB,cAAc,CAAC;AAAA,cAC5D;AAAA,YACF;AAEA,gBAAI,KAAK,UAAU,SAAS;AAC1B,sBAAQ;AAAA,YACV;AAAA,UACF,SAAS,KAAK;AACZ,iBAAK,QAAQ,MAAM,kCAAkC,IAAI,EAAE;AAC3D,mBAAO,GAAG;AAAA,UACZ;AAAA,QACF,CAAC;AAED,WAAG,GAAG,SAAS,CAAC,QAAQ;AACtB,cAAI,CAAC,SAAS;AACZ,mBAAO,GAAG;AAAA,UACZ;AAAA,QACF,CAAC;AAED,WAAG,GAAG,SAAS,MAAM;AACnB,cAAI,CAAC,SAAS;AACZ,oBAAQ;AAAA,UACV;AAAA,QACF,CAAC;AAAA,MACH,CAAC;AAED,YAAM,QAAQ,KAAK,CAAC,eAAe,aAAa,WAAW,MAAM,CAAC,CAAC;AAAA,IACrE,GAAG,KAAK,eAAe;AAEvB,UAAM,QAAQ,IAAI,CAAC,SAAS,GAAG,WAAW,MAAM,CAAC;AACjD,cAAU;AACV,OAAG,MAAM;AAAA,EACX;AACF;","names":["stt"]}
|
|
1
|
+
{"version":3,"sources":["../src/stt.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { type AudioBuffer, AudioByteStream, Task, log, stt, waitForAbort } from '@livekit/agents';\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { WebSocket } from 'ws';\nimport type { BasetenSttOptions } from './types.js';\n\nconst defaultSTTOptions: Partial<BasetenSttOptions> = {\n environment: 'production',\n encoding: 'pcm_s16le',\n sampleRate: 16000,\n bufferSizeSeconds: 0.032,\n enablePartialTranscripts: true,\n partialTranscriptIntervalS: 0.5,\n finalTranscriptMaxDurationS: 5,\n audioLanguage: 'en',\n languageDetectionOnly: false,\n vadThreshold: 0.5,\n vadMinSilenceDurationMs: 300,\n vadSpeechPadMs: 30,\n};\n\nexport class STT extends stt.STT {\n #opts: BasetenSttOptions;\n #logger = log();\n label = 'baseten.STT';\n\n constructor(opts: Partial<BasetenSttOptions> = {}) {\n super({\n streaming: true,\n interimResults: opts.enablePartialTranscripts ?? defaultSTTOptions.enablePartialTranscripts!,\n alignedTranscript: 'word',\n });\n\n const apiKey = opts.apiKey ?? process.env.BASETEN_API_KEY;\n const modelEndpoint = opts.modelEndpoint ?? process.env.BASETEN_MODEL_ENDPOINT;\n const modelId = opts.modelId ?? process.env.BASETEN_STT_MODEL_ID;\n\n if (!apiKey) {\n throw new Error(\n 'Baseten API key is required, either pass it as `apiKey` or set $BASETEN_API_KEY',\n );\n }\n if (!modelEndpoint && !modelId) {\n throw new Error(\n 'Baseten model endpoint is required, either pass it as `modelEndpoint` or set $BASETEN_MODEL_ENDPOINT',\n );\n }\n\n this.#opts = {\n ...defaultSTTOptions,\n ...opts,\n apiKey,\n modelEndpoint,\n modelId,\n } as BasetenSttOptions;\n }\n\n // eslint-disable-next-line\n async _recognize(_: AudioBuffer): Promise<stt.SpeechEvent> {\n throw new Error('Recognize is not supported on Baseten STT');\n }\n\n updateOptions(opts: Partial<BasetenSttOptions>) {\n this.#opts = { ...this.#opts, ...opts };\n }\n\n stream(): SpeechStream {\n return new SpeechStream(this, this.#opts);\n }\n}\n\nexport class SpeechStream extends stt.SpeechStream {\n #opts: BasetenSttOptions;\n #logger = log();\n #speaking = false;\n #requestId = '';\n label = 'baseten.SpeechStream';\n\n constructor(stt: STT, opts: BasetenSttOptions) {\n super(stt, opts.sampleRate);\n this.#opts = opts;\n this.closed = false;\n }\n\n private getWsUrl(): string {\n if (this.#opts.modelEndpoint) {\n return this.#opts.modelEndpoint;\n }\n // Fallback to constructing URL from modelId (deprecated)\n return `wss://model-${this.#opts.modelId}.api.baseten.co/environments/${this.#opts.environment}/websocket`;\n }\n\n protected async run() {\n const maxRetry = 32;\n let retries = 0;\n\n while (!this.input.closed && !this.closed) {\n const url = this.getWsUrl();\n const headers = {\n Authorization: `Api-Key ${this.#opts.apiKey}`,\n };\n\n const ws = new WebSocket(url, { headers, rejectUnauthorized: false });\n\n try {\n await new Promise((resolve, reject) => {\n ws.on('open', resolve);\n ws.on('error', (error) => reject(error));\n ws.on('close', (code) => reject(`WebSocket returned ${code}`));\n });\n\n await this.#runWS(ws);\n } catch (e) {\n if (!this.closed && !this.input.closed) {\n if (retries >= maxRetry) {\n throw new Error(`failed to connect to Baseten after ${retries} attempts: ${e}`);\n }\n\n const delay = Math.min(retries * 5, 10);\n retries++;\n\n this.#logger.warn(\n `failed to connect to Baseten, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`,\n );\n await new Promise((resolve) => setTimeout(resolve, delay * 1000));\n } else {\n this.#logger.warn(\n `Baseten disconnected, connection is closed: ${e} (inputClosed: ${this.input.closed}, isClosed: ${this.closed})`,\n );\n }\n }\n }\n\n this.closed = true;\n }\n\n async #runWS(ws: WebSocket) {\n let closing = false;\n\n // Send initial metadata\n // Note: Baseten server expects 'vad_params' and 'streaming_whisper_params' field names\n // (not 'streaming_vad_config', 'streaming_params', 'whisper_params' as in older versions)\n const metadata = {\n vad_params: {\n threshold: this.#opts.vadThreshold,\n min_silence_duration_ms: this.#opts.vadMinSilenceDurationMs,\n speech_pad_ms: this.#opts.vadSpeechPadMs,\n },\n streaming_whisper_params: {\n encoding: this.#opts.encoding ?? 'pcm_s16le',\n sample_rate: this.#opts.sampleRate ?? 16000,\n enable_partial_transcripts: false,\n audio_language: this.#opts.audioLanguage ?? 'en',\n show_word_timestamps: true,\n },\n };\n\n ws.send(JSON.stringify(metadata));\n\n const sendTask = async () => {\n const sampleRate = this.#opts.sampleRate ?? 16000;\n const samplesPerChunk = sampleRate === 16000 ? 512 : 256;\n const audioByteStream = new AudioByteStream(sampleRate, 1, samplesPerChunk);\n\n try {\n while (!this.closed) {\n const result = await this.input.next();\n if (result.done) {\n break;\n }\n\n const data = result.value;\n\n let frames: AudioFrame[];\n if (data === SpeechStream.FLUSH_SENTINEL) {\n // Flush any remaining buffered audio\n frames = audioByteStream.flush();\n } else {\n if (data.sampleRate !== sampleRate || data.channels !== 1) {\n throw new Error(\n `sample rate or channel count mismatch: expected ${sampleRate}Hz/1ch, got ${data.sampleRate}Hz/${data.channels}ch`,\n );\n }\n frames = audioByteStream.write(data.data.buffer as ArrayBuffer);\n }\n\n for (const frame of frames) {\n const buffer = Buffer.from(\n frame.data.buffer,\n frame.data.byteOffset,\n frame.data.byteLength,\n );\n ws.send(buffer);\n }\n }\n } finally {\n closing = true;\n ws.close();\n }\n };\n\n const listenTask = Task.from(async (controller) => {\n const listenMessage = new Promise<void>((resolve, reject) => {\n ws.on('message', (data) => {\n try {\n let jsonString: string;\n\n if (typeof data === 'string') {\n jsonString = data;\n } else if (data instanceof Buffer) {\n jsonString = data.toString('utf-8');\n } else if (Array.isArray(data)) {\n jsonString = Buffer.concat(data).toString('utf-8');\n } else {\n return;\n }\n\n const msg = JSON.parse(jsonString);\n const isFinal = msg.is_final ?? true;\n const segments = msg.segments ?? [];\n const transcript = msg.transcript ?? '';\n const confidence = msg.confidence ?? 0.0;\n const languageCode = msg.language_code ?? this.#opts.audioLanguage;\n\n // Skip if no transcript text\n if (!transcript) {\n this.#logger.debug('Received non-transcript message:', msg);\n return;\n }\n\n // Emit START_OF_SPEECH if not already speaking (only for interim or first final)\n if (!this.#speaking && !isFinal) {\n this.#speaking = true;\n this.queue.put({ type: stt.SpeechEventType.START_OF_SPEECH });\n }\n\n // Note: Baseten uses 'start_time' and 'end_time' field names (with underscores)\n const startTime =\n segments.length > 0\n ? (segments[0].start_time ?? 0.0) + this.startTimeOffset\n : this.startTimeOffset;\n const endTime =\n segments.length > 0\n ? (segments[segments.length - 1].end_time ?? 0.0) + this.startTimeOffset\n : this.startTimeOffset;\n\n // Note: Baseten returns segments (chunks) which we treat as words for aligned transcripts\n const words = segments.map(\n (segment: { text?: string; start_time?: number; end_time?: number }) => ({\n text: segment.text ?? '',\n startTime: (segment.start_time ?? 0.0) + this.startTimeOffset,\n endTime: (segment.end_time ?? 0.0) + this.startTimeOffset,\n startTimeOffset: this.startTimeOffset,\n confidence: confidence,\n }),\n );\n\n const speechData: stt.SpeechData = {\n language: languageCode!,\n text: transcript,\n startTime,\n endTime,\n confidence,\n words: words.length > 0 ? words : undefined,\n };\n\n // Handle interim vs final transcripts (matching Python implementation)\n if (!isFinal) {\n // Interim transcript\n this.queue.put({\n type: stt.SpeechEventType.INTERIM_TRANSCRIPT,\n alternatives: [speechData],\n });\n } else {\n // Final transcript\n this.queue.put({\n type: stt.SpeechEventType.FINAL_TRANSCRIPT,\n alternatives: [speechData],\n });\n\n // Emit END_OF_SPEECH after final transcript\n if (this.#speaking) {\n this.#speaking = false;\n this.queue.put({ type: stt.SpeechEventType.END_OF_SPEECH });\n }\n }\n\n if (this.closed || closing) {\n resolve();\n }\n } catch (err) {\n this.#logger.error(`STT: Error processing message: ${data}`);\n reject(err);\n }\n });\n\n ws.on('error', (err) => {\n if (!closing) {\n reject(err);\n }\n });\n\n ws.on('close', () => {\n if (!closing) {\n resolve();\n }\n });\n });\n\n await Promise.race([listenMessage, waitForAbort(controller.signal)]);\n }, this.abortController);\n\n await Promise.all([sendTask(), listenTask.result]);\n closing = true;\n ws.close();\n }\n}\n"],"mappings":"AAGA,SAA2B,iBAAiB,MAAM,KAAK,KAAK,oBAAoB;AAEhF,SAAS,iBAAiB;AAG1B,MAAM,oBAAgD;AAAA,EACpD,aAAa;AAAA,EACb,UAAU;AAAA,EACV,YAAY;AAAA,EACZ,mBAAmB;AAAA,EACnB,0BAA0B;AAAA,EAC1B,4BAA4B;AAAA,EAC5B,6BAA6B;AAAA,EAC7B,eAAe;AAAA,EACf,uBAAuB;AAAA,EACvB,cAAc;AAAA,EACd,yBAAyB;AAAA,EACzB,gBAAgB;AAClB;AAEO,MAAM,YAAY,IAAI,IAAI;AAAA,EAC/B;AAAA,EACA,UAAU,IAAI;AAAA,EACd,QAAQ;AAAA,EAER,YAAY,OAAmC,CAAC,GAAG;AACjD,UAAM;AAAA,MACJ,WAAW;AAAA,MACX,gBAAgB,KAAK,4BAA4B,kBAAkB;AAAA,MACnE,mBAAmB;AAAA,IACrB,CAAC;AAED,UAAM,SAAS,KAAK,UAAU,QAAQ,IAAI;AAC1C,UAAM,gBAAgB,KAAK,iBAAiB,QAAQ,IAAI;AACxD,UAAM,UAAU,KAAK,WAAW,QAAQ,IAAI;AAE5C,QAAI,CAAC,QAAQ;AACX,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,iBAAiB,CAAC,SAAS;AAC9B,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAEA,SAAK,QAAQ;AAAA,MACX,GAAG;AAAA,MACH,GAAG;AAAA,MACH;AAAA,MACA;AAAA,MACA;AAAA,IACF;AAAA,EACF;AAAA;AAAA,EAGA,MAAM,WAAW,GAA0C;AACzD,UAAM,IAAI,MAAM,2CAA2C;AAAA,EAC7D;AAAA,EAEA,cAAc,MAAkC;AAC9C,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AAAA,EACxC;AAAA,EAEA,SAAuB;AACrB,WAAO,IAAI,aAAa,MAAM,KAAK,KAAK;AAAA,EAC1C;AACF;AAEO,MAAM,qBAAqB,IAAI,aAAa;AAAA,EACjD;AAAA,EACA,UAAU,IAAI;AAAA,EACd,YAAY;AAAA,EACZ,aAAa;AAAA,EACb,QAAQ;AAAA,EAER,YAAYA,MAAU,MAAyB;AAC7C,UAAMA,MAAK,KAAK,UAAU;AAC1B,SAAK,QAAQ;AACb,SAAK,SAAS;AAAA,EAChB;AAAA,EAEQ,WAAmB;AACzB,QAAI,KAAK,MAAM,eAAe;AAC5B,aAAO,KAAK,MAAM;AAAA,IACpB;AAEA,WAAO,eAAe,KAAK,MAAM,OAAO,gCAAgC,KAAK,MAAM,WAAW;AAAA,EAChG;AAAA,EAEA,MAAgB,MAAM;AACpB,UAAM,WAAW;AACjB,QAAI,UAAU;AAEd,WAAO,CAAC,KAAK,MAAM,UAAU,CAAC,KAAK,QAAQ;AACzC,YAAM,MAAM,KAAK,SAAS;AAC1B,YAAM,UAAU;AAAA,QACd,eAAe,WAAW,KAAK,MAAM,MAAM;AAAA,MAC7C;AAEA,YAAM,KAAK,IAAI,UAAU,KAAK,EAAE,SAAS,oBAAoB,MAAM,CAAC;AAEpE,UAAI;AACF,cAAM,IAAI,QAAQ,CAAC,SAAS,WAAW;AACrC,aAAG,GAAG,QAAQ,OAAO;AACrB,aAAG,GAAG,SAAS,CAAC,UAAU,OAAO,KAAK,CAAC;AACvC,aAAG,GAAG,SAAS,CAAC,SAAS,OAAO,sBAAsB,IAAI,EAAE,CAAC;AAAA,QAC/D,CAAC;AAED,cAAM,KAAK,OAAO,EAAE;AAAA,MACtB,SAAS,GAAG;AACV,YAAI,CAAC,KAAK,UAAU,CAAC,KAAK,MAAM,QAAQ;AACtC,cAAI,WAAW,UAAU;AACvB,kBAAM,IAAI,MAAM,sCAAsC,OAAO,cAAc,CAAC,EAAE;AAAA,UAChF;AAEA,gBAAM,QAAQ,KAAK,IAAI,UAAU,GAAG,EAAE;AACtC;AAEA,eAAK,QAAQ;AAAA,YACX,6CAA6C,KAAK,aAAa,CAAC,KAAK,OAAO,IAAI,QAAQ;AAAA,UAC1F;AACA,gBAAM,IAAI,QAAQ,CAAC,YAAY,WAAW,SAAS,QAAQ,GAAI,CAAC;AAAA,QAClE,OAAO;AACL,eAAK,QAAQ;AAAA,YACX,+CAA+C,CAAC,kBAAkB,KAAK,MAAM,MAAM,eAAe,KAAK,MAAM;AAAA,UAC/G;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,MAAM,OAAO,IAAe;AAC1B,QAAI,UAAU;AAKd,UAAM,WAAW;AAAA,MACf,YAAY;AAAA,QACV,WAAW,KAAK,MAAM;AAAA,QACtB,yBAAyB,KAAK,MAAM;AAAA,QACpC,eAAe,KAAK,MAAM;AAAA,MAC5B;AAAA,MACA,0BAA0B;AAAA,QACxB,UAAU,KAAK,MAAM,YAAY;AAAA,QACjC,aAAa,KAAK,MAAM,cAAc;AAAA,QACtC,4BAA4B;AAAA,QAC5B,gBAAgB,KAAK,MAAM,iBAAiB;AAAA,QAC5C,sBAAsB;AAAA,MACxB;AAAA,IACF;AAEA,OAAG,KAAK,KAAK,UAAU,QAAQ,CAAC;AAEhC,UAAM,WAAW,YAAY;AAC3B,YAAM,aAAa,KAAK,MAAM,cAAc;AAC5C,YAAM,kBAAkB,eAAe,OAAQ,MAAM;AACrD,YAAM,kBAAkB,IAAI,gBAAgB,YAAY,GAAG,eAAe;AAE1E,UAAI;AACF,eAAO,CAAC,KAAK,QAAQ;AACnB,gBAAM,SAAS,MAAM,KAAK,MAAM,KAAK;AACrC,cAAI,OAAO,MAAM;AACf;AAAA,UACF;AAEA,gBAAM,OAAO,OAAO;AAEpB,cAAI;AACJ,cAAI,SAAS,aAAa,gBAAgB;AAExC,qBAAS,gBAAgB,MAAM;AAAA,UACjC,OAAO;AACL,gBAAI,KAAK,eAAe,cAAc,KAAK,aAAa,GAAG;AACzD,oBAAM,IAAI;AAAA,gBACR,mDAAmD,UAAU,eAAe,KAAK,UAAU,MAAM,KAAK,QAAQ;AAAA,cAChH;AAAA,YACF;AACA,qBAAS,gBAAgB,MAAM,KAAK,KAAK,MAAqB;AAAA,UAChE;AAEA,qBAAW,SAAS,QAAQ;AAC1B,kBAAM,SAAS,OAAO;AAAA,cACpB,MAAM,KAAK;AAAA,cACX,MAAM,KAAK;AAAA,cACX,MAAM,KAAK;AAAA,YACb;AACA,eAAG,KAAK,MAAM;AAAA,UAChB;AAAA,QACF;AAAA,MACF,UAAE;AACA,kBAAU;AACV,WAAG,MAAM;AAAA,MACX;AAAA,IACF;AAEA,UAAM,aAAa,KAAK,KAAK,OAAO,eAAe;AACjD,YAAM,gBAAgB,IAAI,QAAc,CAAC,SAAS,WAAW;AAC3D,WAAG,GAAG,WAAW,CAAC,SAAS;AACzB,cAAI;AACF,gBAAI;AAEJ,gBAAI,OAAO,SAAS,UAAU;AAC5B,2BAAa;AAAA,YACf,WAAW,gBAAgB,QAAQ;AACjC,2BAAa,KAAK,SAAS,OAAO;AAAA,YACpC,WAAW,MAAM,QAAQ,IAAI,GAAG;AAC9B,2BAAa,OAAO,OAAO,IAAI,EAAE,SAAS,OAAO;AAAA,YACnD,OAAO;AACL;AAAA,YACF;AAEA,kBAAM,MAAM,KAAK,MAAM,UAAU;AACjC,kBAAM,UAAU,IAAI,YAAY;AAChC,kBAAM,WAAW,IAAI,YAAY,CAAC;AAClC,kBAAM,aAAa,IAAI,cAAc;AACrC,kBAAM,aAAa,IAAI,cAAc;AACrC,kBAAM,eAAe,IAAI,iBAAiB,KAAK,MAAM;AAGrD,gBAAI,CAAC,YAAY;AACf,mBAAK,QAAQ,MAAM,oCAAoC,GAAG;AAC1D;AAAA,YACF;AAGA,gBAAI,CAAC,KAAK,aAAa,CAAC,SAAS;AAC/B,mBAAK,YAAY;AACjB,mBAAK,MAAM,IAAI,EAAE,MAAM,IAAI,gBAAgB,gBAAgB,CAAC;AAAA,YAC9D;AAGA,kBAAM,YACJ,SAAS,SAAS,KACb,SAAS,CAAC,EAAE,cAAc,KAAO,KAAK,kBACvC,KAAK;AACX,kBAAM,UACJ,SAAS,SAAS,KACb,SAAS,SAAS,SAAS,CAAC,EAAE,YAAY,KAAO,KAAK,kBACvD,KAAK;AAGX,kBAAM,QAAQ,SAAS;AAAA,cACrB,CAAC,aAAwE;AAAA,gBACvE,MAAM,QAAQ,QAAQ;AAAA,gBACtB,YAAY,QAAQ,cAAc,KAAO,KAAK;AAAA,gBAC9C,UAAU,QAAQ,YAAY,KAAO,KAAK;AAAA,gBAC1C,iBAAiB,KAAK;AAAA,gBACtB;AAAA,cACF;AAAA,YACF;AAEA,kBAAM,aAA6B;AAAA,cACjC,UAAU;AAAA,cACV,MAAM;AAAA,cACN;AAAA,cACA;AAAA,cACA;AAAA,cACA,OAAO,MAAM,SAAS,IAAI,QAAQ;AAAA,YACpC;AAGA,gBAAI,CAAC,SAAS;AAEZ,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,IAAI,gBAAgB;AAAA,gBAC1B,cAAc,CAAC,UAAU;AAAA,cAC3B,CAAC;AAAA,YACH,OAAO;AAEL,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,IAAI,gBAAgB;AAAA,gBAC1B,cAAc,CAAC,UAAU;AAAA,cAC3B,CAAC;AAGD,kBAAI,KAAK,WAAW;AAClB,qBAAK,YAAY;AACjB,qBAAK,MAAM,IAAI,EAAE,MAAM,IAAI,gBAAgB,cAAc,CAAC;AAAA,cAC5D;AAAA,YACF;AAEA,gBAAI,KAAK,UAAU,SAAS;AAC1B,sBAAQ;AAAA,YACV;AAAA,UACF,SAAS,KAAK;AACZ,iBAAK,QAAQ,MAAM,kCAAkC,IAAI,EAAE;AAC3D,mBAAO,GAAG;AAAA,UACZ;AAAA,QACF,CAAC;AAED,WAAG,GAAG,SAAS,CAAC,QAAQ;AACtB,cAAI,CAAC,SAAS;AACZ,mBAAO,GAAG;AAAA,UACZ;AAAA,QACF,CAAC;AAED,WAAG,GAAG,SAAS,MAAM;AACnB,cAAI,CAAC,SAAS;AACZ,oBAAQ;AAAA,UACV;AAAA,QACF,CAAC;AAAA,MACH,CAAC;AAED,YAAM,QAAQ,KAAK,CAAC,eAAe,aAAa,WAAW,MAAM,CAAC,CAAC;AAAA,IACrE,GAAG,KAAK,eAAe;AAEvB,UAAM,QAAQ,IAAI,CAAC,SAAS,GAAG,WAAW,MAAM,CAAC;AACjD,cAAU;AACV,OAAG,MAAM;AAAA,EACX;AACF;","names":["stt"]}
|
package/dist/types.cjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/types.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\n\n/**\n * Baseten plugin types and interfaces\n */\n\n/**\n * Options for configuring the Baseten LLM\n * Since Baseten provides an OpenAI-compatible API, these options\n * map to standard OpenAI parameters.\n */\nexport interface BasetenLLMOptions {\n apiKey?: string;\n model: string;\n temperature?: number;\n maxTokens?: number;\n user?: string;\n toolChoice?: 'none' | 'auto' | 'required' | { type: 'function'; function: { name: string } };\n parallelToolCalls?: boolean;\n}\n\n/**\n * Options for configuring the Baseten STT service\n */\nexport interface BasetenSttOptions {\n apiKey: string;\n modelId
|
|
1
|
+
{"version":3,"sources":["../src/types.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\n\n/**\n * Baseten plugin types and interfaces\n */\n\n/**\n * Options for configuring the Baseten LLM\n * Since Baseten provides an OpenAI-compatible API, these options\n * map to standard OpenAI parameters.\n */\nexport interface BasetenLLMOptions {\n apiKey?: string;\n model: string;\n temperature?: number;\n maxTokens?: number;\n user?: string;\n toolChoice?: 'none' | 'auto' | 'required' | { type: 'function'; function: { name: string } };\n parallelToolCalls?: boolean;\n}\n\n/**\n * Options for configuring the Baseten STT service\n */\nexport interface BasetenSttOptions {\n apiKey: string;\n /** @deprecated Use modelEndpoint instead */\n modelId?: string;\n /** Full WebSocket endpoint URL (e.g., from Baseten dashboard). Takes priority over modelId. */\n modelEndpoint?: string;\n environment?: string;\n encoding?: string;\n sampleRate?: number;\n bufferSizeSeconds?: number;\n vadThreshold?: number;\n vadMinSilenceDurationMs?: number;\n vadSpeechPadMs?: number;\n enablePartialTranscripts?: boolean;\n partialTranscriptIntervalS?: number;\n finalTranscriptMaxDurationS?: number;\n audioLanguage?: string;\n prompt?: string;\n languageDetectionOnly?: boolean;\n}\n\n/**\n * Options for configuring the Baseten TTS service\n */\nexport interface BasetenTTSOptions {\n apiKey: string;\n modelEndpoint: string;\n voice?: string;\n language?: string;\n temperature?: number;\n maxTokens?: number;\n}\n"],"mappings":";;;;;;;;;;;;;;AAAA;AAAA;","names":[]}
|
package/dist/types.d.cts
CHANGED
|
@@ -25,7 +25,10 @@ export interface BasetenLLMOptions {
|
|
|
25
25
|
*/
|
|
26
26
|
export interface BasetenSttOptions {
|
|
27
27
|
apiKey: string;
|
|
28
|
-
|
|
28
|
+
/** @deprecated Use modelEndpoint instead */
|
|
29
|
+
modelId?: string;
|
|
30
|
+
/** Full WebSocket endpoint URL (e.g., from Baseten dashboard). Takes priority over modelId. */
|
|
31
|
+
modelEndpoint?: string;
|
|
29
32
|
environment?: string;
|
|
30
33
|
encoding?: string;
|
|
31
34
|
sampleRate?: number;
|
package/dist/types.d.ts
CHANGED
|
@@ -25,7 +25,10 @@ export interface BasetenLLMOptions {
|
|
|
25
25
|
*/
|
|
26
26
|
export interface BasetenSttOptions {
|
|
27
27
|
apiKey: string;
|
|
28
|
-
|
|
28
|
+
/** @deprecated Use modelEndpoint instead */
|
|
29
|
+
modelId?: string;
|
|
30
|
+
/** Full WebSocket endpoint URL (e.g., from Baseten dashboard). Takes priority over modelId. */
|
|
31
|
+
modelEndpoint?: string;
|
|
29
32
|
environment?: string;
|
|
30
33
|
encoding?: string;
|
|
31
34
|
sampleRate?: number;
|
package/dist/types.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAIA;;GAEG;AAEH;;;;GAIG;AACH,MAAM,WAAW,iBAAiB;IAChC,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,UAAU,CAAC,EAAE,MAAM,GAAG,MAAM,GAAG,UAAU,GAAG;QAAE,IAAI,EAAE,UAAU,CAAC;QAAC,QAAQ,EAAE;YAAE,IAAI,EAAE,MAAM,CAAA;SAAE,CAAA;KAAE,CAAC;IAC7F,iBAAiB,CAAC,EAAE,OAAO,CAAC;CAC7B;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,MAAM,CAAC;
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAIA;;GAEG;AAEH;;;;GAIG;AACH,MAAM,WAAW,iBAAiB;IAChC,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,UAAU,CAAC,EAAE,MAAM,GAAG,MAAM,GAAG,UAAU,GAAG;QAAE,IAAI,EAAE,UAAU,CAAC;QAAC,QAAQ,EAAE;YAAE,IAAI,EAAE,MAAM,CAAA;SAAE,CAAA;KAAE,CAAC;IAC7F,iBAAiB,CAAC,EAAE,OAAO,CAAC;CAC7B;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,MAAM,EAAE,MAAM,CAAC;IACf,4CAA4C;IAC5C,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,+FAA+F;IAC/F,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,uBAAuB,CAAC,EAAE,MAAM,CAAC;IACjC,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,wBAAwB,CAAC,EAAE,OAAO,CAAC;IACnC,0BAA0B,CAAC,EAAE,MAAM,CAAC;IACpC,2BAA2B,CAAC,EAAE,MAAM,CAAC;IACrC,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,qBAAqB,CAAC,EAAE,OAAO,CAAC;CACjC;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,MAAM,EAAE,MAAM,CAAC;IACf,aAAa,EAAE,MAAM,CAAC;IACtB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@livekit/agents-plugin-baseten",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.38",
|
|
4
4
|
"description": "Baseten plugin for LiveKit Node Agents",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"require": "dist/index.cjs",
|
|
@@ -40,9 +40,9 @@
|
|
|
40
40
|
"@types/ws": "^8.5.8",
|
|
41
41
|
"tsx": "^4.7.0",
|
|
42
42
|
"typescript": "^5.9.3",
|
|
43
|
-
"@livekit/agents": "1.0.
|
|
44
|
-
"@livekit/agents-plugin-silero": "1.0.
|
|
45
|
-
"@livekit/agents-plugins-test": "1.0.
|
|
43
|
+
"@livekit/agents": "1.0.38",
|
|
44
|
+
"@livekit/agents-plugin-silero": "1.0.38",
|
|
45
|
+
"@livekit/agents-plugins-test": "1.0.38"
|
|
46
46
|
},
|
|
47
47
|
"dependencies": {
|
|
48
48
|
"dotenv": "^17.2.3",
|
|
@@ -51,7 +51,7 @@
|
|
|
51
51
|
},
|
|
52
52
|
"peerDependencies": {
|
|
53
53
|
"@livekit/rtc-node": "^0.13.24",
|
|
54
|
-
"@livekit/agents": "1.0.
|
|
54
|
+
"@livekit/agents": "1.0.38"
|
|
55
55
|
},
|
|
56
56
|
"scripts": {
|
|
57
57
|
"build": "tsup --onSuccess \"pnpm build:types\"",
|
package/src/stt.ts
CHANGED
|
@@ -30,9 +30,11 @@ export class STT extends stt.STT {
|
|
|
30
30
|
super({
|
|
31
31
|
streaming: true,
|
|
32
32
|
interimResults: opts.enablePartialTranscripts ?? defaultSTTOptions.enablePartialTranscripts!,
|
|
33
|
+
alignedTranscript: 'word',
|
|
33
34
|
});
|
|
34
35
|
|
|
35
36
|
const apiKey = opts.apiKey ?? process.env.BASETEN_API_KEY;
|
|
37
|
+
const modelEndpoint = opts.modelEndpoint ?? process.env.BASETEN_MODEL_ENDPOINT;
|
|
36
38
|
const modelId = opts.modelId ?? process.env.BASETEN_STT_MODEL_ID;
|
|
37
39
|
|
|
38
40
|
if (!apiKey) {
|
|
@@ -40,9 +42,9 @@ export class STT extends stt.STT {
|
|
|
40
42
|
'Baseten API key is required, either pass it as `apiKey` or set $BASETEN_API_KEY',
|
|
41
43
|
);
|
|
42
44
|
}
|
|
43
|
-
if (!modelId) {
|
|
45
|
+
if (!modelEndpoint && !modelId) {
|
|
44
46
|
throw new Error(
|
|
45
|
-
'Baseten model
|
|
47
|
+
'Baseten model endpoint is required, either pass it as `modelEndpoint` or set $BASETEN_MODEL_ENDPOINT',
|
|
46
48
|
);
|
|
47
49
|
}
|
|
48
50
|
|
|
@@ -50,6 +52,7 @@ export class STT extends stt.STT {
|
|
|
50
52
|
...defaultSTTOptions,
|
|
51
53
|
...opts,
|
|
52
54
|
apiKey,
|
|
55
|
+
modelEndpoint,
|
|
53
56
|
modelId,
|
|
54
57
|
} as BasetenSttOptions;
|
|
55
58
|
}
|
|
@@ -82,6 +85,10 @@ export class SpeechStream extends stt.SpeechStream {
|
|
|
82
85
|
}
|
|
83
86
|
|
|
84
87
|
private getWsUrl(): string {
|
|
88
|
+
if (this.#opts.modelEndpoint) {
|
|
89
|
+
return this.#opts.modelEndpoint;
|
|
90
|
+
}
|
|
91
|
+
// Fallback to constructing URL from modelId (deprecated)
|
|
85
92
|
return `wss://model-${this.#opts.modelId}.api.baseten.co/environments/${this.#opts.environment}/websocket`;
|
|
86
93
|
}
|
|
87
94
|
|
|
@@ -95,7 +102,7 @@ export class SpeechStream extends stt.SpeechStream {
|
|
|
95
102
|
Authorization: `Api-Key ${this.#opts.apiKey}`,
|
|
96
103
|
};
|
|
97
104
|
|
|
98
|
-
const ws = new WebSocket(url, { headers });
|
|
105
|
+
const ws = new WebSocket(url, { headers, rejectUnauthorized: false });
|
|
99
106
|
|
|
100
107
|
try {
|
|
101
108
|
await new Promise((resolve, reject) => {
|
|
@@ -133,25 +140,23 @@ export class SpeechStream extends stt.SpeechStream {
|
|
|
133
140
|
let closing = false;
|
|
134
141
|
|
|
135
142
|
// Send initial metadata
|
|
143
|
+
// Note: Baseten server expects 'vad_params' and 'streaming_whisper_params' field names
|
|
144
|
+
// (not 'streaming_vad_config', 'streaming_params', 'whisper_params' as in older versions)
|
|
136
145
|
const metadata = {
|
|
137
|
-
|
|
146
|
+
vad_params: {
|
|
138
147
|
threshold: this.#opts.vadThreshold,
|
|
139
148
|
min_silence_duration_ms: this.#opts.vadMinSilenceDurationMs,
|
|
140
149
|
speech_pad_ms: this.#opts.vadSpeechPadMs,
|
|
141
150
|
},
|
|
142
|
-
|
|
151
|
+
streaming_whisper_params: {
|
|
143
152
|
encoding: this.#opts.encoding ?? 'pcm_s16le',
|
|
144
153
|
sample_rate: this.#opts.sampleRate ?? 16000,
|
|
145
|
-
enable_partial_transcripts:
|
|
146
|
-
partial_transcript_interval_s: this.#opts.partialTranscriptIntervalS,
|
|
147
|
-
final_transcript_max_duration_s: this.#opts.finalTranscriptMaxDurationS,
|
|
148
|
-
},
|
|
149
|
-
whisper_params: {
|
|
150
|
-
prompt: this.#opts.prompt,
|
|
154
|
+
enable_partial_transcripts: false,
|
|
151
155
|
audio_language: this.#opts.audioLanguage ?? 'en',
|
|
152
|
-
|
|
156
|
+
show_word_timestamps: true,
|
|
153
157
|
},
|
|
154
158
|
};
|
|
159
|
+
|
|
155
160
|
ws.send(JSON.stringify(metadata));
|
|
156
161
|
|
|
157
162
|
const sendTask = async () => {
|
|
@@ -213,8 +218,6 @@ export class SpeechStream extends stt.SpeechStream {
|
|
|
213
218
|
}
|
|
214
219
|
|
|
215
220
|
const msg = JSON.parse(jsonString);
|
|
216
|
-
|
|
217
|
-
// Parse response format matching Python implementation
|
|
218
221
|
const isFinal = msg.is_final ?? true;
|
|
219
222
|
const segments = msg.segments ?? [];
|
|
220
223
|
const transcript = msg.transcript ?? '';
|
|
@@ -233,9 +236,26 @@ export class SpeechStream extends stt.SpeechStream {
|
|
|
233
236
|
this.queue.put({ type: stt.SpeechEventType.START_OF_SPEECH });
|
|
234
237
|
}
|
|
235
238
|
|
|
236
|
-
//
|
|
237
|
-
const startTime =
|
|
238
|
-
|
|
239
|
+
// Note: Baseten uses 'start_time' and 'end_time' field names (with underscores)
|
|
240
|
+
const startTime =
|
|
241
|
+
segments.length > 0
|
|
242
|
+
? (segments[0].start_time ?? 0.0) + this.startTimeOffset
|
|
243
|
+
: this.startTimeOffset;
|
|
244
|
+
const endTime =
|
|
245
|
+
segments.length > 0
|
|
246
|
+
? (segments[segments.length - 1].end_time ?? 0.0) + this.startTimeOffset
|
|
247
|
+
: this.startTimeOffset;
|
|
248
|
+
|
|
249
|
+
// Note: Baseten returns segments (chunks) which we treat as words for aligned transcripts
|
|
250
|
+
const words = segments.map(
|
|
251
|
+
(segment: { text?: string; start_time?: number; end_time?: number }) => ({
|
|
252
|
+
text: segment.text ?? '',
|
|
253
|
+
startTime: (segment.start_time ?? 0.0) + this.startTimeOffset,
|
|
254
|
+
endTime: (segment.end_time ?? 0.0) + this.startTimeOffset,
|
|
255
|
+
startTimeOffset: this.startTimeOffset,
|
|
256
|
+
confidence: confidence,
|
|
257
|
+
}),
|
|
258
|
+
);
|
|
239
259
|
|
|
240
260
|
const speechData: stt.SpeechData = {
|
|
241
261
|
language: languageCode!,
|
|
@@ -243,6 +263,7 @@ export class SpeechStream extends stt.SpeechStream {
|
|
|
243
263
|
startTime,
|
|
244
264
|
endTime,
|
|
245
265
|
confidence,
|
|
266
|
+
words: words.length > 0 ? words : undefined,
|
|
246
267
|
};
|
|
247
268
|
|
|
248
269
|
// Handle interim vs final transcripts (matching Python implementation)
|
package/src/types.ts
CHANGED
|
@@ -26,7 +26,10 @@ export interface BasetenLLMOptions {
|
|
|
26
26
|
*/
|
|
27
27
|
export interface BasetenSttOptions {
|
|
28
28
|
apiKey: string;
|
|
29
|
-
|
|
29
|
+
/** @deprecated Use modelEndpoint instead */
|
|
30
|
+
modelId?: string;
|
|
31
|
+
/** Full WebSocket endpoint URL (e.g., from Baseten dashboard). Takes priority over modelId. */
|
|
32
|
+
modelEndpoint?: string;
|
|
30
33
|
environment?: string;
|
|
31
34
|
encoding?: string;
|
|
32
35
|
sampleRate?: number;
|