@livekit/agents-plugin-baseten 1.0.37 → 1.0.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/stt.cjs CHANGED
@@ -45,24 +45,27 @@ class STT extends import_agents.stt.STT {
45
45
  constructor(opts = {}) {
46
46
  super({
47
47
  streaming: true,
48
- interimResults: opts.enablePartialTranscripts ?? defaultSTTOptions.enablePartialTranscripts
48
+ interimResults: opts.enablePartialTranscripts ?? defaultSTTOptions.enablePartialTranscripts,
49
+ alignedTranscript: "word"
49
50
  });
50
51
  const apiKey = opts.apiKey ?? process.env.BASETEN_API_KEY;
52
+ const modelEndpoint = opts.modelEndpoint ?? process.env.BASETEN_MODEL_ENDPOINT;
51
53
  const modelId = opts.modelId ?? process.env.BASETEN_STT_MODEL_ID;
52
54
  if (!apiKey) {
53
55
  throw new Error(
54
56
  "Baseten API key is required, either pass it as `apiKey` or set $BASETEN_API_KEY"
55
57
  );
56
58
  }
57
- if (!modelId) {
59
+ if (!modelEndpoint && !modelId) {
58
60
  throw new Error(
59
- "Baseten model ID is required, either pass it as `modelId` or set $BASETEN_STT_MODEL_ID"
61
+ "Baseten model endpoint is required, either pass it as `modelEndpoint` or set $BASETEN_MODEL_ENDPOINT"
60
62
  );
61
63
  }
62
64
  this.#opts = {
63
65
  ...defaultSTTOptions,
64
66
  ...opts,
65
67
  apiKey,
68
+ modelEndpoint,
66
69
  modelId
67
70
  };
68
71
  }
@@ -89,6 +92,9 @@ class SpeechStream extends import_agents.stt.SpeechStream {
89
92
  this.closed = false;
90
93
  }
91
94
  getWsUrl() {
95
+ if (this.#opts.modelEndpoint) {
96
+ return this.#opts.modelEndpoint;
97
+ }
92
98
  return `wss://model-${this.#opts.modelId}.api.baseten.co/environments/${this.#opts.environment}/websocket`;
93
99
  }
94
100
  async run() {
@@ -99,7 +105,7 @@ class SpeechStream extends import_agents.stt.SpeechStream {
99
105
  const headers = {
100
106
  Authorization: `Api-Key ${this.#opts.apiKey}`
101
107
  };
102
- const ws = new import_ws.WebSocket(url, { headers });
108
+ const ws = new import_ws.WebSocket(url, { headers, rejectUnauthorized: false });
103
109
  try {
104
110
  await new Promise((resolve, reject) => {
105
111
  ws.on("open", resolve);
@@ -130,22 +136,17 @@ class SpeechStream extends import_agents.stt.SpeechStream {
130
136
  async #runWS(ws) {
131
137
  let closing = false;
132
138
  const metadata = {
133
- streaming_vad_config: {
139
+ vad_params: {
134
140
  threshold: this.#opts.vadThreshold,
135
141
  min_silence_duration_ms: this.#opts.vadMinSilenceDurationMs,
136
142
  speech_pad_ms: this.#opts.vadSpeechPadMs
137
143
  },
138
- streaming_params: {
144
+ streaming_whisper_params: {
139
145
  encoding: this.#opts.encoding ?? "pcm_s16le",
140
146
  sample_rate: this.#opts.sampleRate ?? 16e3,
141
- enable_partial_transcripts: this.#opts.enablePartialTranscripts,
142
- partial_transcript_interval_s: this.#opts.partialTranscriptIntervalS,
143
- final_transcript_max_duration_s: this.#opts.finalTranscriptMaxDurationS
144
- },
145
- whisper_params: {
146
- prompt: this.#opts.prompt,
147
+ enable_partial_transcripts: false,
147
148
  audio_language: this.#opts.audioLanguage ?? "en",
148
- language_detection_only: this.#opts.languageDetectionOnly ?? false
149
+ show_word_timestamps: true
149
150
  }
150
151
  };
151
152
  ws.send(JSON.stringify(metadata));
@@ -213,14 +214,24 @@ class SpeechStream extends import_agents.stt.SpeechStream {
213
214
  this.#speaking = true;
214
215
  this.queue.put({ type: import_agents.stt.SpeechEventType.START_OF_SPEECH });
215
216
  }
216
- const startTime = segments.length > 0 ? segments[0].start ?? 0 : 0;
217
- const endTime = segments.length > 0 ? segments[segments.length - 1].end ?? 0 : 0;
217
+ const startTime = segments.length > 0 ? (segments[0].start_time ?? 0) + this.startTimeOffset : this.startTimeOffset;
218
+ const endTime = segments.length > 0 ? (segments[segments.length - 1].end_time ?? 0) + this.startTimeOffset : this.startTimeOffset;
219
+ const words = segments.map(
220
+ (segment) => ({
221
+ text: segment.text ?? "",
222
+ startTime: (segment.start_time ?? 0) + this.startTimeOffset,
223
+ endTime: (segment.end_time ?? 0) + this.startTimeOffset,
224
+ startTimeOffset: this.startTimeOffset,
225
+ confidence
226
+ })
227
+ );
218
228
  const speechData = {
219
229
  language: languageCode,
220
230
  text: transcript,
221
231
  startTime,
222
232
  endTime,
223
- confidence
233
+ confidence,
234
+ words: words.length > 0 ? words : void 0
224
235
  };
225
236
  if (!isFinal) {
226
237
  this.queue.put({
package/dist/stt.cjs.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/stt.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { type AudioBuffer, AudioByteStream, Task, log, stt, waitForAbort } from '@livekit/agents';\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { WebSocket } from 'ws';\nimport type { BasetenSttOptions } from './types.js';\n\nconst defaultSTTOptions: Partial<BasetenSttOptions> = {\n environment: 'production',\n encoding: 'pcm_s16le',\n sampleRate: 16000,\n bufferSizeSeconds: 0.032,\n enablePartialTranscripts: true,\n partialTranscriptIntervalS: 0.5,\n finalTranscriptMaxDurationS: 5,\n audioLanguage: 'en',\n languageDetectionOnly: false,\n vadThreshold: 0.5,\n vadMinSilenceDurationMs: 300,\n vadSpeechPadMs: 30,\n};\n\nexport class STT extends stt.STT {\n #opts: BasetenSttOptions;\n #logger = log();\n label = 'baseten.STT';\n\n constructor(opts: Partial<BasetenSttOptions> = {}) {\n super({\n streaming: true,\n interimResults: opts.enablePartialTranscripts ?? defaultSTTOptions.enablePartialTranscripts!,\n });\n\n const apiKey = opts.apiKey ?? process.env.BASETEN_API_KEY;\n const modelId = opts.modelId ?? process.env.BASETEN_STT_MODEL_ID;\n\n if (!apiKey) {\n throw new Error(\n 'Baseten API key is required, either pass it as `apiKey` or set $BASETEN_API_KEY',\n );\n }\n if (!modelId) {\n throw new Error(\n 'Baseten model ID is required, either pass it as `modelId` or set $BASETEN_STT_MODEL_ID',\n );\n }\n\n this.#opts = {\n ...defaultSTTOptions,\n ...opts,\n apiKey,\n modelId,\n } as BasetenSttOptions;\n }\n\n // eslint-disable-next-line\n async _recognize(_: AudioBuffer): Promise<stt.SpeechEvent> {\n throw new Error('Recognize is not supported on Baseten STT');\n }\n\n updateOptions(opts: Partial<BasetenSttOptions>) {\n this.#opts = { ...this.#opts, ...opts };\n }\n\n stream(): SpeechStream {\n return new SpeechStream(this, this.#opts);\n }\n}\n\nexport class SpeechStream extends stt.SpeechStream {\n #opts: BasetenSttOptions;\n #logger = log();\n #speaking = false;\n #requestId = '';\n label = 'baseten.SpeechStream';\n\n constructor(stt: STT, opts: BasetenSttOptions) {\n super(stt, opts.sampleRate);\n this.#opts = opts;\n this.closed = false;\n }\n\n private getWsUrl(): string {\n return `wss://model-${this.#opts.modelId}.api.baseten.co/environments/${this.#opts.environment}/websocket`;\n }\n\n protected async run() {\n const maxRetry = 32;\n let retries = 0;\n\n while (!this.input.closed && !this.closed) {\n const url = this.getWsUrl();\n const headers = {\n Authorization: `Api-Key ${this.#opts.apiKey}`,\n };\n\n const ws = new WebSocket(url, { headers });\n\n try {\n await new Promise((resolve, reject) => {\n ws.on('open', resolve);\n ws.on('error', (error) => reject(error));\n ws.on('close', (code) => reject(`WebSocket returned ${code}`));\n });\n\n await this.#runWS(ws);\n } catch (e) {\n if (!this.closed && !this.input.closed) {\n if (retries >= maxRetry) {\n throw new Error(`failed to connect to Baseten after ${retries} attempts: ${e}`);\n }\n\n const delay = Math.min(retries * 5, 10);\n retries++;\n\n this.#logger.warn(\n `failed to connect to Baseten, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`,\n );\n await new Promise((resolve) => setTimeout(resolve, delay * 1000));\n } else {\n this.#logger.warn(\n `Baseten disconnected, connection is closed: ${e} (inputClosed: ${this.input.closed}, isClosed: ${this.closed})`,\n );\n }\n }\n }\n\n this.closed = true;\n }\n\n async #runWS(ws: WebSocket) {\n let closing = false;\n\n // Send initial metadata\n const metadata = {\n streaming_vad_config: {\n threshold: this.#opts.vadThreshold,\n min_silence_duration_ms: this.#opts.vadMinSilenceDurationMs,\n speech_pad_ms: this.#opts.vadSpeechPadMs,\n },\n streaming_params: {\n encoding: this.#opts.encoding ?? 'pcm_s16le',\n sample_rate: this.#opts.sampleRate ?? 16000,\n enable_partial_transcripts: this.#opts.enablePartialTranscripts,\n partial_transcript_interval_s: this.#opts.partialTranscriptIntervalS,\n final_transcript_max_duration_s: this.#opts.finalTranscriptMaxDurationS,\n },\n whisper_params: {\n prompt: this.#opts.prompt,\n audio_language: this.#opts.audioLanguage ?? 'en',\n language_detection_only: this.#opts.languageDetectionOnly ?? false,\n },\n };\n ws.send(JSON.stringify(metadata));\n\n const sendTask = async () => {\n const sampleRate = this.#opts.sampleRate ?? 16000;\n const samplesPerChunk = sampleRate === 16000 ? 512 : 256;\n const audioByteStream = new AudioByteStream(sampleRate, 1, samplesPerChunk);\n\n try {\n while (!this.closed) {\n const result = await this.input.next();\n if (result.done) {\n break;\n }\n\n const data = result.value;\n\n let frames: AudioFrame[];\n if (data === SpeechStream.FLUSH_SENTINEL) {\n // Flush any remaining buffered audio\n frames = audioByteStream.flush();\n } else {\n if (data.sampleRate !== sampleRate || data.channels !== 1) {\n throw new Error(\n `sample rate or channel count mismatch: expected ${sampleRate}Hz/1ch, got ${data.sampleRate}Hz/${data.channels}ch`,\n );\n }\n frames = audioByteStream.write(data.data.buffer as ArrayBuffer);\n }\n\n for (const frame of frames) {\n const buffer = Buffer.from(\n frame.data.buffer,\n frame.data.byteOffset,\n frame.data.byteLength,\n );\n ws.send(buffer);\n }\n }\n } finally {\n closing = true;\n ws.close();\n }\n };\n\n const listenTask = Task.from(async (controller) => {\n const listenMessage = new Promise<void>((resolve, reject) => {\n ws.on('message', (data) => {\n try {\n let jsonString: string;\n\n if (typeof data === 'string') {\n jsonString = data;\n } else if (data instanceof Buffer) {\n jsonString = data.toString('utf-8');\n } else if (Array.isArray(data)) {\n jsonString = Buffer.concat(data).toString('utf-8');\n } else {\n return;\n }\n\n const msg = JSON.parse(jsonString);\n\n // Parse response format matching Python implementation\n const isFinal = msg.is_final ?? true;\n const segments = msg.segments ?? [];\n const transcript = msg.transcript ?? '';\n const confidence = msg.confidence ?? 0.0;\n const languageCode = msg.language_code ?? this.#opts.audioLanguage;\n\n // Skip if no transcript text\n if (!transcript) {\n this.#logger.debug('Received non-transcript message:', msg);\n return;\n }\n\n // Emit START_OF_SPEECH if not already speaking (only for interim or first final)\n if (!this.#speaking && !isFinal) {\n this.#speaking = true;\n this.queue.put({ type: stt.SpeechEventType.START_OF_SPEECH });\n }\n\n // Extract timing from segments\n const startTime = segments.length > 0 ? segments[0].start ?? 0.0 : 0.0;\n const endTime = segments.length > 0 ? segments[segments.length - 1].end ?? 0.0 : 0.0;\n\n const speechData: stt.SpeechData = {\n language: languageCode!,\n text: transcript,\n startTime,\n endTime,\n confidence,\n };\n\n // Handle interim vs final transcripts (matching Python implementation)\n if (!isFinal) {\n // Interim transcript\n this.queue.put({\n type: stt.SpeechEventType.INTERIM_TRANSCRIPT,\n alternatives: [speechData],\n });\n } else {\n // Final transcript\n this.queue.put({\n type: stt.SpeechEventType.FINAL_TRANSCRIPT,\n alternatives: [speechData],\n });\n\n // Emit END_OF_SPEECH after final transcript\n if (this.#speaking) {\n this.#speaking = false;\n this.queue.put({ type: stt.SpeechEventType.END_OF_SPEECH });\n }\n }\n\n if (this.closed || closing) {\n resolve();\n }\n } catch (err) {\n this.#logger.error(`STT: Error processing message: ${data}`);\n reject(err);\n }\n });\n\n ws.on('error', (err) => {\n if (!closing) {\n reject(err);\n }\n });\n\n ws.on('close', () => {\n if (!closing) {\n resolve();\n }\n });\n });\n\n await Promise.race([listenMessage, waitForAbort(controller.signal)]);\n }, this.abortController);\n\n await Promise.all([sendTask(), listenTask.result]);\n closing = true;\n ws.close();\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,oBAAgF;AAEhF,gBAA0B;AAG1B,MAAM,oBAAgD;AAAA,EACpD,aAAa;AAAA,EACb,UAAU;AAAA,EACV,YAAY;AAAA,EACZ,mBAAmB;AAAA,EACnB,0BAA0B;AAAA,EAC1B,4BAA4B;AAAA,EAC5B,6BAA6B;AAAA,EAC7B,eAAe;AAAA,EACf,uBAAuB;AAAA,EACvB,cAAc;AAAA,EACd,yBAAyB;AAAA,EACzB,gBAAgB;AAClB;AAEO,MAAM,YAAY,kBAAI,IAAI;AAAA,EAC/B;AAAA,EACA,cAAU,mBAAI;AAAA,EACd,QAAQ;AAAA,EAER,YAAY,OAAmC,CAAC,GAAG;AACjD,UAAM;AAAA,MACJ,WAAW;AAAA,MACX,gBAAgB,KAAK,4BAA4B,kBAAkB;AAAA,IACrE,CAAC;AAED,UAAM,SAAS,KAAK,UAAU,QAAQ,IAAI;AAC1C,UAAM,UAAU,KAAK,WAAW,QAAQ,IAAI;AAE5C,QAAI,CAAC,QAAQ;AACX,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,SAAS;AACZ,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAEA,SAAK,QAAQ;AAAA,MACX,GAAG;AAAA,MACH,GAAG;AAAA,MACH;AAAA,MACA;AAAA,IACF;AAAA,EACF;AAAA;AAAA,EAGA,MAAM,WAAW,GAA0C;AACzD,UAAM,IAAI,MAAM,2CAA2C;AAAA,EAC7D;AAAA,EAEA,cAAc,MAAkC;AAC9C,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AAAA,EACxC;AAAA,EAEA,SAAuB;AACrB,WAAO,IAAI,aAAa,MAAM,KAAK,KAAK;AAAA,EAC1C;AACF;AAEO,MAAM,qBAAqB,kBAAI,aAAa;AAAA,EACjD;AAAA,EACA,cAAU,mBAAI;AAAA,EACd,YAAY;AAAA,EACZ,aAAa;AAAA,EACb,QAAQ;AAAA,EAER,YAAYA,MAAU,MAAyB;AAC7C,UAAMA,MAAK,KAAK,UAAU;AAC1B,SAAK,QAAQ;AACb,SAAK,SAAS;AAAA,EAChB;AAAA,EAEQ,WAAmB;AACzB,WAAO,eAAe,KAAK,MAAM,OAAO,gCAAgC,KAAK,MAAM,WAAW;AAAA,EAChG;AAAA,EAEA,MAAgB,MAAM;AACpB,UAAM,WAAW;AACjB,QAAI,UAAU;AAEd,WAAO,CAAC,KAAK,MAAM,UAAU,CAAC,KAAK,QAAQ;AACzC,YAAM,MAAM,KAAK,SAAS;AAC1B,YAAM,UAAU;AAAA,QACd,eAAe,WAAW,KAAK,MAAM,MAAM;AAAA,MAC7C;AAEA,YAAM,KAAK,IAAI,oBAAU,KAAK,EAAE,QAAQ,CAAC;AAEzC,UAAI;AACF,cAAM,IAAI,QAAQ,CAAC,SAAS,WAAW;AACrC,aAAG,GAAG,QAAQ,OAAO;AACrB,aAAG,GAAG,SAAS,CAAC,UAAU,OAAO,KAAK,CAAC;AACvC,aAAG,GAAG,SAAS,CAAC,SAAS,OAAO,sBAAsB,IAAI,EAAE,CAAC;AAAA,QAC/D,CAAC;AAED,cAAM,KAAK,OAAO,EAAE;AAAA,MACtB,SAAS,GAAG;AACV,YAAI,CAAC,KAAK,UAAU,CAAC,KAAK,MAAM,QAAQ;AACtC,cAAI,WAAW,UAAU;AACvB,kBAAM,IAAI,MAAM,sCAAsC,OAAO,cAAc,CAAC,EAAE;AAAA,UAChF;AAEA,gBAAM,QAAQ,KAAK,IAAI,UAAU,GAAG,EAAE;AACtC;AAEA,eAAK,QAAQ;AAAA,YACX,6CAA6C,KAAK,aAAa,CAAC,KAAK,OAAO,IAAI,QAAQ;AAAA,UAC1F;AACA,gBAAM,IAAI,QAAQ,CAAC,YAAY,WAAW,SAAS,QAAQ,GAAI,CAAC;AAAA,QAClE,OAAO;AACL,eAAK,QAAQ;AAAA,YACX,+CAA+C,CAAC,kBAAkB,KAAK,MAAM,MAAM,eAAe,KAAK,MAAM;AAAA,UAC/G;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,MAAM,OAAO,IAAe;AAC1B,QAAI,UAAU;AAGd,UAAM,WAAW;AAAA,MACf,sBAAsB;AAAA,QACpB,WAAW,KAAK,MAAM;AAAA,QACtB,yBAAyB,KAAK,MAAM;AAAA,QACpC,eAAe,KAAK,MAAM;AAAA,MAC5B;AAAA,MACA,kBAAkB;AAAA,QAChB,UAAU,KAAK,MAAM,YAAY;AAAA,QACjC,aAAa,KAAK,MAAM,cAAc;AAAA,QACtC,4BAA4B,KAAK,MAAM;AAAA,QACvC,+BAA+B,KAAK,MAAM;AAAA,QAC1C,iCAAiC,KAAK,MAAM;AAAA,MAC9C;AAAA,MACA,gBAAgB;AAAA,QACd,QAAQ,KAAK,MAAM;AAAA,QACnB,gBAAgB,KAAK,MAAM,iBAAiB;AAAA,QAC5C,yBAAyB,KAAK,MAAM,yBAAyB;AAAA,MAC/D;AAAA,IACF;AACA,OAAG,KAAK,KAAK,UAAU,QAAQ,CAAC;AAEhC,UAAM,WAAW,YAAY;AAC3B,YAAM,aAAa,KAAK,MAAM,cAAc;AAC5C,YAAM,kBAAkB,eAAe,OAAQ,MAAM;AACrD,YAAM,kBAAkB,IAAI,8BAAgB,YAAY,GAAG,eAAe;AAE1E,UAAI;AACF,eAAO,CAAC,KAAK,QAAQ;AACnB,gBAAM,SAAS,MAAM,KAAK,MAAM,KAAK;AACrC,cAAI,OAAO,MAAM;AACf;AAAA,UACF;AAEA,gBAAM,OAAO,OAAO;AAEpB,cAAI;AACJ,cAAI,SAAS,aAAa,gBAAgB;AAExC,qBAAS,gBAAgB,MAAM;AAAA,UACjC,OAAO;AACL,gBAAI,KAAK,eAAe,cAAc,KAAK,aAAa,GAAG;AACzD,oBAAM,IAAI;AAAA,gBACR,mDAAmD,UAAU,eAAe,KAAK,UAAU,MAAM,KAAK,QAAQ;AAAA,cAChH;AAAA,YACF;AACA,qBAAS,gBAAgB,MAAM,KAAK,KAAK,MAAqB;AAAA,UAChE;AAEA,qBAAW,SAAS,QAAQ;AAC1B,kBAAM,SAAS,OAAO;AAAA,cACpB,MAAM,KAAK;AAAA,cACX,MAAM,KAAK;AAAA,cACX,MAAM,KAAK;AAAA,YACb;AACA,eAAG,KAAK,MAAM;AAAA,UAChB;AAAA,QACF;AAAA,MACF,UAAE;AACA,kBAAU;AACV,WAAG,MAAM;AAAA,MACX;AAAA,IACF;AAEA,UAAM,aAAa,mBAAK,KAAK,OAAO,eAAe;AACjD,YAAM,gBAAgB,IAAI,QAAc,CAAC,SAAS,WAAW;AAC3D,WAAG,GAAG,WAAW,CAAC,SAAS;AACzB,cAAI;AACF,gBAAI;AAEJ,gBAAI,OAAO,SAAS,UAAU;AAC5B,2BAAa;AAAA,YACf,WAAW,gBAAgB,QAAQ;AACjC,2BAAa,KAAK,SAAS,OAAO;AAAA,YACpC,WAAW,MAAM,QAAQ,IAAI,GAAG;AAC9B,2BAAa,OAAO,OAAO,IAAI,EAAE,SAAS,OAAO;AAAA,YACnD,OAAO;AACL;AAAA,YACF;AAEA,kBAAM,MAAM,KAAK,MAAM,UAAU;AAGjC,kBAAM,UAAU,IAAI,YAAY;AAChC,kBAAM,WAAW,IAAI,YAAY,CAAC;AAClC,kBAAM,aAAa,IAAI,cAAc;AACrC,kBAAM,aAAa,IAAI,cAAc;AACrC,kBAAM,eAAe,IAAI,iBAAiB,KAAK,MAAM;AAGrD,gBAAI,CAAC,YAAY;AACf,mBAAK,QAAQ,MAAM,oCAAoC,GAAG;AAC1D;AAAA,YACF;AAGA,gBAAI,CAAC,KAAK,aAAa,CAAC,SAAS;AAC/B,mBAAK,YAAY;AACjB,mBAAK,MAAM,IAAI,EAAE,MAAM,kBAAI,gBAAgB,gBAAgB,CAAC;AAAA,YAC9D;AAGA,kBAAM,YAAY,SAAS,SAAS,IAAI,SAAS,CAAC,EAAE,SAAS,IAAM;AACnE,kBAAM,UAAU,SAAS,SAAS,IAAI,SAAS,SAAS,SAAS,CAAC,EAAE,OAAO,IAAM;AAEjF,kBAAM,aAA6B;AAAA,cACjC,UAAU;AAAA,cACV,MAAM;AAAA,cACN;AAAA,cACA;AAAA,cACA;AAAA,YACF;AAGA,gBAAI,CAAC,SAAS;AAEZ,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,kBAAI,gBAAgB;AAAA,gBAC1B,cAAc,CAAC,UAAU;AAAA,cAC3B,CAAC;AAAA,YACH,OAAO;AAEL,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,kBAAI,gBAAgB;AAAA,gBAC1B,cAAc,CAAC,UAAU;AAAA,cAC3B,CAAC;AAGD,kBAAI,KAAK,WAAW;AAClB,qBAAK,YAAY;AACjB,qBAAK,MAAM,IAAI,EAAE,MAAM,kBAAI,gBAAgB,cAAc,CAAC;AAAA,cAC5D;AAAA,YACF;AAEA,gBAAI,KAAK,UAAU,SAAS;AAC1B,sBAAQ;AAAA,YACV;AAAA,UACF,SAAS,KAAK;AACZ,iBAAK,QAAQ,MAAM,kCAAkC,IAAI,EAAE;AAC3D,mBAAO,GAAG;AAAA,UACZ;AAAA,QACF,CAAC;AAED,WAAG,GAAG,SAAS,CAAC,QAAQ;AACtB,cAAI,CAAC,SAAS;AACZ,mBAAO,GAAG;AAAA,UACZ;AAAA,QACF,CAAC;AAED,WAAG,GAAG,SAAS,MAAM;AACnB,cAAI,CAAC,SAAS;AACZ,oBAAQ;AAAA,UACV;AAAA,QACF,CAAC;AAAA,MACH,CAAC;AAED,YAAM,QAAQ,KAAK,CAAC,mBAAe,4BAAa,WAAW,MAAM,CAAC,CAAC;AAAA,IACrE,GAAG,KAAK,eAAe;AAEvB,UAAM,QAAQ,IAAI,CAAC,SAAS,GAAG,WAAW,MAAM,CAAC;AACjD,cAAU;AACV,OAAG,MAAM;AAAA,EACX;AACF;","names":["stt"]}
1
+ {"version":3,"sources":["../src/stt.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { type AudioBuffer, AudioByteStream, Task, log, stt, waitForAbort } from '@livekit/agents';\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { WebSocket } from 'ws';\nimport type { BasetenSttOptions } from './types.js';\n\nconst defaultSTTOptions: Partial<BasetenSttOptions> = {\n environment: 'production',\n encoding: 'pcm_s16le',\n sampleRate: 16000,\n bufferSizeSeconds: 0.032,\n enablePartialTranscripts: true,\n partialTranscriptIntervalS: 0.5,\n finalTranscriptMaxDurationS: 5,\n audioLanguage: 'en',\n languageDetectionOnly: false,\n vadThreshold: 0.5,\n vadMinSilenceDurationMs: 300,\n vadSpeechPadMs: 30,\n};\n\nexport class STT extends stt.STT {\n #opts: BasetenSttOptions;\n #logger = log();\n label = 'baseten.STT';\n\n constructor(opts: Partial<BasetenSttOptions> = {}) {\n super({\n streaming: true,\n interimResults: opts.enablePartialTranscripts ?? defaultSTTOptions.enablePartialTranscripts!,\n alignedTranscript: 'word',\n });\n\n const apiKey = opts.apiKey ?? process.env.BASETEN_API_KEY;\n const modelEndpoint = opts.modelEndpoint ?? process.env.BASETEN_MODEL_ENDPOINT;\n const modelId = opts.modelId ?? process.env.BASETEN_STT_MODEL_ID;\n\n if (!apiKey) {\n throw new Error(\n 'Baseten API key is required, either pass it as `apiKey` or set $BASETEN_API_KEY',\n );\n }\n if (!modelEndpoint && !modelId) {\n throw new Error(\n 'Baseten model endpoint is required, either pass it as `modelEndpoint` or set $BASETEN_MODEL_ENDPOINT',\n );\n }\n\n this.#opts = {\n ...defaultSTTOptions,\n ...opts,\n apiKey,\n modelEndpoint,\n modelId,\n } as BasetenSttOptions;\n }\n\n // eslint-disable-next-line\n async _recognize(_: AudioBuffer): Promise<stt.SpeechEvent> {\n throw new Error('Recognize is not supported on Baseten STT');\n }\n\n updateOptions(opts: Partial<BasetenSttOptions>) {\n this.#opts = { ...this.#opts, ...opts };\n }\n\n stream(): SpeechStream {\n return new SpeechStream(this, this.#opts);\n }\n}\n\nexport class SpeechStream extends stt.SpeechStream {\n #opts: BasetenSttOptions;\n #logger = log();\n #speaking = false;\n #requestId = '';\n label = 'baseten.SpeechStream';\n\n constructor(stt: STT, opts: BasetenSttOptions) {\n super(stt, opts.sampleRate);\n this.#opts = opts;\n this.closed = false;\n }\n\n private getWsUrl(): string {\n if (this.#opts.modelEndpoint) {\n return this.#opts.modelEndpoint;\n }\n // Fallback to constructing URL from modelId (deprecated)\n return `wss://model-${this.#opts.modelId}.api.baseten.co/environments/${this.#opts.environment}/websocket`;\n }\n\n protected async run() {\n const maxRetry = 32;\n let retries = 0;\n\n while (!this.input.closed && !this.closed) {\n const url = this.getWsUrl();\n const headers = {\n Authorization: `Api-Key ${this.#opts.apiKey}`,\n };\n\n const ws = new WebSocket(url, { headers, rejectUnauthorized: false });\n\n try {\n await new Promise((resolve, reject) => {\n ws.on('open', resolve);\n ws.on('error', (error) => reject(error));\n ws.on('close', (code) => reject(`WebSocket returned ${code}`));\n });\n\n await this.#runWS(ws);\n } catch (e) {\n if (!this.closed && !this.input.closed) {\n if (retries >= maxRetry) {\n throw new Error(`failed to connect to Baseten after ${retries} attempts: ${e}`);\n }\n\n const delay = Math.min(retries * 5, 10);\n retries++;\n\n this.#logger.warn(\n `failed to connect to Baseten, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`,\n );\n await new Promise((resolve) => setTimeout(resolve, delay * 1000));\n } else {\n this.#logger.warn(\n `Baseten disconnected, connection is closed: ${e} (inputClosed: ${this.input.closed}, isClosed: ${this.closed})`,\n );\n }\n }\n }\n\n this.closed = true;\n }\n\n async #runWS(ws: WebSocket) {\n let closing = false;\n\n // Send initial metadata\n // Note: Baseten server expects 'vad_params' and 'streaming_whisper_params' field names\n // (not 'streaming_vad_config', 'streaming_params', 'whisper_params' as in older versions)\n const metadata = {\n vad_params: {\n threshold: this.#opts.vadThreshold,\n min_silence_duration_ms: this.#opts.vadMinSilenceDurationMs,\n speech_pad_ms: this.#opts.vadSpeechPadMs,\n },\n streaming_whisper_params: {\n encoding: this.#opts.encoding ?? 'pcm_s16le',\n sample_rate: this.#opts.sampleRate ?? 16000,\n enable_partial_transcripts: false,\n audio_language: this.#opts.audioLanguage ?? 'en',\n show_word_timestamps: true,\n },\n };\n\n ws.send(JSON.stringify(metadata));\n\n const sendTask = async () => {\n const sampleRate = this.#opts.sampleRate ?? 16000;\n const samplesPerChunk = sampleRate === 16000 ? 512 : 256;\n const audioByteStream = new AudioByteStream(sampleRate, 1, samplesPerChunk);\n\n try {\n while (!this.closed) {\n const result = await this.input.next();\n if (result.done) {\n break;\n }\n\n const data = result.value;\n\n let frames: AudioFrame[];\n if (data === SpeechStream.FLUSH_SENTINEL) {\n // Flush any remaining buffered audio\n frames = audioByteStream.flush();\n } else {\n if (data.sampleRate !== sampleRate || data.channels !== 1) {\n throw new Error(\n `sample rate or channel count mismatch: expected ${sampleRate}Hz/1ch, got ${data.sampleRate}Hz/${data.channels}ch`,\n );\n }\n frames = audioByteStream.write(data.data.buffer as ArrayBuffer);\n }\n\n for (const frame of frames) {\n const buffer = Buffer.from(\n frame.data.buffer,\n frame.data.byteOffset,\n frame.data.byteLength,\n );\n ws.send(buffer);\n }\n }\n } finally {\n closing = true;\n ws.close();\n }\n };\n\n const listenTask = Task.from(async (controller) => {\n const listenMessage = new Promise<void>((resolve, reject) => {\n ws.on('message', (data) => {\n try {\n let jsonString: string;\n\n if (typeof data === 'string') {\n jsonString = data;\n } else if (data instanceof Buffer) {\n jsonString = data.toString('utf-8');\n } else if (Array.isArray(data)) {\n jsonString = Buffer.concat(data).toString('utf-8');\n } else {\n return;\n }\n\n const msg = JSON.parse(jsonString);\n const isFinal = msg.is_final ?? true;\n const segments = msg.segments ?? [];\n const transcript = msg.transcript ?? '';\n const confidence = msg.confidence ?? 0.0;\n const languageCode = msg.language_code ?? this.#opts.audioLanguage;\n\n // Skip if no transcript text\n if (!transcript) {\n this.#logger.debug('Received non-transcript message:', msg);\n return;\n }\n\n // Emit START_OF_SPEECH if not already speaking (only for interim or first final)\n if (!this.#speaking && !isFinal) {\n this.#speaking = true;\n this.queue.put({ type: stt.SpeechEventType.START_OF_SPEECH });\n }\n\n // Note: Baseten uses 'start_time' and 'end_time' field names (with underscores)\n const startTime =\n segments.length > 0\n ? (segments[0].start_time ?? 0.0) + this.startTimeOffset\n : this.startTimeOffset;\n const endTime =\n segments.length > 0\n ? (segments[segments.length - 1].end_time ?? 0.0) + this.startTimeOffset\n : this.startTimeOffset;\n\n // Note: Baseten returns segments (chunks) which we treat as words for aligned transcripts\n const words = segments.map(\n (segment: { text?: string; start_time?: number; end_time?: number }) => ({\n text: segment.text ?? '',\n startTime: (segment.start_time ?? 0.0) + this.startTimeOffset,\n endTime: (segment.end_time ?? 0.0) + this.startTimeOffset,\n startTimeOffset: this.startTimeOffset,\n confidence: confidence,\n }),\n );\n\n const speechData: stt.SpeechData = {\n language: languageCode!,\n text: transcript,\n startTime,\n endTime,\n confidence,\n words: words.length > 0 ? words : undefined,\n };\n\n // Handle interim vs final transcripts (matching Python implementation)\n if (!isFinal) {\n // Interim transcript\n this.queue.put({\n type: stt.SpeechEventType.INTERIM_TRANSCRIPT,\n alternatives: [speechData],\n });\n } else {\n // Final transcript\n this.queue.put({\n type: stt.SpeechEventType.FINAL_TRANSCRIPT,\n alternatives: [speechData],\n });\n\n // Emit END_OF_SPEECH after final transcript\n if (this.#speaking) {\n this.#speaking = false;\n this.queue.put({ type: stt.SpeechEventType.END_OF_SPEECH });\n }\n }\n\n if (this.closed || closing) {\n resolve();\n }\n } catch (err) {\n this.#logger.error(`STT: Error processing message: ${data}`);\n reject(err);\n }\n });\n\n ws.on('error', (err) => {\n if (!closing) {\n reject(err);\n }\n });\n\n ws.on('close', () => {\n if (!closing) {\n resolve();\n }\n });\n });\n\n await Promise.race([listenMessage, waitForAbort(controller.signal)]);\n }, this.abortController);\n\n await Promise.all([sendTask(), listenTask.result]);\n closing = true;\n ws.close();\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,oBAAgF;AAEhF,gBAA0B;AAG1B,MAAM,oBAAgD;AAAA,EACpD,aAAa;AAAA,EACb,UAAU;AAAA,EACV,YAAY;AAAA,EACZ,mBAAmB;AAAA,EACnB,0BAA0B;AAAA,EAC1B,4BAA4B;AAAA,EAC5B,6BAA6B;AAAA,EAC7B,eAAe;AAAA,EACf,uBAAuB;AAAA,EACvB,cAAc;AAAA,EACd,yBAAyB;AAAA,EACzB,gBAAgB;AAClB;AAEO,MAAM,YAAY,kBAAI,IAAI;AAAA,EAC/B;AAAA,EACA,cAAU,mBAAI;AAAA,EACd,QAAQ;AAAA,EAER,YAAY,OAAmC,CAAC,GAAG;AACjD,UAAM;AAAA,MACJ,WAAW;AAAA,MACX,gBAAgB,KAAK,4BAA4B,kBAAkB;AAAA,MACnE,mBAAmB;AAAA,IACrB,CAAC;AAED,UAAM,SAAS,KAAK,UAAU,QAAQ,IAAI;AAC1C,UAAM,gBAAgB,KAAK,iBAAiB,QAAQ,IAAI;AACxD,UAAM,UAAU,KAAK,WAAW,QAAQ,IAAI;AAE5C,QAAI,CAAC,QAAQ;AACX,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,iBAAiB,CAAC,SAAS;AAC9B,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAEA,SAAK,QAAQ;AAAA,MACX,GAAG;AAAA,MACH,GAAG;AAAA,MACH;AAAA,MACA;AAAA,MACA;AAAA,IACF;AAAA,EACF;AAAA;AAAA,EAGA,MAAM,WAAW,GAA0C;AACzD,UAAM,IAAI,MAAM,2CAA2C;AAAA,EAC7D;AAAA,EAEA,cAAc,MAAkC;AAC9C,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AAAA,EACxC;AAAA,EAEA,SAAuB;AACrB,WAAO,IAAI,aAAa,MAAM,KAAK,KAAK;AAAA,EAC1C;AACF;AAEO,MAAM,qBAAqB,kBAAI,aAAa;AAAA,EACjD;AAAA,EACA,cAAU,mBAAI;AAAA,EACd,YAAY;AAAA,EACZ,aAAa;AAAA,EACb,QAAQ;AAAA,EAER,YAAYA,MAAU,MAAyB;AAC7C,UAAMA,MAAK,KAAK,UAAU;AAC1B,SAAK,QAAQ;AACb,SAAK,SAAS;AAAA,EAChB;AAAA,EAEQ,WAAmB;AACzB,QAAI,KAAK,MAAM,eAAe;AAC5B,aAAO,KAAK,MAAM;AAAA,IACpB;AAEA,WAAO,eAAe,KAAK,MAAM,OAAO,gCAAgC,KAAK,MAAM,WAAW;AAAA,EAChG;AAAA,EAEA,MAAgB,MAAM;AACpB,UAAM,WAAW;AACjB,QAAI,UAAU;AAEd,WAAO,CAAC,KAAK,MAAM,UAAU,CAAC,KAAK,QAAQ;AACzC,YAAM,MAAM,KAAK,SAAS;AAC1B,YAAM,UAAU;AAAA,QACd,eAAe,WAAW,KAAK,MAAM,MAAM;AAAA,MAC7C;AAEA,YAAM,KAAK,IAAI,oBAAU,KAAK,EAAE,SAAS,oBAAoB,MAAM,CAAC;AAEpE,UAAI;AACF,cAAM,IAAI,QAAQ,CAAC,SAAS,WAAW;AACrC,aAAG,GAAG,QAAQ,OAAO;AACrB,aAAG,GAAG,SAAS,CAAC,UAAU,OAAO,KAAK,CAAC;AACvC,aAAG,GAAG,SAAS,CAAC,SAAS,OAAO,sBAAsB,IAAI,EAAE,CAAC;AAAA,QAC/D,CAAC;AAED,cAAM,KAAK,OAAO,EAAE;AAAA,MACtB,SAAS,GAAG;AACV,YAAI,CAAC,KAAK,UAAU,CAAC,KAAK,MAAM,QAAQ;AACtC,cAAI,WAAW,UAAU;AACvB,kBAAM,IAAI,MAAM,sCAAsC,OAAO,cAAc,CAAC,EAAE;AAAA,UAChF;AAEA,gBAAM,QAAQ,KAAK,IAAI,UAAU,GAAG,EAAE;AACtC;AAEA,eAAK,QAAQ;AAAA,YACX,6CAA6C,KAAK,aAAa,CAAC,KAAK,OAAO,IAAI,QAAQ;AAAA,UAC1F;AACA,gBAAM,IAAI,QAAQ,CAAC,YAAY,WAAW,SAAS,QAAQ,GAAI,CAAC;AAAA,QAClE,OAAO;AACL,eAAK,QAAQ;AAAA,YACX,+CAA+C,CAAC,kBAAkB,KAAK,MAAM,MAAM,eAAe,KAAK,MAAM;AAAA,UAC/G;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,MAAM,OAAO,IAAe;AAC1B,QAAI,UAAU;AAKd,UAAM,WAAW;AAAA,MACf,YAAY;AAAA,QACV,WAAW,KAAK,MAAM;AAAA,QACtB,yBAAyB,KAAK,MAAM;AAAA,QACpC,eAAe,KAAK,MAAM;AAAA,MAC5B;AAAA,MACA,0BAA0B;AAAA,QACxB,UAAU,KAAK,MAAM,YAAY;AAAA,QACjC,aAAa,KAAK,MAAM,cAAc;AAAA,QACtC,4BAA4B;AAAA,QAC5B,gBAAgB,KAAK,MAAM,iBAAiB;AAAA,QAC5C,sBAAsB;AAAA,MACxB;AAAA,IACF;AAEA,OAAG,KAAK,KAAK,UAAU,QAAQ,CAAC;AAEhC,UAAM,WAAW,YAAY;AAC3B,YAAM,aAAa,KAAK,MAAM,cAAc;AAC5C,YAAM,kBAAkB,eAAe,OAAQ,MAAM;AACrD,YAAM,kBAAkB,IAAI,8BAAgB,YAAY,GAAG,eAAe;AAE1E,UAAI;AACF,eAAO,CAAC,KAAK,QAAQ;AACnB,gBAAM,SAAS,MAAM,KAAK,MAAM,KAAK;AACrC,cAAI,OAAO,MAAM;AACf;AAAA,UACF;AAEA,gBAAM,OAAO,OAAO;AAEpB,cAAI;AACJ,cAAI,SAAS,aAAa,gBAAgB;AAExC,qBAAS,gBAAgB,MAAM;AAAA,UACjC,OAAO;AACL,gBAAI,KAAK,eAAe,cAAc,KAAK,aAAa,GAAG;AACzD,oBAAM,IAAI;AAAA,gBACR,mDAAmD,UAAU,eAAe,KAAK,UAAU,MAAM,KAAK,QAAQ;AAAA,cAChH;AAAA,YACF;AACA,qBAAS,gBAAgB,MAAM,KAAK,KAAK,MAAqB;AAAA,UAChE;AAEA,qBAAW,SAAS,QAAQ;AAC1B,kBAAM,SAAS,OAAO;AAAA,cACpB,MAAM,KAAK;AAAA,cACX,MAAM,KAAK;AAAA,cACX,MAAM,KAAK;AAAA,YACb;AACA,eAAG,KAAK,MAAM;AAAA,UAChB;AAAA,QACF;AAAA,MACF,UAAE;AACA,kBAAU;AACV,WAAG,MAAM;AAAA,MACX;AAAA,IACF;AAEA,UAAM,aAAa,mBAAK,KAAK,OAAO,eAAe;AACjD,YAAM,gBAAgB,IAAI,QAAc,CAAC,SAAS,WAAW;AAC3D,WAAG,GAAG,WAAW,CAAC,SAAS;AACzB,cAAI;AACF,gBAAI;AAEJ,gBAAI,OAAO,SAAS,UAAU;AAC5B,2BAAa;AAAA,YACf,WAAW,gBAAgB,QAAQ;AACjC,2BAAa,KAAK,SAAS,OAAO;AAAA,YACpC,WAAW,MAAM,QAAQ,IAAI,GAAG;AAC9B,2BAAa,OAAO,OAAO,IAAI,EAAE,SAAS,OAAO;AAAA,YACnD,OAAO;AACL;AAAA,YACF;AAEA,kBAAM,MAAM,KAAK,MAAM,UAAU;AACjC,kBAAM,UAAU,IAAI,YAAY;AAChC,kBAAM,WAAW,IAAI,YAAY,CAAC;AAClC,kBAAM,aAAa,IAAI,cAAc;AACrC,kBAAM,aAAa,IAAI,cAAc;AACrC,kBAAM,eAAe,IAAI,iBAAiB,KAAK,MAAM;AAGrD,gBAAI,CAAC,YAAY;AACf,mBAAK,QAAQ,MAAM,oCAAoC,GAAG;AAC1D;AAAA,YACF;AAGA,gBAAI,CAAC,KAAK,aAAa,CAAC,SAAS;AAC/B,mBAAK,YAAY;AACjB,mBAAK,MAAM,IAAI,EAAE,MAAM,kBAAI,gBAAgB,gBAAgB,CAAC;AAAA,YAC9D;AAGA,kBAAM,YACJ,SAAS,SAAS,KACb,SAAS,CAAC,EAAE,cAAc,KAAO,KAAK,kBACvC,KAAK;AACX,kBAAM,UACJ,SAAS,SAAS,KACb,SAAS,SAAS,SAAS,CAAC,EAAE,YAAY,KAAO,KAAK,kBACvD,KAAK;AAGX,kBAAM,QAAQ,SAAS;AAAA,cACrB,CAAC,aAAwE;AAAA,gBACvE,MAAM,QAAQ,QAAQ;AAAA,gBACtB,YAAY,QAAQ,cAAc,KAAO,KAAK;AAAA,gBAC9C,UAAU,QAAQ,YAAY,KAAO,KAAK;AAAA,gBAC1C,iBAAiB,KAAK;AAAA,gBACtB;AAAA,cACF;AAAA,YACF;AAEA,kBAAM,aAA6B;AAAA,cACjC,UAAU;AAAA,cACV,MAAM;AAAA,cACN;AAAA,cACA;AAAA,cACA;AAAA,cACA,OAAO,MAAM,SAAS,IAAI,QAAQ;AAAA,YACpC;AAGA,gBAAI,CAAC,SAAS;AAEZ,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,kBAAI,gBAAgB;AAAA,gBAC1B,cAAc,CAAC,UAAU;AAAA,cAC3B,CAAC;AAAA,YACH,OAAO;AAEL,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,kBAAI,gBAAgB;AAAA,gBAC1B,cAAc,CAAC,UAAU;AAAA,cAC3B,CAAC;AAGD,kBAAI,KAAK,WAAW;AAClB,qBAAK,YAAY;AACjB,qBAAK,MAAM,IAAI,EAAE,MAAM,kBAAI,gBAAgB,cAAc,CAAC;AAAA,cAC5D;AAAA,YACF;AAEA,gBAAI,KAAK,UAAU,SAAS;AAC1B,sBAAQ;AAAA,YACV;AAAA,UACF,SAAS,KAAK;AACZ,iBAAK,QAAQ,MAAM,kCAAkC,IAAI,EAAE;AAC3D,mBAAO,GAAG;AAAA,UACZ;AAAA,QACF,CAAC;AAED,WAAG,GAAG,SAAS,CAAC,QAAQ;AACtB,cAAI,CAAC,SAAS;AACZ,mBAAO,GAAG;AAAA,UACZ;AAAA,QACF,CAAC;AAED,WAAG,GAAG,SAAS,MAAM;AACnB,cAAI,CAAC,SAAS;AACZ,oBAAQ;AAAA,UACV;AAAA,QACF,CAAC;AAAA,MACH,CAAC;AAED,YAAM,QAAQ,KAAK,CAAC,mBAAe,4BAAa,WAAW,MAAM,CAAC,CAAC;AAAA,IACrE,GAAG,KAAK,eAAe;AAEvB,UAAM,QAAQ,IAAI,CAAC,SAAS,GAAG,WAAW,MAAM,CAAC;AACjD,cAAU;AACV,OAAG,MAAM;AAAA,EACX;AACF;","names":["stt"]}
package/dist/stt.d.ts.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"stt.d.ts","sourceRoot":"","sources":["../src/stt.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,KAAK,WAAW,EAA8B,GAAG,EAAgB,MAAM,iBAAiB,CAAC;AAGlG,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,YAAY,CAAC;AAiBpD,qBAAa,GAAI,SAAQ,GAAG,CAAC,GAAG;;IAG9B,KAAK,SAAiB;gBAEV,IAAI,GAAE,OAAO,CAAC,iBAAiB,CAAM;IA6B3C,UAAU,CAAC,CAAC,EAAE,WAAW,GAAG,OAAO,CAAC,GAAG,CAAC,WAAW,CAAC;IAI1D,aAAa,CAAC,IAAI,EAAE,OAAO,CAAC,iBAAiB,CAAC;IAI9C,MAAM,IAAI,YAAY;CAGvB;AAED,qBAAa,YAAa,SAAQ,GAAG,CAAC,YAAY;;IAKhD,KAAK,SAA0B;gBAEnB,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,iBAAiB;IAM7C,OAAO,CAAC,QAAQ;cAIA,GAAG;CAkNpB"}
1
+ {"version":3,"file":"stt.d.ts","sourceRoot":"","sources":["../src/stt.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,KAAK,WAAW,EAA8B,GAAG,EAAgB,MAAM,iBAAiB,CAAC;AAGlG,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,YAAY,CAAC;AAiBpD,qBAAa,GAAI,SAAQ,GAAG,CAAC,GAAG;;IAG9B,KAAK,SAAiB;gBAEV,IAAI,GAAE,OAAO,CAAC,iBAAiB,CAAM;IAgC3C,UAAU,CAAC,CAAC,EAAE,WAAW,GAAG,OAAO,CAAC,GAAG,CAAC,WAAW,CAAC;IAI1D,aAAa,CAAC,IAAI,EAAE,OAAO,CAAC,iBAAiB,CAAC;IAI9C,MAAM,IAAI,YAAY;CAGvB;AAED,qBAAa,YAAa,SAAQ,GAAG,CAAC,YAAY;;IAKhD,KAAK,SAA0B;gBAEnB,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,iBAAiB;IAM7C,OAAO,CAAC,QAAQ;cAQA,GAAG;CAgOpB"}
package/dist/stt.js CHANGED
@@ -21,24 +21,27 @@ class STT extends stt.STT {
21
21
  constructor(opts = {}) {
22
22
  super({
23
23
  streaming: true,
24
- interimResults: opts.enablePartialTranscripts ?? defaultSTTOptions.enablePartialTranscripts
24
+ interimResults: opts.enablePartialTranscripts ?? defaultSTTOptions.enablePartialTranscripts,
25
+ alignedTranscript: "word"
25
26
  });
26
27
  const apiKey = opts.apiKey ?? process.env.BASETEN_API_KEY;
28
+ const modelEndpoint = opts.modelEndpoint ?? process.env.BASETEN_MODEL_ENDPOINT;
27
29
  const modelId = opts.modelId ?? process.env.BASETEN_STT_MODEL_ID;
28
30
  if (!apiKey) {
29
31
  throw new Error(
30
32
  "Baseten API key is required, either pass it as `apiKey` or set $BASETEN_API_KEY"
31
33
  );
32
34
  }
33
- if (!modelId) {
35
+ if (!modelEndpoint && !modelId) {
34
36
  throw new Error(
35
- "Baseten model ID is required, either pass it as `modelId` or set $BASETEN_STT_MODEL_ID"
37
+ "Baseten model endpoint is required, either pass it as `modelEndpoint` or set $BASETEN_MODEL_ENDPOINT"
36
38
  );
37
39
  }
38
40
  this.#opts = {
39
41
  ...defaultSTTOptions,
40
42
  ...opts,
41
43
  apiKey,
44
+ modelEndpoint,
42
45
  modelId
43
46
  };
44
47
  }
@@ -65,6 +68,9 @@ class SpeechStream extends stt.SpeechStream {
65
68
  this.closed = false;
66
69
  }
67
70
  getWsUrl() {
71
+ if (this.#opts.modelEndpoint) {
72
+ return this.#opts.modelEndpoint;
73
+ }
68
74
  return `wss://model-${this.#opts.modelId}.api.baseten.co/environments/${this.#opts.environment}/websocket`;
69
75
  }
70
76
  async run() {
@@ -75,7 +81,7 @@ class SpeechStream extends stt.SpeechStream {
75
81
  const headers = {
76
82
  Authorization: `Api-Key ${this.#opts.apiKey}`
77
83
  };
78
- const ws = new WebSocket(url, { headers });
84
+ const ws = new WebSocket(url, { headers, rejectUnauthorized: false });
79
85
  try {
80
86
  await new Promise((resolve, reject) => {
81
87
  ws.on("open", resolve);
@@ -106,22 +112,17 @@ class SpeechStream extends stt.SpeechStream {
106
112
  async #runWS(ws) {
107
113
  let closing = false;
108
114
  const metadata = {
109
- streaming_vad_config: {
115
+ vad_params: {
110
116
  threshold: this.#opts.vadThreshold,
111
117
  min_silence_duration_ms: this.#opts.vadMinSilenceDurationMs,
112
118
  speech_pad_ms: this.#opts.vadSpeechPadMs
113
119
  },
114
- streaming_params: {
120
+ streaming_whisper_params: {
115
121
  encoding: this.#opts.encoding ?? "pcm_s16le",
116
122
  sample_rate: this.#opts.sampleRate ?? 16e3,
117
- enable_partial_transcripts: this.#opts.enablePartialTranscripts,
118
- partial_transcript_interval_s: this.#opts.partialTranscriptIntervalS,
119
- final_transcript_max_duration_s: this.#opts.finalTranscriptMaxDurationS
120
- },
121
- whisper_params: {
122
- prompt: this.#opts.prompt,
123
+ enable_partial_transcripts: false,
123
124
  audio_language: this.#opts.audioLanguage ?? "en",
124
- language_detection_only: this.#opts.languageDetectionOnly ?? false
125
+ show_word_timestamps: true
125
126
  }
126
127
  };
127
128
  ws.send(JSON.stringify(metadata));
@@ -189,14 +190,24 @@ class SpeechStream extends stt.SpeechStream {
189
190
  this.#speaking = true;
190
191
  this.queue.put({ type: stt.SpeechEventType.START_OF_SPEECH });
191
192
  }
192
- const startTime = segments.length > 0 ? segments[0].start ?? 0 : 0;
193
- const endTime = segments.length > 0 ? segments[segments.length - 1].end ?? 0 : 0;
193
+ const startTime = segments.length > 0 ? (segments[0].start_time ?? 0) + this.startTimeOffset : this.startTimeOffset;
194
+ const endTime = segments.length > 0 ? (segments[segments.length - 1].end_time ?? 0) + this.startTimeOffset : this.startTimeOffset;
195
+ const words = segments.map(
196
+ (segment) => ({
197
+ text: segment.text ?? "",
198
+ startTime: (segment.start_time ?? 0) + this.startTimeOffset,
199
+ endTime: (segment.end_time ?? 0) + this.startTimeOffset,
200
+ startTimeOffset: this.startTimeOffset,
201
+ confidence
202
+ })
203
+ );
194
204
  const speechData = {
195
205
  language: languageCode,
196
206
  text: transcript,
197
207
  startTime,
198
208
  endTime,
199
- confidence
209
+ confidence,
210
+ words: words.length > 0 ? words : void 0
200
211
  };
201
212
  if (!isFinal) {
202
213
  this.queue.put({
package/dist/stt.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/stt.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { type AudioBuffer, AudioByteStream, Task, log, stt, waitForAbort } from '@livekit/agents';\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { WebSocket } from 'ws';\nimport type { BasetenSttOptions } from './types.js';\n\nconst defaultSTTOptions: Partial<BasetenSttOptions> = {\n environment: 'production',\n encoding: 'pcm_s16le',\n sampleRate: 16000,\n bufferSizeSeconds: 0.032,\n enablePartialTranscripts: true,\n partialTranscriptIntervalS: 0.5,\n finalTranscriptMaxDurationS: 5,\n audioLanguage: 'en',\n languageDetectionOnly: false,\n vadThreshold: 0.5,\n vadMinSilenceDurationMs: 300,\n vadSpeechPadMs: 30,\n};\n\nexport class STT extends stt.STT {\n #opts: BasetenSttOptions;\n #logger = log();\n label = 'baseten.STT';\n\n constructor(opts: Partial<BasetenSttOptions> = {}) {\n super({\n streaming: true,\n interimResults: opts.enablePartialTranscripts ?? defaultSTTOptions.enablePartialTranscripts!,\n });\n\n const apiKey = opts.apiKey ?? process.env.BASETEN_API_KEY;\n const modelId = opts.modelId ?? process.env.BASETEN_STT_MODEL_ID;\n\n if (!apiKey) {\n throw new Error(\n 'Baseten API key is required, either pass it as `apiKey` or set $BASETEN_API_KEY',\n );\n }\n if (!modelId) {\n throw new Error(\n 'Baseten model ID is required, either pass it as `modelId` or set $BASETEN_STT_MODEL_ID',\n );\n }\n\n this.#opts = {\n ...defaultSTTOptions,\n ...opts,\n apiKey,\n modelId,\n } as BasetenSttOptions;\n }\n\n // eslint-disable-next-line\n async _recognize(_: AudioBuffer): Promise<stt.SpeechEvent> {\n throw new Error('Recognize is not supported on Baseten STT');\n }\n\n updateOptions(opts: Partial<BasetenSttOptions>) {\n this.#opts = { ...this.#opts, ...opts };\n }\n\n stream(): SpeechStream {\n return new SpeechStream(this, this.#opts);\n }\n}\n\nexport class SpeechStream extends stt.SpeechStream {\n #opts: BasetenSttOptions;\n #logger = log();\n #speaking = false;\n #requestId = '';\n label = 'baseten.SpeechStream';\n\n constructor(stt: STT, opts: BasetenSttOptions) {\n super(stt, opts.sampleRate);\n this.#opts = opts;\n this.closed = false;\n }\n\n private getWsUrl(): string {\n return `wss://model-${this.#opts.modelId}.api.baseten.co/environments/${this.#opts.environment}/websocket`;\n }\n\n protected async run() {\n const maxRetry = 32;\n let retries = 0;\n\n while (!this.input.closed && !this.closed) {\n const url = this.getWsUrl();\n const headers = {\n Authorization: `Api-Key ${this.#opts.apiKey}`,\n };\n\n const ws = new WebSocket(url, { headers });\n\n try {\n await new Promise((resolve, reject) => {\n ws.on('open', resolve);\n ws.on('error', (error) => reject(error));\n ws.on('close', (code) => reject(`WebSocket returned ${code}`));\n });\n\n await this.#runWS(ws);\n } catch (e) {\n if (!this.closed && !this.input.closed) {\n if (retries >= maxRetry) {\n throw new Error(`failed to connect to Baseten after ${retries} attempts: ${e}`);\n }\n\n const delay = Math.min(retries * 5, 10);\n retries++;\n\n this.#logger.warn(\n `failed to connect to Baseten, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`,\n );\n await new Promise((resolve) => setTimeout(resolve, delay * 1000));\n } else {\n this.#logger.warn(\n `Baseten disconnected, connection is closed: ${e} (inputClosed: ${this.input.closed}, isClosed: ${this.closed})`,\n );\n }\n }\n }\n\n this.closed = true;\n }\n\n async #runWS(ws: WebSocket) {\n let closing = false;\n\n // Send initial metadata\n const metadata = {\n streaming_vad_config: {\n threshold: this.#opts.vadThreshold,\n min_silence_duration_ms: this.#opts.vadMinSilenceDurationMs,\n speech_pad_ms: this.#opts.vadSpeechPadMs,\n },\n streaming_params: {\n encoding: this.#opts.encoding ?? 'pcm_s16le',\n sample_rate: this.#opts.sampleRate ?? 16000,\n enable_partial_transcripts: this.#opts.enablePartialTranscripts,\n partial_transcript_interval_s: this.#opts.partialTranscriptIntervalS,\n final_transcript_max_duration_s: this.#opts.finalTranscriptMaxDurationS,\n },\n whisper_params: {\n prompt: this.#opts.prompt,\n audio_language: this.#opts.audioLanguage ?? 'en',\n language_detection_only: this.#opts.languageDetectionOnly ?? false,\n },\n };\n ws.send(JSON.stringify(metadata));\n\n const sendTask = async () => {\n const sampleRate = this.#opts.sampleRate ?? 16000;\n const samplesPerChunk = sampleRate === 16000 ? 512 : 256;\n const audioByteStream = new AudioByteStream(sampleRate, 1, samplesPerChunk);\n\n try {\n while (!this.closed) {\n const result = await this.input.next();\n if (result.done) {\n break;\n }\n\n const data = result.value;\n\n let frames: AudioFrame[];\n if (data === SpeechStream.FLUSH_SENTINEL) {\n // Flush any remaining buffered audio\n frames = audioByteStream.flush();\n } else {\n if (data.sampleRate !== sampleRate || data.channels !== 1) {\n throw new Error(\n `sample rate or channel count mismatch: expected ${sampleRate}Hz/1ch, got ${data.sampleRate}Hz/${data.channels}ch`,\n );\n }\n frames = audioByteStream.write(data.data.buffer as ArrayBuffer);\n }\n\n for (const frame of frames) {\n const buffer = Buffer.from(\n frame.data.buffer,\n frame.data.byteOffset,\n frame.data.byteLength,\n );\n ws.send(buffer);\n }\n }\n } finally {\n closing = true;\n ws.close();\n }\n };\n\n const listenTask = Task.from(async (controller) => {\n const listenMessage = new Promise<void>((resolve, reject) => {\n ws.on('message', (data) => {\n try {\n let jsonString: string;\n\n if (typeof data === 'string') {\n jsonString = data;\n } else if (data instanceof Buffer) {\n jsonString = data.toString('utf-8');\n } else if (Array.isArray(data)) {\n jsonString = Buffer.concat(data).toString('utf-8');\n } else {\n return;\n }\n\n const msg = JSON.parse(jsonString);\n\n // Parse response format matching Python implementation\n const isFinal = msg.is_final ?? true;\n const segments = msg.segments ?? [];\n const transcript = msg.transcript ?? '';\n const confidence = msg.confidence ?? 0.0;\n const languageCode = msg.language_code ?? this.#opts.audioLanguage;\n\n // Skip if no transcript text\n if (!transcript) {\n this.#logger.debug('Received non-transcript message:', msg);\n return;\n }\n\n // Emit START_OF_SPEECH if not already speaking (only for interim or first final)\n if (!this.#speaking && !isFinal) {\n this.#speaking = true;\n this.queue.put({ type: stt.SpeechEventType.START_OF_SPEECH });\n }\n\n // Extract timing from segments\n const startTime = segments.length > 0 ? segments[0].start ?? 0.0 : 0.0;\n const endTime = segments.length > 0 ? segments[segments.length - 1].end ?? 0.0 : 0.0;\n\n const speechData: stt.SpeechData = {\n language: languageCode!,\n text: transcript,\n startTime,\n endTime,\n confidence,\n };\n\n // Handle interim vs final transcripts (matching Python implementation)\n if (!isFinal) {\n // Interim transcript\n this.queue.put({\n type: stt.SpeechEventType.INTERIM_TRANSCRIPT,\n alternatives: [speechData],\n });\n } else {\n // Final transcript\n this.queue.put({\n type: stt.SpeechEventType.FINAL_TRANSCRIPT,\n alternatives: [speechData],\n });\n\n // Emit END_OF_SPEECH after final transcript\n if (this.#speaking) {\n this.#speaking = false;\n this.queue.put({ type: stt.SpeechEventType.END_OF_SPEECH });\n }\n }\n\n if (this.closed || closing) {\n resolve();\n }\n } catch (err) {\n this.#logger.error(`STT: Error processing message: ${data}`);\n reject(err);\n }\n });\n\n ws.on('error', (err) => {\n if (!closing) {\n reject(err);\n }\n });\n\n ws.on('close', () => {\n if (!closing) {\n resolve();\n }\n });\n });\n\n await Promise.race([listenMessage, waitForAbort(controller.signal)]);\n }, this.abortController);\n\n await Promise.all([sendTask(), listenTask.result]);\n closing = true;\n ws.close();\n }\n}\n"],"mappings":"AAGA,SAA2B,iBAAiB,MAAM,KAAK,KAAK,oBAAoB;AAEhF,SAAS,iBAAiB;AAG1B,MAAM,oBAAgD;AAAA,EACpD,aAAa;AAAA,EACb,UAAU;AAAA,EACV,YAAY;AAAA,EACZ,mBAAmB;AAAA,EACnB,0BAA0B;AAAA,EAC1B,4BAA4B;AAAA,EAC5B,6BAA6B;AAAA,EAC7B,eAAe;AAAA,EACf,uBAAuB;AAAA,EACvB,cAAc;AAAA,EACd,yBAAyB;AAAA,EACzB,gBAAgB;AAClB;AAEO,MAAM,YAAY,IAAI,IAAI;AAAA,EAC/B;AAAA,EACA,UAAU,IAAI;AAAA,EACd,QAAQ;AAAA,EAER,YAAY,OAAmC,CAAC,GAAG;AACjD,UAAM;AAAA,MACJ,WAAW;AAAA,MACX,gBAAgB,KAAK,4BAA4B,kBAAkB;AAAA,IACrE,CAAC;AAED,UAAM,SAAS,KAAK,UAAU,QAAQ,IAAI;AAC1C,UAAM,UAAU,KAAK,WAAW,QAAQ,IAAI;AAE5C,QAAI,CAAC,QAAQ;AACX,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,SAAS;AACZ,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAEA,SAAK,QAAQ;AAAA,MACX,GAAG;AAAA,MACH,GAAG;AAAA,MACH;AAAA,MACA;AAAA,IACF;AAAA,EACF;AAAA;AAAA,EAGA,MAAM,WAAW,GAA0C;AACzD,UAAM,IAAI,MAAM,2CAA2C;AAAA,EAC7D;AAAA,EAEA,cAAc,MAAkC;AAC9C,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AAAA,EACxC;AAAA,EAEA,SAAuB;AACrB,WAAO,IAAI,aAAa,MAAM,KAAK,KAAK;AAAA,EAC1C;AACF;AAEO,MAAM,qBAAqB,IAAI,aAAa;AAAA,EACjD;AAAA,EACA,UAAU,IAAI;AAAA,EACd,YAAY;AAAA,EACZ,aAAa;AAAA,EACb,QAAQ;AAAA,EAER,YAAYA,MAAU,MAAyB;AAC7C,UAAMA,MAAK,KAAK,UAAU;AAC1B,SAAK,QAAQ;AACb,SAAK,SAAS;AAAA,EAChB;AAAA,EAEQ,WAAmB;AACzB,WAAO,eAAe,KAAK,MAAM,OAAO,gCAAgC,KAAK,MAAM,WAAW;AAAA,EAChG;AAAA,EAEA,MAAgB,MAAM;AACpB,UAAM,WAAW;AACjB,QAAI,UAAU;AAEd,WAAO,CAAC,KAAK,MAAM,UAAU,CAAC,KAAK,QAAQ;AACzC,YAAM,MAAM,KAAK,SAAS;AAC1B,YAAM,UAAU;AAAA,QACd,eAAe,WAAW,KAAK,MAAM,MAAM;AAAA,MAC7C;AAEA,YAAM,KAAK,IAAI,UAAU,KAAK,EAAE,QAAQ,CAAC;AAEzC,UAAI;AACF,cAAM,IAAI,QAAQ,CAAC,SAAS,WAAW;AACrC,aAAG,GAAG,QAAQ,OAAO;AACrB,aAAG,GAAG,SAAS,CAAC,UAAU,OAAO,KAAK,CAAC;AACvC,aAAG,GAAG,SAAS,CAAC,SAAS,OAAO,sBAAsB,IAAI,EAAE,CAAC;AAAA,QAC/D,CAAC;AAED,cAAM,KAAK,OAAO,EAAE;AAAA,MACtB,SAAS,GAAG;AACV,YAAI,CAAC,KAAK,UAAU,CAAC,KAAK,MAAM,QAAQ;AACtC,cAAI,WAAW,UAAU;AACvB,kBAAM,IAAI,MAAM,sCAAsC,OAAO,cAAc,CAAC,EAAE;AAAA,UAChF;AAEA,gBAAM,QAAQ,KAAK,IAAI,UAAU,GAAG,EAAE;AACtC;AAEA,eAAK,QAAQ;AAAA,YACX,6CAA6C,KAAK,aAAa,CAAC,KAAK,OAAO,IAAI,QAAQ;AAAA,UAC1F;AACA,gBAAM,IAAI,QAAQ,CAAC,YAAY,WAAW,SAAS,QAAQ,GAAI,CAAC;AAAA,QAClE,OAAO;AACL,eAAK,QAAQ;AAAA,YACX,+CAA+C,CAAC,kBAAkB,KAAK,MAAM,MAAM,eAAe,KAAK,MAAM;AAAA,UAC/G;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,MAAM,OAAO,IAAe;AAC1B,QAAI,UAAU;AAGd,UAAM,WAAW;AAAA,MACf,sBAAsB;AAAA,QACpB,WAAW,KAAK,MAAM;AAAA,QACtB,yBAAyB,KAAK,MAAM;AAAA,QACpC,eAAe,KAAK,MAAM;AAAA,MAC5B;AAAA,MACA,kBAAkB;AAAA,QAChB,UAAU,KAAK,MAAM,YAAY;AAAA,QACjC,aAAa,KAAK,MAAM,cAAc;AAAA,QACtC,4BAA4B,KAAK,MAAM;AAAA,QACvC,+BAA+B,KAAK,MAAM;AAAA,QAC1C,iCAAiC,KAAK,MAAM;AAAA,MAC9C;AAAA,MACA,gBAAgB;AAAA,QACd,QAAQ,KAAK,MAAM;AAAA,QACnB,gBAAgB,KAAK,MAAM,iBAAiB;AAAA,QAC5C,yBAAyB,KAAK,MAAM,yBAAyB;AAAA,MAC/D;AAAA,IACF;AACA,OAAG,KAAK,KAAK,UAAU,QAAQ,CAAC;AAEhC,UAAM,WAAW,YAAY;AAC3B,YAAM,aAAa,KAAK,MAAM,cAAc;AAC5C,YAAM,kBAAkB,eAAe,OAAQ,MAAM;AACrD,YAAM,kBAAkB,IAAI,gBAAgB,YAAY,GAAG,eAAe;AAE1E,UAAI;AACF,eAAO,CAAC,KAAK,QAAQ;AACnB,gBAAM,SAAS,MAAM,KAAK,MAAM,KAAK;AACrC,cAAI,OAAO,MAAM;AACf;AAAA,UACF;AAEA,gBAAM,OAAO,OAAO;AAEpB,cAAI;AACJ,cAAI,SAAS,aAAa,gBAAgB;AAExC,qBAAS,gBAAgB,MAAM;AAAA,UACjC,OAAO;AACL,gBAAI,KAAK,eAAe,cAAc,KAAK,aAAa,GAAG;AACzD,oBAAM,IAAI;AAAA,gBACR,mDAAmD,UAAU,eAAe,KAAK,UAAU,MAAM,KAAK,QAAQ;AAAA,cAChH;AAAA,YACF;AACA,qBAAS,gBAAgB,MAAM,KAAK,KAAK,MAAqB;AAAA,UAChE;AAEA,qBAAW,SAAS,QAAQ;AAC1B,kBAAM,SAAS,OAAO;AAAA,cACpB,MAAM,KAAK;AAAA,cACX,MAAM,KAAK;AAAA,cACX,MAAM,KAAK;AAAA,YACb;AACA,eAAG,KAAK,MAAM;AAAA,UAChB;AAAA,QACF;AAAA,MACF,UAAE;AACA,kBAAU;AACV,WAAG,MAAM;AAAA,MACX;AAAA,IACF;AAEA,UAAM,aAAa,KAAK,KAAK,OAAO,eAAe;AACjD,YAAM,gBAAgB,IAAI,QAAc,CAAC,SAAS,WAAW;AAC3D,WAAG,GAAG,WAAW,CAAC,SAAS;AACzB,cAAI;AACF,gBAAI;AAEJ,gBAAI,OAAO,SAAS,UAAU;AAC5B,2BAAa;AAAA,YACf,WAAW,gBAAgB,QAAQ;AACjC,2BAAa,KAAK,SAAS,OAAO;AAAA,YACpC,WAAW,MAAM,QAAQ,IAAI,GAAG;AAC9B,2BAAa,OAAO,OAAO,IAAI,EAAE,SAAS,OAAO;AAAA,YACnD,OAAO;AACL;AAAA,YACF;AAEA,kBAAM,MAAM,KAAK,MAAM,UAAU;AAGjC,kBAAM,UAAU,IAAI,YAAY;AAChC,kBAAM,WAAW,IAAI,YAAY,CAAC;AAClC,kBAAM,aAAa,IAAI,cAAc;AACrC,kBAAM,aAAa,IAAI,cAAc;AACrC,kBAAM,eAAe,IAAI,iBAAiB,KAAK,MAAM;AAGrD,gBAAI,CAAC,YAAY;AACf,mBAAK,QAAQ,MAAM,oCAAoC,GAAG;AAC1D;AAAA,YACF;AAGA,gBAAI,CAAC,KAAK,aAAa,CAAC,SAAS;AAC/B,mBAAK,YAAY;AACjB,mBAAK,MAAM,IAAI,EAAE,MAAM,IAAI,gBAAgB,gBAAgB,CAAC;AAAA,YAC9D;AAGA,kBAAM,YAAY,SAAS,SAAS,IAAI,SAAS,CAAC,EAAE,SAAS,IAAM;AACnE,kBAAM,UAAU,SAAS,SAAS,IAAI,SAAS,SAAS,SAAS,CAAC,EAAE,OAAO,IAAM;AAEjF,kBAAM,aAA6B;AAAA,cACjC,UAAU;AAAA,cACV,MAAM;AAAA,cACN;AAAA,cACA;AAAA,cACA;AAAA,YACF;AAGA,gBAAI,CAAC,SAAS;AAEZ,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,IAAI,gBAAgB;AAAA,gBAC1B,cAAc,CAAC,UAAU;AAAA,cAC3B,CAAC;AAAA,YACH,OAAO;AAEL,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,IAAI,gBAAgB;AAAA,gBAC1B,cAAc,CAAC,UAAU;AAAA,cAC3B,CAAC;AAGD,kBAAI,KAAK,WAAW;AAClB,qBAAK,YAAY;AACjB,qBAAK,MAAM,IAAI,EAAE,MAAM,IAAI,gBAAgB,cAAc,CAAC;AAAA,cAC5D;AAAA,YACF;AAEA,gBAAI,KAAK,UAAU,SAAS;AAC1B,sBAAQ;AAAA,YACV;AAAA,UACF,SAAS,KAAK;AACZ,iBAAK,QAAQ,MAAM,kCAAkC,IAAI,EAAE;AAC3D,mBAAO,GAAG;AAAA,UACZ;AAAA,QACF,CAAC;AAED,WAAG,GAAG,SAAS,CAAC,QAAQ;AACtB,cAAI,CAAC,SAAS;AACZ,mBAAO,GAAG;AAAA,UACZ;AAAA,QACF,CAAC;AAED,WAAG,GAAG,SAAS,MAAM;AACnB,cAAI,CAAC,SAAS;AACZ,oBAAQ;AAAA,UACV;AAAA,QACF,CAAC;AAAA,MACH,CAAC;AAED,YAAM,QAAQ,KAAK,CAAC,eAAe,aAAa,WAAW,MAAM,CAAC,CAAC;AAAA,IACrE,GAAG,KAAK,eAAe;AAEvB,UAAM,QAAQ,IAAI,CAAC,SAAS,GAAG,WAAW,MAAM,CAAC;AACjD,cAAU;AACV,OAAG,MAAM;AAAA,EACX;AACF;","names":["stt"]}
1
+ {"version":3,"sources":["../src/stt.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { type AudioBuffer, AudioByteStream, Task, log, stt, waitForAbort } from '@livekit/agents';\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { WebSocket } from 'ws';\nimport type { BasetenSttOptions } from './types.js';\n\nconst defaultSTTOptions: Partial<BasetenSttOptions> = {\n environment: 'production',\n encoding: 'pcm_s16le',\n sampleRate: 16000,\n bufferSizeSeconds: 0.032,\n enablePartialTranscripts: true,\n partialTranscriptIntervalS: 0.5,\n finalTranscriptMaxDurationS: 5,\n audioLanguage: 'en',\n languageDetectionOnly: false,\n vadThreshold: 0.5,\n vadMinSilenceDurationMs: 300,\n vadSpeechPadMs: 30,\n};\n\nexport class STT extends stt.STT {\n #opts: BasetenSttOptions;\n #logger = log();\n label = 'baseten.STT';\n\n constructor(opts: Partial<BasetenSttOptions> = {}) {\n super({\n streaming: true,\n interimResults: opts.enablePartialTranscripts ?? defaultSTTOptions.enablePartialTranscripts!,\n alignedTranscript: 'word',\n });\n\n const apiKey = opts.apiKey ?? process.env.BASETEN_API_KEY;\n const modelEndpoint = opts.modelEndpoint ?? process.env.BASETEN_MODEL_ENDPOINT;\n const modelId = opts.modelId ?? process.env.BASETEN_STT_MODEL_ID;\n\n if (!apiKey) {\n throw new Error(\n 'Baseten API key is required, either pass it as `apiKey` or set $BASETEN_API_KEY',\n );\n }\n if (!modelEndpoint && !modelId) {\n throw new Error(\n 'Baseten model endpoint is required, either pass it as `modelEndpoint` or set $BASETEN_MODEL_ENDPOINT',\n );\n }\n\n this.#opts = {\n ...defaultSTTOptions,\n ...opts,\n apiKey,\n modelEndpoint,\n modelId,\n } as BasetenSttOptions;\n }\n\n // eslint-disable-next-line\n async _recognize(_: AudioBuffer): Promise<stt.SpeechEvent> {\n throw new Error('Recognize is not supported on Baseten STT');\n }\n\n updateOptions(opts: Partial<BasetenSttOptions>) {\n this.#opts = { ...this.#opts, ...opts };\n }\n\n stream(): SpeechStream {\n return new SpeechStream(this, this.#opts);\n }\n}\n\nexport class SpeechStream extends stt.SpeechStream {\n #opts: BasetenSttOptions;\n #logger = log();\n #speaking = false;\n #requestId = '';\n label = 'baseten.SpeechStream';\n\n constructor(stt: STT, opts: BasetenSttOptions) {\n super(stt, opts.sampleRate);\n this.#opts = opts;\n this.closed = false;\n }\n\n private getWsUrl(): string {\n if (this.#opts.modelEndpoint) {\n return this.#opts.modelEndpoint;\n }\n // Fallback to constructing URL from modelId (deprecated)\n return `wss://model-${this.#opts.modelId}.api.baseten.co/environments/${this.#opts.environment}/websocket`;\n }\n\n protected async run() {\n const maxRetry = 32;\n let retries = 0;\n\n while (!this.input.closed && !this.closed) {\n const url = this.getWsUrl();\n const headers = {\n Authorization: `Api-Key ${this.#opts.apiKey}`,\n };\n\n const ws = new WebSocket(url, { headers, rejectUnauthorized: false });\n\n try {\n await new Promise((resolve, reject) => {\n ws.on('open', resolve);\n ws.on('error', (error) => reject(error));\n ws.on('close', (code) => reject(`WebSocket returned ${code}`));\n });\n\n await this.#runWS(ws);\n } catch (e) {\n if (!this.closed && !this.input.closed) {\n if (retries >= maxRetry) {\n throw new Error(`failed to connect to Baseten after ${retries} attempts: ${e}`);\n }\n\n const delay = Math.min(retries * 5, 10);\n retries++;\n\n this.#logger.warn(\n `failed to connect to Baseten, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`,\n );\n await new Promise((resolve) => setTimeout(resolve, delay * 1000));\n } else {\n this.#logger.warn(\n `Baseten disconnected, connection is closed: ${e} (inputClosed: ${this.input.closed}, isClosed: ${this.closed})`,\n );\n }\n }\n }\n\n this.closed = true;\n }\n\n async #runWS(ws: WebSocket) {\n let closing = false;\n\n // Send initial metadata\n // Note: Baseten server expects 'vad_params' and 'streaming_whisper_params' field names\n // (not 'streaming_vad_config', 'streaming_params', 'whisper_params' as in older versions)\n const metadata = {\n vad_params: {\n threshold: this.#opts.vadThreshold,\n min_silence_duration_ms: this.#opts.vadMinSilenceDurationMs,\n speech_pad_ms: this.#opts.vadSpeechPadMs,\n },\n streaming_whisper_params: {\n encoding: this.#opts.encoding ?? 'pcm_s16le',\n sample_rate: this.#opts.sampleRate ?? 16000,\n enable_partial_transcripts: false,\n audio_language: this.#opts.audioLanguage ?? 'en',\n show_word_timestamps: true,\n },\n };\n\n ws.send(JSON.stringify(metadata));\n\n const sendTask = async () => {\n const sampleRate = this.#opts.sampleRate ?? 16000;\n const samplesPerChunk = sampleRate === 16000 ? 512 : 256;\n const audioByteStream = new AudioByteStream(sampleRate, 1, samplesPerChunk);\n\n try {\n while (!this.closed) {\n const result = await this.input.next();\n if (result.done) {\n break;\n }\n\n const data = result.value;\n\n let frames: AudioFrame[];\n if (data === SpeechStream.FLUSH_SENTINEL) {\n // Flush any remaining buffered audio\n frames = audioByteStream.flush();\n } else {\n if (data.sampleRate !== sampleRate || data.channels !== 1) {\n throw new Error(\n `sample rate or channel count mismatch: expected ${sampleRate}Hz/1ch, got ${data.sampleRate}Hz/${data.channels}ch`,\n );\n }\n frames = audioByteStream.write(data.data.buffer as ArrayBuffer);\n }\n\n for (const frame of frames) {\n const buffer = Buffer.from(\n frame.data.buffer,\n frame.data.byteOffset,\n frame.data.byteLength,\n );\n ws.send(buffer);\n }\n }\n } finally {\n closing = true;\n ws.close();\n }\n };\n\n const listenTask = Task.from(async (controller) => {\n const listenMessage = new Promise<void>((resolve, reject) => {\n ws.on('message', (data) => {\n try {\n let jsonString: string;\n\n if (typeof data === 'string') {\n jsonString = data;\n } else if (data instanceof Buffer) {\n jsonString = data.toString('utf-8');\n } else if (Array.isArray(data)) {\n jsonString = Buffer.concat(data).toString('utf-8');\n } else {\n return;\n }\n\n const msg = JSON.parse(jsonString);\n const isFinal = msg.is_final ?? true;\n const segments = msg.segments ?? [];\n const transcript = msg.transcript ?? '';\n const confidence = msg.confidence ?? 0.0;\n const languageCode = msg.language_code ?? this.#opts.audioLanguage;\n\n // Skip if no transcript text\n if (!transcript) {\n this.#logger.debug('Received non-transcript message:', msg);\n return;\n }\n\n // Emit START_OF_SPEECH if not already speaking (only for interim or first final)\n if (!this.#speaking && !isFinal) {\n this.#speaking = true;\n this.queue.put({ type: stt.SpeechEventType.START_OF_SPEECH });\n }\n\n // Note: Baseten uses 'start_time' and 'end_time' field names (with underscores)\n const startTime =\n segments.length > 0\n ? (segments[0].start_time ?? 0.0) + this.startTimeOffset\n : this.startTimeOffset;\n const endTime =\n segments.length > 0\n ? (segments[segments.length - 1].end_time ?? 0.0) + this.startTimeOffset\n : this.startTimeOffset;\n\n // Note: Baseten returns segments (chunks) which we treat as words for aligned transcripts\n const words = segments.map(\n (segment: { text?: string; start_time?: number; end_time?: number }) => ({\n text: segment.text ?? '',\n startTime: (segment.start_time ?? 0.0) + this.startTimeOffset,\n endTime: (segment.end_time ?? 0.0) + this.startTimeOffset,\n startTimeOffset: this.startTimeOffset,\n confidence: confidence,\n }),\n );\n\n const speechData: stt.SpeechData = {\n language: languageCode!,\n text: transcript,\n startTime,\n endTime,\n confidence,\n words: words.length > 0 ? words : undefined,\n };\n\n // Handle interim vs final transcripts (matching Python implementation)\n if (!isFinal) {\n // Interim transcript\n this.queue.put({\n type: stt.SpeechEventType.INTERIM_TRANSCRIPT,\n alternatives: [speechData],\n });\n } else {\n // Final transcript\n this.queue.put({\n type: stt.SpeechEventType.FINAL_TRANSCRIPT,\n alternatives: [speechData],\n });\n\n // Emit END_OF_SPEECH after final transcript\n if (this.#speaking) {\n this.#speaking = false;\n this.queue.put({ type: stt.SpeechEventType.END_OF_SPEECH });\n }\n }\n\n if (this.closed || closing) {\n resolve();\n }\n } catch (err) {\n this.#logger.error(`STT: Error processing message: ${data}`);\n reject(err);\n }\n });\n\n ws.on('error', (err) => {\n if (!closing) {\n reject(err);\n }\n });\n\n ws.on('close', () => {\n if (!closing) {\n resolve();\n }\n });\n });\n\n await Promise.race([listenMessage, waitForAbort(controller.signal)]);\n }, this.abortController);\n\n await Promise.all([sendTask(), listenTask.result]);\n closing = true;\n ws.close();\n }\n}\n"],"mappings":"AAGA,SAA2B,iBAAiB,MAAM,KAAK,KAAK,oBAAoB;AAEhF,SAAS,iBAAiB;AAG1B,MAAM,oBAAgD;AAAA,EACpD,aAAa;AAAA,EACb,UAAU;AAAA,EACV,YAAY;AAAA,EACZ,mBAAmB;AAAA,EACnB,0BAA0B;AAAA,EAC1B,4BAA4B;AAAA,EAC5B,6BAA6B;AAAA,EAC7B,eAAe;AAAA,EACf,uBAAuB;AAAA,EACvB,cAAc;AAAA,EACd,yBAAyB;AAAA,EACzB,gBAAgB;AAClB;AAEO,MAAM,YAAY,IAAI,IAAI;AAAA,EAC/B;AAAA,EACA,UAAU,IAAI;AAAA,EACd,QAAQ;AAAA,EAER,YAAY,OAAmC,CAAC,GAAG;AACjD,UAAM;AAAA,MACJ,WAAW;AAAA,MACX,gBAAgB,KAAK,4BAA4B,kBAAkB;AAAA,MACnE,mBAAmB;AAAA,IACrB,CAAC;AAED,UAAM,SAAS,KAAK,UAAU,QAAQ,IAAI;AAC1C,UAAM,gBAAgB,KAAK,iBAAiB,QAAQ,IAAI;AACxD,UAAM,UAAU,KAAK,WAAW,QAAQ,IAAI;AAE5C,QAAI,CAAC,QAAQ;AACX,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,iBAAiB,CAAC,SAAS;AAC9B,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAEA,SAAK,QAAQ;AAAA,MACX,GAAG;AAAA,MACH,GAAG;AAAA,MACH;AAAA,MACA;AAAA,MACA;AAAA,IACF;AAAA,EACF;AAAA;AAAA,EAGA,MAAM,WAAW,GAA0C;AACzD,UAAM,IAAI,MAAM,2CAA2C;AAAA,EAC7D;AAAA,EAEA,cAAc,MAAkC;AAC9C,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AAAA,EACxC;AAAA,EAEA,SAAuB;AACrB,WAAO,IAAI,aAAa,MAAM,KAAK,KAAK;AAAA,EAC1C;AACF;AAEO,MAAM,qBAAqB,IAAI,aAAa;AAAA,EACjD;AAAA,EACA,UAAU,IAAI;AAAA,EACd,YAAY;AAAA,EACZ,aAAa;AAAA,EACb,QAAQ;AAAA,EAER,YAAYA,MAAU,MAAyB;AAC7C,UAAMA,MAAK,KAAK,UAAU;AAC1B,SAAK,QAAQ;AACb,SAAK,SAAS;AAAA,EAChB;AAAA,EAEQ,WAAmB;AACzB,QAAI,KAAK,MAAM,eAAe;AAC5B,aAAO,KAAK,MAAM;AAAA,IACpB;AAEA,WAAO,eAAe,KAAK,MAAM,OAAO,gCAAgC,KAAK,MAAM,WAAW;AAAA,EAChG;AAAA,EAEA,MAAgB,MAAM;AACpB,UAAM,WAAW;AACjB,QAAI,UAAU;AAEd,WAAO,CAAC,KAAK,MAAM,UAAU,CAAC,KAAK,QAAQ;AACzC,YAAM,MAAM,KAAK,SAAS;AAC1B,YAAM,UAAU;AAAA,QACd,eAAe,WAAW,KAAK,MAAM,MAAM;AAAA,MAC7C;AAEA,YAAM,KAAK,IAAI,UAAU,KAAK,EAAE,SAAS,oBAAoB,MAAM,CAAC;AAEpE,UAAI;AACF,cAAM,IAAI,QAAQ,CAAC,SAAS,WAAW;AACrC,aAAG,GAAG,QAAQ,OAAO;AACrB,aAAG,GAAG,SAAS,CAAC,UAAU,OAAO,KAAK,CAAC;AACvC,aAAG,GAAG,SAAS,CAAC,SAAS,OAAO,sBAAsB,IAAI,EAAE,CAAC;AAAA,QAC/D,CAAC;AAED,cAAM,KAAK,OAAO,EAAE;AAAA,MACtB,SAAS,GAAG;AACV,YAAI,CAAC,KAAK,UAAU,CAAC,KAAK,MAAM,QAAQ;AACtC,cAAI,WAAW,UAAU;AACvB,kBAAM,IAAI,MAAM,sCAAsC,OAAO,cAAc,CAAC,EAAE;AAAA,UAChF;AAEA,gBAAM,QAAQ,KAAK,IAAI,UAAU,GAAG,EAAE;AACtC;AAEA,eAAK,QAAQ;AAAA,YACX,6CAA6C,KAAK,aAAa,CAAC,KAAK,OAAO,IAAI,QAAQ;AAAA,UAC1F;AACA,gBAAM,IAAI,QAAQ,CAAC,YAAY,WAAW,SAAS,QAAQ,GAAI,CAAC;AAAA,QAClE,OAAO;AACL,eAAK,QAAQ;AAAA,YACX,+CAA+C,CAAC,kBAAkB,KAAK,MAAM,MAAM,eAAe,KAAK,MAAM;AAAA,UAC/G;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,MAAM,OAAO,IAAe;AAC1B,QAAI,UAAU;AAKd,UAAM,WAAW;AAAA,MACf,YAAY;AAAA,QACV,WAAW,KAAK,MAAM;AAAA,QACtB,yBAAyB,KAAK,MAAM;AAAA,QACpC,eAAe,KAAK,MAAM;AAAA,MAC5B;AAAA,MACA,0BAA0B;AAAA,QACxB,UAAU,KAAK,MAAM,YAAY;AAAA,QACjC,aAAa,KAAK,MAAM,cAAc;AAAA,QACtC,4BAA4B;AAAA,QAC5B,gBAAgB,KAAK,MAAM,iBAAiB;AAAA,QAC5C,sBAAsB;AAAA,MACxB;AAAA,IACF;AAEA,OAAG,KAAK,KAAK,UAAU,QAAQ,CAAC;AAEhC,UAAM,WAAW,YAAY;AAC3B,YAAM,aAAa,KAAK,MAAM,cAAc;AAC5C,YAAM,kBAAkB,eAAe,OAAQ,MAAM;AACrD,YAAM,kBAAkB,IAAI,gBAAgB,YAAY,GAAG,eAAe;AAE1E,UAAI;AACF,eAAO,CAAC,KAAK,QAAQ;AACnB,gBAAM,SAAS,MAAM,KAAK,MAAM,KAAK;AACrC,cAAI,OAAO,MAAM;AACf;AAAA,UACF;AAEA,gBAAM,OAAO,OAAO;AAEpB,cAAI;AACJ,cAAI,SAAS,aAAa,gBAAgB;AAExC,qBAAS,gBAAgB,MAAM;AAAA,UACjC,OAAO;AACL,gBAAI,KAAK,eAAe,cAAc,KAAK,aAAa,GAAG;AACzD,oBAAM,IAAI;AAAA,gBACR,mDAAmD,UAAU,eAAe,KAAK,UAAU,MAAM,KAAK,QAAQ;AAAA,cAChH;AAAA,YACF;AACA,qBAAS,gBAAgB,MAAM,KAAK,KAAK,MAAqB;AAAA,UAChE;AAEA,qBAAW,SAAS,QAAQ;AAC1B,kBAAM,SAAS,OAAO;AAAA,cACpB,MAAM,KAAK;AAAA,cACX,MAAM,KAAK;AAAA,cACX,MAAM,KAAK;AAAA,YACb;AACA,eAAG,KAAK,MAAM;AAAA,UAChB;AAAA,QACF;AAAA,MACF,UAAE;AACA,kBAAU;AACV,WAAG,MAAM;AAAA,MACX;AAAA,IACF;AAEA,UAAM,aAAa,KAAK,KAAK,OAAO,eAAe;AACjD,YAAM,gBAAgB,IAAI,QAAc,CAAC,SAAS,WAAW;AAC3D,WAAG,GAAG,WAAW,CAAC,SAAS;AACzB,cAAI;AACF,gBAAI;AAEJ,gBAAI,OAAO,SAAS,UAAU;AAC5B,2BAAa;AAAA,YACf,WAAW,gBAAgB,QAAQ;AACjC,2BAAa,KAAK,SAAS,OAAO;AAAA,YACpC,WAAW,MAAM,QAAQ,IAAI,GAAG;AAC9B,2BAAa,OAAO,OAAO,IAAI,EAAE,SAAS,OAAO;AAAA,YACnD,OAAO;AACL;AAAA,YACF;AAEA,kBAAM,MAAM,KAAK,MAAM,UAAU;AACjC,kBAAM,UAAU,IAAI,YAAY;AAChC,kBAAM,WAAW,IAAI,YAAY,CAAC;AAClC,kBAAM,aAAa,IAAI,cAAc;AACrC,kBAAM,aAAa,IAAI,cAAc;AACrC,kBAAM,eAAe,IAAI,iBAAiB,KAAK,MAAM;AAGrD,gBAAI,CAAC,YAAY;AACf,mBAAK,QAAQ,MAAM,oCAAoC,GAAG;AAC1D;AAAA,YACF;AAGA,gBAAI,CAAC,KAAK,aAAa,CAAC,SAAS;AAC/B,mBAAK,YAAY;AACjB,mBAAK,MAAM,IAAI,EAAE,MAAM,IAAI,gBAAgB,gBAAgB,CAAC;AAAA,YAC9D;AAGA,kBAAM,YACJ,SAAS,SAAS,KACb,SAAS,CAAC,EAAE,cAAc,KAAO,KAAK,kBACvC,KAAK;AACX,kBAAM,UACJ,SAAS,SAAS,KACb,SAAS,SAAS,SAAS,CAAC,EAAE,YAAY,KAAO,KAAK,kBACvD,KAAK;AAGX,kBAAM,QAAQ,SAAS;AAAA,cACrB,CAAC,aAAwE;AAAA,gBACvE,MAAM,QAAQ,QAAQ;AAAA,gBACtB,YAAY,QAAQ,cAAc,KAAO,KAAK;AAAA,gBAC9C,UAAU,QAAQ,YAAY,KAAO,KAAK;AAAA,gBAC1C,iBAAiB,KAAK;AAAA,gBACtB;AAAA,cACF;AAAA,YACF;AAEA,kBAAM,aAA6B;AAAA,cACjC,UAAU;AAAA,cACV,MAAM;AAAA,cACN;AAAA,cACA;AAAA,cACA;AAAA,cACA,OAAO,MAAM,SAAS,IAAI,QAAQ;AAAA,YACpC;AAGA,gBAAI,CAAC,SAAS;AAEZ,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,IAAI,gBAAgB;AAAA,gBAC1B,cAAc,CAAC,UAAU;AAAA,cAC3B,CAAC;AAAA,YACH,OAAO;AAEL,mBAAK,MAAM,IAAI;AAAA,gBACb,MAAM,IAAI,gBAAgB;AAAA,gBAC1B,cAAc,CAAC,UAAU;AAAA,cAC3B,CAAC;AAGD,kBAAI,KAAK,WAAW;AAClB,qBAAK,YAAY;AACjB,qBAAK,MAAM,IAAI,EAAE,MAAM,IAAI,gBAAgB,cAAc,CAAC;AAAA,cAC5D;AAAA,YACF;AAEA,gBAAI,KAAK,UAAU,SAAS;AAC1B,sBAAQ;AAAA,YACV;AAAA,UACF,SAAS,KAAK;AACZ,iBAAK,QAAQ,MAAM,kCAAkC,IAAI,EAAE;AAC3D,mBAAO,GAAG;AAAA,UACZ;AAAA,QACF,CAAC;AAED,WAAG,GAAG,SAAS,CAAC,QAAQ;AACtB,cAAI,CAAC,SAAS;AACZ,mBAAO,GAAG;AAAA,UACZ;AAAA,QACF,CAAC;AAED,WAAG,GAAG,SAAS,MAAM;AACnB,cAAI,CAAC,SAAS;AACZ,oBAAQ;AAAA,UACV;AAAA,QACF,CAAC;AAAA,MACH,CAAC;AAED,YAAM,QAAQ,KAAK,CAAC,eAAe,aAAa,WAAW,MAAM,CAAC,CAAC;AAAA,IACrE,GAAG,KAAK,eAAe;AAEvB,UAAM,QAAQ,IAAI,CAAC,SAAS,GAAG,WAAW,MAAM,CAAC;AACjD,cAAU;AACV,OAAG,MAAM;AAAA,EACX;AACF;","names":["stt"]}
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/types.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\n\n/**\n * Baseten plugin types and interfaces\n */\n\n/**\n * Options for configuring the Baseten LLM\n * Since Baseten provides an OpenAI-compatible API, these options\n * map to standard OpenAI parameters.\n */\nexport interface BasetenLLMOptions {\n apiKey?: string;\n model: string;\n temperature?: number;\n maxTokens?: number;\n user?: string;\n toolChoice?: 'none' | 'auto' | 'required' | { type: 'function'; function: { name: string } };\n parallelToolCalls?: boolean;\n}\n\n/**\n * Options for configuring the Baseten STT service\n */\nexport interface BasetenSttOptions {\n apiKey: string;\n modelId: string;\n environment?: string;\n encoding?: string;\n sampleRate?: number;\n bufferSizeSeconds?: number;\n vadThreshold?: number;\n vadMinSilenceDurationMs?: number;\n vadSpeechPadMs?: number;\n enablePartialTranscripts?: boolean;\n partialTranscriptIntervalS?: number;\n finalTranscriptMaxDurationS?: number;\n audioLanguage?: string;\n prompt?: string;\n languageDetectionOnly?: boolean;\n}\n\n/**\n * Options for configuring the Baseten TTS service\n */\nexport interface BasetenTTSOptions {\n apiKey: string;\n modelEndpoint: string;\n voice?: string;\n language?: string;\n temperature?: number;\n maxTokens?: number;\n}\n"],"mappings":";;;;;;;;;;;;;;AAAA;AAAA;","names":[]}
1
+ {"version":3,"sources":["../src/types.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\n\n/**\n * Baseten plugin types and interfaces\n */\n\n/**\n * Options for configuring the Baseten LLM\n * Since Baseten provides an OpenAI-compatible API, these options\n * map to standard OpenAI parameters.\n */\nexport interface BasetenLLMOptions {\n apiKey?: string;\n model: string;\n temperature?: number;\n maxTokens?: number;\n user?: string;\n toolChoice?: 'none' | 'auto' | 'required' | { type: 'function'; function: { name: string } };\n parallelToolCalls?: boolean;\n}\n\n/**\n * Options for configuring the Baseten STT service\n */\nexport interface BasetenSttOptions {\n apiKey: string;\n /** @deprecated Use modelEndpoint instead */\n modelId?: string;\n /** Full WebSocket endpoint URL (e.g., from Baseten dashboard). Takes priority over modelId. */\n modelEndpoint?: string;\n environment?: string;\n encoding?: string;\n sampleRate?: number;\n bufferSizeSeconds?: number;\n vadThreshold?: number;\n vadMinSilenceDurationMs?: number;\n vadSpeechPadMs?: number;\n enablePartialTranscripts?: boolean;\n partialTranscriptIntervalS?: number;\n finalTranscriptMaxDurationS?: number;\n audioLanguage?: string;\n prompt?: string;\n languageDetectionOnly?: boolean;\n}\n\n/**\n * Options for configuring the Baseten TTS service\n */\nexport interface BasetenTTSOptions {\n apiKey: string;\n modelEndpoint: string;\n voice?: string;\n language?: string;\n temperature?: number;\n maxTokens?: number;\n}\n"],"mappings":";;;;;;;;;;;;;;AAAA;AAAA;","names":[]}
package/dist/types.d.cts CHANGED
@@ -25,7 +25,10 @@ export interface BasetenLLMOptions {
25
25
  */
26
26
  export interface BasetenSttOptions {
27
27
  apiKey: string;
28
- modelId: string;
28
+ /** @deprecated Use modelEndpoint instead */
29
+ modelId?: string;
30
+ /** Full WebSocket endpoint URL (e.g., from Baseten dashboard). Takes priority over modelId. */
31
+ modelEndpoint?: string;
29
32
  environment?: string;
30
33
  encoding?: string;
31
34
  sampleRate?: number;
package/dist/types.d.ts CHANGED
@@ -25,7 +25,10 @@ export interface BasetenLLMOptions {
25
25
  */
26
26
  export interface BasetenSttOptions {
27
27
  apiKey: string;
28
- modelId: string;
28
+ /** @deprecated Use modelEndpoint instead */
29
+ modelId?: string;
30
+ /** Full WebSocket endpoint URL (e.g., from Baseten dashboard). Takes priority over modelId. */
31
+ modelEndpoint?: string;
29
32
  environment?: string;
30
33
  encoding?: string;
31
34
  sampleRate?: number;
@@ -1 +1 @@
1
- {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAIA;;GAEG;AAEH;;;;GAIG;AACH,MAAM,WAAW,iBAAiB;IAChC,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,UAAU,CAAC,EAAE,MAAM,GAAG,MAAM,GAAG,UAAU,GAAG;QAAE,IAAI,EAAE,UAAU,CAAC;QAAC,QAAQ,EAAE;YAAE,IAAI,EAAE,MAAM,CAAA;SAAE,CAAA;KAAE,CAAC;IAC7F,iBAAiB,CAAC,EAAE,OAAO,CAAC;CAC7B;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,uBAAuB,CAAC,EAAE,MAAM,CAAC;IACjC,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,wBAAwB,CAAC,EAAE,OAAO,CAAC;IACnC,0BAA0B,CAAC,EAAE,MAAM,CAAC;IACpC,2BAA2B,CAAC,EAAE,MAAM,CAAC;IACrC,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,qBAAqB,CAAC,EAAE,OAAO,CAAC;CACjC;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,MAAM,EAAE,MAAM,CAAC;IACf,aAAa,EAAE,MAAM,CAAC;IACtB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB"}
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAIA;;GAEG;AAEH;;;;GAIG;AACH,MAAM,WAAW,iBAAiB;IAChC,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,UAAU,CAAC,EAAE,MAAM,GAAG,MAAM,GAAG,UAAU,GAAG;QAAE,IAAI,EAAE,UAAU,CAAC;QAAC,QAAQ,EAAE;YAAE,IAAI,EAAE,MAAM,CAAA;SAAE,CAAA;KAAE,CAAC;IAC7F,iBAAiB,CAAC,EAAE,OAAO,CAAC;CAC7B;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,MAAM,EAAE,MAAM,CAAC;IACf,4CAA4C;IAC5C,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,+FAA+F;IAC/F,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,uBAAuB,CAAC,EAAE,MAAM,CAAC;IACjC,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,wBAAwB,CAAC,EAAE,OAAO,CAAC;IACnC,0BAA0B,CAAC,EAAE,MAAM,CAAC;IACpC,2BAA2B,CAAC,EAAE,MAAM,CAAC;IACrC,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,qBAAqB,CAAC,EAAE,OAAO,CAAC;CACjC;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,MAAM,EAAE,MAAM,CAAC;IACf,aAAa,EAAE,MAAM,CAAC;IACtB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@livekit/agents-plugin-baseten",
3
- "version": "1.0.37",
3
+ "version": "1.0.39",
4
4
  "description": "Baseten plugin for LiveKit Node Agents",
5
5
  "main": "dist/index.js",
6
6
  "require": "dist/index.cjs",
@@ -40,9 +40,9 @@
40
40
  "@types/ws": "^8.5.8",
41
41
  "tsx": "^4.7.0",
42
42
  "typescript": "^5.9.3",
43
- "@livekit/agents": "1.0.37",
44
- "@livekit/agents-plugin-silero": "1.0.37",
45
- "@livekit/agents-plugins-test": "1.0.37"
43
+ "@livekit/agents": "1.0.39",
44
+ "@livekit/agents-plugin-silero": "1.0.39",
45
+ "@livekit/agents-plugins-test": "1.0.39"
46
46
  },
47
47
  "dependencies": {
48
48
  "dotenv": "^17.2.3",
@@ -51,7 +51,7 @@
51
51
  },
52
52
  "peerDependencies": {
53
53
  "@livekit/rtc-node": "^0.13.24",
54
- "@livekit/agents": "1.0.37"
54
+ "@livekit/agents": "1.0.39"
55
55
  },
56
56
  "scripts": {
57
57
  "build": "tsup --onSuccess \"pnpm build:types\"",
package/src/stt.ts CHANGED
@@ -30,9 +30,11 @@ export class STT extends stt.STT {
30
30
  super({
31
31
  streaming: true,
32
32
  interimResults: opts.enablePartialTranscripts ?? defaultSTTOptions.enablePartialTranscripts!,
33
+ alignedTranscript: 'word',
33
34
  });
34
35
 
35
36
  const apiKey = opts.apiKey ?? process.env.BASETEN_API_KEY;
37
+ const modelEndpoint = opts.modelEndpoint ?? process.env.BASETEN_MODEL_ENDPOINT;
36
38
  const modelId = opts.modelId ?? process.env.BASETEN_STT_MODEL_ID;
37
39
 
38
40
  if (!apiKey) {
@@ -40,9 +42,9 @@ export class STT extends stt.STT {
40
42
  'Baseten API key is required, either pass it as `apiKey` or set $BASETEN_API_KEY',
41
43
  );
42
44
  }
43
- if (!modelId) {
45
+ if (!modelEndpoint && !modelId) {
44
46
  throw new Error(
45
- 'Baseten model ID is required, either pass it as `modelId` or set $BASETEN_STT_MODEL_ID',
47
+ 'Baseten model endpoint is required, either pass it as `modelEndpoint` or set $BASETEN_MODEL_ENDPOINT',
46
48
  );
47
49
  }
48
50
 
@@ -50,6 +52,7 @@ export class STT extends stt.STT {
50
52
  ...defaultSTTOptions,
51
53
  ...opts,
52
54
  apiKey,
55
+ modelEndpoint,
53
56
  modelId,
54
57
  } as BasetenSttOptions;
55
58
  }
@@ -82,6 +85,10 @@ export class SpeechStream extends stt.SpeechStream {
82
85
  }
83
86
 
84
87
  private getWsUrl(): string {
88
+ if (this.#opts.modelEndpoint) {
89
+ return this.#opts.modelEndpoint;
90
+ }
91
+ // Fallback to constructing URL from modelId (deprecated)
85
92
  return `wss://model-${this.#opts.modelId}.api.baseten.co/environments/${this.#opts.environment}/websocket`;
86
93
  }
87
94
 
@@ -95,7 +102,7 @@ export class SpeechStream extends stt.SpeechStream {
95
102
  Authorization: `Api-Key ${this.#opts.apiKey}`,
96
103
  };
97
104
 
98
- const ws = new WebSocket(url, { headers });
105
+ const ws = new WebSocket(url, { headers, rejectUnauthorized: false });
99
106
 
100
107
  try {
101
108
  await new Promise((resolve, reject) => {
@@ -133,25 +140,23 @@ export class SpeechStream extends stt.SpeechStream {
133
140
  let closing = false;
134
141
 
135
142
  // Send initial metadata
143
+ // Note: Baseten server expects 'vad_params' and 'streaming_whisper_params' field names
144
+ // (not 'streaming_vad_config', 'streaming_params', 'whisper_params' as in older versions)
136
145
  const metadata = {
137
- streaming_vad_config: {
146
+ vad_params: {
138
147
  threshold: this.#opts.vadThreshold,
139
148
  min_silence_duration_ms: this.#opts.vadMinSilenceDurationMs,
140
149
  speech_pad_ms: this.#opts.vadSpeechPadMs,
141
150
  },
142
- streaming_params: {
151
+ streaming_whisper_params: {
143
152
  encoding: this.#opts.encoding ?? 'pcm_s16le',
144
153
  sample_rate: this.#opts.sampleRate ?? 16000,
145
- enable_partial_transcripts: this.#opts.enablePartialTranscripts,
146
- partial_transcript_interval_s: this.#opts.partialTranscriptIntervalS,
147
- final_transcript_max_duration_s: this.#opts.finalTranscriptMaxDurationS,
148
- },
149
- whisper_params: {
150
- prompt: this.#opts.prompt,
154
+ enable_partial_transcripts: false,
151
155
  audio_language: this.#opts.audioLanguage ?? 'en',
152
- language_detection_only: this.#opts.languageDetectionOnly ?? false,
156
+ show_word_timestamps: true,
153
157
  },
154
158
  };
159
+
155
160
  ws.send(JSON.stringify(metadata));
156
161
 
157
162
  const sendTask = async () => {
@@ -213,8 +218,6 @@ export class SpeechStream extends stt.SpeechStream {
213
218
  }
214
219
 
215
220
  const msg = JSON.parse(jsonString);
216
-
217
- // Parse response format matching Python implementation
218
221
  const isFinal = msg.is_final ?? true;
219
222
  const segments = msg.segments ?? [];
220
223
  const transcript = msg.transcript ?? '';
@@ -233,9 +236,26 @@ export class SpeechStream extends stt.SpeechStream {
233
236
  this.queue.put({ type: stt.SpeechEventType.START_OF_SPEECH });
234
237
  }
235
238
 
236
- // Extract timing from segments
237
- const startTime = segments.length > 0 ? segments[0].start ?? 0.0 : 0.0;
238
- const endTime = segments.length > 0 ? segments[segments.length - 1].end ?? 0.0 : 0.0;
239
+ // Note: Baseten uses 'start_time' and 'end_time' field names (with underscores)
240
+ const startTime =
241
+ segments.length > 0
242
+ ? (segments[0].start_time ?? 0.0) + this.startTimeOffset
243
+ : this.startTimeOffset;
244
+ const endTime =
245
+ segments.length > 0
246
+ ? (segments[segments.length - 1].end_time ?? 0.0) + this.startTimeOffset
247
+ : this.startTimeOffset;
248
+
249
+ // Note: Baseten returns segments (chunks) which we treat as words for aligned transcripts
250
+ const words = segments.map(
251
+ (segment: { text?: string; start_time?: number; end_time?: number }) => ({
252
+ text: segment.text ?? '',
253
+ startTime: (segment.start_time ?? 0.0) + this.startTimeOffset,
254
+ endTime: (segment.end_time ?? 0.0) + this.startTimeOffset,
255
+ startTimeOffset: this.startTimeOffset,
256
+ confidence: confidence,
257
+ }),
258
+ );
239
259
 
240
260
  const speechData: stt.SpeechData = {
241
261
  language: languageCode!,
@@ -243,6 +263,7 @@ export class SpeechStream extends stt.SpeechStream {
243
263
  startTime,
244
264
  endTime,
245
265
  confidence,
266
+ words: words.length > 0 ? words : undefined,
246
267
  };
247
268
 
248
269
  // Handle interim vs final transcripts (matching Python implementation)
package/src/types.ts CHANGED
@@ -26,7 +26,10 @@ export interface BasetenLLMOptions {
26
26
  */
27
27
  export interface BasetenSttOptions {
28
28
  apiKey: string;
29
- modelId: string;
29
+ /** @deprecated Use modelEndpoint instead */
30
+ modelId?: string;
31
+ /** Full WebSocket endpoint URL (e.g., from Baseten dashboard). Takes priority over modelId. */
32
+ modelEndpoint?: string;
30
33
  environment?: string;
31
34
  encoding?: string;
32
35
  sampleRate?: number;