voice-router-dev 0.8.6 → 0.8.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +65 -0
- package/dist/{field-configs-D1RCJSmr.d.mts → field-configs-BtR4uR2N.d.mts} +166 -166
- package/dist/{field-configs-D1RCJSmr.d.ts → field-configs-BtR4uR2N.d.ts} +166 -166
- package/dist/field-configs.d.mts +1 -1
- package/dist/field-configs.d.ts +1 -1
- package/dist/index.d.mts +522 -474
- package/dist/index.d.ts +522 -474
- package/dist/index.js +479 -66
- package/dist/index.mjs +479 -66
- package/dist/{provider-metadata-BnkedpXm.d.mts → provider-metadata-BJ29OPW1.d.mts} +2 -2
- package/dist/{provider-metadata-DbsSGAO7.d.ts → provider-metadata-D1d-9cng.d.ts} +2 -2
- package/dist/provider-metadata.d.mts +1 -1
- package/dist/provider-metadata.d.ts +1 -1
- package/dist/provider-metadata.js +1 -1
- package/dist/provider-metadata.mjs +1 -1
- package/dist/{speechToTextChunkResponseModel-BZSxrijj.d.ts → speechToTextChunkResponseModel-B4kVoFc3.d.ts} +97 -6
- package/dist/{speechToTextChunkResponseModel-DK61nDc5.d.mts → speechToTextChunkResponseModel-DmajV4F-.d.mts} +97 -6
- package/dist/webhooks.d.mts +2 -2
- package/dist/webhooks.d.ts +2 -2
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -8270,6 +8270,7 @@ var AzureSTTAdapter = class extends BaseAdapter {
|
|
|
8270
8270
|
id: String(speakerId),
|
|
8271
8271
|
label: `Speaker ${speakerId}`
|
|
8272
8272
|
})) : void 0;
|
|
8273
|
+
const utterances = words.length > 0 ? buildUtterancesFromWords(words) : void 0;
|
|
8273
8274
|
const transcriptionId = transcription.self?.split("/").pop() || "";
|
|
8274
8275
|
return {
|
|
8275
8276
|
success: true,
|
|
@@ -8283,6 +8284,7 @@ var AzureSTTAdapter = class extends BaseAdapter {
|
|
|
8283
8284
|
duration: transcriptionData.duration ? transcriptionData.duration / 1e7 : void 0,
|
|
8284
8285
|
speakers,
|
|
8285
8286
|
words: words.length > 0 ? words : void 0,
|
|
8287
|
+
utterances: utterances && utterances.length > 0 ? utterances : void 0,
|
|
8286
8288
|
createdAt: transcription.createdDateTime,
|
|
8287
8289
|
completedAt: transcription.lastActionDateTime
|
|
8288
8290
|
},
|
|
@@ -8918,6 +8920,7 @@ function createOpenAIWhisperAdapter(config) {
|
|
|
8918
8920
|
|
|
8919
8921
|
// src/adapters/speechmatics-adapter.ts
|
|
8920
8922
|
var import_axios8 = __toESM(require("axios"));
|
|
8923
|
+
var import_ws5 = __toESM(require("ws"));
|
|
8921
8924
|
|
|
8922
8925
|
// src/generated/speechmatics/schema/notificationConfigContentsItem.ts
|
|
8923
8926
|
var NotificationConfigContentsItem = {
|
|
@@ -8967,8 +8970,7 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
|
|
|
8967
8970
|
super(...arguments);
|
|
8968
8971
|
this.name = "speechmatics";
|
|
8969
8972
|
this.capabilities = {
|
|
8970
|
-
streaming:
|
|
8971
|
-
// Batch only (streaming available via separate WebSocket API)
|
|
8973
|
+
streaming: true,
|
|
8972
8974
|
diarization: true,
|
|
8973
8975
|
wordTimestamps: true,
|
|
8974
8976
|
languageDetection: false,
|
|
@@ -9103,13 +9105,16 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
|
|
|
9103
9105
|
jobConfig.fetch_data = {
|
|
9104
9106
|
url: audio.url
|
|
9105
9107
|
};
|
|
9106
|
-
|
|
9107
|
-
|
|
9108
|
+
const formData = new FormData();
|
|
9109
|
+
formData.append("config", JSON.stringify(jobConfig));
|
|
9110
|
+
requestBody = formData;
|
|
9111
|
+
headers = { "Content-Type": "multipart/form-data" };
|
|
9108
9112
|
} else if (audio.type === "file") {
|
|
9109
|
-
|
|
9110
|
-
|
|
9111
|
-
|
|
9112
|
-
|
|
9113
|
+
const formData = new FormData();
|
|
9114
|
+
formData.append("config", JSON.stringify(jobConfig));
|
|
9115
|
+
const audioBlob = audio.file instanceof Blob ? audio.file : new Blob([audio.file], { type: audio.mimeType || "audio/wav" });
|
|
9116
|
+
formData.append("data_file", audioBlob, audio.filename || "audio.wav");
|
|
9117
|
+
requestBody = formData;
|
|
9113
9118
|
headers = { "Content-Type": "multipart/form-data" };
|
|
9114
9119
|
} else {
|
|
9115
9120
|
return {
|
|
@@ -9214,6 +9219,381 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
|
|
|
9214
9219
|
throw error;
|
|
9215
9220
|
}
|
|
9216
9221
|
}
|
|
9222
|
+
/**
|
|
9223
|
+
* Build WebSocket URL for real-time streaming
|
|
9224
|
+
*
|
|
9225
|
+
* Note: Real-time API uses a different host from the batch API:
|
|
9226
|
+
* - Batch: {region}.asr.api.speechmatics.com
|
|
9227
|
+
* - Real-time: {region}.rt.speechmatics.com
|
|
9228
|
+
*
|
|
9229
|
+
* @param region - Regional endpoint identifier
|
|
9230
|
+
* @returns WebSocket URL for real-time API
|
|
9231
|
+
*/
|
|
9232
|
+
getRegionalWsUrl(region) {
|
|
9233
|
+
if (this.config?.wsBaseUrl) {
|
|
9234
|
+
return this.config.wsBaseUrl;
|
|
9235
|
+
}
|
|
9236
|
+
const regionPrefix = region || "eu1";
|
|
9237
|
+
return `wss://${regionPrefix}.rt.speechmatics.com/v2`;
|
|
9238
|
+
}
|
|
9239
|
+
/**
|
|
9240
|
+
* Stream audio for real-time transcription via WebSocket
|
|
9241
|
+
*
|
|
9242
|
+
* Connects to Speechmatics' real-time API and sends audio chunks
|
|
9243
|
+
* for transcription with results returned via callbacks.
|
|
9244
|
+
*
|
|
9245
|
+
* @param options - Streaming configuration options
|
|
9246
|
+
* @param callbacks - Event callbacks for transcription results
|
|
9247
|
+
* @returns Promise that resolves with a StreamingSession
|
|
9248
|
+
*
|
|
9249
|
+
* @example Basic streaming
|
|
9250
|
+
* ```typescript
|
|
9251
|
+
* const session = await adapter.transcribeStream({
|
|
9252
|
+
* language: 'en',
|
|
9253
|
+
* speechmaticsStreaming: {
|
|
9254
|
+
* enablePartials: true,
|
|
9255
|
+
* operatingPoint: 'enhanced'
|
|
9256
|
+
* }
|
|
9257
|
+
* }, {
|
|
9258
|
+
* onTranscript: (event) => console.log(event.text),
|
|
9259
|
+
* onUtterance: (utt) => console.log(`[${utt.speaker}]: ${utt.text}`),
|
|
9260
|
+
* onError: (error) => console.error(error)
|
|
9261
|
+
* });
|
|
9262
|
+
*
|
|
9263
|
+
* await session.sendAudio({ data: audioBuffer });
|
|
9264
|
+
* await session.close();
|
|
9265
|
+
* ```
|
|
9266
|
+
*/
|
|
9267
|
+
async transcribeStream(options, callbacks) {
|
|
9268
|
+
this.validateConfig();
|
|
9269
|
+
const smOpts = options?.speechmaticsStreaming || {};
|
|
9270
|
+
const region = smOpts.region || this.config?.region;
|
|
9271
|
+
const wsUrl = this.getRegionalWsUrl(region);
|
|
9272
|
+
const ws = new import_ws5.default(wsUrl, {
|
|
9273
|
+
headers: {
|
|
9274
|
+
Authorization: `Bearer ${this.config.apiKey}`
|
|
9275
|
+
}
|
|
9276
|
+
});
|
|
9277
|
+
let sessionStatus = "connecting";
|
|
9278
|
+
const sessionId = `speechmatics-${Date.now()}-${Math.random().toString(36).substring(7)}`;
|
|
9279
|
+
let seqNo = 0;
|
|
9280
|
+
let utteranceResults = [];
|
|
9281
|
+
const sessionReady = new Promise((resolve, reject) => {
|
|
9282
|
+
const timeout = setTimeout(() => {
|
|
9283
|
+
reject(new Error("WebSocket connection timeout"));
|
|
9284
|
+
}, 1e4);
|
|
9285
|
+
let wsOpen = false;
|
|
9286
|
+
ws.once("error", (error) => {
|
|
9287
|
+
clearTimeout(timeout);
|
|
9288
|
+
reject(error);
|
|
9289
|
+
});
|
|
9290
|
+
ws.once("open", () => {
|
|
9291
|
+
wsOpen = true;
|
|
9292
|
+
const encoding = smOpts.encoding || options?.encoding || "pcm_s16le";
|
|
9293
|
+
const sampleRate = smOpts.sampleRate || options?.sampleRate || 16e3;
|
|
9294
|
+
const startMsg = {
|
|
9295
|
+
message: "StartRecognition",
|
|
9296
|
+
audio_format: {
|
|
9297
|
+
type: "raw",
|
|
9298
|
+
encoding,
|
|
9299
|
+
sample_rate: sampleRate
|
|
9300
|
+
},
|
|
9301
|
+
transcription_config: {
|
|
9302
|
+
language: smOpts.language || options?.language || "en",
|
|
9303
|
+
enable_partials: smOpts.enablePartials ?? options?.interimResults ?? true
|
|
9304
|
+
}
|
|
9305
|
+
};
|
|
9306
|
+
const txConfig = startMsg.transcription_config;
|
|
9307
|
+
if (smOpts.domain) txConfig.domain = smOpts.domain;
|
|
9308
|
+
if (smOpts.operatingPoint) txConfig.operating_point = smOpts.operatingPoint;
|
|
9309
|
+
if (smOpts.maxDelay !== void 0) txConfig.max_delay = smOpts.maxDelay;
|
|
9310
|
+
if (smOpts.maxDelayMode) txConfig.max_delay_mode = smOpts.maxDelayMode;
|
|
9311
|
+
if (smOpts.enableEntities !== void 0) txConfig.enable_entities = smOpts.enableEntities;
|
|
9312
|
+
if (smOpts.diarization === "speaker" || options?.diarization) {
|
|
9313
|
+
txConfig.diarization = "speaker";
|
|
9314
|
+
if (smOpts.maxSpeakers) {
|
|
9315
|
+
txConfig.speaker_diarization_config = {
|
|
9316
|
+
max_speakers: smOpts.maxSpeakers
|
|
9317
|
+
};
|
|
9318
|
+
} else if (options?.speakersExpected) {
|
|
9319
|
+
txConfig.speaker_diarization_config = {
|
|
9320
|
+
max_speakers: options.speakersExpected
|
|
9321
|
+
};
|
|
9322
|
+
}
|
|
9323
|
+
}
|
|
9324
|
+
if (smOpts.additionalVocab && smOpts.additionalVocab.length > 0) {
|
|
9325
|
+
txConfig.additional_vocab = smOpts.additionalVocab.map((word) => ({
|
|
9326
|
+
content: word
|
|
9327
|
+
}));
|
|
9328
|
+
} else if (options?.customVocabulary && options.customVocabulary.length > 0) {
|
|
9329
|
+
txConfig.additional_vocab = options.customVocabulary.map((word) => ({
|
|
9330
|
+
content: word
|
|
9331
|
+
}));
|
|
9332
|
+
}
|
|
9333
|
+
if (smOpts.conversationConfig) {
|
|
9334
|
+
txConfig.conversation_config = {
|
|
9335
|
+
end_of_utterance_silence_trigger: smOpts.conversationConfig.endOfUtteranceSilenceTrigger
|
|
9336
|
+
};
|
|
9337
|
+
}
|
|
9338
|
+
const startPayload = JSON.stringify(startMsg);
|
|
9339
|
+
if (callbacks?.onRawMessage) {
|
|
9340
|
+
callbacks.onRawMessage({
|
|
9341
|
+
provider: "speechmatics",
|
|
9342
|
+
direction: "outgoing",
|
|
9343
|
+
timestamp: Date.now(),
|
|
9344
|
+
payload: startPayload,
|
|
9345
|
+
messageType: "StartRecognition"
|
|
9346
|
+
});
|
|
9347
|
+
}
|
|
9348
|
+
ws.send(startPayload);
|
|
9349
|
+
});
|
|
9350
|
+
const onMessage = (data) => {
|
|
9351
|
+
const rawPayload = data.toString();
|
|
9352
|
+
try {
|
|
9353
|
+
const msg = JSON.parse(rawPayload);
|
|
9354
|
+
if (msg.message === "RecognitionStarted") {
|
|
9355
|
+
clearTimeout(timeout);
|
|
9356
|
+
ws.removeListener("message", onMessage);
|
|
9357
|
+
ws.emit("message", data);
|
|
9358
|
+
resolve();
|
|
9359
|
+
} else if (msg.message === "Error") {
|
|
9360
|
+
clearTimeout(timeout);
|
|
9361
|
+
ws.removeListener("message", onMessage);
|
|
9362
|
+
reject(new Error(msg.reason || "Recognition failed to start"));
|
|
9363
|
+
}
|
|
9364
|
+
} catch {
|
|
9365
|
+
}
|
|
9366
|
+
};
|
|
9367
|
+
ws.on("message", onMessage);
|
|
9368
|
+
});
|
|
9369
|
+
ws.on("message", (data) => {
|
|
9370
|
+
const rawPayload = data.toString();
|
|
9371
|
+
try {
|
|
9372
|
+
const message = JSON.parse(rawPayload);
|
|
9373
|
+
if (callbacks?.onRawMessage) {
|
|
9374
|
+
callbacks.onRawMessage({
|
|
9375
|
+
provider: "speechmatics",
|
|
9376
|
+
direction: "incoming",
|
|
9377
|
+
timestamp: Date.now(),
|
|
9378
|
+
payload: rawPayload,
|
|
9379
|
+
messageType: message.message
|
|
9380
|
+
});
|
|
9381
|
+
}
|
|
9382
|
+
this.handleStreamingMessage(message, callbacks, utteranceResults);
|
|
9383
|
+
} catch (error) {
|
|
9384
|
+
if (callbacks?.onRawMessage) {
|
|
9385
|
+
callbacks.onRawMessage({
|
|
9386
|
+
provider: "speechmatics",
|
|
9387
|
+
direction: "incoming",
|
|
9388
|
+
timestamp: Date.now(),
|
|
9389
|
+
payload: rawPayload,
|
|
9390
|
+
messageType: "parse_error"
|
|
9391
|
+
});
|
|
9392
|
+
}
|
|
9393
|
+
callbacks?.onError?.({
|
|
9394
|
+
code: "PARSE_ERROR",
|
|
9395
|
+
message: "Failed to parse WebSocket message",
|
|
9396
|
+
details: error
|
|
9397
|
+
});
|
|
9398
|
+
}
|
|
9399
|
+
});
|
|
9400
|
+
ws.on("error", (error) => {
|
|
9401
|
+
callbacks?.onError?.({
|
|
9402
|
+
code: "WEBSOCKET_ERROR",
|
|
9403
|
+
message: error.message,
|
|
9404
|
+
details: error
|
|
9405
|
+
});
|
|
9406
|
+
});
|
|
9407
|
+
ws.on("close", (code, reason) => {
|
|
9408
|
+
sessionStatus = "closed";
|
|
9409
|
+
callbacks?.onClose?.(code, reason.toString());
|
|
9410
|
+
});
|
|
9411
|
+
await sessionReady;
|
|
9412
|
+
sessionStatus = "open";
|
|
9413
|
+
callbacks?.onOpen?.();
|
|
9414
|
+
return {
|
|
9415
|
+
id: sessionId,
|
|
9416
|
+
provider: this.name,
|
|
9417
|
+
createdAt: /* @__PURE__ */ new Date(),
|
|
9418
|
+
getStatus: () => sessionStatus,
|
|
9419
|
+
sendAudio: async (chunk) => {
|
|
9420
|
+
if (sessionStatus !== "open") {
|
|
9421
|
+
throw new Error(`Cannot send audio: session is ${sessionStatus}`);
|
|
9422
|
+
}
|
|
9423
|
+
if (ws.readyState !== import_ws5.default.OPEN) {
|
|
9424
|
+
throw new Error("WebSocket is not open");
|
|
9425
|
+
}
|
|
9426
|
+
if (callbacks?.onRawMessage) {
|
|
9427
|
+
const audioPayload = chunk.data instanceof ArrayBuffer ? chunk.data : chunk.data.buffer.slice(
|
|
9428
|
+
chunk.data.byteOffset,
|
|
9429
|
+
chunk.data.byteOffset + chunk.data.byteLength
|
|
9430
|
+
);
|
|
9431
|
+
callbacks.onRawMessage({
|
|
9432
|
+
provider: this.name,
|
|
9433
|
+
direction: "outgoing",
|
|
9434
|
+
timestamp: Date.now(),
|
|
9435
|
+
payload: audioPayload,
|
|
9436
|
+
messageType: "audio"
|
|
9437
|
+
});
|
|
9438
|
+
}
|
|
9439
|
+
ws.send(chunk.data);
|
|
9440
|
+
seqNo++;
|
|
9441
|
+
if (chunk.isLast) {
|
|
9442
|
+
const endMsg = JSON.stringify({
|
|
9443
|
+
message: "EndOfStream",
|
|
9444
|
+
last_seq_no: seqNo
|
|
9445
|
+
});
|
|
9446
|
+
if (callbacks?.onRawMessage) {
|
|
9447
|
+
callbacks.onRawMessage({
|
|
9448
|
+
provider: this.name,
|
|
9449
|
+
direction: "outgoing",
|
|
9450
|
+
timestamp: Date.now(),
|
|
9451
|
+
payload: endMsg,
|
|
9452
|
+
messageType: "EndOfStream"
|
|
9453
|
+
});
|
|
9454
|
+
}
|
|
9455
|
+
ws.send(endMsg);
|
|
9456
|
+
}
|
|
9457
|
+
},
|
|
9458
|
+
close: async () => {
|
|
9459
|
+
if (sessionStatus === "closed" || sessionStatus === "closing") {
|
|
9460
|
+
return;
|
|
9461
|
+
}
|
|
9462
|
+
sessionStatus = "closing";
|
|
9463
|
+
if (ws.readyState === import_ws5.default.OPEN) {
|
|
9464
|
+
seqNo++;
|
|
9465
|
+
ws.send(
|
|
9466
|
+
JSON.stringify({
|
|
9467
|
+
message: "EndOfStream",
|
|
9468
|
+
last_seq_no: seqNo
|
|
9469
|
+
})
|
|
9470
|
+
);
|
|
9471
|
+
}
|
|
9472
|
+
return new Promise((resolve) => {
|
|
9473
|
+
const timeout = setTimeout(() => {
|
|
9474
|
+
ws.terminate();
|
|
9475
|
+
sessionStatus = "closed";
|
|
9476
|
+
resolve();
|
|
9477
|
+
}, 5e3);
|
|
9478
|
+
const onMsg = (data) => {
|
|
9479
|
+
try {
|
|
9480
|
+
const msg = JSON.parse(data.toString());
|
|
9481
|
+
if (msg.message === "EndOfTranscript") {
|
|
9482
|
+
ws.removeListener("message", onMsg);
|
|
9483
|
+
clearTimeout(timeout);
|
|
9484
|
+
ws.close();
|
|
9485
|
+
}
|
|
9486
|
+
} catch {
|
|
9487
|
+
}
|
|
9488
|
+
};
|
|
9489
|
+
ws.on("message", onMsg);
|
|
9490
|
+
ws.once("close", () => {
|
|
9491
|
+
clearTimeout(timeout);
|
|
9492
|
+
sessionStatus = "closed";
|
|
9493
|
+
resolve();
|
|
9494
|
+
});
|
|
9495
|
+
});
|
|
9496
|
+
}
|
|
9497
|
+
};
|
|
9498
|
+
}
|
|
9499
|
+
/**
|
|
9500
|
+
* Handle incoming Speechmatics real-time WebSocket messages
|
|
9501
|
+
*/
|
|
9502
|
+
handleStreamingMessage(message, callbacks, utteranceResults) {
|
|
9503
|
+
switch (message.message) {
|
|
9504
|
+
case "RecognitionStarted": {
|
|
9505
|
+
break;
|
|
9506
|
+
}
|
|
9507
|
+
case "AddPartialTranscript": {
|
|
9508
|
+
const results = message.results || [];
|
|
9509
|
+
const text = buildTextFromSpeechmaticsResults(results);
|
|
9510
|
+
if (text) {
|
|
9511
|
+
callbacks?.onTranscript?.({
|
|
9512
|
+
type: "transcript",
|
|
9513
|
+
text,
|
|
9514
|
+
isFinal: false,
|
|
9515
|
+
words: this.extractWordsFromResults(results),
|
|
9516
|
+
data: message
|
|
9517
|
+
});
|
|
9518
|
+
}
|
|
9519
|
+
break;
|
|
9520
|
+
}
|
|
9521
|
+
case "AddTranscript": {
|
|
9522
|
+
const results = message.results || [];
|
|
9523
|
+
const text = buildTextFromSpeechmaticsResults(results);
|
|
9524
|
+
if (utteranceResults) {
|
|
9525
|
+
utteranceResults.push(...results);
|
|
9526
|
+
}
|
|
9527
|
+
if (text) {
|
|
9528
|
+
callbacks?.onTranscript?.({
|
|
9529
|
+
type: "transcript",
|
|
9530
|
+
text,
|
|
9531
|
+
isFinal: true,
|
|
9532
|
+
words: this.extractWordsFromResults(results),
|
|
9533
|
+
data: message
|
|
9534
|
+
});
|
|
9535
|
+
}
|
|
9536
|
+
break;
|
|
9537
|
+
}
|
|
9538
|
+
case "EndOfUtterance": {
|
|
9539
|
+
if (utteranceResults && utteranceResults.length > 0) {
|
|
9540
|
+
const text = buildTextFromSpeechmaticsResults(utteranceResults);
|
|
9541
|
+
const words = this.extractWordsFromResults(utteranceResults);
|
|
9542
|
+
const utterances = buildUtterancesFromWords(words);
|
|
9543
|
+
if (utterances.length > 0) {
|
|
9544
|
+
for (const utt of utterances) {
|
|
9545
|
+
callbacks?.onUtterance?.(utt);
|
|
9546
|
+
}
|
|
9547
|
+
} else if (text) {
|
|
9548
|
+
callbacks?.onUtterance?.({
|
|
9549
|
+
text,
|
|
9550
|
+
start: words.length > 0 ? words[0].start : 0,
|
|
9551
|
+
end: words.length > 0 ? words[words.length - 1].end : 0,
|
|
9552
|
+
words
|
|
9553
|
+
});
|
|
9554
|
+
}
|
|
9555
|
+
utteranceResults.length = 0;
|
|
9556
|
+
}
|
|
9557
|
+
break;
|
|
9558
|
+
}
|
|
9559
|
+
case "AudioAdded": {
|
|
9560
|
+
break;
|
|
9561
|
+
}
|
|
9562
|
+
case "EndOfTranscript": {
|
|
9563
|
+
break;
|
|
9564
|
+
}
|
|
9565
|
+
case "Info":
|
|
9566
|
+
case "Warning": {
|
|
9567
|
+
callbacks?.onMetadata?.(message);
|
|
9568
|
+
break;
|
|
9569
|
+
}
|
|
9570
|
+
case "Error": {
|
|
9571
|
+
const errMsg = message;
|
|
9572
|
+
callbacks?.onError?.({
|
|
9573
|
+
code: errMsg.type || "SPEECHMATICS_ERROR",
|
|
9574
|
+
message: errMsg.reason || "Unknown error",
|
|
9575
|
+
details: message
|
|
9576
|
+
});
|
|
9577
|
+
break;
|
|
9578
|
+
}
|
|
9579
|
+
default: {
|
|
9580
|
+
callbacks?.onMetadata?.(message);
|
|
9581
|
+
break;
|
|
9582
|
+
}
|
|
9583
|
+
}
|
|
9584
|
+
}
|
|
9585
|
+
/**
|
|
9586
|
+
* Extract unified Word[] from Speechmatics recognition results
|
|
9587
|
+
*/
|
|
9588
|
+
extractWordsFromResults(results) {
|
|
9589
|
+
return results.filter((r) => r.type === "word" && r.start_time !== void 0 && r.end_time !== void 0).map((result) => ({
|
|
9590
|
+
word: result.alternatives?.[0]?.content || "",
|
|
9591
|
+
start: result.start_time,
|
|
9592
|
+
end: result.end_time,
|
|
9593
|
+
confidence: result.alternatives?.[0]?.confidence,
|
|
9594
|
+
speaker: result.alternatives?.[0]?.speaker
|
|
9595
|
+
}));
|
|
9596
|
+
}
|
|
9217
9597
|
/**
|
|
9218
9598
|
* Normalize Speechmatics status to unified status
|
|
9219
9599
|
* Uses generated JobDetailsStatus enum values
|
|
@@ -9432,26 +9812,13 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
9432
9812
|
} else if (audio.type === "file") {
|
|
9433
9813
|
const formData = new FormData();
|
|
9434
9814
|
const audioBlob = audio.file instanceof Blob ? audio.file : new Blob([audio.file], { type: audio.mimeType || "audio/wav" });
|
|
9435
|
-
formData.append("
|
|
9436
|
-
|
|
9437
|
-
if (options?.language) {
|
|
9438
|
-
formData.append("language_hints", JSON.stringify([options.language]));
|
|
9439
|
-
}
|
|
9440
|
-
if (options?.diarization) {
|
|
9441
|
-
formData.append("enable_speaker_diarization", "true");
|
|
9442
|
-
}
|
|
9443
|
-
if (options?.languageDetection) {
|
|
9444
|
-
formData.append("enable_language_identification", "true");
|
|
9445
|
-
}
|
|
9446
|
-
if (options?.customVocabulary) {
|
|
9447
|
-
formData.append("context", JSON.stringify({ terms: options.customVocabulary }));
|
|
9448
|
-
}
|
|
9449
|
-
const response2 = await this.client.post("/speech/transcribe", formData, {
|
|
9815
|
+
formData.append("file", audioBlob, audio.filename || "audio.wav");
|
|
9816
|
+
const uploadResponse = await this.client.post("/files", formData, {
|
|
9450
9817
|
headers: {
|
|
9451
9818
|
"Content-Type": "multipart/form-data"
|
|
9452
9819
|
}
|
|
9453
9820
|
});
|
|
9454
|
-
|
|
9821
|
+
requestBody.file_id = uploadResponse.data.id;
|
|
9455
9822
|
} else {
|
|
9456
9823
|
return {
|
|
9457
9824
|
success: false,
|
|
@@ -9476,8 +9843,9 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
9476
9843
|
terms: options.customVocabulary
|
|
9477
9844
|
};
|
|
9478
9845
|
}
|
|
9479
|
-
const response = await this.client.post("/
|
|
9480
|
-
|
|
9846
|
+
const response = await this.client.post("/transcriptions", requestBody);
|
|
9847
|
+
const transcriptionId = response.data.id;
|
|
9848
|
+
return await this.pollForCompletion(transcriptionId);
|
|
9481
9849
|
} catch (error) {
|
|
9482
9850
|
return this.createErrorResponse(error);
|
|
9483
9851
|
}
|
|
@@ -9485,8 +9853,9 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
9485
9853
|
/**
|
|
9486
9854
|
* Get transcription result by ID
|
|
9487
9855
|
*
|
|
9488
|
-
*
|
|
9489
|
-
*
|
|
9856
|
+
* Checks job status via GET /v1/transcriptions/{id}, then fetches
|
|
9857
|
+
* the full transcript via GET /v1/transcriptions/{id}/transcript
|
|
9858
|
+
* when completed.
|
|
9490
9859
|
*
|
|
9491
9860
|
* @param transcriptId - Transcript ID
|
|
9492
9861
|
* @returns Transcription response
|
|
@@ -9494,8 +9863,39 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
9494
9863
|
async getTranscript(transcriptId) {
|
|
9495
9864
|
this.validateConfig();
|
|
9496
9865
|
try {
|
|
9497
|
-
const
|
|
9498
|
-
|
|
9866
|
+
const statusResponse = await this.client.get(`/transcriptions/${transcriptId}`);
|
|
9867
|
+
const job = statusResponse.data;
|
|
9868
|
+
if (job.status === "error") {
|
|
9869
|
+
return {
|
|
9870
|
+
success: false,
|
|
9871
|
+
provider: this.name,
|
|
9872
|
+
error: {
|
|
9873
|
+
code: "TRANSCRIPTION_ERROR",
|
|
9874
|
+
message: job.error_message || "Transcription failed"
|
|
9875
|
+
}
|
|
9876
|
+
};
|
|
9877
|
+
}
|
|
9878
|
+
if (job.status !== "completed") {
|
|
9879
|
+
return {
|
|
9880
|
+
success: true,
|
|
9881
|
+
provider: this.name,
|
|
9882
|
+
data: {
|
|
9883
|
+
id: job.id,
|
|
9884
|
+
text: "",
|
|
9885
|
+
status: job.status
|
|
9886
|
+
},
|
|
9887
|
+
raw: job
|
|
9888
|
+
};
|
|
9889
|
+
}
|
|
9890
|
+
const transcriptResponse = await this.client.get(
|
|
9891
|
+
`/transcriptions/${transcriptId}/transcript`
|
|
9892
|
+
);
|
|
9893
|
+
return this.normalizeResponse({
|
|
9894
|
+
...transcriptResponse.data,
|
|
9895
|
+
// Carry over job metadata
|
|
9896
|
+
id: job.id,
|
|
9897
|
+
audio_duration_ms: job.audio_duration_ms
|
|
9898
|
+
});
|
|
9499
9899
|
} catch (error) {
|
|
9500
9900
|
return this.createErrorResponse(error);
|
|
9501
9901
|
}
|
|
@@ -9515,50 +9915,51 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
9515
9915
|
const sessionId = `soniox_${Date.now()}_${Math.random().toString(36).substring(7)}`;
|
|
9516
9916
|
const createdAt = /* @__PURE__ */ new Date();
|
|
9517
9917
|
const wsBase = this.config?.wsBaseUrl || (this.config?.baseUrl ? this.deriveWsUrl(this.config.baseUrl) : `wss://${this.getRegionalWsHost()}`);
|
|
9518
|
-
const wsUrl =
|
|
9519
|
-
|
|
9520
|
-
const
|
|
9521
|
-
|
|
9522
|
-
|
|
9918
|
+
const wsUrl = `${wsBase}/transcribe-websocket`;
|
|
9919
|
+
const modelId = options?.sonioxStreaming?.model || options?.model || "stt-rt-v4";
|
|
9920
|
+
const sonioxOpts = options?.sonioxStreaming;
|
|
9921
|
+
const initMessage = {
|
|
9922
|
+
api_key: this.config.apiKey,
|
|
9923
|
+
model: modelId
|
|
9924
|
+
};
|
|
9925
|
+
if (sonioxOpts?.audioFormat) {
|
|
9926
|
+
initMessage.audio_format = sonioxOpts.audioFormat;
|
|
9927
|
+
} else if (options?.encoding) {
|
|
9523
9928
|
const encodingMap = {
|
|
9524
9929
|
linear16: "pcm_s16le",
|
|
9525
9930
|
pcm: "pcm_s16le",
|
|
9526
9931
|
mulaw: "mulaw",
|
|
9527
9932
|
alaw: "alaw"
|
|
9528
9933
|
};
|
|
9529
|
-
|
|
9934
|
+
initMessage.audio_format = encodingMap[options.encoding] || options.encoding;
|
|
9530
9935
|
}
|
|
9531
|
-
if (options?.sampleRate) {
|
|
9532
|
-
|
|
9936
|
+
if (sonioxOpts?.sampleRate || options?.sampleRate) {
|
|
9937
|
+
initMessage.sample_rate = sonioxOpts?.sampleRate || options?.sampleRate;
|
|
9533
9938
|
}
|
|
9534
|
-
if (options?.channels) {
|
|
9535
|
-
|
|
9939
|
+
if (sonioxOpts?.numChannels || options?.channels) {
|
|
9940
|
+
initMessage.num_channels = sonioxOpts?.numChannels || options?.channels;
|
|
9536
9941
|
}
|
|
9537
|
-
const sonioxOpts = options?.sonioxStreaming;
|
|
9538
9942
|
if (sonioxOpts) {
|
|
9539
9943
|
if (sonioxOpts.languageHints && sonioxOpts.languageHints.length > 0) {
|
|
9540
|
-
|
|
9944
|
+
initMessage.language_hints = sonioxOpts.languageHints;
|
|
9541
9945
|
}
|
|
9542
9946
|
if (sonioxOpts.enableLanguageIdentification) {
|
|
9543
|
-
|
|
9947
|
+
initMessage.enable_language_identification = true;
|
|
9544
9948
|
}
|
|
9545
9949
|
if (sonioxOpts.enableEndpointDetection) {
|
|
9546
|
-
|
|
9950
|
+
initMessage.enable_endpoint_detection = true;
|
|
9547
9951
|
}
|
|
9548
9952
|
if (sonioxOpts.enableSpeakerDiarization) {
|
|
9549
|
-
|
|
9953
|
+
initMessage.enable_speaker_diarization = true;
|
|
9550
9954
|
}
|
|
9551
9955
|
if (sonioxOpts.context) {
|
|
9552
|
-
|
|
9553
|
-
"context",
|
|
9554
|
-
typeof sonioxOpts.context === "string" ? sonioxOpts.context : JSON.stringify(sonioxOpts.context)
|
|
9555
|
-
);
|
|
9956
|
+
initMessage.context = typeof sonioxOpts.context === "string" ? sonioxOpts.context : sonioxOpts.context;
|
|
9556
9957
|
}
|
|
9557
9958
|
if (sonioxOpts.translation) {
|
|
9558
|
-
|
|
9959
|
+
initMessage.translation = sonioxOpts.translation;
|
|
9559
9960
|
}
|
|
9560
9961
|
if (sonioxOpts.clientReferenceId) {
|
|
9561
|
-
|
|
9962
|
+
initMessage.client_reference_id = sonioxOpts.clientReferenceId;
|
|
9562
9963
|
}
|
|
9563
9964
|
}
|
|
9564
9965
|
if (!sonioxOpts?.languageHints && options?.language) {
|
|
@@ -9567,24 +9968,33 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
9567
9968
|
`[Soniox] Warning: language="multi" is Deepgram-specific and not supported by Soniox. For automatic language detection, use languageDetection: true instead, or specify a language code like 'en'.`
|
|
9568
9969
|
);
|
|
9569
9970
|
}
|
|
9570
|
-
|
|
9971
|
+
initMessage.language_hints = [options.language];
|
|
9571
9972
|
}
|
|
9572
9973
|
if (!sonioxOpts?.enableSpeakerDiarization && options?.diarization) {
|
|
9573
|
-
|
|
9974
|
+
initMessage.enable_speaker_diarization = true;
|
|
9574
9975
|
}
|
|
9575
9976
|
if (!sonioxOpts?.enableLanguageIdentification && options?.languageDetection) {
|
|
9576
|
-
|
|
9577
|
-
}
|
|
9578
|
-
if (options?.interimResults !== false) {
|
|
9977
|
+
initMessage.enable_language_identification = true;
|
|
9579
9978
|
}
|
|
9580
9979
|
let status = "connecting";
|
|
9581
9980
|
let openedAt = null;
|
|
9582
9981
|
let receivedData = false;
|
|
9583
9982
|
const WebSocketImpl = typeof WebSocket !== "undefined" ? WebSocket : require("ws");
|
|
9584
|
-
const ws = new WebSocketImpl(wsUrl
|
|
9983
|
+
const ws = new WebSocketImpl(wsUrl);
|
|
9585
9984
|
ws.onopen = () => {
|
|
9586
|
-
status = "open";
|
|
9587
9985
|
openedAt = Date.now();
|
|
9986
|
+
const initPayload = JSON.stringify(initMessage);
|
|
9987
|
+
if (callbacks?.onRawMessage) {
|
|
9988
|
+
callbacks.onRawMessage({
|
|
9989
|
+
provider: this.name,
|
|
9990
|
+
direction: "outgoing",
|
|
9991
|
+
timestamp: Date.now(),
|
|
9992
|
+
payload: initPayload,
|
|
9993
|
+
messageType: "init"
|
|
9994
|
+
});
|
|
9995
|
+
}
|
|
9996
|
+
ws.send(initPayload);
|
|
9997
|
+
status = "open";
|
|
9588
9998
|
callbacks?.onOpen?.();
|
|
9589
9999
|
};
|
|
9590
10000
|
ws.onmessage = (event) => {
|
|
@@ -9663,10 +10073,10 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
9663
10073
|
ws.onclose = (event) => {
|
|
9664
10074
|
status = "closed";
|
|
9665
10075
|
const timeSinceOpen = openedAt ? Date.now() - openedAt : null;
|
|
9666
|
-
const
|
|
9667
|
-
if (
|
|
10076
|
+
const isEarlyClose = timeSinceOpen !== null && timeSinceOpen < 5e3 && !receivedData;
|
|
10077
|
+
if (isEarlyClose && event.code === 1e3) {
|
|
9668
10078
|
const errorMessage = [
|
|
9669
|
-
"Soniox closed connection
|
|
10079
|
+
"Soniox closed connection shortly after opening.",
|
|
9670
10080
|
`Current config: region=${this.region}, model=${modelId}`,
|
|
9671
10081
|
"Likely causes:",
|
|
9672
10082
|
" - Invalid API key or region mismatch (keys are region-specific, current: " + this.region + ")",
|
|
@@ -9787,8 +10197,10 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
9787
10197
|
* Normalize Soniox response to unified format
|
|
9788
10198
|
*/
|
|
9789
10199
|
normalizeResponse(response) {
|
|
9790
|
-
const text = response.text || (response.tokens ? response.tokens.filter((t) => t.is_final).map((t) => t.text).join("") : "");
|
|
9791
|
-
const words = response.tokens ? response.tokens.filter(
|
|
10200
|
+
const text = response.text || (response.tokens ? response.tokens.filter((t) => t.is_final !== false).map((t) => t.text).join("") : "");
|
|
10201
|
+
const words = response.tokens ? response.tokens.filter(
|
|
10202
|
+
(t) => t.is_final !== false && t.start_ms !== void 0 && t.end_ms !== void 0
|
|
10203
|
+
).map((token) => ({
|
|
9792
10204
|
word: token.text,
|
|
9793
10205
|
start: token.start_ms / 1e3,
|
|
9794
10206
|
end: token.end_ms / 1e3,
|
|
@@ -9805,7 +10217,8 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
9805
10217
|
id,
|
|
9806
10218
|
label: `Speaker ${id}`
|
|
9807
10219
|
})) : void 0;
|
|
9808
|
-
const
|
|
10220
|
+
const tokens = response.tokens ? response.tokens.filter((t) => t.is_final !== false) : [];
|
|
10221
|
+
const utterances = tokens.length > 0 ? this.buildUtterancesFromTokens(tokens) : [];
|
|
9809
10222
|
const language = response.tokens?.find((t) => t.language)?.language;
|
|
9810
10223
|
return {
|
|
9811
10224
|
success: true,
|
|
@@ -9815,7 +10228,7 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
9815
10228
|
text,
|
|
9816
10229
|
status: TranscriptionStatus.completed,
|
|
9817
10230
|
language,
|
|
9818
|
-
duration: response.total_audio_proc_ms ? response.total_audio_proc_ms / 1e3 : void 0,
|
|
10231
|
+
duration: response.audio_duration_ms ? response.audio_duration_ms / 1e3 : response.total_audio_proc_ms ? response.total_audio_proc_ms / 1e3 : void 0,
|
|
9819
10232
|
speakers,
|
|
9820
10233
|
words: words.length > 0 ? words : void 0,
|
|
9821
10234
|
utterances: utterances.length > 0 ? utterances : void 0
|
|
@@ -36682,7 +37095,7 @@ var AzureCapabilities = {
|
|
|
36682
37095
|
deleteTranscript: true
|
|
36683
37096
|
};
|
|
36684
37097
|
var SpeechmaticsCapabilities = {
|
|
36685
|
-
streaming:
|
|
37098
|
+
streaming: true,
|
|
36686
37099
|
diarization: true,
|
|
36687
37100
|
wordTimestamps: true,
|
|
36688
37101
|
languageDetection: false,
|