voice-router-dev 0.8.6 → 0.8.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +65 -0
- package/dist/{field-configs-D1RCJSmr.d.mts → field-configs-BtR4uR2N.d.mts} +166 -166
- package/dist/{field-configs-D1RCJSmr.d.ts → field-configs-BtR4uR2N.d.ts} +166 -166
- package/dist/field-configs.d.mts +1 -1
- package/dist/field-configs.d.ts +1 -1
- package/dist/index.d.mts +522 -474
- package/dist/index.d.ts +522 -474
- package/dist/index.js +479 -66
- package/dist/index.mjs +479 -66
- package/dist/{provider-metadata-BnkedpXm.d.mts → provider-metadata-BJ29OPW1.d.mts} +2 -2
- package/dist/{provider-metadata-DbsSGAO7.d.ts → provider-metadata-D1d-9cng.d.ts} +2 -2
- package/dist/provider-metadata.d.mts +1 -1
- package/dist/provider-metadata.d.ts +1 -1
- package/dist/provider-metadata.js +1 -1
- package/dist/provider-metadata.mjs +1 -1
- package/dist/{speechToTextChunkResponseModel-BZSxrijj.d.ts → speechToTextChunkResponseModel-B4kVoFc3.d.ts} +97 -6
- package/dist/{speechToTextChunkResponseModel-DK61nDc5.d.mts → speechToTextChunkResponseModel-DmajV4F-.d.mts} +97 -6
- package/dist/webhooks.d.mts +2 -2
- package/dist/webhooks.d.ts +2 -2
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -8041,6 +8041,7 @@ var AzureSTTAdapter = class extends BaseAdapter {
|
|
|
8041
8041
|
id: String(speakerId),
|
|
8042
8042
|
label: `Speaker ${speakerId}`
|
|
8043
8043
|
})) : void 0;
|
|
8044
|
+
const utterances = words.length > 0 ? buildUtterancesFromWords(words) : void 0;
|
|
8044
8045
|
const transcriptionId = transcription.self?.split("/").pop() || "";
|
|
8045
8046
|
return {
|
|
8046
8047
|
success: true,
|
|
@@ -8054,6 +8055,7 @@ var AzureSTTAdapter = class extends BaseAdapter {
|
|
|
8054
8055
|
duration: transcriptionData.duration ? transcriptionData.duration / 1e7 : void 0,
|
|
8055
8056
|
speakers,
|
|
8056
8057
|
words: words.length > 0 ? words : void 0,
|
|
8058
|
+
utterances: utterances && utterances.length > 0 ? utterances : void 0,
|
|
8057
8059
|
createdAt: transcription.createdDateTime,
|
|
8058
8060
|
completedAt: transcription.lastActionDateTime
|
|
8059
8061
|
},
|
|
@@ -8689,6 +8691,7 @@ function createOpenAIWhisperAdapter(config) {
|
|
|
8689
8691
|
|
|
8690
8692
|
// src/adapters/speechmatics-adapter.ts
|
|
8691
8693
|
import axios8 from "axios";
|
|
8694
|
+
import WebSocket6 from "ws";
|
|
8692
8695
|
|
|
8693
8696
|
// src/generated/speechmatics/schema/notificationConfigContentsItem.ts
|
|
8694
8697
|
var NotificationConfigContentsItem = {
|
|
@@ -8738,8 +8741,7 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
|
|
|
8738
8741
|
super(...arguments);
|
|
8739
8742
|
this.name = "speechmatics";
|
|
8740
8743
|
this.capabilities = {
|
|
8741
|
-
streaming:
|
|
8742
|
-
// Batch only (streaming available via separate WebSocket API)
|
|
8744
|
+
streaming: true,
|
|
8743
8745
|
diarization: true,
|
|
8744
8746
|
wordTimestamps: true,
|
|
8745
8747
|
languageDetection: false,
|
|
@@ -8874,13 +8876,16 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
|
|
|
8874
8876
|
jobConfig.fetch_data = {
|
|
8875
8877
|
url: audio.url
|
|
8876
8878
|
};
|
|
8877
|
-
|
|
8878
|
-
|
|
8879
|
+
const formData = new FormData();
|
|
8880
|
+
formData.append("config", JSON.stringify(jobConfig));
|
|
8881
|
+
requestBody = formData;
|
|
8882
|
+
headers = { "Content-Type": "multipart/form-data" };
|
|
8879
8883
|
} else if (audio.type === "file") {
|
|
8880
|
-
|
|
8881
|
-
|
|
8882
|
-
|
|
8883
|
-
|
|
8884
|
+
const formData = new FormData();
|
|
8885
|
+
formData.append("config", JSON.stringify(jobConfig));
|
|
8886
|
+
const audioBlob = audio.file instanceof Blob ? audio.file : new Blob([audio.file], { type: audio.mimeType || "audio/wav" });
|
|
8887
|
+
formData.append("data_file", audioBlob, audio.filename || "audio.wav");
|
|
8888
|
+
requestBody = formData;
|
|
8884
8889
|
headers = { "Content-Type": "multipart/form-data" };
|
|
8885
8890
|
} else {
|
|
8886
8891
|
return {
|
|
@@ -8985,6 +8990,381 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
|
|
|
8985
8990
|
throw error;
|
|
8986
8991
|
}
|
|
8987
8992
|
}
|
|
8993
|
+
/**
|
|
8994
|
+
* Build WebSocket URL for real-time streaming
|
|
8995
|
+
*
|
|
8996
|
+
* Note: Real-time API uses a different host from the batch API:
|
|
8997
|
+
* - Batch: {region}.asr.api.speechmatics.com
|
|
8998
|
+
* - Real-time: {region}.rt.speechmatics.com
|
|
8999
|
+
*
|
|
9000
|
+
* @param region - Regional endpoint identifier
|
|
9001
|
+
* @returns WebSocket URL for real-time API
|
|
9002
|
+
*/
|
|
9003
|
+
getRegionalWsUrl(region) {
|
|
9004
|
+
if (this.config?.wsBaseUrl) {
|
|
9005
|
+
return this.config.wsBaseUrl;
|
|
9006
|
+
}
|
|
9007
|
+
const regionPrefix = region || "eu1";
|
|
9008
|
+
return `wss://${regionPrefix}.rt.speechmatics.com/v2`;
|
|
9009
|
+
}
|
|
9010
|
+
/**
|
|
9011
|
+
* Stream audio for real-time transcription via WebSocket
|
|
9012
|
+
*
|
|
9013
|
+
* Connects to Speechmatics' real-time API and sends audio chunks
|
|
9014
|
+
* for transcription with results returned via callbacks.
|
|
9015
|
+
*
|
|
9016
|
+
* @param options - Streaming configuration options
|
|
9017
|
+
* @param callbacks - Event callbacks for transcription results
|
|
9018
|
+
* @returns Promise that resolves with a StreamingSession
|
|
9019
|
+
*
|
|
9020
|
+
* @example Basic streaming
|
|
9021
|
+
* ```typescript
|
|
9022
|
+
* const session = await adapter.transcribeStream({
|
|
9023
|
+
* language: 'en',
|
|
9024
|
+
* speechmaticsStreaming: {
|
|
9025
|
+
* enablePartials: true,
|
|
9026
|
+
* operatingPoint: 'enhanced'
|
|
9027
|
+
* }
|
|
9028
|
+
* }, {
|
|
9029
|
+
* onTranscript: (event) => console.log(event.text),
|
|
9030
|
+
* onUtterance: (utt) => console.log(`[${utt.speaker}]: ${utt.text}`),
|
|
9031
|
+
* onError: (error) => console.error(error)
|
|
9032
|
+
* });
|
|
9033
|
+
*
|
|
9034
|
+
* await session.sendAudio({ data: audioBuffer });
|
|
9035
|
+
* await session.close();
|
|
9036
|
+
* ```
|
|
9037
|
+
*/
|
|
9038
|
+
async transcribeStream(options, callbacks) {
|
|
9039
|
+
this.validateConfig();
|
|
9040
|
+
const smOpts = options?.speechmaticsStreaming || {};
|
|
9041
|
+
const region = smOpts.region || this.config?.region;
|
|
9042
|
+
const wsUrl = this.getRegionalWsUrl(region);
|
|
9043
|
+
const ws = new WebSocket6(wsUrl, {
|
|
9044
|
+
headers: {
|
|
9045
|
+
Authorization: `Bearer ${this.config.apiKey}`
|
|
9046
|
+
}
|
|
9047
|
+
});
|
|
9048
|
+
let sessionStatus = "connecting";
|
|
9049
|
+
const sessionId = `speechmatics-${Date.now()}-${Math.random().toString(36).substring(7)}`;
|
|
9050
|
+
let seqNo = 0;
|
|
9051
|
+
let utteranceResults = [];
|
|
9052
|
+
const sessionReady = new Promise((resolve, reject) => {
|
|
9053
|
+
const timeout = setTimeout(() => {
|
|
9054
|
+
reject(new Error("WebSocket connection timeout"));
|
|
9055
|
+
}, 1e4);
|
|
9056
|
+
let wsOpen = false;
|
|
9057
|
+
ws.once("error", (error) => {
|
|
9058
|
+
clearTimeout(timeout);
|
|
9059
|
+
reject(error);
|
|
9060
|
+
});
|
|
9061
|
+
ws.once("open", () => {
|
|
9062
|
+
wsOpen = true;
|
|
9063
|
+
const encoding = smOpts.encoding || options?.encoding || "pcm_s16le";
|
|
9064
|
+
const sampleRate = smOpts.sampleRate || options?.sampleRate || 16e3;
|
|
9065
|
+
const startMsg = {
|
|
9066
|
+
message: "StartRecognition",
|
|
9067
|
+
audio_format: {
|
|
9068
|
+
type: "raw",
|
|
9069
|
+
encoding,
|
|
9070
|
+
sample_rate: sampleRate
|
|
9071
|
+
},
|
|
9072
|
+
transcription_config: {
|
|
9073
|
+
language: smOpts.language || options?.language || "en",
|
|
9074
|
+
enable_partials: smOpts.enablePartials ?? options?.interimResults ?? true
|
|
9075
|
+
}
|
|
9076
|
+
};
|
|
9077
|
+
const txConfig = startMsg.transcription_config;
|
|
9078
|
+
if (smOpts.domain) txConfig.domain = smOpts.domain;
|
|
9079
|
+
if (smOpts.operatingPoint) txConfig.operating_point = smOpts.operatingPoint;
|
|
9080
|
+
if (smOpts.maxDelay !== void 0) txConfig.max_delay = smOpts.maxDelay;
|
|
9081
|
+
if (smOpts.maxDelayMode) txConfig.max_delay_mode = smOpts.maxDelayMode;
|
|
9082
|
+
if (smOpts.enableEntities !== void 0) txConfig.enable_entities = smOpts.enableEntities;
|
|
9083
|
+
if (smOpts.diarization === "speaker" || options?.diarization) {
|
|
9084
|
+
txConfig.diarization = "speaker";
|
|
9085
|
+
if (smOpts.maxSpeakers) {
|
|
9086
|
+
txConfig.speaker_diarization_config = {
|
|
9087
|
+
max_speakers: smOpts.maxSpeakers
|
|
9088
|
+
};
|
|
9089
|
+
} else if (options?.speakersExpected) {
|
|
9090
|
+
txConfig.speaker_diarization_config = {
|
|
9091
|
+
max_speakers: options.speakersExpected
|
|
9092
|
+
};
|
|
9093
|
+
}
|
|
9094
|
+
}
|
|
9095
|
+
if (smOpts.additionalVocab && smOpts.additionalVocab.length > 0) {
|
|
9096
|
+
txConfig.additional_vocab = smOpts.additionalVocab.map((word) => ({
|
|
9097
|
+
content: word
|
|
9098
|
+
}));
|
|
9099
|
+
} else if (options?.customVocabulary && options.customVocabulary.length > 0) {
|
|
9100
|
+
txConfig.additional_vocab = options.customVocabulary.map((word) => ({
|
|
9101
|
+
content: word
|
|
9102
|
+
}));
|
|
9103
|
+
}
|
|
9104
|
+
if (smOpts.conversationConfig) {
|
|
9105
|
+
txConfig.conversation_config = {
|
|
9106
|
+
end_of_utterance_silence_trigger: smOpts.conversationConfig.endOfUtteranceSilenceTrigger
|
|
9107
|
+
};
|
|
9108
|
+
}
|
|
9109
|
+
const startPayload = JSON.stringify(startMsg);
|
|
9110
|
+
if (callbacks?.onRawMessage) {
|
|
9111
|
+
callbacks.onRawMessage({
|
|
9112
|
+
provider: "speechmatics",
|
|
9113
|
+
direction: "outgoing",
|
|
9114
|
+
timestamp: Date.now(),
|
|
9115
|
+
payload: startPayload,
|
|
9116
|
+
messageType: "StartRecognition"
|
|
9117
|
+
});
|
|
9118
|
+
}
|
|
9119
|
+
ws.send(startPayload);
|
|
9120
|
+
});
|
|
9121
|
+
const onMessage = (data) => {
|
|
9122
|
+
const rawPayload = data.toString();
|
|
9123
|
+
try {
|
|
9124
|
+
const msg = JSON.parse(rawPayload);
|
|
9125
|
+
if (msg.message === "RecognitionStarted") {
|
|
9126
|
+
clearTimeout(timeout);
|
|
9127
|
+
ws.removeListener("message", onMessage);
|
|
9128
|
+
ws.emit("message", data);
|
|
9129
|
+
resolve();
|
|
9130
|
+
} else if (msg.message === "Error") {
|
|
9131
|
+
clearTimeout(timeout);
|
|
9132
|
+
ws.removeListener("message", onMessage);
|
|
9133
|
+
reject(new Error(msg.reason || "Recognition failed to start"));
|
|
9134
|
+
}
|
|
9135
|
+
} catch {
|
|
9136
|
+
}
|
|
9137
|
+
};
|
|
9138
|
+
ws.on("message", onMessage);
|
|
9139
|
+
});
|
|
9140
|
+
ws.on("message", (data) => {
|
|
9141
|
+
const rawPayload = data.toString();
|
|
9142
|
+
try {
|
|
9143
|
+
const message = JSON.parse(rawPayload);
|
|
9144
|
+
if (callbacks?.onRawMessage) {
|
|
9145
|
+
callbacks.onRawMessage({
|
|
9146
|
+
provider: "speechmatics",
|
|
9147
|
+
direction: "incoming",
|
|
9148
|
+
timestamp: Date.now(),
|
|
9149
|
+
payload: rawPayload,
|
|
9150
|
+
messageType: message.message
|
|
9151
|
+
});
|
|
9152
|
+
}
|
|
9153
|
+
this.handleStreamingMessage(message, callbacks, utteranceResults);
|
|
9154
|
+
} catch (error) {
|
|
9155
|
+
if (callbacks?.onRawMessage) {
|
|
9156
|
+
callbacks.onRawMessage({
|
|
9157
|
+
provider: "speechmatics",
|
|
9158
|
+
direction: "incoming",
|
|
9159
|
+
timestamp: Date.now(),
|
|
9160
|
+
payload: rawPayload,
|
|
9161
|
+
messageType: "parse_error"
|
|
9162
|
+
});
|
|
9163
|
+
}
|
|
9164
|
+
callbacks?.onError?.({
|
|
9165
|
+
code: "PARSE_ERROR",
|
|
9166
|
+
message: "Failed to parse WebSocket message",
|
|
9167
|
+
details: error
|
|
9168
|
+
});
|
|
9169
|
+
}
|
|
9170
|
+
});
|
|
9171
|
+
ws.on("error", (error) => {
|
|
9172
|
+
callbacks?.onError?.({
|
|
9173
|
+
code: "WEBSOCKET_ERROR",
|
|
9174
|
+
message: error.message,
|
|
9175
|
+
details: error
|
|
9176
|
+
});
|
|
9177
|
+
});
|
|
9178
|
+
ws.on("close", (code, reason) => {
|
|
9179
|
+
sessionStatus = "closed";
|
|
9180
|
+
callbacks?.onClose?.(code, reason.toString());
|
|
9181
|
+
});
|
|
9182
|
+
await sessionReady;
|
|
9183
|
+
sessionStatus = "open";
|
|
9184
|
+
callbacks?.onOpen?.();
|
|
9185
|
+
return {
|
|
9186
|
+
id: sessionId,
|
|
9187
|
+
provider: this.name,
|
|
9188
|
+
createdAt: /* @__PURE__ */ new Date(),
|
|
9189
|
+
getStatus: () => sessionStatus,
|
|
9190
|
+
sendAudio: async (chunk) => {
|
|
9191
|
+
if (sessionStatus !== "open") {
|
|
9192
|
+
throw new Error(`Cannot send audio: session is ${sessionStatus}`);
|
|
9193
|
+
}
|
|
9194
|
+
if (ws.readyState !== WebSocket6.OPEN) {
|
|
9195
|
+
throw new Error("WebSocket is not open");
|
|
9196
|
+
}
|
|
9197
|
+
if (callbacks?.onRawMessage) {
|
|
9198
|
+
const audioPayload = chunk.data instanceof ArrayBuffer ? chunk.data : chunk.data.buffer.slice(
|
|
9199
|
+
chunk.data.byteOffset,
|
|
9200
|
+
chunk.data.byteOffset + chunk.data.byteLength
|
|
9201
|
+
);
|
|
9202
|
+
callbacks.onRawMessage({
|
|
9203
|
+
provider: this.name,
|
|
9204
|
+
direction: "outgoing",
|
|
9205
|
+
timestamp: Date.now(),
|
|
9206
|
+
payload: audioPayload,
|
|
9207
|
+
messageType: "audio"
|
|
9208
|
+
});
|
|
9209
|
+
}
|
|
9210
|
+
ws.send(chunk.data);
|
|
9211
|
+
seqNo++;
|
|
9212
|
+
if (chunk.isLast) {
|
|
9213
|
+
const endMsg = JSON.stringify({
|
|
9214
|
+
message: "EndOfStream",
|
|
9215
|
+
last_seq_no: seqNo
|
|
9216
|
+
});
|
|
9217
|
+
if (callbacks?.onRawMessage) {
|
|
9218
|
+
callbacks.onRawMessage({
|
|
9219
|
+
provider: this.name,
|
|
9220
|
+
direction: "outgoing",
|
|
9221
|
+
timestamp: Date.now(),
|
|
9222
|
+
payload: endMsg,
|
|
9223
|
+
messageType: "EndOfStream"
|
|
9224
|
+
});
|
|
9225
|
+
}
|
|
9226
|
+
ws.send(endMsg);
|
|
9227
|
+
}
|
|
9228
|
+
},
|
|
9229
|
+
close: async () => {
|
|
9230
|
+
if (sessionStatus === "closed" || sessionStatus === "closing") {
|
|
9231
|
+
return;
|
|
9232
|
+
}
|
|
9233
|
+
sessionStatus = "closing";
|
|
9234
|
+
if (ws.readyState === WebSocket6.OPEN) {
|
|
9235
|
+
seqNo++;
|
|
9236
|
+
ws.send(
|
|
9237
|
+
JSON.stringify({
|
|
9238
|
+
message: "EndOfStream",
|
|
9239
|
+
last_seq_no: seqNo
|
|
9240
|
+
})
|
|
9241
|
+
);
|
|
9242
|
+
}
|
|
9243
|
+
return new Promise((resolve) => {
|
|
9244
|
+
const timeout = setTimeout(() => {
|
|
9245
|
+
ws.terminate();
|
|
9246
|
+
sessionStatus = "closed";
|
|
9247
|
+
resolve();
|
|
9248
|
+
}, 5e3);
|
|
9249
|
+
const onMsg = (data) => {
|
|
9250
|
+
try {
|
|
9251
|
+
const msg = JSON.parse(data.toString());
|
|
9252
|
+
if (msg.message === "EndOfTranscript") {
|
|
9253
|
+
ws.removeListener("message", onMsg);
|
|
9254
|
+
clearTimeout(timeout);
|
|
9255
|
+
ws.close();
|
|
9256
|
+
}
|
|
9257
|
+
} catch {
|
|
9258
|
+
}
|
|
9259
|
+
};
|
|
9260
|
+
ws.on("message", onMsg);
|
|
9261
|
+
ws.once("close", () => {
|
|
9262
|
+
clearTimeout(timeout);
|
|
9263
|
+
sessionStatus = "closed";
|
|
9264
|
+
resolve();
|
|
9265
|
+
});
|
|
9266
|
+
});
|
|
9267
|
+
}
|
|
9268
|
+
};
|
|
9269
|
+
}
|
|
9270
|
+
/**
|
|
9271
|
+
* Handle incoming Speechmatics real-time WebSocket messages
|
|
9272
|
+
*/
|
|
9273
|
+
handleStreamingMessage(message, callbacks, utteranceResults) {
|
|
9274
|
+
switch (message.message) {
|
|
9275
|
+
case "RecognitionStarted": {
|
|
9276
|
+
break;
|
|
9277
|
+
}
|
|
9278
|
+
case "AddPartialTranscript": {
|
|
9279
|
+
const results = message.results || [];
|
|
9280
|
+
const text = buildTextFromSpeechmaticsResults(results);
|
|
9281
|
+
if (text) {
|
|
9282
|
+
callbacks?.onTranscript?.({
|
|
9283
|
+
type: "transcript",
|
|
9284
|
+
text,
|
|
9285
|
+
isFinal: false,
|
|
9286
|
+
words: this.extractWordsFromResults(results),
|
|
9287
|
+
data: message
|
|
9288
|
+
});
|
|
9289
|
+
}
|
|
9290
|
+
break;
|
|
9291
|
+
}
|
|
9292
|
+
case "AddTranscript": {
|
|
9293
|
+
const results = message.results || [];
|
|
9294
|
+
const text = buildTextFromSpeechmaticsResults(results);
|
|
9295
|
+
if (utteranceResults) {
|
|
9296
|
+
utteranceResults.push(...results);
|
|
9297
|
+
}
|
|
9298
|
+
if (text) {
|
|
9299
|
+
callbacks?.onTranscript?.({
|
|
9300
|
+
type: "transcript",
|
|
9301
|
+
text,
|
|
9302
|
+
isFinal: true,
|
|
9303
|
+
words: this.extractWordsFromResults(results),
|
|
9304
|
+
data: message
|
|
9305
|
+
});
|
|
9306
|
+
}
|
|
9307
|
+
break;
|
|
9308
|
+
}
|
|
9309
|
+
case "EndOfUtterance": {
|
|
9310
|
+
if (utteranceResults && utteranceResults.length > 0) {
|
|
9311
|
+
const text = buildTextFromSpeechmaticsResults(utteranceResults);
|
|
9312
|
+
const words = this.extractWordsFromResults(utteranceResults);
|
|
9313
|
+
const utterances = buildUtterancesFromWords(words);
|
|
9314
|
+
if (utterances.length > 0) {
|
|
9315
|
+
for (const utt of utterances) {
|
|
9316
|
+
callbacks?.onUtterance?.(utt);
|
|
9317
|
+
}
|
|
9318
|
+
} else if (text) {
|
|
9319
|
+
callbacks?.onUtterance?.({
|
|
9320
|
+
text,
|
|
9321
|
+
start: words.length > 0 ? words[0].start : 0,
|
|
9322
|
+
end: words.length > 0 ? words[words.length - 1].end : 0,
|
|
9323
|
+
words
|
|
9324
|
+
});
|
|
9325
|
+
}
|
|
9326
|
+
utteranceResults.length = 0;
|
|
9327
|
+
}
|
|
9328
|
+
break;
|
|
9329
|
+
}
|
|
9330
|
+
case "AudioAdded": {
|
|
9331
|
+
break;
|
|
9332
|
+
}
|
|
9333
|
+
case "EndOfTranscript": {
|
|
9334
|
+
break;
|
|
9335
|
+
}
|
|
9336
|
+
case "Info":
|
|
9337
|
+
case "Warning": {
|
|
9338
|
+
callbacks?.onMetadata?.(message);
|
|
9339
|
+
break;
|
|
9340
|
+
}
|
|
9341
|
+
case "Error": {
|
|
9342
|
+
const errMsg = message;
|
|
9343
|
+
callbacks?.onError?.({
|
|
9344
|
+
code: errMsg.type || "SPEECHMATICS_ERROR",
|
|
9345
|
+
message: errMsg.reason || "Unknown error",
|
|
9346
|
+
details: message
|
|
9347
|
+
});
|
|
9348
|
+
break;
|
|
9349
|
+
}
|
|
9350
|
+
default: {
|
|
9351
|
+
callbacks?.onMetadata?.(message);
|
|
9352
|
+
break;
|
|
9353
|
+
}
|
|
9354
|
+
}
|
|
9355
|
+
}
|
|
9356
|
+
/**
|
|
9357
|
+
* Extract unified Word[] from Speechmatics recognition results
|
|
9358
|
+
*/
|
|
9359
|
+
extractWordsFromResults(results) {
|
|
9360
|
+
return results.filter((r) => r.type === "word" && r.start_time !== void 0 && r.end_time !== void 0).map((result) => ({
|
|
9361
|
+
word: result.alternatives?.[0]?.content || "",
|
|
9362
|
+
start: result.start_time,
|
|
9363
|
+
end: result.end_time,
|
|
9364
|
+
confidence: result.alternatives?.[0]?.confidence,
|
|
9365
|
+
speaker: result.alternatives?.[0]?.speaker
|
|
9366
|
+
}));
|
|
9367
|
+
}
|
|
8988
9368
|
/**
|
|
8989
9369
|
* Normalize Speechmatics status to unified status
|
|
8990
9370
|
* Uses generated JobDetailsStatus enum values
|
|
@@ -9203,26 +9583,13 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
9203
9583
|
} else if (audio.type === "file") {
|
|
9204
9584
|
const formData = new FormData();
|
|
9205
9585
|
const audioBlob = audio.file instanceof Blob ? audio.file : new Blob([audio.file], { type: audio.mimeType || "audio/wav" });
|
|
9206
|
-
formData.append("
|
|
9207
|
-
|
|
9208
|
-
if (options?.language) {
|
|
9209
|
-
formData.append("language_hints", JSON.stringify([options.language]));
|
|
9210
|
-
}
|
|
9211
|
-
if (options?.diarization) {
|
|
9212
|
-
formData.append("enable_speaker_diarization", "true");
|
|
9213
|
-
}
|
|
9214
|
-
if (options?.languageDetection) {
|
|
9215
|
-
formData.append("enable_language_identification", "true");
|
|
9216
|
-
}
|
|
9217
|
-
if (options?.customVocabulary) {
|
|
9218
|
-
formData.append("context", JSON.stringify({ terms: options.customVocabulary }));
|
|
9219
|
-
}
|
|
9220
|
-
const response2 = await this.client.post("/speech/transcribe", formData, {
|
|
9586
|
+
formData.append("file", audioBlob, audio.filename || "audio.wav");
|
|
9587
|
+
const uploadResponse = await this.client.post("/files", formData, {
|
|
9221
9588
|
headers: {
|
|
9222
9589
|
"Content-Type": "multipart/form-data"
|
|
9223
9590
|
}
|
|
9224
9591
|
});
|
|
9225
|
-
|
|
9592
|
+
requestBody.file_id = uploadResponse.data.id;
|
|
9226
9593
|
} else {
|
|
9227
9594
|
return {
|
|
9228
9595
|
success: false,
|
|
@@ -9247,8 +9614,9 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
9247
9614
|
terms: options.customVocabulary
|
|
9248
9615
|
};
|
|
9249
9616
|
}
|
|
9250
|
-
const response = await this.client.post("/
|
|
9251
|
-
|
|
9617
|
+
const response = await this.client.post("/transcriptions", requestBody);
|
|
9618
|
+
const transcriptionId = response.data.id;
|
|
9619
|
+
return await this.pollForCompletion(transcriptionId);
|
|
9252
9620
|
} catch (error) {
|
|
9253
9621
|
return this.createErrorResponse(error);
|
|
9254
9622
|
}
|
|
@@ -9256,8 +9624,9 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
9256
9624
|
/**
|
|
9257
9625
|
* Get transcription result by ID
|
|
9258
9626
|
*
|
|
9259
|
-
*
|
|
9260
|
-
*
|
|
9627
|
+
* Checks job status via GET /v1/transcriptions/{id}, then fetches
|
|
9628
|
+
* the full transcript via GET /v1/transcriptions/{id}/transcript
|
|
9629
|
+
* when completed.
|
|
9261
9630
|
*
|
|
9262
9631
|
* @param transcriptId - Transcript ID
|
|
9263
9632
|
* @returns Transcription response
|
|
@@ -9265,8 +9634,39 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
9265
9634
|
async getTranscript(transcriptId) {
|
|
9266
9635
|
this.validateConfig();
|
|
9267
9636
|
try {
|
|
9268
|
-
const
|
|
9269
|
-
|
|
9637
|
+
const statusResponse = await this.client.get(`/transcriptions/${transcriptId}`);
|
|
9638
|
+
const job = statusResponse.data;
|
|
9639
|
+
if (job.status === "error") {
|
|
9640
|
+
return {
|
|
9641
|
+
success: false,
|
|
9642
|
+
provider: this.name,
|
|
9643
|
+
error: {
|
|
9644
|
+
code: "TRANSCRIPTION_ERROR",
|
|
9645
|
+
message: job.error_message || "Transcription failed"
|
|
9646
|
+
}
|
|
9647
|
+
};
|
|
9648
|
+
}
|
|
9649
|
+
if (job.status !== "completed") {
|
|
9650
|
+
return {
|
|
9651
|
+
success: true,
|
|
9652
|
+
provider: this.name,
|
|
9653
|
+
data: {
|
|
9654
|
+
id: job.id,
|
|
9655
|
+
text: "",
|
|
9656
|
+
status: job.status
|
|
9657
|
+
},
|
|
9658
|
+
raw: job
|
|
9659
|
+
};
|
|
9660
|
+
}
|
|
9661
|
+
const transcriptResponse = await this.client.get(
|
|
9662
|
+
`/transcriptions/${transcriptId}/transcript`
|
|
9663
|
+
);
|
|
9664
|
+
return this.normalizeResponse({
|
|
9665
|
+
...transcriptResponse.data,
|
|
9666
|
+
// Carry over job metadata
|
|
9667
|
+
id: job.id,
|
|
9668
|
+
audio_duration_ms: job.audio_duration_ms
|
|
9669
|
+
});
|
|
9270
9670
|
} catch (error) {
|
|
9271
9671
|
return this.createErrorResponse(error);
|
|
9272
9672
|
}
|
|
@@ -9286,50 +9686,51 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
9286
9686
|
const sessionId = `soniox_${Date.now()}_${Math.random().toString(36).substring(7)}`;
|
|
9287
9687
|
const createdAt = /* @__PURE__ */ new Date();
|
|
9288
9688
|
const wsBase = this.config?.wsBaseUrl || (this.config?.baseUrl ? this.deriveWsUrl(this.config.baseUrl) : `wss://${this.getRegionalWsHost()}`);
|
|
9289
|
-
const wsUrl =
|
|
9290
|
-
|
|
9291
|
-
const
|
|
9292
|
-
|
|
9293
|
-
|
|
9689
|
+
const wsUrl = `${wsBase}/transcribe-websocket`;
|
|
9690
|
+
const modelId = options?.sonioxStreaming?.model || options?.model || "stt-rt-v4";
|
|
9691
|
+
const sonioxOpts = options?.sonioxStreaming;
|
|
9692
|
+
const initMessage = {
|
|
9693
|
+
api_key: this.config.apiKey,
|
|
9694
|
+
model: modelId
|
|
9695
|
+
};
|
|
9696
|
+
if (sonioxOpts?.audioFormat) {
|
|
9697
|
+
initMessage.audio_format = sonioxOpts.audioFormat;
|
|
9698
|
+
} else if (options?.encoding) {
|
|
9294
9699
|
const encodingMap = {
|
|
9295
9700
|
linear16: "pcm_s16le",
|
|
9296
9701
|
pcm: "pcm_s16le",
|
|
9297
9702
|
mulaw: "mulaw",
|
|
9298
9703
|
alaw: "alaw"
|
|
9299
9704
|
};
|
|
9300
|
-
|
|
9705
|
+
initMessage.audio_format = encodingMap[options.encoding] || options.encoding;
|
|
9301
9706
|
}
|
|
9302
|
-
if (options?.sampleRate) {
|
|
9303
|
-
|
|
9707
|
+
if (sonioxOpts?.sampleRate || options?.sampleRate) {
|
|
9708
|
+
initMessage.sample_rate = sonioxOpts?.sampleRate || options?.sampleRate;
|
|
9304
9709
|
}
|
|
9305
|
-
if (options?.channels) {
|
|
9306
|
-
|
|
9710
|
+
if (sonioxOpts?.numChannels || options?.channels) {
|
|
9711
|
+
initMessage.num_channels = sonioxOpts?.numChannels || options?.channels;
|
|
9307
9712
|
}
|
|
9308
|
-
const sonioxOpts = options?.sonioxStreaming;
|
|
9309
9713
|
if (sonioxOpts) {
|
|
9310
9714
|
if (sonioxOpts.languageHints && sonioxOpts.languageHints.length > 0) {
|
|
9311
|
-
|
|
9715
|
+
initMessage.language_hints = sonioxOpts.languageHints;
|
|
9312
9716
|
}
|
|
9313
9717
|
if (sonioxOpts.enableLanguageIdentification) {
|
|
9314
|
-
|
|
9718
|
+
initMessage.enable_language_identification = true;
|
|
9315
9719
|
}
|
|
9316
9720
|
if (sonioxOpts.enableEndpointDetection) {
|
|
9317
|
-
|
|
9721
|
+
initMessage.enable_endpoint_detection = true;
|
|
9318
9722
|
}
|
|
9319
9723
|
if (sonioxOpts.enableSpeakerDiarization) {
|
|
9320
|
-
|
|
9724
|
+
initMessage.enable_speaker_diarization = true;
|
|
9321
9725
|
}
|
|
9322
9726
|
if (sonioxOpts.context) {
|
|
9323
|
-
|
|
9324
|
-
"context",
|
|
9325
|
-
typeof sonioxOpts.context === "string" ? sonioxOpts.context : JSON.stringify(sonioxOpts.context)
|
|
9326
|
-
);
|
|
9727
|
+
initMessage.context = typeof sonioxOpts.context === "string" ? sonioxOpts.context : sonioxOpts.context;
|
|
9327
9728
|
}
|
|
9328
9729
|
if (sonioxOpts.translation) {
|
|
9329
|
-
|
|
9730
|
+
initMessage.translation = sonioxOpts.translation;
|
|
9330
9731
|
}
|
|
9331
9732
|
if (sonioxOpts.clientReferenceId) {
|
|
9332
|
-
|
|
9733
|
+
initMessage.client_reference_id = sonioxOpts.clientReferenceId;
|
|
9333
9734
|
}
|
|
9334
9735
|
}
|
|
9335
9736
|
if (!sonioxOpts?.languageHints && options?.language) {
|
|
@@ -9338,24 +9739,33 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
9338
9739
|
`[Soniox] Warning: language="multi" is Deepgram-specific and not supported by Soniox. For automatic language detection, use languageDetection: true instead, or specify a language code like 'en'.`
|
|
9339
9740
|
);
|
|
9340
9741
|
}
|
|
9341
|
-
|
|
9742
|
+
initMessage.language_hints = [options.language];
|
|
9342
9743
|
}
|
|
9343
9744
|
if (!sonioxOpts?.enableSpeakerDiarization && options?.diarization) {
|
|
9344
|
-
|
|
9745
|
+
initMessage.enable_speaker_diarization = true;
|
|
9345
9746
|
}
|
|
9346
9747
|
if (!sonioxOpts?.enableLanguageIdentification && options?.languageDetection) {
|
|
9347
|
-
|
|
9348
|
-
}
|
|
9349
|
-
if (options?.interimResults !== false) {
|
|
9748
|
+
initMessage.enable_language_identification = true;
|
|
9350
9749
|
}
|
|
9351
9750
|
let status = "connecting";
|
|
9352
9751
|
let openedAt = null;
|
|
9353
9752
|
let receivedData = false;
|
|
9354
9753
|
const WebSocketImpl = typeof WebSocket !== "undefined" ? WebSocket : __require("ws");
|
|
9355
|
-
const ws = new WebSocketImpl(wsUrl
|
|
9754
|
+
const ws = new WebSocketImpl(wsUrl);
|
|
9356
9755
|
ws.onopen = () => {
|
|
9357
|
-
status = "open";
|
|
9358
9756
|
openedAt = Date.now();
|
|
9757
|
+
const initPayload = JSON.stringify(initMessage);
|
|
9758
|
+
if (callbacks?.onRawMessage) {
|
|
9759
|
+
callbacks.onRawMessage({
|
|
9760
|
+
provider: this.name,
|
|
9761
|
+
direction: "outgoing",
|
|
9762
|
+
timestamp: Date.now(),
|
|
9763
|
+
payload: initPayload,
|
|
9764
|
+
messageType: "init"
|
|
9765
|
+
});
|
|
9766
|
+
}
|
|
9767
|
+
ws.send(initPayload);
|
|
9768
|
+
status = "open";
|
|
9359
9769
|
callbacks?.onOpen?.();
|
|
9360
9770
|
};
|
|
9361
9771
|
ws.onmessage = (event) => {
|
|
@@ -9434,10 +9844,10 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
9434
9844
|
ws.onclose = (event) => {
|
|
9435
9845
|
status = "closed";
|
|
9436
9846
|
const timeSinceOpen = openedAt ? Date.now() - openedAt : null;
|
|
9437
|
-
const
|
|
9438
|
-
if (
|
|
9847
|
+
const isEarlyClose = timeSinceOpen !== null && timeSinceOpen < 5e3 && !receivedData;
|
|
9848
|
+
if (isEarlyClose && event.code === 1e3) {
|
|
9439
9849
|
const errorMessage = [
|
|
9440
|
-
"Soniox closed connection
|
|
9850
|
+
"Soniox closed connection shortly after opening.",
|
|
9441
9851
|
`Current config: region=${this.region}, model=${modelId}`,
|
|
9442
9852
|
"Likely causes:",
|
|
9443
9853
|
" - Invalid API key or region mismatch (keys are region-specific, current: " + this.region + ")",
|
|
@@ -9558,8 +9968,10 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
9558
9968
|
* Normalize Soniox response to unified format
|
|
9559
9969
|
*/
|
|
9560
9970
|
normalizeResponse(response) {
|
|
9561
|
-
const text = response.text || (response.tokens ? response.tokens.filter((t) => t.is_final).map((t) => t.text).join("") : "");
|
|
9562
|
-
const words = response.tokens ? response.tokens.filter(
|
|
9971
|
+
const text = response.text || (response.tokens ? response.tokens.filter((t) => t.is_final !== false).map((t) => t.text).join("") : "");
|
|
9972
|
+
const words = response.tokens ? response.tokens.filter(
|
|
9973
|
+
(t) => t.is_final !== false && t.start_ms !== void 0 && t.end_ms !== void 0
|
|
9974
|
+
).map((token) => ({
|
|
9563
9975
|
word: token.text,
|
|
9564
9976
|
start: token.start_ms / 1e3,
|
|
9565
9977
|
end: token.end_ms / 1e3,
|
|
@@ -9576,7 +9988,8 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
9576
9988
|
id,
|
|
9577
9989
|
label: `Speaker ${id}`
|
|
9578
9990
|
})) : void 0;
|
|
9579
|
-
const
|
|
9991
|
+
const tokens = response.tokens ? response.tokens.filter((t) => t.is_final !== false) : [];
|
|
9992
|
+
const utterances = tokens.length > 0 ? this.buildUtterancesFromTokens(tokens) : [];
|
|
9580
9993
|
const language = response.tokens?.find((t) => t.language)?.language;
|
|
9581
9994
|
return {
|
|
9582
9995
|
success: true,
|
|
@@ -9586,7 +9999,7 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
9586
9999
|
text,
|
|
9587
10000
|
status: TranscriptionStatus.completed,
|
|
9588
10001
|
language,
|
|
9589
|
-
duration: response.total_audio_proc_ms ? response.total_audio_proc_ms / 1e3 : void 0,
|
|
10002
|
+
duration: response.audio_duration_ms ? response.audio_duration_ms / 1e3 : response.total_audio_proc_ms ? response.total_audio_proc_ms / 1e3 : void 0,
|
|
9590
10003
|
speakers,
|
|
9591
10004
|
words: words.length > 0 ? words : void 0,
|
|
9592
10005
|
utterances: utterances.length > 0 ? utterances : void 0
|
|
@@ -36453,7 +36866,7 @@ var AzureCapabilities = {
|
|
|
36453
36866
|
deleteTranscript: true
|
|
36454
36867
|
};
|
|
36455
36868
|
var SpeechmaticsCapabilities = {
|
|
36456
|
-
streaming:
|
|
36869
|
+
streaming: true,
|
|
36457
36870
|
diarization: true,
|
|
36458
36871
|
wordTimestamps: true,
|
|
36459
36872
|
languageDetection: false,
|