@voice-kit/core 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +2137 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1466 -4
- package/dist/index.d.ts +1466 -4
- package/dist/index.js +2102 -1
- package/dist/index.js.map +1 -1
- package/package.json +1 -31
- package/dist/audio.cjs +0 -533
- package/dist/audio.cjs.map +0 -1
- package/dist/audio.d.cts +0 -260
- package/dist/audio.d.ts +0 -260
- package/dist/audio.js +0 -514
- package/dist/audio.js.map +0 -1
- package/dist/compliance.cjs +0 -343
- package/dist/compliance.cjs.map +0 -1
- package/dist/compliance.d.cts +0 -163
- package/dist/compliance.d.ts +0 -163
- package/dist/compliance.js +0 -335
- package/dist/compliance.js.map +0 -1
- package/dist/errors.cjs +0 -284
- package/dist/errors.cjs.map +0 -1
- package/dist/errors.d.cts +0 -100
- package/dist/errors.d.ts +0 -100
- package/dist/errors.js +0 -262
- package/dist/errors.js.map +0 -1
- package/dist/index-D3KfRXMP.d.cts +0 -319
- package/dist/index-D3KfRXMP.d.ts +0 -319
- package/dist/memory.cjs +0 -121
- package/dist/memory.cjs.map +0 -1
- package/dist/memory.d.cts +0 -29
- package/dist/memory.d.ts +0 -29
- package/dist/memory.js +0 -115
- package/dist/memory.js.map +0 -1
- package/dist/observability.cjs +0 -229
- package/dist/observability.cjs.map +0 -1
- package/dist/observability.d.cts +0 -122
- package/dist/observability.d.ts +0 -122
- package/dist/observability.js +0 -222
- package/dist/observability.js.map +0 -1
- package/dist/stt.cjs +0 -828
- package/dist/stt.cjs.map +0 -1
- package/dist/stt.d.cts +0 -308
- package/dist/stt.d.ts +0 -308
- package/dist/stt.js +0 -815
- package/dist/stt.js.map +0 -1
- package/dist/telephony.errors-BQYr6-vl.d.cts +0 -80
- package/dist/telephony.errors-C0-nScrF.d.ts +0 -80
- package/dist/tts.cjs +0 -429
- package/dist/tts.cjs.map +0 -1
- package/dist/tts.d.cts +0 -151
- package/dist/tts.d.ts +0 -151
- package/dist/tts.js +0 -418
- package/dist/tts.js.map +0 -1
package/dist/stt.js
DELETED
|
@@ -1,815 +0,0 @@
|
|
|
1
|
-
import { AssemblyAI } from 'assemblyai';
|
|
2
|
-
import pino from 'pino';
|
|
3
|
-
import { DeepgramClient } from '@deepgram/sdk';
|
|
4
|
-
import axios from 'axios';
|
|
5
|
-
import { EventEmitter } from 'events';
|
|
6
|
-
import { createOpenAI } from '@ai-sdk/openai';
|
|
7
|
-
|
|
8
|
-
// src/stt/assembly/index.ts
|
|
9
|
-
|
|
10
|
-
// src/errors/base.ts
|
|
11
|
-
var VoiceKitError = class extends Error {
|
|
12
|
-
code;
|
|
13
|
-
callId;
|
|
14
|
-
provider;
|
|
15
|
-
retryable;
|
|
16
|
-
severity;
|
|
17
|
-
cause;
|
|
18
|
-
constructor(params) {
|
|
19
|
-
super(params.message);
|
|
20
|
-
this.name = this.constructor.name;
|
|
21
|
-
this.code = params.code;
|
|
22
|
-
this.callId = params.callId;
|
|
23
|
-
this.provider = params.provider;
|
|
24
|
-
this.retryable = params.retryable ?? false;
|
|
25
|
-
this.severity = params.severity ?? "medium";
|
|
26
|
-
this.cause = params.cause;
|
|
27
|
-
Object.setPrototypeOf(this, new.target.prototype);
|
|
28
|
-
}
|
|
29
|
-
toJSON() {
|
|
30
|
-
return {
|
|
31
|
-
name: this.name,
|
|
32
|
-
code: this.code,
|
|
33
|
-
message: this.message,
|
|
34
|
-
callId: this.callId,
|
|
35
|
-
provider: this.provider,
|
|
36
|
-
retryable: this.retryable,
|
|
37
|
-
severity: this.severity
|
|
38
|
-
};
|
|
39
|
-
}
|
|
40
|
-
};
|
|
41
|
-
|
|
42
|
-
// src/errors/stt.errors.ts
|
|
43
|
-
var STTError = class extends VoiceKitError {
|
|
44
|
-
languageCode;
|
|
45
|
-
constructor(params) {
|
|
46
|
-
super(params);
|
|
47
|
-
this.languageCode = params.languageCode;
|
|
48
|
-
}
|
|
49
|
-
};
|
|
50
|
-
var STTConnectionError = class extends STTError {
|
|
51
|
-
constructor(provider, cause, callId) {
|
|
52
|
-
super({
|
|
53
|
-
code: "STT_CONNECTION_FAILED",
|
|
54
|
-
message: `Failed to connect to ${provider} STT service`,
|
|
55
|
-
provider,
|
|
56
|
-
callId,
|
|
57
|
-
retryable: true,
|
|
58
|
-
severity: "high",
|
|
59
|
-
cause
|
|
60
|
-
});
|
|
61
|
-
}
|
|
62
|
-
};
|
|
63
|
-
var STTStreamError = class extends STTError {
|
|
64
|
-
constructor(provider, cause, callId) {
|
|
65
|
-
super({
|
|
66
|
-
code: "STT_STREAM_ERROR",
|
|
67
|
-
message: `STT stream error from ${provider}`,
|
|
68
|
-
provider,
|
|
69
|
-
callId,
|
|
70
|
-
retryable: true,
|
|
71
|
-
severity: "medium",
|
|
72
|
-
cause
|
|
73
|
-
});
|
|
74
|
-
}
|
|
75
|
-
};
|
|
76
|
-
var STTLanguageNotSupportedError = class extends STTError {
|
|
77
|
-
constructor(provider, language) {
|
|
78
|
-
super({
|
|
79
|
-
code: "STT_LANGUAGE_NOT_SUPPORTED",
|
|
80
|
-
message: `Language '${language}' is not supported by ${provider}`,
|
|
81
|
-
provider,
|
|
82
|
-
retryable: false,
|
|
83
|
-
severity: "low",
|
|
84
|
-
languageCode: language
|
|
85
|
-
});
|
|
86
|
-
}
|
|
87
|
-
};
|
|
88
|
-
var logger = pino({ name: "@voice-kit/core:stt:assemblyai" });
|
|
89
|
-
var SUPPORTED_LANGUAGES = [
|
|
90
|
-
"en",
|
|
91
|
-
"en_au",
|
|
92
|
-
"en_uk",
|
|
93
|
-
"en_us",
|
|
94
|
-
"hi",
|
|
95
|
-
"fr",
|
|
96
|
-
"de",
|
|
97
|
-
"es",
|
|
98
|
-
"it",
|
|
99
|
-
"pt",
|
|
100
|
-
"nl",
|
|
101
|
-
"ja",
|
|
102
|
-
"zh"
|
|
103
|
-
];
|
|
104
|
-
var AssemblyAISTTProvider = class {
|
|
105
|
-
name = "assemblyai";
|
|
106
|
-
supportsStreaming = false;
|
|
107
|
-
supportedLanguages = SUPPORTED_LANGUAGES;
|
|
108
|
-
client;
|
|
109
|
-
config;
|
|
110
|
-
constructor(config) {
|
|
111
|
-
const apiKey = config.apiKey ?? process.env["ASSEMBLYAI_API_KEY"];
|
|
112
|
-
if (!apiKey) throw new STTConnectionError("assemblyai", new Error("ASSEMBLYAI_API_KEY not set"));
|
|
113
|
-
this.client = new AssemblyAI({ apiKey });
|
|
114
|
-
this.config = {
|
|
115
|
-
language: config.language ?? "en",
|
|
116
|
-
alternateLanguages: config.alternateLanguages ?? [],
|
|
117
|
-
apiKey,
|
|
118
|
-
model: config.model ?? "best",
|
|
119
|
-
wordTimestamps: config.wordTimestamps ?? true,
|
|
120
|
-
interimResults: false,
|
|
121
|
-
smartFormat: config.smartFormat ?? true,
|
|
122
|
-
region: ""
|
|
123
|
-
};
|
|
124
|
-
}
|
|
125
|
-
/**
|
|
126
|
-
* Batch-transcribes collected audio. AssemblyAI has no realtime streaming.
|
|
127
|
-
* Collects all audio from the iterable, uploads, then polls for result.
|
|
128
|
-
*
|
|
129
|
-
* @param audio Async iterable of PCM buffers
|
|
130
|
-
*/
|
|
131
|
-
async *transcribeStream(audio) {
|
|
132
|
-
const chunks = [];
|
|
133
|
-
for await (const chunk of audio) chunks.push(chunk);
|
|
134
|
-
const result = await this.transcribeBatch(Buffer.concat(chunks));
|
|
135
|
-
yield result;
|
|
136
|
-
}
|
|
137
|
-
/**
|
|
138
|
-
* Upload audio to AssemblyAI and wait for async transcription.
|
|
139
|
-
* Suitable for call recordings. Average latency: 15–45s per minute of audio.
|
|
140
|
-
*
|
|
141
|
-
* @param audio Raw WAV/PCM/MP3 buffer
|
|
142
|
-
*
|
|
143
|
-
* @example
|
|
144
|
-
* ```ts
|
|
145
|
-
* const stt = createSTT('assemblyai', { wordTimestamps: true })
|
|
146
|
-
* const result = await stt.transcribeBatch(recordingBuffer)
|
|
147
|
-
* console.log(result.words) // Word-level timestamps
|
|
148
|
-
* ```
|
|
149
|
-
*/
|
|
150
|
-
async transcribeBatch(audio) {
|
|
151
|
-
const startMs = Date.now();
|
|
152
|
-
try {
|
|
153
|
-
logger.debug({ bytes: audio.length, language: this.config.language }, "AssemblyAI transcription started");
|
|
154
|
-
const transcript = await this.client.transcripts.transcribe({
|
|
155
|
-
audio,
|
|
156
|
-
language_code: this.config.language,
|
|
157
|
-
speech_model: this.config.model,
|
|
158
|
-
punctuate: this.config.smartFormat,
|
|
159
|
-
format_text: this.config.smartFormat,
|
|
160
|
-
word_boost: [],
|
|
161
|
-
...this.config.wordTimestamps && { timestamps: true }
|
|
162
|
-
});
|
|
163
|
-
if (transcript.status === "error") {
|
|
164
|
-
throw new STTStreamError("assemblyai", new Error(transcript.error ?? "Transcription failed"));
|
|
165
|
-
}
|
|
166
|
-
logger.info(
|
|
167
|
-
{ id: transcript.id, duration: transcript.audio_duration, latencyMs: Date.now() - startMs },
|
|
168
|
-
"AssemblyAI transcription complete"
|
|
169
|
-
);
|
|
170
|
-
return {
|
|
171
|
-
transcript: transcript.text ?? "",
|
|
172
|
-
isFinal: true,
|
|
173
|
-
confidence: transcript.confidence ?? 0.9,
|
|
174
|
-
language: this.config.language,
|
|
175
|
-
languageSwitchDetected: false,
|
|
176
|
-
words: this.config.wordTimestamps && transcript.words ? transcript.words.map((w) => ({
|
|
177
|
-
word: w.text,
|
|
178
|
-
startMs: w.start,
|
|
179
|
-
endMs: w.end,
|
|
180
|
-
confidence: w.confidence
|
|
181
|
-
})) : void 0,
|
|
182
|
-
latencyMs: Date.now() - startMs
|
|
183
|
-
};
|
|
184
|
-
} catch (err) {
|
|
185
|
-
if (err instanceof STTStreamError) throw err;
|
|
186
|
-
throw new STTStreamError("assemblyai", err);
|
|
187
|
-
}
|
|
188
|
-
}
|
|
189
|
-
};
|
|
190
|
-
var logger2 = pino({ name: "@voice-kit/core:stt:deepgram" });
|
|
191
|
-
var SUPPORTED_LANGUAGES2 = [
|
|
192
|
-
"en-IN",
|
|
193
|
-
"hi-IN",
|
|
194
|
-
"ta-IN",
|
|
195
|
-
"te-IN",
|
|
196
|
-
"kn-IN",
|
|
197
|
-
"mr-IN",
|
|
198
|
-
"en-US",
|
|
199
|
-
"en-GB",
|
|
200
|
-
"en-AU"
|
|
201
|
-
];
|
|
202
|
-
var BACKOFF = {
|
|
203
|
-
baseMs: 100,
|
|
204
|
-
maxMs: 5e3,
|
|
205
|
-
jitterPct: 0.2,
|
|
206
|
-
maxAttempts: 3
|
|
207
|
-
};
|
|
208
|
-
function backoffDelay(attempt) {
|
|
209
|
-
const base = Math.min(BACKOFF.baseMs * Math.pow(2, attempt), BACKOFF.maxMs);
|
|
210
|
-
const jitter = base * BACKOFF.jitterPct * (Math.random() * 2 - 1);
|
|
211
|
-
return Math.round(base + jitter);
|
|
212
|
-
}
|
|
213
|
-
var DeepgramSTTProvider = class {
|
|
214
|
-
name = "deepgram";
|
|
215
|
-
supportsStreaming = true;
|
|
216
|
-
supportedLanguages = SUPPORTED_LANGUAGES2;
|
|
217
|
-
client;
|
|
218
|
-
config;
|
|
219
|
-
constructor(config) {
|
|
220
|
-
const apiKey = config.apiKey ?? process.env["DEEPGRAM_API_KEY"];
|
|
221
|
-
if (!apiKey) throw new STTConnectionError("deepgram", new Error("DEEPGRAM_API_KEY not set"));
|
|
222
|
-
this.client = new DeepgramClient({ apiKey });
|
|
223
|
-
this.config = {
|
|
224
|
-
language: config.language ?? "en-IN",
|
|
225
|
-
alternateLanguages: config.alternateLanguages ?? [],
|
|
226
|
-
apiKey,
|
|
227
|
-
// nova-3 is now Deepgram's latest recommended model
|
|
228
|
-
model: config.model ?? "nova-3",
|
|
229
|
-
wordTimestamps: config.wordTimestamps ?? false,
|
|
230
|
-
interimResults: config.interimResults ?? true,
|
|
231
|
-
smartFormat: config.smartFormat ?? true,
|
|
232
|
-
region: config.region ?? ""
|
|
233
|
-
};
|
|
234
|
-
}
|
|
235
|
-
/**
|
|
236
|
-
* Stream audio to Deepgram and receive interim + final transcription results.
|
|
237
|
-
* Handles reconnection transparently with exponential backoff.
|
|
238
|
-
*
|
|
239
|
-
* @param audio Async iterable of 16kHz PCM buffers from AudioPipeline
|
|
240
|
-
*
|
|
241
|
-
* @example
|
|
242
|
-
* ```ts
|
|
243
|
-
* const stt = createSTT('deepgram', { language: 'hi-IN' })
|
|
244
|
-
* for await (const result of stt.transcribeStream(audioIterable)) {
|
|
245
|
-
* if (result.isFinal) console.log('User said:', result.transcript)
|
|
246
|
-
* }
|
|
247
|
-
* ```
|
|
248
|
-
*/
|
|
249
|
-
async *transcribeStream(audio) {
|
|
250
|
-
let attempt = 0;
|
|
251
|
-
const startMs = Date.now();
|
|
252
|
-
while (attempt <= BACKOFF.maxAttempts) {
|
|
253
|
-
const connection = await this.connectWithRetry(attempt);
|
|
254
|
-
const results = [];
|
|
255
|
-
let done = false;
|
|
256
|
-
let error = null;
|
|
257
|
-
connection.on("message", (data) => {
|
|
258
|
-
if (data.type !== "Results") return;
|
|
259
|
-
const alt = data.channel?.alternatives?.[0];
|
|
260
|
-
if (!alt?.transcript) return;
|
|
261
|
-
const isFinal = data.is_final === true;
|
|
262
|
-
const result = {
|
|
263
|
-
transcript: alt.transcript,
|
|
264
|
-
isFinal,
|
|
265
|
-
// speech_final=true means Deepgram detected end-of-utterance (endpointing).
|
|
266
|
-
// A frame can be speech_final without is_final — callers should act on both.
|
|
267
|
-
confidence: alt.confidence ?? 0,
|
|
268
|
-
// alt.languages populated when detect_language is enabled
|
|
269
|
-
language: alt.languages?.[0] ?? this.config.language,
|
|
270
|
-
languageSwitchDetected: false,
|
|
271
|
-
words: this.config.wordTimestamps ? alt.words?.map((w) => ({
|
|
272
|
-
word: w.word ?? "",
|
|
273
|
-
startMs: (w.start ?? 0) * 1e3,
|
|
274
|
-
endMs: (w.end ?? 0) * 1e3,
|
|
275
|
-
confidence: w.confidence ?? 0,
|
|
276
|
-
punctuatedWord: w.punctuated_word
|
|
277
|
-
})) : void 0,
|
|
278
|
-
latencyMs: Date.now() - startMs
|
|
279
|
-
};
|
|
280
|
-
results.push(result);
|
|
281
|
-
if (isFinal) {
|
|
282
|
-
logger2.debug(
|
|
283
|
-
{ transcript: result.transcript, confidence: result.confidence, language: result.language },
|
|
284
|
-
"Deepgram final transcript"
|
|
285
|
-
);
|
|
286
|
-
}
|
|
287
|
-
});
|
|
288
|
-
connection.on("close", () => {
|
|
289
|
-
done = true;
|
|
290
|
-
});
|
|
291
|
-
connection.on("error", (err) => {
|
|
292
|
-
error = err;
|
|
293
|
-
logger2.warn({ err, attempt }, "Deepgram stream error");
|
|
294
|
-
});
|
|
295
|
-
const sendAudio = async () => {
|
|
296
|
-
try {
|
|
297
|
-
for await (const chunk of audio) {
|
|
298
|
-
connection.socket.send(chunk);
|
|
299
|
-
}
|
|
300
|
-
connection.socket.send(JSON.stringify({ type: "Finalize" }));
|
|
301
|
-
} catch (err) {
|
|
302
|
-
error = err instanceof Error ? err : new Error(String(err));
|
|
303
|
-
}
|
|
304
|
-
};
|
|
305
|
-
const sendPromise = sendAudio();
|
|
306
|
-
let resultIndex = 0;
|
|
307
|
-
while (!done || resultIndex < results.length) {
|
|
308
|
-
if (resultIndex < results.length) {
|
|
309
|
-
yield results[resultIndex++];
|
|
310
|
-
} else {
|
|
311
|
-
await new Promise((r) => setTimeout(r, 10));
|
|
312
|
-
}
|
|
313
|
-
if (error && attempt < BACKOFF.maxAttempts) {
|
|
314
|
-
try {
|
|
315
|
-
connection.socket.close();
|
|
316
|
-
} catch {
|
|
317
|
-
}
|
|
318
|
-
break;
|
|
319
|
-
}
|
|
320
|
-
if (error && attempt >= BACKOFF.maxAttempts) {
|
|
321
|
-
await sendPromise.catch(() => {
|
|
322
|
-
});
|
|
323
|
-
throw new STTStreamError("deepgram", error);
|
|
324
|
-
}
|
|
325
|
-
}
|
|
326
|
-
await sendPromise.catch(() => {
|
|
327
|
-
});
|
|
328
|
-
if (!error) return;
|
|
329
|
-
attempt++;
|
|
330
|
-
await new Promise((r) => setTimeout(r, backoffDelay(attempt)));
|
|
331
|
-
logger2.info({ attempt }, "Deepgram reconnecting...");
|
|
332
|
-
}
|
|
333
|
-
throw new STTStreamError("deepgram", new Error("Max reconnect attempts exceeded"));
|
|
334
|
-
}
|
|
335
|
-
/**
|
|
336
|
-
* Transcribe a complete audio buffer (non-streaming).
|
|
337
|
-
* Uses Deepgram pre-recorded API.
|
|
338
|
-
*
|
|
339
|
-
* @param audio Raw PCM or WAV buffer
|
|
340
|
-
*/
|
|
341
|
-
async transcribeBatch(audio) {
|
|
342
|
-
const startMs = Date.now();
|
|
343
|
-
try {
|
|
344
|
-
const response = await this.client.listen.v1.media.transcribeFile(
|
|
345
|
-
audio,
|
|
346
|
-
{
|
|
347
|
-
model: this.config.model,
|
|
348
|
-
language: this.config.language,
|
|
349
|
-
// v5: boolean-like options must be strings
|
|
350
|
-
smart_format: true,
|
|
351
|
-
diarize: false
|
|
352
|
-
}
|
|
353
|
-
);
|
|
354
|
-
const alt = response?.results?.channels?.[0]?.alternatives?.[0];
|
|
355
|
-
return {
|
|
356
|
-
transcript: alt?.transcript ?? "",
|
|
357
|
-
isFinal: true,
|
|
358
|
-
confidence: alt?.confidence ?? 0,
|
|
359
|
-
language: this.config.language,
|
|
360
|
-
languageSwitchDetected: false,
|
|
361
|
-
latencyMs: Date.now() - startMs
|
|
362
|
-
};
|
|
363
|
-
} catch (err) {
|
|
364
|
-
if (err instanceof STTStreamError) throw err;
|
|
365
|
-
throw new STTStreamError("deepgram", err instanceof Error ? err : new Error(String(err)));
|
|
366
|
-
}
|
|
367
|
-
}
|
|
368
|
-
/**
|
|
369
|
-
* Create and open a live WebSocket connection to Deepgram.
|
|
370
|
-
*
|
|
371
|
-
* v5 connection lifecycle (3 explicit steps):
|
|
372
|
-
* 1. await listen.v1.connect(options) — constructs the connection object
|
|
373
|
-
* 2. connection.connect() — initiates the WebSocket handshake
|
|
374
|
-
* 3. await connection.waitForOpen() — resolves once the socket is ready
|
|
375
|
-
*
|
|
376
|
-
* @internal
|
|
377
|
-
*/
|
|
378
|
-
async connectWithRetry(attempt) {
|
|
379
|
-
const delay = attempt > 0 ? backoffDelay(attempt) : 0;
|
|
380
|
-
if (delay > 0) await new Promise((r) => setTimeout(r, delay));
|
|
381
|
-
try {
|
|
382
|
-
logger2.debug({ attempt, language: this.config.language }, "Connecting to Deepgram");
|
|
383
|
-
const connection = await this.client.listen.v1.connect({
|
|
384
|
-
model: this.config.model,
|
|
385
|
-
language: this.config.language,
|
|
386
|
-
// v5: boolean-like options must be strings
|
|
387
|
-
smart_format: "true",
|
|
388
|
-
interim_results: String(this.config.interimResults),
|
|
389
|
-
encoding: "linear16",
|
|
390
|
-
sample_rate: 16e3,
|
|
391
|
-
channels: 1,
|
|
392
|
-
utterance_end_ms: "1000",
|
|
393
|
-
...this.config.alternateLanguages.length > 0 && {
|
|
394
|
-
detect_language: "true",
|
|
395
|
-
// language must be omitted when detect_language is enabled
|
|
396
|
-
language: void 0
|
|
397
|
-
},
|
|
398
|
-
Authorization: `Token ${this.config.apiKey}`
|
|
399
|
-
});
|
|
400
|
-
connection.connect();
|
|
401
|
-
await Promise.race([
|
|
402
|
-
connection.waitForOpen(),
|
|
403
|
-
new Promise(
|
|
404
|
-
(_, reject) => setTimeout(
|
|
405
|
-
() => reject(new STTConnectionError("deepgram", new Error("Connection timeout"))),
|
|
406
|
-
1e4
|
|
407
|
-
)
|
|
408
|
-
)
|
|
409
|
-
]);
|
|
410
|
-
logger2.info({ attempt, language: this.config.language }, "Deepgram connected");
|
|
411
|
-
return connection;
|
|
412
|
-
} catch (err) {
|
|
413
|
-
if (err instanceof STTConnectionError) throw err;
|
|
414
|
-
throw new STTConnectionError("deepgram", err instanceof Error ? err : new Error(String(err)));
|
|
415
|
-
}
|
|
416
|
-
}
|
|
417
|
-
};
|
|
418
|
-
var logger3 = pino({ name: "@voice-kit/core:stt:sarvam" });
|
|
419
|
-
var SARVAM_API_BASE = "https://api.sarvam.ai";
|
|
420
|
-
var SUPPORTED_LANGUAGES3 = [
|
|
421
|
-
"hi-IN",
|
|
422
|
-
"kn-IN",
|
|
423
|
-
"ta-IN",
|
|
424
|
-
"te-IN",
|
|
425
|
-
"mr-IN",
|
|
426
|
-
"bn-IN",
|
|
427
|
-
"gu-IN",
|
|
428
|
-
"pa-IN",
|
|
429
|
-
"or-IN",
|
|
430
|
-
"ml-IN"
|
|
431
|
-
];
|
|
432
|
-
var SARVAM_MODELS = {
|
|
433
|
-
"hi-IN": "saarika:v1",
|
|
434
|
-
"kn-IN": "saarika:v1",
|
|
435
|
-
"ta-IN": "saarika:v1",
|
|
436
|
-
"te-IN": "saarika:v1",
|
|
437
|
-
"mr-IN": "saarika:v1",
|
|
438
|
-
"bn-IN": "saarika:v1",
|
|
439
|
-
"gu-IN": "saarika:v1",
|
|
440
|
-
"pa-IN": "saarika:v1",
|
|
441
|
-
"or-IN": "saarika:v1",
|
|
442
|
-
"ml-IN": "saarika:v1"
|
|
443
|
-
};
|
|
444
|
-
var SarvamSTTProvider = class {
|
|
445
|
-
name = "sarvam";
|
|
446
|
-
supportsStreaming = false;
|
|
447
|
-
// Sarvam REST API is batch-only
|
|
448
|
-
supportedLanguages = SUPPORTED_LANGUAGES3;
|
|
449
|
-
http;
|
|
450
|
-
config;
|
|
451
|
-
constructor(config) {
|
|
452
|
-
const apiKey = config.apiKey ?? process.env["SARVAM_API_KEY"];
|
|
453
|
-
if (!apiKey) throw new STTConnectionError("sarvam", new Error("SARVAM_API_KEY not set"));
|
|
454
|
-
const language = config.language ?? "hi-IN";
|
|
455
|
-
if (!SUPPORTED_LANGUAGES3.includes(language)) {
|
|
456
|
-
throw new STTLanguageNotSupportedError("sarvam", language);
|
|
457
|
-
}
|
|
458
|
-
this.http = axios.create({
|
|
459
|
-
baseURL: SARVAM_API_BASE,
|
|
460
|
-
headers: {
|
|
461
|
-
"API-Subscription-Key": apiKey,
|
|
462
|
-
"Content-Type": "multipart/form-data"
|
|
463
|
-
},
|
|
464
|
-
timeout: 3e4
|
|
465
|
-
});
|
|
466
|
-
this.config = {
|
|
467
|
-
language,
|
|
468
|
-
alternateLanguages: config.alternateLanguages ?? [],
|
|
469
|
-
apiKey,
|
|
470
|
-
model: config.model ?? SARVAM_MODELS[language] ?? "saarika:v1",
|
|
471
|
-
wordTimestamps: false,
|
|
472
|
-
// Sarvam doesn't support word timestamps yet
|
|
473
|
-
interimResults: false,
|
|
474
|
-
smartFormat: config.smartFormat ?? true,
|
|
475
|
-
region: config.region ?? ""
|
|
476
|
-
};
|
|
477
|
-
}
|
|
478
|
-
/**
|
|
479
|
-
* Collects audio and transcribes via Sarvam batch API.
|
|
480
|
-
* Sarvam doesn't support realtime streaming.
|
|
481
|
-
*
|
|
482
|
-
* @param audio Async iterable of 16kHz PCM buffers
|
|
483
|
-
*/
|
|
484
|
-
async *transcribeStream(audio) {
|
|
485
|
-
const chunks = [];
|
|
486
|
-
for await (const chunk of audio) chunks.push(chunk);
|
|
487
|
-
const result = await this.transcribeBatch(Buffer.concat(chunks));
|
|
488
|
-
yield result;
|
|
489
|
-
}
|
|
490
|
-
/**
|
|
491
|
-
* Transcribe a WAV/PCM audio buffer in an Indic language.
|
|
492
|
-
*
|
|
493
|
-
* @param audio 16kHz PCM or WAV buffer
|
|
494
|
-
*
|
|
495
|
-
* @example
|
|
496
|
-
* ```ts
|
|
497
|
-
* const stt = createSTT('sarvam', { language: 'ta-IN' })
|
|
498
|
-
* const result = await stt.transcribeBatch(tamilAudioBuffer)
|
|
499
|
-
* console.log(result.transcript) // Tamil text
|
|
500
|
-
* ```
|
|
501
|
-
*/
|
|
502
|
-
async transcribeBatch(audio) {
|
|
503
|
-
const startMs = Date.now();
|
|
504
|
-
try {
|
|
505
|
-
logger3.debug(
|
|
506
|
-
{ language: this.config.language, bytes: audio.length },
|
|
507
|
-
"Sarvam transcription request"
|
|
508
|
-
);
|
|
509
|
-
const form = new FormData();
|
|
510
|
-
form.append("file", new Blob([audio], { type: "audio/wav" }), "audio.wav");
|
|
511
|
-
form.append("language_code", this.config.language);
|
|
512
|
-
form.append("model", this.config.model);
|
|
513
|
-
if (this.config.smartFormat) {
|
|
514
|
-
form.append("with_disfluencies", "false");
|
|
515
|
-
}
|
|
516
|
-
const response = await this.http.post(
|
|
517
|
-
"/speech-to-text",
|
|
518
|
-
form
|
|
519
|
-
);
|
|
520
|
-
const data = response.data;
|
|
521
|
-
logger3.info(
|
|
522
|
-
{ language: data.language_code, confidence: data.confidence, latencyMs: Date.now() - startMs },
|
|
523
|
-
"Sarvam transcription complete"
|
|
524
|
-
);
|
|
525
|
-
return {
|
|
526
|
-
transcript: data.transcript,
|
|
527
|
-
isFinal: true,
|
|
528
|
-
confidence: data.confidence ?? 0.9,
|
|
529
|
-
language: data.language_code ?? this.config.language,
|
|
530
|
-
languageSwitchDetected: false,
|
|
531
|
-
latencyMs: Date.now() - startMs
|
|
532
|
-
};
|
|
533
|
-
} catch (err) {
|
|
534
|
-
if (axios.isAxiosError(err)) {
|
|
535
|
-
throw new STTStreamError(
|
|
536
|
-
"sarvam",
|
|
537
|
-
new Error(`Sarvam API error: ${err.response?.status} ${JSON.stringify(err.response?.data)}`)
|
|
538
|
-
);
|
|
539
|
-
}
|
|
540
|
-
throw new STTStreamError("sarvam", err);
|
|
541
|
-
}
|
|
542
|
-
}
|
|
543
|
-
};
|
|
544
|
-
var logger4 = pino({ name: "@voice-kit/core:stt:language-detect" });
|
|
545
|
-
var DEVANAGARI_RANGE = /[\u0900-\u097F]/;
|
|
546
|
-
var MIN_WORDS_FOR_CLASSIFICATION = 2;
|
|
547
|
-
var SWITCH_CONFIDENCE_THRESHOLD = 0.6;
|
|
548
|
-
var NEUTRAL_TOKENS = /* @__PURE__ */ new Set([
|
|
549
|
-
"ok",
|
|
550
|
-
"okay",
|
|
551
|
-
"haan",
|
|
552
|
-
"nahin",
|
|
553
|
-
"nahi",
|
|
554
|
-
"kya",
|
|
555
|
-
"hai",
|
|
556
|
-
"ho",
|
|
557
|
-
"na",
|
|
558
|
-
"toh",
|
|
559
|
-
"aur",
|
|
560
|
-
"ya",
|
|
561
|
-
"matlab",
|
|
562
|
-
"yani",
|
|
563
|
-
"i",
|
|
564
|
-
"a",
|
|
565
|
-
"the",
|
|
566
|
-
"is",
|
|
567
|
-
"are",
|
|
568
|
-
"and",
|
|
569
|
-
"or"
|
|
570
|
-
]);
|
|
571
|
-
var LanguageSwitchDetector = class extends EventEmitter {
|
|
572
|
-
currentLanguage;
|
|
573
|
-
primaryLanguage;
|
|
574
|
-
/** Rolling window of recent language classifications for smoothing. */
|
|
575
|
-
recentClassifications = [];
|
|
576
|
-
windowSize = 5;
|
|
577
|
-
constructor(primaryLanguage = "en-IN") {
|
|
578
|
-
super();
|
|
579
|
-
this.primaryLanguage = primaryLanguage;
|
|
580
|
-
this.currentLanguage = primaryLanguage;
|
|
581
|
-
}
|
|
582
|
-
/**
|
|
583
|
-
* Analyze a transcript for language switches.
|
|
584
|
-
* Should be called on every STT final result.
|
|
585
|
-
*
|
|
586
|
-
* @param transcript The transcribed text to analyze
|
|
587
|
-
* @returns Detected language of the transcript
|
|
588
|
-
*/
|
|
589
|
-
analyze(transcript) {
|
|
590
|
-
const words = this.tokenize(transcript);
|
|
591
|
-
if (words.length === 0) return this.currentLanguage;
|
|
592
|
-
const classification = this.classifySegment(words);
|
|
593
|
-
const confidence = this.computeConfidence(words, classification);
|
|
594
|
-
this.recentClassifications.push(classification);
|
|
595
|
-
if (this.recentClassifications.length > this.windowSize) {
|
|
596
|
-
this.recentClassifications.shift();
|
|
597
|
-
}
|
|
598
|
-
const smoothed = this.smoothedLanguage();
|
|
599
|
-
if (smoothed !== this.currentLanguage && confidence >= SWITCH_CONFIDENCE_THRESHOLD && smoothed !== "unknown") {
|
|
600
|
-
const event = {
|
|
601
|
-
from: this.currentLanguage,
|
|
602
|
-
to: smoothed,
|
|
603
|
-
position: 0,
|
|
604
|
-
// position in full conversation
|
|
605
|
-
confidence,
|
|
606
|
-
transcript,
|
|
607
|
-
detectedAt: /* @__PURE__ */ new Date()
|
|
608
|
-
};
|
|
609
|
-
const prev = this.currentLanguage;
|
|
610
|
-
this.currentLanguage = smoothed;
|
|
611
|
-
logger4.info(
|
|
612
|
-
{ from: prev, to: smoothed, confidence, transcript: transcript.slice(0, 50) },
|
|
613
|
-
"Language switch detected"
|
|
614
|
-
);
|
|
615
|
-
this.emit("language.switched", event);
|
|
616
|
-
}
|
|
617
|
-
return this.currentLanguage;
|
|
618
|
-
}
|
|
619
|
-
/**
|
|
620
|
-
* Analyze a transcript and return per-word language classification.
|
|
621
|
-
* Useful for word-level Hinglish mixing visualization.
|
|
622
|
-
*
|
|
623
|
-
* @param transcript Text to analyze
|
|
624
|
-
* @returns Array of { word, language } pairs
|
|
625
|
-
*/
|
|
626
|
-
analyzeWords(transcript) {
|
|
627
|
-
const words = this.tokenize(transcript);
|
|
628
|
-
return words.map((word) => ({
|
|
629
|
-
word,
|
|
630
|
-
language: this.classifyWord(word)
|
|
631
|
-
}));
|
|
632
|
-
}
|
|
633
|
-
/** Reset to primary language (e.g., on new call). */
|
|
634
|
-
reset() {
|
|
635
|
-
this.currentLanguage = this.primaryLanguage;
|
|
636
|
-
this.recentClassifications = [];
|
|
637
|
-
}
|
|
638
|
-
/** Current detected language. */
|
|
639
|
-
get language() {
|
|
640
|
-
return this.currentLanguage;
|
|
641
|
-
}
|
|
642
|
-
// ─── Private helpers ────────────────────────────────────────────────────────
|
|
643
|
-
tokenize(text) {
|
|
644
|
-
return text.toLowerCase().split(/\s+/).filter((w) => w.length > 0 && !NEUTRAL_TOKENS.has(w));
|
|
645
|
-
}
|
|
646
|
-
classifyWord(word) {
|
|
647
|
-
if (DEVANAGARI_RANGE.test(word)) return "hi-IN";
|
|
648
|
-
if (/^[a-z]+$/.test(word)) return "en-IN";
|
|
649
|
-
return "unknown";
|
|
650
|
-
}
|
|
651
|
-
classifySegment(words) {
|
|
652
|
-
let hindiCount = 0;
|
|
653
|
-
let englishCount = 0;
|
|
654
|
-
for (const word of words) {
|
|
655
|
-
const lang = this.classifyWord(word);
|
|
656
|
-
if (lang === "hi-IN") hindiCount++;
|
|
657
|
-
else if (lang === "en-IN") englishCount++;
|
|
658
|
-
}
|
|
659
|
-
if (hindiCount === 0 && englishCount === 0) return "unknown";
|
|
660
|
-
if (hindiCount > englishCount) return "hi-IN";
|
|
661
|
-
if (englishCount > hindiCount) return "en-IN";
|
|
662
|
-
return this.primaryLanguage;
|
|
663
|
-
}
|
|
664
|
-
computeConfidence(words, classification) {
|
|
665
|
-
const relevant = words.filter((w) => this.classifyWord(w) !== "unknown");
|
|
666
|
-
if (relevant.length < MIN_WORDS_FOR_CLASSIFICATION) return 0;
|
|
667
|
-
const matching = relevant.filter((w) => this.classifyWord(w) === classification);
|
|
668
|
-
return matching.length / relevant.length;
|
|
669
|
-
}
|
|
670
|
-
smoothedLanguage() {
|
|
671
|
-
if (this.recentClassifications.length === 0) return this.primaryLanguage;
|
|
672
|
-
const counts = { "hi-IN": 0, "en-IN": 0, "unknown": 0 };
|
|
673
|
-
for (const lang of this.recentClassifications) {
|
|
674
|
-
counts[lang]++;
|
|
675
|
-
}
|
|
676
|
-
if (counts["hi-IN"] > counts["en-IN"]) return "hi-IN";
|
|
677
|
-
if (counts["en-IN"] > counts["hi-IN"]) return "en-IN";
|
|
678
|
-
return this.currentLanguage;
|
|
679
|
-
}
|
|
680
|
-
};
|
|
681
|
-
function isInglish(transcript) {
|
|
682
|
-
const hasDevanagari = DEVANAGARI_RANGE.test(transcript);
|
|
683
|
-
const hasLatin = /[a-zA-Z]/.test(transcript);
|
|
684
|
-
return hasDevanagari && hasLatin;
|
|
685
|
-
}
|
|
686
|
-
var logger5 = pino({ name: "@voice-kit/core:stt:whisper" });
|
|
687
|
-
var WHISPER_LANGUAGES = [
|
|
688
|
-
"en",
|
|
689
|
-
"hi",
|
|
690
|
-
"ta",
|
|
691
|
-
"te",
|
|
692
|
-
"kn",
|
|
693
|
-
"mr",
|
|
694
|
-
"bn",
|
|
695
|
-
"gu",
|
|
696
|
-
"pa",
|
|
697
|
-
"ur",
|
|
698
|
-
"fr",
|
|
699
|
-
"de",
|
|
700
|
-
"es",
|
|
701
|
-
"pt",
|
|
702
|
-
"it",
|
|
703
|
-
"nl",
|
|
704
|
-
"pl",
|
|
705
|
-
"ru",
|
|
706
|
-
"ja",
|
|
707
|
-
"zh"
|
|
708
|
-
];
|
|
709
|
-
var WhisperSTTProvider = class {
|
|
710
|
-
name = "whisper";
|
|
711
|
-
supportsStreaming = false;
|
|
712
|
-
supportedLanguages = WHISPER_LANGUAGES;
|
|
713
|
-
config;
|
|
714
|
-
constructor(config) {
|
|
715
|
-
const apiKey = config.apiKey ?? process.env["OPENAI_API_KEY"];
|
|
716
|
-
if (!apiKey) throw new STTStreamError("whisper", new Error("OPENAI_API_KEY not set"));
|
|
717
|
-
const language = config.language ?? "en-IN";
|
|
718
|
-
const whisperLang = language.split("-")[0] ?? "en";
|
|
719
|
-
if (!WHISPER_LANGUAGES.includes(whisperLang)) {
|
|
720
|
-
throw new STTLanguageNotSupportedError("whisper", language);
|
|
721
|
-
}
|
|
722
|
-
this.config = {
|
|
723
|
-
language,
|
|
724
|
-
alternateLanguages: config.alternateLanguages ?? [],
|
|
725
|
-
apiKey,
|
|
726
|
-
model: config.model ?? "whisper-1",
|
|
727
|
-
wordTimestamps: config.wordTimestamps ?? false,
|
|
728
|
-
interimResults: false,
|
|
729
|
-
smartFormat: false,
|
|
730
|
-
region: ""
|
|
731
|
-
};
|
|
732
|
-
}
|
|
733
|
-
/**
|
|
734
|
-
* Streaming not supported by Whisper. Collects all audio then transcribes.
|
|
735
|
-
* For realtime use, use createSTT('deepgram') instead.
|
|
736
|
-
*/
|
|
737
|
-
async *transcribeStream(audio) {
|
|
738
|
-
const chunks = [];
|
|
739
|
-
for await (const chunk of audio) chunks.push(chunk);
|
|
740
|
-
const result = await this.transcribeBatch(Buffer.concat(chunks));
|
|
741
|
-
yield result;
|
|
742
|
-
}
|
|
743
|
-
/**
|
|
744
|
-
* Transcribe a complete audio buffer via Whisper.
|
|
745
|
-
*
|
|
746
|
-
* @param audio WAV or PCM buffer
|
|
747
|
-
*/
|
|
748
|
-
async transcribeBatch(audio) {
|
|
749
|
-
const startMs = Date.now();
|
|
750
|
-
const language = this.config.language.split("-")[0] ?? "en";
|
|
751
|
-
try {
|
|
752
|
-
logger5.debug({ language, bytes: audio.length }, "Whisper batch transcription");
|
|
753
|
-
const openai = createOpenAI({ apiKey: this.config.apiKey });
|
|
754
|
-
const file = new File([audio], "audio.wav", { type: "audio/wav" });
|
|
755
|
-
const formData = new FormData();
|
|
756
|
-
formData.append("file", file);
|
|
757
|
-
formData.append("model", this.config.model);
|
|
758
|
-
formData.append("language", language);
|
|
759
|
-
if (this.config.wordTimestamps) {
|
|
760
|
-
formData.append("timestamp_granularities[]", "word");
|
|
761
|
-
formData.append("response_format", "verbose_json");
|
|
762
|
-
}
|
|
763
|
-
const response = await fetch("https://api.openai.com/v1/audio/transcriptions", {
|
|
764
|
-
method: "POST",
|
|
765
|
-
headers: { Authorization: `Bearer ${this.config.apiKey}` },
|
|
766
|
-
body: formData
|
|
767
|
-
});
|
|
768
|
-
if (!response.ok) {
|
|
769
|
-
throw new Error(`Whisper API error: ${response.status} ${response.statusText}`);
|
|
770
|
-
}
|
|
771
|
-
const data = await response.json();
|
|
772
|
-
return {
|
|
773
|
-
transcript: data.text,
|
|
774
|
-
isFinal: true,
|
|
775
|
-
confidence: 0.95,
|
|
776
|
-
// Whisper doesn't return confidence
|
|
777
|
-
language: this.config.language,
|
|
778
|
-
languageSwitchDetected: false,
|
|
779
|
-
words: this.config.wordTimestamps && data.words ? data.words.map((w) => ({
|
|
780
|
-
word: w.word,
|
|
781
|
-
startMs: w.start * 1e3,
|
|
782
|
-
endMs: w.end * 1e3,
|
|
783
|
-
confidence: 0.95
|
|
784
|
-
})) : void 0,
|
|
785
|
-
latencyMs: Date.now() - startMs
|
|
786
|
-
};
|
|
787
|
-
} catch (err) {
|
|
788
|
-
if (err instanceof STTStreamError) throw err;
|
|
789
|
-
throw new STTStreamError("whisper", err);
|
|
790
|
-
}
|
|
791
|
-
}
|
|
792
|
-
};
|
|
793
|
-
|
|
794
|
-
// src/stt/STT-factory.ts
|
|
795
|
-
function createSTT(provider, config) {
|
|
796
|
-
const cfg = config ?? {};
|
|
797
|
-
switch (provider) {
|
|
798
|
-
case "deepgram":
|
|
799
|
-
return new DeepgramSTTProvider(cfg);
|
|
800
|
-
case "whisper":
|
|
801
|
-
return new WhisperSTTProvider(cfg);
|
|
802
|
-
case "assemblyai":
|
|
803
|
-
return new AssemblyAISTTProvider(cfg);
|
|
804
|
-
case "sarvam":
|
|
805
|
-
return new SarvamSTTProvider(cfg);
|
|
806
|
-
default: {
|
|
807
|
-
const _exhaustive = provider;
|
|
808
|
-
throw new Error(`Unknown STT provider: ${String(_exhaustive)}`);
|
|
809
|
-
}
|
|
810
|
-
}
|
|
811
|
-
}
|
|
812
|
-
|
|
813
|
-
export { AssemblyAISTTProvider, DeepgramSTTProvider, LanguageSwitchDetector, SarvamSTTProvider, WhisperSTTProvider, createSTT, isInglish };
|
|
814
|
-
//# sourceMappingURL=stt.js.map
|
|
815
|
-
//# sourceMappingURL=stt.js.map
|