@voice-kit/core 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/dist/index.cjs +2137 -0
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.cts +1466 -4
  4. package/dist/index.d.ts +1466 -4
  5. package/dist/index.js +2102 -1
  6. package/dist/index.js.map +1 -1
  7. package/package.json +1 -31
  8. package/dist/audio.cjs +0 -533
  9. package/dist/audio.cjs.map +0 -1
  10. package/dist/audio.d.cts +0 -260
  11. package/dist/audio.d.ts +0 -260
  12. package/dist/audio.js +0 -514
  13. package/dist/audio.js.map +0 -1
  14. package/dist/compliance.cjs +0 -343
  15. package/dist/compliance.cjs.map +0 -1
  16. package/dist/compliance.d.cts +0 -163
  17. package/dist/compliance.d.ts +0 -163
  18. package/dist/compliance.js +0 -335
  19. package/dist/compliance.js.map +0 -1
  20. package/dist/errors.cjs +0 -284
  21. package/dist/errors.cjs.map +0 -1
  22. package/dist/errors.d.cts +0 -100
  23. package/dist/errors.d.ts +0 -100
  24. package/dist/errors.js +0 -262
  25. package/dist/errors.js.map +0 -1
  26. package/dist/index-D3KfRXMP.d.cts +0 -319
  27. package/dist/index-D3KfRXMP.d.ts +0 -319
  28. package/dist/memory.cjs +0 -121
  29. package/dist/memory.cjs.map +0 -1
  30. package/dist/memory.d.cts +0 -29
  31. package/dist/memory.d.ts +0 -29
  32. package/dist/memory.js +0 -115
  33. package/dist/memory.js.map +0 -1
  34. package/dist/observability.cjs +0 -229
  35. package/dist/observability.cjs.map +0 -1
  36. package/dist/observability.d.cts +0 -122
  37. package/dist/observability.d.ts +0 -122
  38. package/dist/observability.js +0 -222
  39. package/dist/observability.js.map +0 -1
  40. package/dist/stt.cjs +0 -828
  41. package/dist/stt.cjs.map +0 -1
  42. package/dist/stt.d.cts +0 -308
  43. package/dist/stt.d.ts +0 -308
  44. package/dist/stt.js +0 -815
  45. package/dist/stt.js.map +0 -1
  46. package/dist/telephony.errors-BQYr6-vl.d.cts +0 -80
  47. package/dist/telephony.errors-C0-nScrF.d.ts +0 -80
  48. package/dist/tts.cjs +0 -429
  49. package/dist/tts.cjs.map +0 -1
  50. package/dist/tts.d.cts +0 -151
  51. package/dist/tts.d.ts +0 -151
  52. package/dist/tts.js +0 -418
  53. package/dist/tts.js.map +0 -1
package/dist/stt.cjs DELETED
@@ -1,828 +0,0 @@
1
- 'use strict';
2
-
3
- var assemblyai = require('assemblyai');
4
- var pino = require('pino');
5
- var sdk = require('@deepgram/sdk');
6
- var axios = require('axios');
7
- var events = require('events');
8
- var openai = require('@ai-sdk/openai');
9
-
10
- function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
11
-
12
- var pino__default = /*#__PURE__*/_interopDefault(pino);
13
- var axios__default = /*#__PURE__*/_interopDefault(axios);
14
-
15
- // src/stt/assembly/index.ts
16
-
17
- // src/errors/base.ts
18
- var VoiceKitError = class extends Error {
19
- code;
20
- callId;
21
- provider;
22
- retryable;
23
- severity;
24
- cause;
25
- constructor(params) {
26
- super(params.message);
27
- this.name = this.constructor.name;
28
- this.code = params.code;
29
- this.callId = params.callId;
30
- this.provider = params.provider;
31
- this.retryable = params.retryable ?? false;
32
- this.severity = params.severity ?? "medium";
33
- this.cause = params.cause;
34
- Object.setPrototypeOf(this, new.target.prototype);
35
- }
36
- toJSON() {
37
- return {
38
- name: this.name,
39
- code: this.code,
40
- message: this.message,
41
- callId: this.callId,
42
- provider: this.provider,
43
- retryable: this.retryable,
44
- severity: this.severity
45
- };
46
- }
47
- };
48
-
49
- // src/errors/stt.errors.ts
50
- var STTError = class extends VoiceKitError {
51
- languageCode;
52
- constructor(params) {
53
- super(params);
54
- this.languageCode = params.languageCode;
55
- }
56
- };
57
- var STTConnectionError = class extends STTError {
58
- constructor(provider, cause, callId) {
59
- super({
60
- code: "STT_CONNECTION_FAILED",
61
- message: `Failed to connect to ${provider} STT service`,
62
- provider,
63
- callId,
64
- retryable: true,
65
- severity: "high",
66
- cause
67
- });
68
- }
69
- };
70
- var STTStreamError = class extends STTError {
71
- constructor(provider, cause, callId) {
72
- super({
73
- code: "STT_STREAM_ERROR",
74
- message: `STT stream error from ${provider}`,
75
- provider,
76
- callId,
77
- retryable: true,
78
- severity: "medium",
79
- cause
80
- });
81
- }
82
- };
83
- var STTLanguageNotSupportedError = class extends STTError {
84
- constructor(provider, language) {
85
- super({
86
- code: "STT_LANGUAGE_NOT_SUPPORTED",
87
- message: `Language '${language}' is not supported by ${provider}`,
88
- provider,
89
- retryable: false,
90
- severity: "low",
91
- languageCode: language
92
- });
93
- }
94
- };
95
- var logger = pino__default.default({ name: "@voice-kit/core:stt:assemblyai" });
96
- var SUPPORTED_LANGUAGES = [
97
- "en",
98
- "en_au",
99
- "en_uk",
100
- "en_us",
101
- "hi",
102
- "fr",
103
- "de",
104
- "es",
105
- "it",
106
- "pt",
107
- "nl",
108
- "ja",
109
- "zh"
110
- ];
111
- var AssemblyAISTTProvider = class {
112
- name = "assemblyai";
113
- supportsStreaming = false;
114
- supportedLanguages = SUPPORTED_LANGUAGES;
115
- client;
116
- config;
117
- constructor(config) {
118
- const apiKey = config.apiKey ?? process.env["ASSEMBLYAI_API_KEY"];
119
- if (!apiKey) throw new STTConnectionError("assemblyai", new Error("ASSEMBLYAI_API_KEY not set"));
120
- this.client = new assemblyai.AssemblyAI({ apiKey });
121
- this.config = {
122
- language: config.language ?? "en",
123
- alternateLanguages: config.alternateLanguages ?? [],
124
- apiKey,
125
- model: config.model ?? "best",
126
- wordTimestamps: config.wordTimestamps ?? true,
127
- interimResults: false,
128
- smartFormat: config.smartFormat ?? true,
129
- region: ""
130
- };
131
- }
132
- /**
133
- * Batch-transcribes collected audio. AssemblyAI has no realtime streaming.
134
- * Collects all audio from the iterable, uploads, then polls for result.
135
- *
136
- * @param audio Async iterable of PCM buffers
137
- */
138
- async *transcribeStream(audio) {
139
- const chunks = [];
140
- for await (const chunk of audio) chunks.push(chunk);
141
- const result = await this.transcribeBatch(Buffer.concat(chunks));
142
- yield result;
143
- }
144
- /**
145
- * Upload audio to AssemblyAI and wait for async transcription.
146
- * Suitable for call recordings. Average latency: 15–45s per minute of audio.
147
- *
148
- * @param audio Raw WAV/PCM/MP3 buffer
149
- *
150
- * @example
151
- * ```ts
152
- * const stt = createSTT('assemblyai', { wordTimestamps: true })
153
- * const result = await stt.transcribeBatch(recordingBuffer)
154
- * console.log(result.words) // Word-level timestamps
155
- * ```
156
- */
157
- async transcribeBatch(audio) {
158
- const startMs = Date.now();
159
- try {
160
- logger.debug({ bytes: audio.length, language: this.config.language }, "AssemblyAI transcription started");
161
- const transcript = await this.client.transcripts.transcribe({
162
- audio,
163
- language_code: this.config.language,
164
- speech_model: this.config.model,
165
- punctuate: this.config.smartFormat,
166
- format_text: this.config.smartFormat,
167
- word_boost: [],
168
- ...this.config.wordTimestamps && { timestamps: true }
169
- });
170
- if (transcript.status === "error") {
171
- throw new STTStreamError("assemblyai", new Error(transcript.error ?? "Transcription failed"));
172
- }
173
- logger.info(
174
- { id: transcript.id, duration: transcript.audio_duration, latencyMs: Date.now() - startMs },
175
- "AssemblyAI transcription complete"
176
- );
177
- return {
178
- transcript: transcript.text ?? "",
179
- isFinal: true,
180
- confidence: transcript.confidence ?? 0.9,
181
- language: this.config.language,
182
- languageSwitchDetected: false,
183
- words: this.config.wordTimestamps && transcript.words ? transcript.words.map((w) => ({
184
- word: w.text,
185
- startMs: w.start,
186
- endMs: w.end,
187
- confidence: w.confidence
188
- })) : void 0,
189
- latencyMs: Date.now() - startMs
190
- };
191
- } catch (err) {
192
- if (err instanceof STTStreamError) throw err;
193
- throw new STTStreamError("assemblyai", err);
194
- }
195
- }
196
- };
197
- var logger2 = pino__default.default({ name: "@voice-kit/core:stt:deepgram" });
198
- var SUPPORTED_LANGUAGES2 = [
199
- "en-IN",
200
- "hi-IN",
201
- "ta-IN",
202
- "te-IN",
203
- "kn-IN",
204
- "mr-IN",
205
- "en-US",
206
- "en-GB",
207
- "en-AU"
208
- ];
209
- var BACKOFF = {
210
- baseMs: 100,
211
- maxMs: 5e3,
212
- jitterPct: 0.2,
213
- maxAttempts: 3
214
- };
215
- function backoffDelay(attempt) {
216
- const base = Math.min(BACKOFF.baseMs * Math.pow(2, attempt), BACKOFF.maxMs);
217
- const jitter = base * BACKOFF.jitterPct * (Math.random() * 2 - 1);
218
- return Math.round(base + jitter);
219
- }
220
- var DeepgramSTTProvider = class {
221
- name = "deepgram";
222
- supportsStreaming = true;
223
- supportedLanguages = SUPPORTED_LANGUAGES2;
224
- client;
225
- config;
226
- constructor(config) {
227
- const apiKey = config.apiKey ?? process.env["DEEPGRAM_API_KEY"];
228
- if (!apiKey) throw new STTConnectionError("deepgram", new Error("DEEPGRAM_API_KEY not set"));
229
- this.client = new sdk.DeepgramClient({ apiKey });
230
- this.config = {
231
- language: config.language ?? "en-IN",
232
- alternateLanguages: config.alternateLanguages ?? [],
233
- apiKey,
234
- // nova-3 is now Deepgram's latest recommended model
235
- model: config.model ?? "nova-3",
236
- wordTimestamps: config.wordTimestamps ?? false,
237
- interimResults: config.interimResults ?? true,
238
- smartFormat: config.smartFormat ?? true,
239
- region: config.region ?? ""
240
- };
241
- }
242
- /**
243
- * Stream audio to Deepgram and receive interim + final transcription results.
244
- * Handles reconnection transparently with exponential backoff.
245
- *
246
- * @param audio Async iterable of 16kHz PCM buffers from AudioPipeline
247
- *
248
- * @example
249
- * ```ts
250
- * const stt = createSTT('deepgram', { language: 'hi-IN' })
251
- * for await (const result of stt.transcribeStream(audioIterable)) {
252
- * if (result.isFinal) console.log('User said:', result.transcript)
253
- * }
254
- * ```
255
- */
256
- async *transcribeStream(audio) {
257
- let attempt = 0;
258
- const startMs = Date.now();
259
- while (attempt <= BACKOFF.maxAttempts) {
260
- const connection = await this.connectWithRetry(attempt);
261
- const results = [];
262
- let done = false;
263
- let error = null;
264
- connection.on("message", (data) => {
265
- if (data.type !== "Results") return;
266
- const alt = data.channel?.alternatives?.[0];
267
- if (!alt?.transcript) return;
268
- const isFinal = data.is_final === true;
269
- const result = {
270
- transcript: alt.transcript,
271
- isFinal,
272
- // speech_final=true means Deepgram detected end-of-utterance (endpointing).
273
- // A frame can be speech_final without is_final — callers should act on both.
274
- confidence: alt.confidence ?? 0,
275
- // alt.languages populated when detect_language is enabled
276
- language: alt.languages?.[0] ?? this.config.language,
277
- languageSwitchDetected: false,
278
- words: this.config.wordTimestamps ? alt.words?.map((w) => ({
279
- word: w.word ?? "",
280
- startMs: (w.start ?? 0) * 1e3,
281
- endMs: (w.end ?? 0) * 1e3,
282
- confidence: w.confidence ?? 0,
283
- punctuatedWord: w.punctuated_word
284
- })) : void 0,
285
- latencyMs: Date.now() - startMs
286
- };
287
- results.push(result);
288
- if (isFinal) {
289
- logger2.debug(
290
- { transcript: result.transcript, confidence: result.confidence, language: result.language },
291
- "Deepgram final transcript"
292
- );
293
- }
294
- });
295
- connection.on("close", () => {
296
- done = true;
297
- });
298
- connection.on("error", (err) => {
299
- error = err;
300
- logger2.warn({ err, attempt }, "Deepgram stream error");
301
- });
302
- const sendAudio = async () => {
303
- try {
304
- for await (const chunk of audio) {
305
- connection.socket.send(chunk);
306
- }
307
- connection.socket.send(JSON.stringify({ type: "Finalize" }));
308
- } catch (err) {
309
- error = err instanceof Error ? err : new Error(String(err));
310
- }
311
- };
312
- const sendPromise = sendAudio();
313
- let resultIndex = 0;
314
- while (!done || resultIndex < results.length) {
315
- if (resultIndex < results.length) {
316
- yield results[resultIndex++];
317
- } else {
318
- await new Promise((r) => setTimeout(r, 10));
319
- }
320
- if (error && attempt < BACKOFF.maxAttempts) {
321
- try {
322
- connection.socket.close();
323
- } catch {
324
- }
325
- break;
326
- }
327
- if (error && attempt >= BACKOFF.maxAttempts) {
328
- await sendPromise.catch(() => {
329
- });
330
- throw new STTStreamError("deepgram", error);
331
- }
332
- }
333
- await sendPromise.catch(() => {
334
- });
335
- if (!error) return;
336
- attempt++;
337
- await new Promise((r) => setTimeout(r, backoffDelay(attempt)));
338
- logger2.info({ attempt }, "Deepgram reconnecting...");
339
- }
340
- throw new STTStreamError("deepgram", new Error("Max reconnect attempts exceeded"));
341
- }
342
- /**
343
- * Transcribe a complete audio buffer (non-streaming).
344
- * Uses Deepgram pre-recorded API.
345
- *
346
- * @param audio Raw PCM or WAV buffer
347
- */
348
- async transcribeBatch(audio) {
349
- const startMs = Date.now();
350
- try {
351
- const response = await this.client.listen.v1.media.transcribeFile(
352
- audio,
353
- {
354
- model: this.config.model,
355
- language: this.config.language,
356
- // v5: boolean-like options must be strings
357
- smart_format: true,
358
- diarize: false
359
- }
360
- );
361
- const alt = response?.results?.channels?.[0]?.alternatives?.[0];
362
- return {
363
- transcript: alt?.transcript ?? "",
364
- isFinal: true,
365
- confidence: alt?.confidence ?? 0,
366
- language: this.config.language,
367
- languageSwitchDetected: false,
368
- latencyMs: Date.now() - startMs
369
- };
370
- } catch (err) {
371
- if (err instanceof STTStreamError) throw err;
372
- throw new STTStreamError("deepgram", err instanceof Error ? err : new Error(String(err)));
373
- }
374
- }
375
- /**
376
- * Create and open a live WebSocket connection to Deepgram.
377
- *
378
- * v5 connection lifecycle (3 explicit steps):
379
- * 1. await listen.v1.connect(options) — constructs the connection object
380
- * 2. connection.connect() — initiates the WebSocket handshake
381
- * 3. await connection.waitForOpen() — resolves once the socket is ready
382
- *
383
- * @internal
384
- */
385
- async connectWithRetry(attempt) {
386
- const delay = attempt > 0 ? backoffDelay(attempt) : 0;
387
- if (delay > 0) await new Promise((r) => setTimeout(r, delay));
388
- try {
389
- logger2.debug({ attempt, language: this.config.language }, "Connecting to Deepgram");
390
- const connection = await this.client.listen.v1.connect({
391
- model: this.config.model,
392
- language: this.config.language,
393
- // v5: boolean-like options must be strings
394
- smart_format: "true",
395
- interim_results: String(this.config.interimResults),
396
- encoding: "linear16",
397
- sample_rate: 16e3,
398
- channels: 1,
399
- utterance_end_ms: "1000",
400
- ...this.config.alternateLanguages.length > 0 && {
401
- detect_language: "true",
402
- // language must be omitted when detect_language is enabled
403
- language: void 0
404
- },
405
- Authorization: `Token ${this.config.apiKey}`
406
- });
407
- connection.connect();
408
- await Promise.race([
409
- connection.waitForOpen(),
410
- new Promise(
411
- (_, reject) => setTimeout(
412
- () => reject(new STTConnectionError("deepgram", new Error("Connection timeout"))),
413
- 1e4
414
- )
415
- )
416
- ]);
417
- logger2.info({ attempt, language: this.config.language }, "Deepgram connected");
418
- return connection;
419
- } catch (err) {
420
- if (err instanceof STTConnectionError) throw err;
421
- throw new STTConnectionError("deepgram", err instanceof Error ? err : new Error(String(err)));
422
- }
423
- }
424
- };
425
- var logger3 = pino__default.default({ name: "@voice-kit/core:stt:sarvam" });
426
- var SARVAM_API_BASE = "https://api.sarvam.ai";
427
- var SUPPORTED_LANGUAGES3 = [
428
- "hi-IN",
429
- "kn-IN",
430
- "ta-IN",
431
- "te-IN",
432
- "mr-IN",
433
- "bn-IN",
434
- "gu-IN",
435
- "pa-IN",
436
- "or-IN",
437
- "ml-IN"
438
- ];
439
- var SARVAM_MODELS = {
440
- "hi-IN": "saarika:v1",
441
- "kn-IN": "saarika:v1",
442
- "ta-IN": "saarika:v1",
443
- "te-IN": "saarika:v1",
444
- "mr-IN": "saarika:v1",
445
- "bn-IN": "saarika:v1",
446
- "gu-IN": "saarika:v1",
447
- "pa-IN": "saarika:v1",
448
- "or-IN": "saarika:v1",
449
- "ml-IN": "saarika:v1"
450
- };
451
- var SarvamSTTProvider = class {
452
- name = "sarvam";
453
- supportsStreaming = false;
454
- // Sarvam REST API is batch-only
455
- supportedLanguages = SUPPORTED_LANGUAGES3;
456
- http;
457
- config;
458
- constructor(config) {
459
- const apiKey = config.apiKey ?? process.env["SARVAM_API_KEY"];
460
- if (!apiKey) throw new STTConnectionError("sarvam", new Error("SARVAM_API_KEY not set"));
461
- const language = config.language ?? "hi-IN";
462
- if (!SUPPORTED_LANGUAGES3.includes(language)) {
463
- throw new STTLanguageNotSupportedError("sarvam", language);
464
- }
465
- this.http = axios__default.default.create({
466
- baseURL: SARVAM_API_BASE,
467
- headers: {
468
- "API-Subscription-Key": apiKey,
469
- "Content-Type": "multipart/form-data"
470
- },
471
- timeout: 3e4
472
- });
473
- this.config = {
474
- language,
475
- alternateLanguages: config.alternateLanguages ?? [],
476
- apiKey,
477
- model: config.model ?? SARVAM_MODELS[language] ?? "saarika:v1",
478
- wordTimestamps: false,
479
- // Sarvam doesn't support word timestamps yet
480
- interimResults: false,
481
- smartFormat: config.smartFormat ?? true,
482
- region: config.region ?? ""
483
- };
484
- }
485
- /**
486
- * Collects audio and transcribes via Sarvam batch API.
487
- * Sarvam doesn't support realtime streaming.
488
- *
489
- * @param audio Async iterable of 16kHz PCM buffers
490
- */
491
- async *transcribeStream(audio) {
492
- const chunks = [];
493
- for await (const chunk of audio) chunks.push(chunk);
494
- const result = await this.transcribeBatch(Buffer.concat(chunks));
495
- yield result;
496
- }
497
- /**
498
- * Transcribe a WAV/PCM audio buffer in an Indic language.
499
- *
500
- * @param audio 16kHz PCM or WAV buffer
501
- *
502
- * @example
503
- * ```ts
504
- * const stt = createSTT('sarvam', { language: 'ta-IN' })
505
- * const result = await stt.transcribeBatch(tamilAudioBuffer)
506
- * console.log(result.transcript) // Tamil text
507
- * ```
508
- */
509
- async transcribeBatch(audio) {
510
- const startMs = Date.now();
511
- try {
512
- logger3.debug(
513
- { language: this.config.language, bytes: audio.length },
514
- "Sarvam transcription request"
515
- );
516
- const form = new FormData();
517
- form.append("file", new Blob([audio], { type: "audio/wav" }), "audio.wav");
518
- form.append("language_code", this.config.language);
519
- form.append("model", this.config.model);
520
- if (this.config.smartFormat) {
521
- form.append("with_disfluencies", "false");
522
- }
523
- const response = await this.http.post(
524
- "/speech-to-text",
525
- form
526
- );
527
- const data = response.data;
528
- logger3.info(
529
- { language: data.language_code, confidence: data.confidence, latencyMs: Date.now() - startMs },
530
- "Sarvam transcription complete"
531
- );
532
- return {
533
- transcript: data.transcript,
534
- isFinal: true,
535
- confidence: data.confidence ?? 0.9,
536
- language: data.language_code ?? this.config.language,
537
- languageSwitchDetected: false,
538
- latencyMs: Date.now() - startMs
539
- };
540
- } catch (err) {
541
- if (axios__default.default.isAxiosError(err)) {
542
- throw new STTStreamError(
543
- "sarvam",
544
- new Error(`Sarvam API error: ${err.response?.status} ${JSON.stringify(err.response?.data)}`)
545
- );
546
- }
547
- throw new STTStreamError("sarvam", err);
548
- }
549
- }
550
- };
551
- var logger4 = pino__default.default({ name: "@voice-kit/core:stt:language-detect" });
552
- var DEVANAGARI_RANGE = /[\u0900-\u097F]/;
553
- var MIN_WORDS_FOR_CLASSIFICATION = 2;
554
- var SWITCH_CONFIDENCE_THRESHOLD = 0.6;
555
- var NEUTRAL_TOKENS = /* @__PURE__ */ new Set([
556
- "ok",
557
- "okay",
558
- "haan",
559
- "nahin",
560
- "nahi",
561
- "kya",
562
- "hai",
563
- "ho",
564
- "na",
565
- "toh",
566
- "aur",
567
- "ya",
568
- "matlab",
569
- "yani",
570
- "i",
571
- "a",
572
- "the",
573
- "is",
574
- "are",
575
- "and",
576
- "or"
577
- ]);
578
- var LanguageSwitchDetector = class extends events.EventEmitter {
579
- currentLanguage;
580
- primaryLanguage;
581
- /** Rolling window of recent language classifications for smoothing. */
582
- recentClassifications = [];
583
- windowSize = 5;
584
- constructor(primaryLanguage = "en-IN") {
585
- super();
586
- this.primaryLanguage = primaryLanguage;
587
- this.currentLanguage = primaryLanguage;
588
- }
589
- /**
590
- * Analyze a transcript for language switches.
591
- * Should be called on every STT final result.
592
- *
593
- * @param transcript The transcribed text to analyze
594
- * @returns Detected language of the transcript
595
- */
596
- analyze(transcript) {
597
- const words = this.tokenize(transcript);
598
- if (words.length === 0) return this.currentLanguage;
599
- const classification = this.classifySegment(words);
600
- const confidence = this.computeConfidence(words, classification);
601
- this.recentClassifications.push(classification);
602
- if (this.recentClassifications.length > this.windowSize) {
603
- this.recentClassifications.shift();
604
- }
605
- const smoothed = this.smoothedLanguage();
606
- if (smoothed !== this.currentLanguage && confidence >= SWITCH_CONFIDENCE_THRESHOLD && smoothed !== "unknown") {
607
- const event = {
608
- from: this.currentLanguage,
609
- to: smoothed,
610
- position: 0,
611
- // position in full conversation
612
- confidence,
613
- transcript,
614
- detectedAt: /* @__PURE__ */ new Date()
615
- };
616
- const prev = this.currentLanguage;
617
- this.currentLanguage = smoothed;
618
- logger4.info(
619
- { from: prev, to: smoothed, confidence, transcript: transcript.slice(0, 50) },
620
- "Language switch detected"
621
- );
622
- this.emit("language.switched", event);
623
- }
624
- return this.currentLanguage;
625
- }
626
- /**
627
- * Analyze a transcript and return per-word language classification.
628
- * Useful for word-level Hinglish mixing visualization.
629
- *
630
- * @param transcript Text to analyze
631
- * @returns Array of { word, language } pairs
632
- */
633
- analyzeWords(transcript) {
634
- const words = this.tokenize(transcript);
635
- return words.map((word) => ({
636
- word,
637
- language: this.classifyWord(word)
638
- }));
639
- }
640
- /** Reset to primary language (e.g., on new call). */
641
- reset() {
642
- this.currentLanguage = this.primaryLanguage;
643
- this.recentClassifications = [];
644
- }
645
- /** Current detected language. */
646
- get language() {
647
- return this.currentLanguage;
648
- }
649
- // ─── Private helpers ────────────────────────────────────────────────────────
650
- tokenize(text) {
651
- return text.toLowerCase().split(/\s+/).filter((w) => w.length > 0 && !NEUTRAL_TOKENS.has(w));
652
- }
653
- classifyWord(word) {
654
- if (DEVANAGARI_RANGE.test(word)) return "hi-IN";
655
- if (/^[a-z]+$/.test(word)) return "en-IN";
656
- return "unknown";
657
- }
658
- classifySegment(words) {
659
- let hindiCount = 0;
660
- let englishCount = 0;
661
- for (const word of words) {
662
- const lang = this.classifyWord(word);
663
- if (lang === "hi-IN") hindiCount++;
664
- else if (lang === "en-IN") englishCount++;
665
- }
666
- if (hindiCount === 0 && englishCount === 0) return "unknown";
667
- if (hindiCount > englishCount) return "hi-IN";
668
- if (englishCount > hindiCount) return "en-IN";
669
- return this.primaryLanguage;
670
- }
671
- computeConfidence(words, classification) {
672
- const relevant = words.filter((w) => this.classifyWord(w) !== "unknown");
673
- if (relevant.length < MIN_WORDS_FOR_CLASSIFICATION) return 0;
674
- const matching = relevant.filter((w) => this.classifyWord(w) === classification);
675
- return matching.length / relevant.length;
676
- }
677
- smoothedLanguage() {
678
- if (this.recentClassifications.length === 0) return this.primaryLanguage;
679
- const counts = { "hi-IN": 0, "en-IN": 0, "unknown": 0 };
680
- for (const lang of this.recentClassifications) {
681
- counts[lang]++;
682
- }
683
- if (counts["hi-IN"] > counts["en-IN"]) return "hi-IN";
684
- if (counts["en-IN"] > counts["hi-IN"]) return "en-IN";
685
- return this.currentLanguage;
686
- }
687
- };
688
- function isInglish(transcript) {
689
- const hasDevanagari = DEVANAGARI_RANGE.test(transcript);
690
- const hasLatin = /[a-zA-Z]/.test(transcript);
691
- return hasDevanagari && hasLatin;
692
- }
693
- var logger5 = pino__default.default({ name: "@voice-kit/core:stt:whisper" });
694
- var WHISPER_LANGUAGES = [
695
- "en",
696
- "hi",
697
- "ta",
698
- "te",
699
- "kn",
700
- "mr",
701
- "bn",
702
- "gu",
703
- "pa",
704
- "ur",
705
- "fr",
706
- "de",
707
- "es",
708
- "pt",
709
- "it",
710
- "nl",
711
- "pl",
712
- "ru",
713
- "ja",
714
- "zh"
715
- ];
716
- var WhisperSTTProvider = class {
717
- name = "whisper";
718
- supportsStreaming = false;
719
- supportedLanguages = WHISPER_LANGUAGES;
720
- config;
721
- constructor(config) {
722
- const apiKey = config.apiKey ?? process.env["OPENAI_API_KEY"];
723
- if (!apiKey) throw new STTStreamError("whisper", new Error("OPENAI_API_KEY not set"));
724
- const language = config.language ?? "en-IN";
725
- const whisperLang = language.split("-")[0] ?? "en";
726
- if (!WHISPER_LANGUAGES.includes(whisperLang)) {
727
- throw new STTLanguageNotSupportedError("whisper", language);
728
- }
729
- this.config = {
730
- language,
731
- alternateLanguages: config.alternateLanguages ?? [],
732
- apiKey,
733
- model: config.model ?? "whisper-1",
734
- wordTimestamps: config.wordTimestamps ?? false,
735
- interimResults: false,
736
- smartFormat: false,
737
- region: ""
738
- };
739
- }
740
- /**
741
- * Streaming not supported by Whisper. Collects all audio then transcribes.
742
- * For realtime use, use createSTT('deepgram') instead.
743
- */
744
- async *transcribeStream(audio) {
745
- const chunks = [];
746
- for await (const chunk of audio) chunks.push(chunk);
747
- const result = await this.transcribeBatch(Buffer.concat(chunks));
748
- yield result;
749
- }
750
- /**
751
- * Transcribe a complete audio buffer via Whisper.
752
- *
753
- * @param audio WAV or PCM buffer
754
- */
755
- async transcribeBatch(audio) {
756
- const startMs = Date.now();
757
- const language = this.config.language.split("-")[0] ?? "en";
758
- try {
759
- logger5.debug({ language, bytes: audio.length }, "Whisper batch transcription");
760
- const openai$1 = openai.createOpenAI({ apiKey: this.config.apiKey });
761
- const file = new File([audio], "audio.wav", { type: "audio/wav" });
762
- const formData = new FormData();
763
- formData.append("file", file);
764
- formData.append("model", this.config.model);
765
- formData.append("language", language);
766
- if (this.config.wordTimestamps) {
767
- formData.append("timestamp_granularities[]", "word");
768
- formData.append("response_format", "verbose_json");
769
- }
770
- const response = await fetch("https://api.openai.com/v1/audio/transcriptions", {
771
- method: "POST",
772
- headers: { Authorization: `Bearer ${this.config.apiKey}` },
773
- body: formData
774
- });
775
- if (!response.ok) {
776
- throw new Error(`Whisper API error: ${response.status} ${response.statusText}`);
777
- }
778
- const data = await response.json();
779
- return {
780
- transcript: data.text,
781
- isFinal: true,
782
- confidence: 0.95,
783
- // Whisper doesn't return confidence
784
- language: this.config.language,
785
- languageSwitchDetected: false,
786
- words: this.config.wordTimestamps && data.words ? data.words.map((w) => ({
787
- word: w.word,
788
- startMs: w.start * 1e3,
789
- endMs: w.end * 1e3,
790
- confidence: 0.95
791
- })) : void 0,
792
- latencyMs: Date.now() - startMs
793
- };
794
- } catch (err) {
795
- if (err instanceof STTStreamError) throw err;
796
- throw new STTStreamError("whisper", err);
797
- }
798
- }
799
- };
800
-
801
- // src/stt/STT-factory.ts
802
- function createSTT(provider, config) {
803
- const cfg = config ?? {};
804
- switch (provider) {
805
- case "deepgram":
806
- return new DeepgramSTTProvider(cfg);
807
- case "whisper":
808
- return new WhisperSTTProvider(cfg);
809
- case "assemblyai":
810
- return new AssemblyAISTTProvider(cfg);
811
- case "sarvam":
812
- return new SarvamSTTProvider(cfg);
813
- default: {
814
- const _exhaustive = provider;
815
- throw new Error(`Unknown STT provider: ${String(_exhaustive)}`);
816
- }
817
- }
818
- }
819
-
820
- exports.AssemblyAISTTProvider = AssemblyAISTTProvider;
821
- exports.DeepgramSTTProvider = DeepgramSTTProvider;
822
- exports.LanguageSwitchDetector = LanguageSwitchDetector;
823
- exports.SarvamSTTProvider = SarvamSTTProvider;
824
- exports.WhisperSTTProvider = WhisperSTTProvider;
825
- exports.createSTT = createSTT;
826
- exports.isInglish = isInglish;
827
- //# sourceMappingURL=stt.cjs.map
828
- //# sourceMappingURL=stt.cjs.map