@omote/core 0.4.5 → 0.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -2401,7 +2401,7 @@ function isIOSSafari() {
2401
2401
  function isIOS() {
2402
2402
  if (typeof navigator === "undefined") return false;
2403
2403
  const ua = navigator.userAgent.toLowerCase();
2404
- return /iphone|ipad|ipod/.test(ua);
2404
+ return /iphone|ipad|ipod/.test(ua) || /macintosh/.test(ua) && navigator.maxTouchPoints > 1;
2405
2405
  }
2406
2406
  function isAndroid() {
2407
2407
  if (typeof navigator === "undefined") return false;
@@ -3022,10 +3022,16 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
3022
3022
  });
3023
3023
  logger2.debug("Running warmup inference to initialize GPU context");
3024
3024
  const warmupStart = performance.now();
3025
- const silentAudio = new Float32Array(16e3);
3025
+ const warmupAudio = new Float32Array(16e3);
3026
+ const warmupIdentity = new Float32Array(this.numIdentityClasses);
3027
+ warmupIdentity[0] = 1;
3028
+ const warmupFeeds = {
3029
+ "audio": new this.ort.Tensor("float32", warmupAudio, [1, 16e3]),
3030
+ "identity": new this.ort.Tensor("float32", warmupIdentity, [1, this.numIdentityClasses])
3031
+ };
3026
3032
  const WARMUP_TIMEOUT_MS = 15e3;
3027
3033
  const warmupResult = await Promise.race([
3028
- this.infer(silentAudio, 0).then(() => "ok"),
3034
+ this.session.run(warmupFeeds).then(() => "ok"),
3029
3035
  new Promise((r) => setTimeout(() => r("timeout"), WARMUP_TIMEOUT_MS))
3030
3036
  ]);
3031
3037
  const warmupTimeMs = performance.now() - warmupStart;
@@ -3087,7 +3093,7 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
3087
3093
  audio = audioSamplesCopy.slice(0, 16e3);
3088
3094
  }
3089
3095
  const identity = new Float32Array(this.numIdentityClasses);
3090
- identity[Math.min(identityIndex, this.numIdentityClasses - 1)] = 1;
3096
+ identity[Math.max(0, Math.min(identityIndex, this.numIdentityClasses - 1))] = 1;
3091
3097
  const audioCopy = new Float32Array(audio);
3092
3098
  const identityCopy = new Float32Array(identity);
3093
3099
  const feeds = {
@@ -4189,175 +4195,3008 @@ var _SenseVoiceInference = class _SenseVoiceInference {
4189
4195
  _SenseVoiceInference.INFERENCE_TIMEOUT_MS = 1e4;
4190
4196
  var SenseVoiceInference = _SenseVoiceInference;
4191
4197
 
4192
- // src/inference/Wav2ArkitCpuInference.ts
4193
- var logger5 = createLogger("Wav2ArkitCpu");
4194
- var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
4195
- constructor(config) {
4196
- this.modelId = "wav2arkit_cpu";
4197
- this.session = null;
4198
- this.ort = null;
4199
- this._backend = "wasm";
4200
- this.isLoading = false;
4201
- // Inference queue for handling concurrent calls
4202
- this.inferenceQueue = Promise.resolve();
4203
- // Session health: set to true if session.run() times out.
4204
- // A timed-out session may have a zombie WASM dispatch still running,
4205
- // so all future infer() calls reject immediately to prevent concurrent access.
4206
- this.poisoned = false;
4207
- this.config = config;
4198
+ // src/inference/SenseVoiceWorker.ts
4199
+ var logger5 = createLogger("SenseVoiceWorker");
4200
+ var WASM_CDN_PATH2 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
4201
+ var LOAD_TIMEOUT_MS = 3e4;
4202
+ var INFERENCE_TIMEOUT_MS = 1e4;
4203
+ function resolveUrl(url) {
4204
+ if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
4205
+ try {
4206
+ return new URL(url, globalThis.location?.origin ?? "https://localhost").href;
4207
+ } catch {
4208
+ return url;
4208
4209
  }
4209
- get backend() {
4210
- return this.session ? this._backend : null;
4210
+ }
4211
+ var WORKER_SCRIPT = `
4212
+ // SenseVoice ASR Worker Script
4213
+ // Loaded via Blob URL - no separate file needed
4214
+
4215
+ var ort = null;
4216
+ var session = null;
4217
+ var tokenMap = null;
4218
+ var negMean = null;
4219
+ var invStddev = null;
4220
+ var languageId = 0;
4221
+ var textNormId = 14;
4222
+ var vocabSize = 0;
4223
+
4224
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
4225
+ // kaldiFbank.ts \u2014 inlined as plain JavaScript
4226
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
4227
+
4228
+ /**
4229
+ * In-place Radix-2 Cooley-Tukey FFT
4230
+ */
4231
+ function fft(re, im) {
4232
+ var n = re.length;
4233
+
4234
+ // Bit-reversal permutation
4235
+ for (var i = 1, j = 0; i < n; i++) {
4236
+ var bit = n >> 1;
4237
+ while (j & bit) {
4238
+ j ^= bit;
4239
+ bit >>= 1;
4240
+ }
4241
+ j ^= bit;
4242
+ if (i < j) {
4243
+ var tmp = re[i]; re[i] = re[j]; re[j] = tmp;
4244
+ tmp = im[i]; im[i] = im[j]; im[j] = tmp;
4245
+ }
4211
4246
  }
4212
- get isLoaded() {
4213
- return this.session !== null;
4247
+
4248
+ // Butterfly passes
4249
+ for (var len = 2; len <= n; len *= 2) {
4250
+ var halfLen = len / 2;
4251
+ var angle = -2 * Math.PI / len;
4252
+ var wRe = Math.cos(angle);
4253
+ var wIm = Math.sin(angle);
4254
+
4255
+ for (var i = 0; i < n; i += len) {
4256
+ var curRe = 1;
4257
+ var curIm = 0;
4258
+ for (var j = 0; j < halfLen; j++) {
4259
+ var a = i + j;
4260
+ var b = a + halfLen;
4261
+ var tRe = curRe * re[b] - curIm * im[b];
4262
+ var tIm = curRe * im[b] + curIm * re[b];
4263
+ re[b] = re[a] - tRe;
4264
+ im[b] = im[a] - tIm;
4265
+ re[a] += tRe;
4266
+ im[a] += tIm;
4267
+ var nextRe = curRe * wRe - curIm * wIm;
4268
+ curIm = curRe * wIm + curIm * wRe;
4269
+ curRe = nextRe;
4270
+ }
4271
+ }
4214
4272
  }
4215
- /**
4216
- * Load the ONNX model
4273
+ }
4274
+
4275
+ /** HTK mel scale */
4276
+ function htkMel(freq) {
4277
+ return 1127.0 * Math.log(1.0 + freq / 700.0);
4278
+ }
4279
+
4280
+ function htkMelInverse(mel) {
4281
+ return 700.0 * (Math.exp(mel / 1127.0) - 1.0);
4282
+ }
4283
+
4284
+ /**
4285
+ * Build triangular mel filterbank matrix
4286
+ */
4287
+ function buildMelFilterbank(numBins, fftSize, sampleRate, lowFreq, highFreq) {
4288
+ var numFftBins = fftSize / 2 + 1;
4289
+ var lowMel = htkMel(lowFreq);
4290
+ var highMel = htkMel(highFreq);
4291
+
4292
+ // numBins + 2 equally spaced points in mel space
4293
+ var melPoints = new Float64Array(numBins + 2);
4294
+ for (var i = 0; i < numBins + 2; i++) {
4295
+ melPoints[i] = lowMel + (highMel - lowMel) * i / (numBins + 1);
4296
+ }
4297
+
4298
+ // Convert mel points to FFT bin indices (float, not rounded)
4299
+ var binFreqs = new Float64Array(numBins + 2);
4300
+ for (var i = 0; i < numBins + 2; i++) {
4301
+ binFreqs[i] = htkMelInverse(melPoints[i]) * fftSize / sampleRate;
4302
+ }
4303
+
4304
+ var filters = [];
4305
+
4306
+ for (var m = 0; m < numBins; m++) {
4307
+ var left = binFreqs[m];
4308
+ var center = binFreqs[m + 1];
4309
+ var right = binFreqs[m + 2];
4310
+
4311
+ var startBin = Math.max(0, Math.ceil(left));
4312
+ var endBin = Math.min(numFftBins - 1, Math.floor(right));
4313
+
4314
+ var weights = new Float32Array(endBin - startBin + 1);
4315
+ for (var k = startBin; k <= endBin; k++) {
4316
+ if (k <= center) {
4317
+ weights[k - startBin] = (center - left) > 0 ? (k - left) / (center - left) : 0;
4318
+ } else {
4319
+ weights[k - startBin] = (right - center) > 0 ? (right - k) / (right - center) : 0;
4320
+ }
4321
+ }
4322
+
4323
+ filters.push({ startBin: startBin, weights: weights });
4324
+ }
4325
+
4326
+ return filters;
4327
+ }
4328
+
4329
+ /** Create Hamming window */
4330
+ function createHammingWindow(length) {
4331
+ var w = new Float32Array(length);
4332
+ for (var i = 0; i < length; i++) {
4333
+ w[i] = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (length - 1));
4334
+ }
4335
+ return w;
4336
+ }
4337
+
4338
+ /**
4339
+ * Compute Kaldi-compatible log mel filterbank features
4340
+ */
4341
+ function computeKaldiFbank(audio, sampleRate, numMelBins, opts) {
4342
+ var frameLengthMs = (opts && opts.frameLengthMs !== undefined) ? opts.frameLengthMs : 25;
4343
+ var frameShiftMs = (opts && opts.frameShiftMs !== undefined) ? opts.frameShiftMs : 10;
4344
+ var lowFreq = (opts && opts.lowFreq !== undefined) ? opts.lowFreq : 20;
4345
+ var highFreq = (opts && opts.highFreq !== undefined) ? opts.highFreq : (sampleRate / 2);
4346
+ var dither = (opts && opts.dither !== undefined) ? opts.dither : 0;
4347
+ var preemphasis = (opts && opts.preemphasis !== undefined) ? opts.preemphasis : 0.97;
4348
+
4349
+ var frameLengthSamples = Math.round(sampleRate * frameLengthMs / 1000);
4350
+ var frameShiftSamples = Math.round(sampleRate * frameShiftMs / 1000);
4351
+
4352
+ // Kaldi signal scaling: float [-1,1] -> int16 range
4353
+ var scaled = new Float32Array(audio.length);
4354
+ for (var i = 0; i < audio.length; i++) {
4355
+ scaled[i] = audio[i] * 32768;
4356
+ }
4357
+
4358
+ // Optional dithering
4359
+ if (dither > 0) {
4360
+ for (var i = 0; i < scaled.length; i++) {
4361
+ var u1 = Math.random();
4362
+ var u2 = Math.random();
4363
+ scaled[i] += dither * Math.sqrt(-2 * Math.log(u1 + 1e-10)) * Math.cos(2 * Math.PI * u2);
4364
+ }
4365
+ }
4366
+
4367
+ // Number of frames (snip_edges=true: only complete frames)
4368
+ var numFrames = Math.max(0, Math.floor((scaled.length - frameLengthSamples) / frameShiftSamples) + 1);
4369
+ if (numFrames === 0) {
4370
+ return new Float32Array(0);
4371
+ }
4372
+
4373
+ // FFT size: next power of 2
4374
+ var fftSize = 1;
4375
+ while (fftSize < frameLengthSamples) fftSize *= 2;
4376
+
4377
+ var numFftBins = fftSize / 2 + 1;
4378
+
4379
+ // Pre-compute window and filterbank
4380
+ var window = createHammingWindow(frameLengthSamples);
4381
+ var filters = buildMelFilterbank(numMelBins, fftSize, sampleRate, lowFreq, highFreq);
4382
+
4383
+ // Allocate output
4384
+ var output = new Float32Array(numFrames * numMelBins);
4385
+
4386
+ // FFT buffers (reused per frame)
4387
+ var fftRe = new Float64Array(fftSize);
4388
+ var fftIm = new Float64Array(fftSize);
4389
+
4390
+ for (var f = 0; f < numFrames; f++) {
4391
+ var offset = f * frameShiftSamples;
4392
+
4393
+ // Clear FFT buffers
4394
+ fftRe.fill(0);
4395
+ fftIm.fill(0);
4396
+
4397
+ // Extract frame with preemphasis and windowing
4398
+ for (var i = 0; i < frameLengthSamples; i++) {
4399
+ var sample = scaled[offset + i];
4400
+ // Preemphasis: y[n] = x[n] - coeff * x[n-1]
4401
+ if (preemphasis > 0 && i > 0) {
4402
+ sample -= preemphasis * scaled[offset + i - 1];
4403
+ } else if (preemphasis > 0 && i === 0 && offset > 0) {
4404
+ sample -= preemphasis * scaled[offset - 1];
4405
+ }
4406
+ // Apply window
4407
+ fftRe[i] = sample * window[i];
4408
+ }
4409
+
4410
+ // FFT
4411
+ fft(fftRe, fftIm);
4412
+
4413
+ // Power spectrum -> mel filterbank -> log
4414
+ var outOffset = f * numMelBins;
4415
+ for (var m = 0; m < numMelBins; m++) {
4416
+ var filter = filters[m];
4417
+ var energy = 0;
4418
+ for (var k = 0; k < filter.weights.length; k++) {
4419
+ var bin = filter.startBin + k;
4420
+ if (bin < numFftBins) {
4421
+ var powerSpec = fftRe[bin] * fftRe[bin] + fftIm[bin] * fftIm[bin];
4422
+ energy += filter.weights[k] * powerSpec;
4423
+ }
4424
+ }
4425
+ output[outOffset + m] = Math.log(Math.max(energy, 1e-10));
4426
+ }
4427
+ }
4428
+
4429
+ return output;
4430
+ }
4431
+
4432
+ /**
4433
+ * Apply Low Frame Rate stacking for SenseVoice
4434
+ */
4435
+ function applyLFR(features, featureDim, lfrM, lfrN) {
4436
+ var numFrames = features.length / featureDim;
4437
+ if (numFrames === 0) return new Float32Array(0);
4438
+
4439
+ var leftPad = Math.floor((lfrM - 1) / 2); // 3 for lfrM=7
4440
+ var paddedLen = numFrames + leftPad;
4441
+ var numOutputFrames = Math.ceil(paddedLen / lfrN);
4442
+ var outputDim = featureDim * lfrM;
4443
+
4444
+ var output = new Float32Array(numOutputFrames * outputDim);
4445
+
4446
+ for (var i = 0; i < numOutputFrames; i++) {
4447
+ var startFrame = i * lfrN - leftPad;
4448
+
4449
+ for (var j = 0; j < lfrM; j++) {
4450
+ var srcFrame = startFrame + j;
4451
+ // Clamp to valid range
4452
+ if (srcFrame < 0) srcFrame = 0;
4453
+ if (srcFrame >= numFrames) srcFrame = numFrames - 1;
4454
+
4455
+ var srcOffset = srcFrame * featureDim;
4456
+ var dstOffset = i * outputDim + j * featureDim;
4457
+ for (var k = 0; k < featureDim; k++) {
4458
+ output[dstOffset + k] = features[srcOffset + k];
4459
+ }
4460
+ }
4461
+ }
4462
+
4463
+ return output;
4464
+ }
4465
+
4466
+ /**
4467
+ * Apply CMVN normalization in-place
4468
+ */
4469
+ function applyCMVN(features, dim, negMeanVec, invStddevVec) {
4470
+ for (var i = 0; i < features.length; i++) {
4471
+ var d = i % dim;
4472
+ features[i] = (features[i] + negMeanVec[d]) * invStddevVec[d];
4473
+ }
4474
+ return features;
4475
+ }
4476
+
4477
+ /**
4478
+ * Parse CMVN vectors from comma-separated strings (stored in ONNX metadata)
4479
+ */
4480
+ function parseCMVNFromMetadata(negMeanStr, invStddevStr) {
4481
+ var negMeanArr = new Float32Array(
4482
+ negMeanStr.split(',').map(function(s) { return parseFloat(s.trim()); })
4483
+ );
4484
+ var invStddevArr = new Float32Array(
4485
+ invStddevStr.split(',').map(function(s) { return parseFloat(s.trim()); })
4486
+ );
4487
+ return { negMean: negMeanArr, invStddev: invStddevArr };
4488
+ }
4489
+
4490
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
4491
+ // ctcDecoder.ts \u2014 inlined as plain JavaScript
4492
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
4493
+
4494
+ /** SenseVoice language ID -> string mapping */
4495
+ var LANGUAGE_IDS = {
4496
+ 0: 'auto',
4497
+ 3: 'zh',
4498
+ 4: 'en',
4499
+ 7: 'yue',
4500
+ 11: 'ja',
4501
+ 12: 'ko',
4502
+ 13: 'nospeech'
4503
+ };
4504
+
4505
+ /** SenseVoice text normalization ID -> string mapping */
4506
+ var TEXT_NORM_IDS = {
4507
+ 14: 'with_itn',
4508
+ 15: 'without_itn'
4509
+ };
4510
+
4511
+ /** Resolve language string to SenseVoice language ID */
4512
+ function resolveLanguageId(language) {
4513
+ var map = {
4514
+ auto: 0,
4515
+ zh: 3,
4516
+ en: 4,
4517
+ yue: 7,
4518
+ ja: 11,
4519
+ ko: 12
4520
+ };
4521
+ return map[language] !== undefined ? map[language] : 0;
4522
+ }
4523
+
4524
+ /** Resolve text norm string to SenseVoice text norm ID */
4525
+ function resolveTextNormId(textNorm) {
4526
+ return textNorm === 'without_itn' ? 15 : 14;
4527
+ }
4528
+
4529
+ /**
4530
+ * Parse tokens.txt into a token ID -> string map
4531
+ */
4532
+ function parseTokensFile(content) {
4533
+ var map = new Map();
4534
+ var lines = content.split('\\n');
4535
+ for (var idx = 0; idx < lines.length; idx++) {
4536
+ var trimmed = lines[idx].trim();
4537
+ if (!trimmed) continue;
4538
+ // Find the last space - token string may contain spaces
4539
+ var lastSpace = trimmed.lastIndexOf(' ');
4540
+ if (lastSpace === -1) continue;
4541
+ var token = trimmed.substring(0, lastSpace);
4542
+ var id = parseInt(trimmed.substring(lastSpace + 1), 10);
4543
+ if (!isNaN(id)) {
4544
+ map.set(id, token);
4545
+ }
4546
+ }
4547
+ return map;
4548
+ }
4549
+
4550
+ /**
4551
+ * SenseVoice structured token pattern matching
4552
+ */
4553
+ function parseStructuredToken(token) {
4554
+ var match = token.match(/^<\\|(.+)\\|>$/);
4555
+ if (!match) return null;
4556
+
4557
+ var value = match[1];
4558
+
4559
+ // Language tokens
4560
+ if (value === 'zh' || value === 'en' || value === 'ja' || value === 'ko' || value === 'yue' || value === 'nospeech') {
4561
+ return { type: 'language', value: value };
4562
+ }
4563
+
4564
+ // Emotion tokens
4565
+ var emotions = ['HAPPY', 'SAD', 'ANGRY', 'NEUTRAL', 'FEARFUL', 'DISGUSTED', 'SURPRISED', 'EMO_UNKNOWN'];
4566
+ if (emotions.indexOf(value) !== -1) {
4567
+ return { type: 'emotion', value: value };
4568
+ }
4569
+
4570
+ // Audio event tokens
4571
+ var events = ['Speech', 'BGM', 'Applause', 'Laughter', 'Crying', 'Coughing', 'Sneezing', 'EVENT_UNKNOWN'];
4572
+ if (events.indexOf(value) !== -1) {
4573
+ return { type: 'event', value: value };
4574
+ }
4575
+
4576
+ // ITN tokens
4577
+ if (value === 'withitn' || value === 'woitn' || value === 'with_itn' || value === 'without_itn') {
4578
+ return { type: 'textnorm', value: value };
4579
+ }
4580
+
4581
+ return null;
4582
+ }
4583
+
4584
+ /**
4585
+ * CTC greedy decode
4586
+ */
4587
+ function ctcGreedyDecode(logits, seqLen, vocabSz, tokenMapLocal) {
4588
+ // Step 1: Argmax per time step
4589
+ var tokenIds = [];
4590
+ for (var t = 0; t < seqLen; t++) {
4591
+ var offset = t * vocabSz;
4592
+ var maxIdx = 0;
4593
+ var maxVal = logits[offset];
4594
+ for (var v = 1; v < vocabSz; v++) {
4595
+ if (logits[offset + v] > maxVal) {
4596
+ maxVal = logits[offset + v];
4597
+ maxIdx = v;
4598
+ }
4599
+ }
4600
+ tokenIds.push(maxIdx);
4601
+ }
4602
+
4603
+ // Step 2: Collapse consecutive duplicates
4604
+ var collapsed = [];
4605
+ var prev = -1;
4606
+ for (var idx = 0; idx < tokenIds.length; idx++) {
4607
+ var id = tokenIds[idx];
4608
+ if (id !== prev) {
4609
+ collapsed.push(id);
4610
+ prev = id;
4611
+ }
4612
+ }
4613
+
4614
+ // Step 3: Remove blank tokens (ID 0) and special tokens (<s>=1, </s>=2)
4615
+ var filtered = collapsed.filter(function(id) { return id !== 0 && id !== 1 && id !== 2; });
4616
+
4617
+ // Step 4: Convert to token strings and parse structured tokens
4618
+ var language = undefined;
4619
+ var emotion = undefined;
4620
+ var event = undefined;
4621
+ var textTokens = [];
4622
+
4623
+ for (var idx = 0; idx < filtered.length; idx++) {
4624
+ var id = filtered[idx];
4625
+ var token = tokenMapLocal.get(id);
4626
+ if (!token) continue;
4627
+
4628
+ var structured = parseStructuredToken(token);
4629
+ if (structured) {
4630
+ if (structured.type === 'language') language = structured.value;
4631
+ else if (structured.type === 'emotion') emotion = structured.value;
4632
+ else if (structured.type === 'event') event = structured.value;
4633
+ // Skip textnorm tokens
4634
+ } else {
4635
+ textTokens.push(token);
4636
+ }
4637
+ }
4638
+
4639
+ // Step 5: Join tokens, handle SentencePiece boundary marker
4640
+ var text = textTokens.join('');
4641
+ // Replace SentencePiece word boundary (U+2581) with space
4642
+ text = text.replace(/\\u2581/g, ' ').trim();
4643
+
4644
+ return { text: text, language: language, emotion: emotion, event: event };
4645
+ }
4646
+
4647
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
4648
+ // Worker globals and message handler
4649
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
4650
+
4651
+ /**
4652
+ * Load ONNX Runtime from CDN
4653
+ */
4654
+ async function loadOrt(wasmPaths) {
4655
+ if (ort) return;
4656
+
4657
+ // Import ONNX Runtime from CDN
4658
+ var ortUrl = wasmPaths + 'ort.wasm.min.js';
4659
+
4660
+ // Load the script by fetching and executing it
4661
+ var response = await fetch(ortUrl);
4662
+ var scriptText = await response.text();
4663
+
4664
+ // Create a blob URL for the script
4665
+ var blob = new Blob([scriptText], { type: 'application/javascript' });
4666
+ var blobUrl = URL.createObjectURL(blob);
4667
+
4668
+ // Import the module
4669
+ importScripts(blobUrl);
4670
+ URL.revokeObjectURL(blobUrl);
4671
+
4672
+ // ort is now available as global
4673
+ ort = self.ort;
4674
+
4675
+ // Configure WASM settings
4676
+ ort.env.wasm.wasmPaths = wasmPaths;
4677
+ ort.env.wasm.numThreads = 1; // Single thread in worker
4678
+ ort.env.wasm.simd = true;
4679
+ ort.env.wasm.proxy = false; // No proxy in worker
4680
+ }
4681
+
4682
+ /**
4683
+ * Load the SenseVoice model and tokens
4684
+ */
4685
+ async function loadModel(modelUrl, tokensUrl, isIOSDevice, lang, textNorm) {
4686
+ // 1. Fetch and parse tokens.txt
4687
+ var tokensResponse = await fetch(tokensUrl);
4688
+ if (!tokensResponse.ok) {
4689
+ throw new Error('Failed to fetch tokens.txt: ' + tokensResponse.status + ' ' + tokensResponse.statusText);
4690
+ }
4691
+ var tokensText = await tokensResponse.text();
4692
+ tokenMap = parseTokensFile(tokensText);
4693
+
4694
+ // 2. Store language/textNorm IDs
4695
+ languageId = lang;
4696
+ textNormId = textNorm;
4697
+
4698
+ // 3. Create inference session
4699
+ var sessionOptions = {
4700
+ executionProviders: ['wasm'],
4701
+ graphOptimizationLevel: 'all',
4702
+ };
4703
+
4704
+ if (isIOSDevice) {
4705
+ // iOS: pass URL string directly to ORT to avoid 239MB JS heap allocation
4706
+ // ORT fetches into WASM memory, keeping JS heap at ~2MB
4707
+ session = await ort.InferenceSession.create(modelUrl, sessionOptions);
4708
+ } else {
4709
+ // Desktop: fetch ArrayBuffer for potential caching
4710
+ var modelResponse = await fetch(modelUrl);
4711
+ if (!modelResponse.ok) {
4712
+ throw new Error('Failed to fetch model: ' + modelResponse.status + ' ' + modelResponse.statusText);
4713
+ }
4714
+ var modelBuffer = await modelResponse.arrayBuffer();
4715
+ var modelData = new Uint8Array(modelBuffer);
4716
+ session = await ort.InferenceSession.create(modelData, sessionOptions);
4717
+ }
4718
+
4719
+ // 4. Try to read CMVN from model metadata
4720
+ try {
4721
+ var metadata = session.handler && session.handler.metadata;
4722
+ if (metadata && metadata.neg_mean && metadata.inv_stddev) {
4723
+ var cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
4724
+ negMean = cmvn.negMean;
4725
+ invStddev = cmvn.invStddev;
4726
+ }
4727
+ } catch (cmvnErr) {
4728
+ // CMVN not available \u2014 features will not be normalized
4729
+ }
4730
+
4731
+ // 5. Determine vocab size from tokenMap
4732
+ vocabSize = 0;
4733
+ tokenMap.forEach(function(val, key) {
4734
+ if (key >= vocabSize) vocabSize = key + 1;
4735
+ });
4736
+
4737
+ return {
4738
+ vocabSize: vocabSize,
4739
+ inputNames: session.inputNames.slice(),
4740
+ outputNames: session.outputNames.slice(),
4741
+ };
4742
+ }
4743
+
4744
+ /**
4745
+ * Run transcription on audio samples
4746
+ */
4747
+ async function runTranscription(audio) {
4748
+ var preprocessStart = performance.now();
4749
+
4750
+ // 1. Compute Kaldi fbank features [T, 80]
4751
+ var fbank = computeKaldiFbank(audio, 16000, 80);
4752
+ var numFrames = fbank.length / 80;
4753
+
4754
+ if (numFrames === 0) {
4755
+ return {
4756
+ text: '',
4757
+ language: undefined,
4758
+ emotion: undefined,
4759
+ event: undefined,
4760
+ inferenceTimeMs: performance.now() - preprocessStart,
4761
+ preprocessTimeMs: performance.now() - preprocessStart,
4762
+ };
4763
+ }
4764
+
4765
+ // 2. Apply LFR stacking [T_reduced, 560]
4766
+ var lfrFeatures = applyLFR(fbank, 80, 7, 6);
4767
+ var numLfrFrames = lfrFeatures.length / 560;
4768
+
4769
+ // 3. Apply CMVN normalization (in-place)
4770
+ if (negMean && invStddev) {
4771
+ applyCMVN(lfrFeatures, 560, negMean, invStddev);
4772
+ }
4773
+
4774
+ var preprocessTimeMs = performance.now() - preprocessStart;
4775
+
4776
+ // 4. Build ORT tensors
4777
+ var feeds = {
4778
+ x: new ort.Tensor('float32', lfrFeatures, [1, numLfrFrames, 560]),
4779
+ x_length: new ort.Tensor('int32', new Int32Array([numLfrFrames]), [1]),
4780
+ language: new ort.Tensor('int32', new Int32Array([languageId]), [1]),
4781
+ text_norm: new ort.Tensor('int32', new Int32Array([textNormId]), [1]),
4782
+ };
4783
+
4784
+ // 5. Run inference
4785
+ var results = await session.run(feeds);
4786
+
4787
+ var logitsOutput = results['logits'];
4788
+ if (!logitsOutput) {
4789
+ throw new Error('Model output missing "logits" tensor');
4790
+ }
4791
+
4792
+ var logitsData = logitsOutput.data;
4793
+ var logitsDims = logitsOutput.dims;
4794
+ var seqLen = logitsDims[1];
4795
+ var modelVocabSize = logitsDims[2];
4796
+
4797
+ // 6. CTC decode
4798
+ var decoded = ctcGreedyDecode(logitsData, seqLen, modelVocabSize, tokenMap);
4799
+
4800
+ var totalTimeMs = performance.now() - preprocessStart;
4801
+
4802
+ return {
4803
+ text: decoded.text,
4804
+ language: decoded.language,
4805
+ emotion: decoded.emotion,
4806
+ event: decoded.event,
4807
+ inferenceTimeMs: totalTimeMs,
4808
+ preprocessTimeMs: preprocessTimeMs,
4809
+ };
4810
+ }
4811
+
4812
+ // Message handler
4813
+ self.onmessage = async function(e) {
4814
+ var msg = e.data;
4815
+
4816
+ try {
4817
+ switch (msg.type) {
4818
+ case 'load': {
4819
+ var startTime = performance.now();
4820
+ await loadOrt(msg.wasmPaths);
4821
+ var info = await loadModel(msg.modelUrl, msg.tokensUrl, msg.isIOS, msg.language, msg.textNorm);
4822
+ var loadTimeMs = performance.now() - startTime;
4823
+
4824
+ self.postMessage({
4825
+ type: 'loaded',
4826
+ vocabSize: info.vocabSize,
4827
+ inputNames: info.inputNames,
4828
+ outputNames: info.outputNames,
4829
+ loadTimeMs: loadTimeMs,
4830
+ });
4831
+ break;
4832
+ }
4833
+
4834
+ case 'transcribe': {
4835
+ var result = await runTranscription(msg.audio);
4836
+
4837
+ self.postMessage({
4838
+ type: 'result',
4839
+ text: result.text,
4840
+ language: result.language,
4841
+ emotion: result.emotion,
4842
+ event: result.event,
4843
+ inferenceTimeMs: result.inferenceTimeMs,
4844
+ preprocessTimeMs: result.preprocessTimeMs,
4845
+ });
4846
+ break;
4847
+ }
4848
+
4849
+ case 'dispose': {
4850
+ if (session) {
4851
+ await session.release();
4852
+ session = null;
4853
+ }
4854
+ ort = null;
4855
+ tokenMap = null;
4856
+ negMean = null;
4857
+ invStddev = null;
4858
+ self.postMessage({ type: 'disposed' });
4859
+ break;
4860
+ }
4861
+
4862
+ default:
4863
+ self.postMessage({
4864
+ type: 'error',
4865
+ error: 'Unknown message type: ' + msg.type,
4866
+ });
4867
+ }
4868
+ } catch (err) {
4869
+ var errorMsg = err.message || String(err);
4870
+ // Handle raw C++ exception pointers from ORT WASM
4871
+ if (typeof err === 'number') {
4872
+ errorMsg = 'Raw C++ exception pointer (0x' + err.toString(16) + '). Likely OOM in WASM.';
4873
+ }
4874
+ self.postMessage({
4875
+ type: 'error',
4876
+ error: errorMsg,
4877
+ });
4878
+ }
4879
+ };
4880
+
4881
+ // Error handler
4882
+ self.onerror = function(err) {
4883
+ self.postMessage({
4884
+ type: 'error',
4885
+ error: 'Worker error: ' + (err.message || String(err)),
4886
+ });
4887
+ };
4888
+ `;
4889
+ var SenseVoiceWorker = class {
4890
+ constructor(config) {
4891
+ this.worker = null;
4892
+ this.isLoading = false;
4893
+ this._isLoaded = false;
4894
+ // Inference queue for serialization
4895
+ this.inferenceQueue = Promise.resolve();
4896
+ // Session health: set to true if worker operation times out
4897
+ this.poisoned = false;
4898
+ // Pending message handlers
4899
+ this.pendingResolvers = /* @__PURE__ */ new Map();
4900
+ const modelDir = config.modelUrl.substring(0, config.modelUrl.lastIndexOf("/"));
4901
+ const tokensUrl = config.tokensUrl ?? `${modelDir}/tokens.txt`;
4902
+ this.config = {
4903
+ modelUrl: config.modelUrl,
4904
+ tokensUrl,
4905
+ language: config.language ?? "auto",
4906
+ textNorm: config.textNorm ?? "with_itn"
4907
+ };
4908
+ this.languageId = resolveLanguageId(this.config.language);
4909
+ this.textNormId = resolveTextNormId(this.config.textNorm);
4910
+ }
4911
+ get isLoaded() {
4912
+ return this._isLoaded;
4913
+ }
4914
+ /**
4915
+ * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
4916
+ */
4917
+ get backend() {
4918
+ return this._isLoaded ? "wasm" : null;
4919
+ }
4920
+ /**
4921
+ * Create the worker from inline script
4922
+ */
4923
+ createWorker() {
4924
+ const blob = new Blob([WORKER_SCRIPT], { type: "application/javascript" });
4925
+ const blobUrl = URL.createObjectURL(blob);
4926
+ const worker = new Worker(blobUrl);
4927
+ URL.revokeObjectURL(blobUrl);
4928
+ worker.onmessage = (event) => {
4929
+ this.handleWorkerMessage(event.data);
4930
+ };
4931
+ worker.onerror = (error) => {
4932
+ logger5.error("Worker error", { error: error.message });
4933
+ for (const [, resolver] of this.pendingResolvers) {
4934
+ resolver.reject(new Error(`Worker error: ${error.message}`));
4935
+ }
4936
+ this.pendingResolvers.clear();
4937
+ };
4938
+ return worker;
4939
+ }
4940
+ /**
4941
+ * Handle messages from worker
4942
+ */
4943
+ handleWorkerMessage(result) {
4944
+ const resolver = this.pendingResolvers.get(result.type);
4945
+ if (resolver) {
4946
+ this.pendingResolvers.delete(result.type);
4947
+ if (result.type === "error") {
4948
+ resolver.reject(new Error(result.error));
4949
+ } else {
4950
+ resolver.resolve(result);
4951
+ }
4952
+ }
4953
+ }
4954
+ /**
4955
+ * Send message to worker and wait for response
4956
+ */
4957
+ sendMessage(message, expectedType, timeoutMs) {
4958
+ return new Promise((resolve, reject) => {
4959
+ if (!this.worker) {
4960
+ reject(new Error("Worker not initialized"));
4961
+ return;
4962
+ }
4963
+ const timeoutId = setTimeout(() => {
4964
+ this.pendingResolvers.delete(expectedType);
4965
+ this.poisoned = true;
4966
+ reject(new Error(`Worker operation timed out after ${timeoutMs}ms`));
4967
+ }, timeoutMs);
4968
+ this.pendingResolvers.set(expectedType, {
4969
+ resolve: (value) => {
4970
+ clearTimeout(timeoutId);
4971
+ resolve(value);
4972
+ },
4973
+ reject: (error) => {
4974
+ clearTimeout(timeoutId);
4975
+ reject(error);
4976
+ }
4977
+ });
4978
+ this.pendingResolvers.set("error", {
4979
+ resolve: () => {
4980
+ },
4981
+ // Never called for errors
4982
+ reject: (error) => {
4983
+ clearTimeout(timeoutId);
4984
+ this.pendingResolvers.delete(expectedType);
4985
+ reject(error);
4986
+ }
4987
+ });
4988
+ this.worker.postMessage(message);
4989
+ });
4990
+ }
4991
+ /**
4992
+ * Load the ONNX model in the worker
4993
+ *
4994
+ * @param onProgress - Optional progress callback. Fires once at 100% when load completes
4995
+ * (the worker downloads and loads the model internally, so granular progress is not available).
4996
+ */
4997
+ async load(onProgress) {
4998
+ if (this.isLoading) {
4999
+ throw new Error("Model is already loading");
5000
+ }
5001
+ if (this._isLoaded) {
5002
+ throw new Error("Model already loaded. Call dispose() first.");
5003
+ }
5004
+ this.isLoading = true;
5005
+ const startTime = performance.now();
5006
+ const telemetry = getTelemetry();
5007
+ const span = telemetry?.startSpan("SenseVoiceWorker.load", {
5008
+ "model.url": this.config.modelUrl,
5009
+ "model.language": this.config.language
5010
+ });
5011
+ try {
5012
+ logger5.info("Creating SenseVoice worker...");
5013
+ this.worker = this.createWorker();
5014
+ logger5.info("Loading model in worker...", {
5015
+ modelUrl: this.config.modelUrl,
5016
+ tokensUrl: this.config.tokensUrl,
5017
+ language: this.config.language,
5018
+ textNorm: this.config.textNorm
5019
+ });
5020
+ const result = await this.sendMessage(
5021
+ {
5022
+ type: "load",
5023
+ modelUrl: resolveUrl(this.config.modelUrl),
5024
+ tokensUrl: resolveUrl(this.config.tokensUrl),
5025
+ wasmPaths: WASM_CDN_PATH2,
5026
+ isIOS: isIOS(),
5027
+ language: this.languageId,
5028
+ textNorm: this.textNormId
5029
+ },
5030
+ "loaded",
5031
+ LOAD_TIMEOUT_MS
5032
+ );
5033
+ this._isLoaded = true;
5034
+ const loadTimeMs = performance.now() - startTime;
5035
+ onProgress?.(1, 1);
5036
+ logger5.info("SenseVoice worker loaded successfully", {
5037
+ backend: "wasm",
5038
+ loadTimeMs: Math.round(loadTimeMs),
5039
+ workerLoadTimeMs: Math.round(result.loadTimeMs),
5040
+ vocabSize: result.vocabSize,
5041
+ language: this.config.language,
5042
+ textNorm: this.config.textNorm
5043
+ });
5044
+ span?.setAttributes({
5045
+ "model.backend": "wasm",
5046
+ "model.load_time_ms": loadTimeMs,
5047
+ "model.worker_load_time_ms": result.loadTimeMs,
5048
+ "model.vocab_size": result.vocabSize
5049
+ });
5050
+ span?.end();
5051
+ telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
5052
+ model: "sensevoice-worker",
5053
+ backend: "wasm"
5054
+ });
5055
+ return {
5056
+ backend: "wasm",
5057
+ loadTimeMs,
5058
+ inputNames: result.inputNames,
5059
+ outputNames: result.outputNames,
5060
+ vocabSize: result.vocabSize
5061
+ };
5062
+ } catch (error) {
5063
+ span?.endWithError(error instanceof Error ? error : new Error(String(error)));
5064
+ telemetry?.incrementCounter("omote.errors.total", 1, {
5065
+ model: "sensevoice-worker",
5066
+ error_type: "load_failed"
5067
+ });
5068
+ if (this.worker) {
5069
+ this.worker.terminate();
5070
+ this.worker = null;
5071
+ }
5072
+ throw error;
5073
+ } finally {
5074
+ this.isLoading = false;
5075
+ }
5076
+ }
5077
+ /**
5078
+ * Transcribe audio samples to text
5079
+ *
5080
+ * @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
5081
+ * @returns Transcription result with text, emotion, language, and event
5082
+ */
5083
+ async transcribe(audioSamples) {
5084
+ if (!this._isLoaded || !this.worker) {
5085
+ throw new Error("Worker not loaded. Call load() first.");
5086
+ }
5087
+ if (this.poisoned) {
5088
+ throw new Error("SenseVoice worker timed out \u2014 inference unavailable until page reload");
5089
+ }
5090
+ const audio = new Float32Array(audioSamples);
5091
+ return this.queueInference(audio);
5092
+ }
5093
+ /**
5094
+ * Queue inference to serialize worker calls
5095
+ */
5096
+ queueInference(audio) {
5097
+ return new Promise((resolve, reject) => {
5098
+ this.inferenceQueue = this.inferenceQueue.then(async () => {
5099
+ const telemetry = getTelemetry();
5100
+ const span = telemetry?.startSpan("SenseVoiceWorker.transcribe", {
5101
+ "inference.backend": "wasm",
5102
+ "inference.input_samples": audio.length
5103
+ });
5104
+ try {
5105
+ const startTime = performance.now();
5106
+ const result = await this.sendMessage(
5107
+ {
5108
+ type: "transcribe",
5109
+ audio
5110
+ },
5111
+ "result",
5112
+ INFERENCE_TIMEOUT_MS
5113
+ );
5114
+ const totalTimeMs = performance.now() - startTime;
5115
+ logger5.trace("Worker transcription complete", {
5116
+ text: result.text.substring(0, 50),
5117
+ language: result.language,
5118
+ emotion: result.emotion,
5119
+ event: result.event,
5120
+ preprocessTimeMs: Math.round(result.preprocessTimeMs * 100) / 100,
5121
+ inferenceTimeMs: Math.round(result.inferenceTimeMs * 100) / 100,
5122
+ roundTripMs: Math.round(totalTimeMs * 100) / 100
5123
+ });
5124
+ span?.setAttributes({
5125
+ "inference.duration_ms": totalTimeMs,
5126
+ "inference.worker_duration_ms": result.inferenceTimeMs,
5127
+ "inference.preprocess_ms": result.preprocessTimeMs,
5128
+ "inference.text_length": result.text.length
5129
+ });
5130
+ span?.end();
5131
+ telemetry?.recordHistogram("omote.inference.latency", totalTimeMs, {
5132
+ model: "sensevoice-worker",
5133
+ backend: "wasm"
5134
+ });
5135
+ telemetry?.incrementCounter("omote.inference.total", 1, {
5136
+ model: "sensevoice-worker",
5137
+ backend: "wasm",
5138
+ status: "success"
5139
+ });
5140
+ resolve({
5141
+ text: result.text,
5142
+ language: result.language,
5143
+ emotion: result.emotion,
5144
+ event: result.event,
5145
+ inferenceTimeMs: result.inferenceTimeMs,
5146
+ preprocessTimeMs: result.preprocessTimeMs
5147
+ });
5148
+ } catch (err) {
5149
+ const errMsg = err instanceof Error ? err.message : String(err);
5150
+ if (errMsg.includes("timed out")) {
5151
+ logger5.error("CRITICAL: Worker inference timed out \u2014 SenseVoice worker is dead. Page reload required.", {
5152
+ timeoutMs: INFERENCE_TIMEOUT_MS
5153
+ });
5154
+ } else {
5155
+ logger5.error("Worker inference failed", { error: errMsg });
5156
+ }
5157
+ span?.endWithError(err instanceof Error ? err : new Error(String(err)));
5158
+ telemetry?.incrementCounter("omote.inference.total", 1, {
5159
+ model: "sensevoice-worker",
5160
+ backend: "wasm",
5161
+ status: "error"
5162
+ });
5163
+ reject(err);
5164
+ }
5165
+ });
5166
+ });
5167
+ }
5168
+ /**
5169
+ * Dispose of the worker and free resources
5170
+ */
5171
+ async dispose() {
5172
+ if (this.worker) {
5173
+ try {
5174
+ await this.sendMessage({ type: "dispose" }, "disposed", INFERENCE_TIMEOUT_MS);
5175
+ } catch {
5176
+ }
5177
+ this.worker.terminate();
5178
+ this.worker = null;
5179
+ }
5180
+ this._isLoaded = false;
5181
+ this.poisoned = false;
5182
+ this.pendingResolvers.clear();
5183
+ }
5184
+ /**
5185
+ * Check if Web Workers are supported
5186
+ */
5187
+ static isSupported() {
5188
+ return typeof Worker !== "undefined";
5189
+ }
5190
+ };
5191
+
5192
+ // src/inference/UnifiedInferenceWorker.ts
5193
+ var logger6 = createLogger("UnifiedInferenceWorker");
5194
+ var WASM_CDN_PATH3 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
5195
+ var INIT_TIMEOUT_MS = 15e3;
5196
+ var SV_LOAD_TIMEOUT_MS = 3e4;
5197
+ var SV_INFER_TIMEOUT_MS = 1e4;
5198
+ var CPU_LOAD_TIMEOUT_MS = 6e4;
5199
+ var CPU_INFER_TIMEOUT_MS = 5e3;
5200
+ var VAD_LOAD_TIMEOUT_MS = 1e4;
5201
+ var VAD_INFER_TIMEOUT_MS = 1e3;
5202
+ var DISPOSE_TIMEOUT_MS = 5e3;
5203
+ function resolveUrl2(url) {
5204
+ if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
5205
+ try {
5206
+ return new URL(url, globalThis.location?.origin ?? "https://localhost").href;
5207
+ } catch {
5208
+ return url;
5209
+ }
5210
+ }
5211
+ var requestCounter = 0;
5212
+ function nextRequestId() {
5213
+ return `req_${++requestCounter}_${Date.now()}`;
5214
+ }
5215
+ var WORKER_SCRIPT2 = `
5216
+ // Unified Inference Worker Script
5217
+ // Hosts SenseVoice + Wav2ArkitCpu + Silero VAD in a single ORT instance
5218
+
5219
+ var ort = null;
5220
+
5221
+ // SenseVoice state
5222
+ var svSession = null;
5223
+ var svTokenMap = null;
5224
+ var svNegMean = null;
5225
+ var svInvStddev = null;
5226
+ var svLanguageId = 0;
5227
+ var svTextNormId = 14;
5228
+ var svVocabSize = 0;
5229
+
5230
+ // Wav2ArkitCpu state
5231
+ var cpuSession = null;
5232
+
5233
+ // Silero VAD state
5234
+ var vadSession = null;
5235
+ var vadSampleRate = 16000;
5236
+ var vadChunkSize = 512;
5237
+ var vadContextSize = 64;
5238
+
5239
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5240
+ // kaldiFbank.ts \u2014 inlined as plain JavaScript
5241
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5242
+
5243
+ function fft(re, im) {
5244
+ var n = re.length;
5245
+ for (var i = 1, j = 0; i < n; i++) {
5246
+ var bit = n >> 1;
5247
+ while (j & bit) { j ^= bit; bit >>= 1; }
5248
+ j ^= bit;
5249
+ if (i < j) {
5250
+ var tmp = re[i]; re[i] = re[j]; re[j] = tmp;
5251
+ tmp = im[i]; im[i] = im[j]; im[j] = tmp;
5252
+ }
5253
+ }
5254
+ for (var len = 2; len <= n; len *= 2) {
5255
+ var halfLen = len / 2;
5256
+ var angle = -2 * Math.PI / len;
5257
+ var wRe = Math.cos(angle);
5258
+ var wIm = Math.sin(angle);
5259
+ for (var i = 0; i < n; i += len) {
5260
+ var curRe = 1, curIm = 0;
5261
+ for (var j = 0; j < halfLen; j++) {
5262
+ var a = i + j, b = a + halfLen;
5263
+ var tRe = curRe * re[b] - curIm * im[b];
5264
+ var tIm = curRe * im[b] + curIm * re[b];
5265
+ re[b] = re[a] - tRe; im[b] = im[a] - tIm;
5266
+ re[a] += tRe; im[a] += tIm;
5267
+ var nextRe = curRe * wRe - curIm * wIm;
5268
+ curIm = curRe * wIm + curIm * wRe;
5269
+ curRe = nextRe;
5270
+ }
5271
+ }
5272
+ }
5273
+ }
5274
+
5275
+ function htkMel(freq) { return 1127.0 * Math.log(1.0 + freq / 700.0); }
5276
+ function htkMelInverse(mel) { return 700.0 * (Math.exp(mel / 1127.0) - 1.0); }
5277
+
5278
+ function buildMelFilterbank(numBins, fftSize, sampleRate, lowFreq, highFreq) {
5279
+ var numFftBins = fftSize / 2 + 1;
5280
+ var lowMel = htkMel(lowFreq);
5281
+ var highMel = htkMel(highFreq);
5282
+ var melPoints = new Float64Array(numBins + 2);
5283
+ for (var i = 0; i < numBins + 2; i++) {
5284
+ melPoints[i] = lowMel + (highMel - lowMel) * i / (numBins + 1);
5285
+ }
5286
+ var binFreqs = new Float64Array(numBins + 2);
5287
+ for (var i = 0; i < numBins + 2; i++) {
5288
+ binFreqs[i] = htkMelInverse(melPoints[i]) * fftSize / sampleRate;
5289
+ }
5290
+ var filters = [];
5291
+ for (var m = 0; m < numBins; m++) {
5292
+ var left = binFreqs[m], center = binFreqs[m + 1], right = binFreqs[m + 2];
5293
+ var startBin = Math.max(0, Math.ceil(left));
5294
+ var endBin = Math.min(numFftBins - 1, Math.floor(right));
5295
+ var weights = new Float32Array(endBin - startBin + 1);
5296
+ for (var k = startBin; k <= endBin; k++) {
5297
+ if (k <= center) {
5298
+ weights[k - startBin] = (center - left) > 0 ? (k - left) / (center - left) : 0;
5299
+ } else {
5300
+ weights[k - startBin] = (right - center) > 0 ? (right - k) / (right - center) : 0;
5301
+ }
5302
+ }
5303
+ filters.push({ startBin: startBin, weights: weights });
5304
+ }
5305
+ return filters;
5306
+ }
5307
+
5308
+ function createHammingWindow(length) {
5309
+ var w = new Float32Array(length);
5310
+ for (var i = 0; i < length; i++) {
5311
+ w[i] = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (length - 1));
5312
+ }
5313
+ return w;
5314
+ }
5315
+
5316
+ function computeKaldiFbank(audio, sampleRate, numMelBins, opts) {
5317
+ var frameLengthMs = (opts && opts.frameLengthMs !== undefined) ? opts.frameLengthMs : 25;
5318
+ var frameShiftMs = (opts && opts.frameShiftMs !== undefined) ? opts.frameShiftMs : 10;
5319
+ var lowFreq = (opts && opts.lowFreq !== undefined) ? opts.lowFreq : 20;
5320
+ var highFreq = (opts && opts.highFreq !== undefined) ? opts.highFreq : (sampleRate / 2);
5321
+ var dither = (opts && opts.dither !== undefined) ? opts.dither : 0;
5322
+ var preemphasis = (opts && opts.preemphasis !== undefined) ? opts.preemphasis : 0.97;
5323
+
5324
+ var frameLengthSamples = Math.round(sampleRate * frameLengthMs / 1000);
5325
+ var frameShiftSamples = Math.round(sampleRate * frameShiftMs / 1000);
5326
+
5327
+ var scaled = new Float32Array(audio.length);
5328
+ for (var i = 0; i < audio.length; i++) { scaled[i] = audio[i] * 32768; }
5329
+
5330
+ if (dither > 0) {
5331
+ for (var i = 0; i < scaled.length; i++) {
5332
+ var u1 = Math.random(), u2 = Math.random();
5333
+ scaled[i] += dither * Math.sqrt(-2 * Math.log(u1 + 1e-10)) * Math.cos(2 * Math.PI * u2);
5334
+ }
5335
+ }
5336
+
5337
+ var numFrames = Math.max(0, Math.floor((scaled.length - frameLengthSamples) / frameShiftSamples) + 1);
5338
+ if (numFrames === 0) return new Float32Array(0);
5339
+
5340
+ var fftSize = 1;
5341
+ while (fftSize < frameLengthSamples) fftSize *= 2;
5342
+ var numFftBins = fftSize / 2 + 1;
5343
+
5344
+ var window = createHammingWindow(frameLengthSamples);
5345
+ var filters = buildMelFilterbank(numMelBins, fftSize, sampleRate, lowFreq, highFreq);
5346
+ var output = new Float32Array(numFrames * numMelBins);
5347
+ var fftRe = new Float64Array(fftSize);
5348
+ var fftIm = new Float64Array(fftSize);
5349
+
5350
+ for (var f = 0; f < numFrames; f++) {
5351
+ var offset = f * frameShiftSamples;
5352
+ fftRe.fill(0); fftIm.fill(0);
5353
+ for (var i = 0; i < frameLengthSamples; i++) {
5354
+ var sample = scaled[offset + i];
5355
+ if (preemphasis > 0 && i > 0) {
5356
+ sample -= preemphasis * scaled[offset + i - 1];
5357
+ } else if (preemphasis > 0 && i === 0 && offset > 0) {
5358
+ sample -= preemphasis * scaled[offset - 1];
5359
+ }
5360
+ fftRe[i] = sample * window[i];
5361
+ }
5362
+ fft(fftRe, fftIm);
5363
+ var outOffset = f * numMelBins;
5364
+ for (var m = 0; m < numMelBins; m++) {
5365
+ var filter = filters[m];
5366
+ var energy = 0;
5367
+ for (var k = 0; k < filter.weights.length; k++) {
5368
+ var bin = filter.startBin + k;
5369
+ if (bin < numFftBins) {
5370
+ var powerSpec = fftRe[bin] * fftRe[bin] + fftIm[bin] * fftIm[bin];
5371
+ energy += filter.weights[k] * powerSpec;
5372
+ }
5373
+ }
5374
+ output[outOffset + m] = Math.log(Math.max(energy, 1e-10));
5375
+ }
5376
+ }
5377
+ return output;
5378
+ }
5379
+
5380
+ function applyLFR(features, featureDim, lfrM, lfrN) {
5381
+ var numFrames = features.length / featureDim;
5382
+ if (numFrames === 0) return new Float32Array(0);
5383
+ var leftPad = Math.floor((lfrM - 1) / 2);
5384
+ var paddedLen = numFrames + leftPad;
5385
+ var numOutputFrames = Math.ceil(paddedLen / lfrN);
5386
+ var outputDim = featureDim * lfrM;
5387
+ var output = new Float32Array(numOutputFrames * outputDim);
5388
+ for (var i = 0; i < numOutputFrames; i++) {
5389
+ var startFrame = i * lfrN - leftPad;
5390
+ for (var j = 0; j < lfrM; j++) {
5391
+ var srcFrame = startFrame + j;
5392
+ if (srcFrame < 0) srcFrame = 0;
5393
+ if (srcFrame >= numFrames) srcFrame = numFrames - 1;
5394
+ var srcOffset = srcFrame * featureDim;
5395
+ var dstOffset = i * outputDim + j * featureDim;
5396
+ for (var k = 0; k < featureDim; k++) {
5397
+ output[dstOffset + k] = features[srcOffset + k];
5398
+ }
5399
+ }
5400
+ }
5401
+ return output;
5402
+ }
5403
+
5404
+ function applyCMVN(features, dim, negMeanVec, invStddevVec) {
5405
+ for (var i = 0; i < features.length; i++) {
5406
+ var d = i % dim;
5407
+ features[i] = (features[i] + negMeanVec[d]) * invStddevVec[d];
5408
+ }
5409
+ return features;
5410
+ }
5411
+
5412
+ function parseCMVNFromMetadata(negMeanStr, invStddevStr) {
5413
+ var negMeanArr = new Float32Array(
5414
+ negMeanStr.split(',').map(function(s) { return parseFloat(s.trim()); })
5415
+ );
5416
+ var invStddevArr = new Float32Array(
5417
+ invStddevStr.split(',').map(function(s) { return parseFloat(s.trim()); })
5418
+ );
5419
+ return { negMean: negMeanArr, invStddev: invStddevArr };
5420
+ }
5421
+
5422
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5423
+ // ctcDecoder.ts \u2014 inlined as plain JavaScript
5424
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5425
+
5426
+ var LANGUAGE_IDS = { 0: 'auto', 3: 'zh', 4: 'en', 7: 'yue', 11: 'ja', 12: 'ko', 13: 'nospeech' };
5427
+ var TEXT_NORM_IDS = { 14: 'with_itn', 15: 'without_itn' };
5428
+
5429
+ function resolveLanguageIdW(language) {
5430
+ var map = { auto: 0, zh: 3, en: 4, yue: 7, ja: 11, ko: 12 };
5431
+ return map[language] !== undefined ? map[language] : 0;
5432
+ }
5433
+
5434
+ function resolveTextNormIdW(textNorm) {
5435
+ return textNorm === 'without_itn' ? 15 : 14;
5436
+ }
5437
+
5438
+ function parseTokensFile(content) {
5439
+ var map = new Map();
5440
+ var lines = content.split('\\n');
5441
+ for (var idx = 0; idx < lines.length; idx++) {
5442
+ var trimmed = lines[idx].trim();
5443
+ if (!trimmed) continue;
5444
+ var lastSpace = trimmed.lastIndexOf(' ');
5445
+ if (lastSpace === -1) continue;
5446
+ var token = trimmed.substring(0, lastSpace);
5447
+ var id = parseInt(trimmed.substring(lastSpace + 1), 10);
5448
+ if (!isNaN(id)) map.set(id, token);
5449
+ }
5450
+ return map;
5451
+ }
5452
+
5453
+ function parseStructuredToken(token) {
5454
+ var match = token.match(/^<\\|(.+)\\|>$/);
5455
+ if (!match) return null;
5456
+ var value = match[1];
5457
+ if (value === 'zh' || value === 'en' || value === 'ja' || value === 'ko' || value === 'yue' || value === 'nospeech') {
5458
+ return { type: 'language', value: value };
5459
+ }
5460
+ var emotions = ['HAPPY', 'SAD', 'ANGRY', 'NEUTRAL', 'FEARFUL', 'DISGUSTED', 'SURPRISED', 'EMO_UNKNOWN'];
5461
+ if (emotions.indexOf(value) !== -1) return { type: 'emotion', value: value };
5462
+ var events = ['Speech', 'BGM', 'Applause', 'Laughter', 'Crying', 'Coughing', 'Sneezing', 'EVENT_UNKNOWN'];
5463
+ if (events.indexOf(value) !== -1) return { type: 'event', value: value };
5464
+ if (value === 'withitn' || value === 'woitn' || value === 'with_itn' || value === 'without_itn') {
5465
+ return { type: 'textnorm', value: value };
5466
+ }
5467
+ return null;
5468
+ }
5469
+
5470
+ function ctcGreedyDecode(logits, seqLen, vocabSz, tokenMapLocal) {
5471
+ var tokenIds = [];
5472
+ for (var t = 0; t < seqLen; t++) {
5473
+ var offset = t * vocabSz;
5474
+ var maxIdx = 0, maxVal = logits[offset];
5475
+ for (var v = 1; v < vocabSz; v++) {
5476
+ if (logits[offset + v] > maxVal) { maxVal = logits[offset + v]; maxIdx = v; }
5477
+ }
5478
+ tokenIds.push(maxIdx);
5479
+ }
5480
+ var collapsed = [], prev = -1;
5481
+ for (var idx = 0; idx < tokenIds.length; idx++) {
5482
+ var id = tokenIds[idx];
5483
+ if (id !== prev) { collapsed.push(id); prev = id; }
5484
+ }
5485
+ var filtered = collapsed.filter(function(id) { return id !== 0 && id !== 1 && id !== 2; });
5486
+ var language = undefined, emotion = undefined, event = undefined;
5487
+ var textTokens = [];
5488
+ for (var idx = 0; idx < filtered.length; idx++) {
5489
+ var id = filtered[idx];
5490
+ var token = tokenMapLocal.get(id);
5491
+ if (!token) continue;
5492
+ var structured = parseStructuredToken(token);
5493
+ if (structured) {
5494
+ if (structured.type === 'language') language = structured.value;
5495
+ else if (structured.type === 'emotion') emotion = structured.value;
5496
+ else if (structured.type === 'event') event = structured.value;
5497
+ } else {
5498
+ textTokens.push(token);
5499
+ }
5500
+ }
5501
+ var text = textTokens.join('');
5502
+ text = text.replace(/\\u2581/g, ' ').trim();
5503
+ return { text: text, language: language, emotion: emotion, event: event };
5504
+ }
5505
+
5506
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5507
+ // blendshapeUtils.ts \u2014 inlined
5508
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5509
+
5510
+ var SYMMETRIC_INDEX_PAIRS = [
5511
+ [23, 25], [32, 38], [43, 44], [29, 30], [27, 28], [45, 46],
5512
+ [35, 36], [47, 48], [33, 34], [49, 50], [6, 7], [0, 1],
5513
+ [3, 4], [8, 9], [16, 17], [10, 11], [12, 13], [14, 15],
5514
+ [18, 19], [20, 21],
5515
+ ];
5516
+
5517
+ function symmetrizeBlendshapes(frame) {
5518
+ var result = new Float32Array(frame);
5519
+ for (var p = 0; p < SYMMETRIC_INDEX_PAIRS.length; p++) {
5520
+ var lIdx = SYMMETRIC_INDEX_PAIRS[p][0], rIdx = SYMMETRIC_INDEX_PAIRS[p][1];
5521
+ var avg = (frame[lIdx] + frame[rIdx]) / 2;
5522
+ result[lIdx] = avg;
5523
+ result[rIdx] = avg;
5524
+ }
5525
+ return result;
5526
+ }
5527
+
5528
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5529
+ // Shared ORT loader
5530
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5531
+
5532
+ async function loadOrt(wasmPaths, isIOSDevice) {
5533
+ if (ort) return;
5534
+ var ortUrl = wasmPaths + 'ort.wasm.min.js';
5535
+ var response = await fetch(ortUrl);
5536
+ var scriptText = await response.text();
5537
+ var blob = new Blob([scriptText], { type: 'application/javascript' });
5538
+ var blobUrl = URL.createObjectURL(blob);
5539
+ importScripts(blobUrl);
5540
+ URL.revokeObjectURL(blobUrl);
5541
+ ort = self.ort;
5542
+ ort.env.wasm.wasmPaths = wasmPaths;
5543
+ ort.env.wasm.numThreads = isIOSDevice ? 1 : 4;
5544
+ ort.env.wasm.simd = true;
5545
+ ort.env.wasm.proxy = false;
5546
+ }
5547
+
5548
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5549
+ // SenseVoice handlers
5550
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5551
+
5552
+ async function svLoad(msg) {
5553
+ var tokensResponse = await fetch(msg.tokensUrl);
5554
+ if (!tokensResponse.ok) throw new Error('Failed to fetch tokens.txt: ' + tokensResponse.status);
5555
+ var tokensText = await tokensResponse.text();
5556
+ svTokenMap = parseTokensFile(tokensText);
5557
+ svLanguageId = msg.language;
5558
+ svTextNormId = msg.textNorm;
5559
+
5560
+ var sessionOptions = { executionProviders: ['wasm'], graphOptimizationLevel: 'all' };
5561
+ if (msg.isIOS) {
5562
+ svSession = await ort.InferenceSession.create(msg.modelUrl, sessionOptions);
5563
+ } else {
5564
+ var modelResponse = await fetch(msg.modelUrl);
5565
+ if (!modelResponse.ok) throw new Error('Failed to fetch model: ' + modelResponse.status);
5566
+ var modelBuffer = await modelResponse.arrayBuffer();
5567
+ svSession = await ort.InferenceSession.create(new Uint8Array(modelBuffer), sessionOptions);
5568
+ }
5569
+
5570
+ try {
5571
+ var metadata = svSession.handler && svSession.handler.metadata;
5572
+ if (metadata && metadata.neg_mean && metadata.inv_stddev) {
5573
+ var cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
5574
+ svNegMean = cmvn.negMean;
5575
+ svInvStddev = cmvn.invStddev;
5576
+ }
5577
+ } catch (e) { /* CMVN not available */ }
5578
+
5579
+ svVocabSize = 0;
5580
+ svTokenMap.forEach(function(val, key) { if (key >= svVocabSize) svVocabSize = key + 1; });
5581
+
5582
+ return {
5583
+ vocabSize: svVocabSize,
5584
+ inputNames: svSession.inputNames.slice(),
5585
+ outputNames: svSession.outputNames.slice(),
5586
+ };
5587
+ }
5588
+
5589
+ async function svTranscribe(audio) {
5590
+ var preprocessStart = performance.now();
5591
+ var fbank = computeKaldiFbank(audio, 16000, 80);
5592
+ var numFrames = fbank.length / 80;
5593
+ if (numFrames === 0) {
5594
+ return { text: '', inferenceTimeMs: performance.now() - preprocessStart, preprocessTimeMs: performance.now() - preprocessStart };
5595
+ }
5596
+ var lfrFeatures = applyLFR(fbank, 80, 7, 6);
5597
+ var numLfrFrames = lfrFeatures.length / 560;
5598
+ if (svNegMean && svInvStddev) applyCMVN(lfrFeatures, 560, svNegMean, svInvStddev);
5599
+ var preprocessTimeMs = performance.now() - preprocessStart;
5600
+
5601
+ var feeds = {
5602
+ x: new ort.Tensor('float32', lfrFeatures, [1, numLfrFrames, 560]),
5603
+ x_length: new ort.Tensor('int32', new Int32Array([numLfrFrames]), [1]),
5604
+ language: new ort.Tensor('int32', new Int32Array([svLanguageId]), [1]),
5605
+ text_norm: new ort.Tensor('int32', new Int32Array([svTextNormId]), [1]),
5606
+ };
5607
+ var results = await svSession.run(feeds);
5608
+ var logitsOutput = results['logits'];
5609
+ if (!logitsOutput) throw new Error('Model output missing "logits" tensor');
5610
+
5611
+ var decoded = ctcGreedyDecode(logitsOutput.data, logitsOutput.dims[1], logitsOutput.dims[2], svTokenMap);
5612
+ var totalTimeMs = performance.now() - preprocessStart;
5613
+
5614
+ return {
5615
+ text: decoded.text, language: decoded.language, emotion: decoded.emotion, event: decoded.event,
5616
+ inferenceTimeMs: totalTimeMs, preprocessTimeMs: preprocessTimeMs,
5617
+ };
5618
+ }
5619
+
5620
+ async function svDispose() {
5621
+ if (svSession) { await svSession.release(); svSession = null; }
5622
+ svTokenMap = null; svNegMean = null; svInvStddev = null;
5623
+ }
5624
+
5625
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5626
+ // Wav2ArkitCpu handlers
5627
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5628
+
5629
+ async function cpuLoad(msg) {
5630
+ var sessionOptions = { executionProviders: ['wasm'], graphOptimizationLevel: 'all' };
5631
+ var dataFilename = msg.externalDataUrl ? msg.externalDataUrl.split('/').pop() : null;
5632
+
5633
+ if (msg.isIOS) {
5634
+ if (msg.externalDataUrl && dataFilename) {
5635
+ sessionOptions.externalData = [{ path: dataFilename, data: msg.externalDataUrl }];
5636
+ }
5637
+ cpuSession = await ort.InferenceSession.create(msg.modelUrl, sessionOptions);
5638
+ } else {
5639
+ var graphResponse = await fetch(msg.modelUrl);
5640
+ if (!graphResponse.ok) throw new Error('Failed to fetch model graph: ' + graphResponse.status);
5641
+ var graphBuffer = await graphResponse.arrayBuffer();
5642
+ if (msg.externalDataUrl && dataFilename) {
5643
+ var dataResponse = await fetch(msg.externalDataUrl);
5644
+ if (!dataResponse.ok) throw new Error('Failed to fetch external data: ' + dataResponse.status);
5645
+ var dataBuffer = await dataResponse.arrayBuffer();
5646
+ sessionOptions.externalData = [{ path: dataFilename, data: new Uint8Array(dataBuffer) }];
5647
+ }
5648
+ cpuSession = await ort.InferenceSession.create(new Uint8Array(graphBuffer), sessionOptions);
5649
+ }
5650
+
5651
+ // Warmup
5652
+ var warmupAudio = new Float32Array(16000);
5653
+ var warmupTensor = new ort.Tensor('float32', warmupAudio, [1, warmupAudio.length]);
5654
+ await cpuSession.run({ audio_waveform: warmupTensor });
5655
+
5656
+ return {
5657
+ inputNames: cpuSession.inputNames.slice(),
5658
+ outputNames: cpuSession.outputNames.slice(),
5659
+ };
5660
+ }
5661
+
5662
+ async function cpuInfer(audio) {
5663
+ var tensor = new ort.Tensor('float32', audio, [1, audio.length]);
5664
+ var results = await cpuSession.run({ audio_waveform: tensor });
5665
+ var blendshapeOutput = results['blendshapes'];
5666
+ if (!blendshapeOutput) throw new Error('Missing blendshapes output from model');
5667
+
5668
+ var blendshapeData = blendshapeOutput.data;
5669
+ var numFrames = blendshapeOutput.dims[1];
5670
+ var numBlendshapes = blendshapeOutput.dims[2];
5671
+
5672
+ var flatBuffer = new Float32Array(numFrames * numBlendshapes);
5673
+ for (var f = 0; f < numFrames; f++) {
5674
+ var offset = f * numBlendshapes;
5675
+ var rawFrame = blendshapeData.slice(offset, offset + numBlendshapes);
5676
+ var symmetrized = symmetrizeBlendshapes(rawFrame);
5677
+ flatBuffer.set(symmetrized, offset);
5678
+ }
5679
+ return { flatBuffer: flatBuffer, numFrames: numFrames, numBlendshapes: numBlendshapes };
5680
+ }
5681
+
5682
+ async function cpuDispose() {
5683
+ if (cpuSession) { await cpuSession.release(); cpuSession = null; }
5684
+ }
5685
+
5686
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5687
+ // Silero VAD handlers
5688
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5689
+
5690
+ async function vadLoad(msg) {
5691
+ vadSampleRate = msg.sampleRate;
5692
+ vadChunkSize = vadSampleRate === 16000 ? 512 : 256;
5693
+ vadContextSize = vadSampleRate === 16000 ? 64 : 32;
5694
+
5695
+ var response = await fetch(msg.modelUrl);
5696
+ if (!response.ok) throw new Error('Failed to fetch VAD model: ' + response.status);
5697
+ var modelBuffer = await response.arrayBuffer();
5698
+ vadSession = await ort.InferenceSession.create(new Uint8Array(modelBuffer), {
5699
+ executionProviders: ['wasm'],
5700
+ graphOptimizationLevel: 'all',
5701
+ });
5702
+
5703
+ return {
5704
+ inputNames: vadSession.inputNames.slice(),
5705
+ outputNames: vadSession.outputNames.slice(),
5706
+ };
5707
+ }
5708
+
5709
+ async function vadProcess(audio, state, context) {
5710
+ var inputSize = vadContextSize + vadChunkSize;
5711
+ var inputBuffer = new Float32Array(inputSize);
5712
+ inputBuffer.set(context, 0);
5713
+ inputBuffer.set(audio, vadContextSize);
5714
+
5715
+ var inputTensor = new ort.Tensor('float32', new Float32Array(inputBuffer), [1, inputSize]);
5716
+ var stateTensor = new ort.Tensor('float32', new Float32Array(state), [2, 1, 128]);
5717
+ var srTensor;
5718
+ try {
5719
+ srTensor = new ort.Tensor('int64', new BigInt64Array([BigInt(vadSampleRate)]), []);
5720
+ } catch (e) {
5721
+ srTensor = new ort.Tensor('int64', [BigInt(vadSampleRate)], []);
5722
+ }
5723
+
5724
+ var feeds = { 'input': inputTensor, 'state': stateTensor, 'sr': srTensor };
5725
+ var results = await vadSession.run(feeds);
5726
+ var outputTensor = results['output'];
5727
+ var newStateTensor = results['stateN'] || results['state'];
5728
+ if (!outputTensor) throw new Error('Missing output tensor from VAD model');
5729
+
5730
+ return { probability: outputTensor.data[0], newState: new Float32Array(newStateTensor.data) };
5731
+ }
5732
+
5733
+ function vadCreateInitialState() {
5734
+ return new Float32Array(2 * 1 * 128);
5735
+ }
5736
+
5737
+ async function vadDispose() {
5738
+ if (vadSession) { await vadSession.release(); vadSession = null; }
5739
+ }
5740
+
5741
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5742
+ // Message handler
5743
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5744
+
5745
+ self.onmessage = async function(e) {
5746
+ var msg = e.data;
5747
+ var requestId = msg.requestId;
5748
+
5749
+ try {
5750
+ switch (msg.type) {
5751
+ case 'init': {
5752
+ var startTime = performance.now();
5753
+ await loadOrt(msg.wasmPaths, msg.isIOS);
5754
+ self.postMessage({ type: 'init:done', requestId: requestId, loadTimeMs: performance.now() - startTime });
5755
+ break;
5756
+ }
5757
+
5758
+ case 'sv:load': {
5759
+ var startTime = performance.now();
5760
+ var info = await svLoad(msg);
5761
+ self.postMessage({
5762
+ type: 'sv:loaded', requestId: requestId, vocabSize: info.vocabSize,
5763
+ inputNames: info.inputNames, outputNames: info.outputNames,
5764
+ loadTimeMs: performance.now() - startTime,
5765
+ });
5766
+ break;
5767
+ }
5768
+
5769
+ case 'sv:transcribe': {
5770
+ var result = await svTranscribe(msg.audio);
5771
+ self.postMessage({
5772
+ type: 'sv:result', requestId: requestId,
5773
+ text: result.text, language: result.language, emotion: result.emotion, event: result.event,
5774
+ inferenceTimeMs: result.inferenceTimeMs, preprocessTimeMs: result.preprocessTimeMs,
5775
+ });
5776
+ break;
5777
+ }
5778
+
5779
+ case 'sv:dispose': {
5780
+ await svDispose();
5781
+ self.postMessage({ type: 'sv:disposed', requestId: requestId });
5782
+ break;
5783
+ }
5784
+
5785
+ case 'cpu:load': {
5786
+ var startTime = performance.now();
5787
+ var info = await cpuLoad(msg);
5788
+ self.postMessage({
5789
+ type: 'cpu:loaded', requestId: requestId,
5790
+ inputNames: info.inputNames, outputNames: info.outputNames,
5791
+ loadTimeMs: performance.now() - startTime,
5792
+ });
5793
+ break;
5794
+ }
5795
+
5796
+ case 'cpu:infer': {
5797
+ var startTime = performance.now();
5798
+ var result = await cpuInfer(msg.audio);
5799
+ var inferenceTimeMs = performance.now() - startTime;
5800
+ self.postMessage({
5801
+ type: 'cpu:result', requestId: requestId,
5802
+ blendshapes: result.flatBuffer, numFrames: result.numFrames,
5803
+ numBlendshapes: result.numBlendshapes, inferenceTimeMs: inferenceTimeMs,
5804
+ }, [result.flatBuffer.buffer]);
5805
+ break;
5806
+ }
5807
+
5808
+ case 'cpu:dispose': {
5809
+ await cpuDispose();
5810
+ self.postMessage({ type: 'cpu:disposed', requestId: requestId });
5811
+ break;
5812
+ }
5813
+
5814
+ case 'vad:load': {
5815
+ var startTime = performance.now();
5816
+ var info = await vadLoad(msg);
5817
+ self.postMessage({
5818
+ type: 'vad:loaded', requestId: requestId,
5819
+ inputNames: info.inputNames, outputNames: info.outputNames,
5820
+ loadTimeMs: performance.now() - startTime,
5821
+ });
5822
+ break;
5823
+ }
5824
+
5825
+ case 'vad:process': {
5826
+ var startTime = performance.now();
5827
+ var result = await vadProcess(msg.audio, msg.state, msg.context);
5828
+ self.postMessage({
5829
+ type: 'vad:result', requestId: requestId,
5830
+ probability: result.probability, state: result.newState,
5831
+ inferenceTimeMs: performance.now() - startTime,
5832
+ });
5833
+ break;
5834
+ }
5835
+
5836
+ case 'vad:reset': {
5837
+ var state = vadCreateInitialState();
5838
+ self.postMessage({ type: 'vad:reset', requestId: requestId, state: state });
5839
+ break;
5840
+ }
5841
+
5842
+ case 'vad:dispose': {
5843
+ await vadDispose();
5844
+ self.postMessage({ type: 'vad:disposed', requestId: requestId });
5845
+ break;
5846
+ }
5847
+
5848
+ case 'dispose-all': {
5849
+ await svDispose();
5850
+ await cpuDispose();
5851
+ await vadDispose();
5852
+ ort = null;
5853
+ self.postMessage({ type: 'dispose-all:done', requestId: requestId });
5854
+ break;
5855
+ }
5856
+
5857
+ default:
5858
+ self.postMessage({ type: 'error', requestId: requestId, error: 'Unknown message type: ' + msg.type });
5859
+ }
5860
+ } catch (err) {
5861
+ var errorMsg = err.message || String(err);
5862
+ if (typeof err === 'number') {
5863
+ errorMsg = 'Raw C++ exception pointer (0x' + err.toString(16) + '). Likely OOM in WASM.';
5864
+ }
5865
+ self.postMessage({ type: 'error', requestId: requestId, error: errorMsg });
5866
+ }
5867
+ };
5868
+
5869
+ self.onerror = function(err) {
5870
+ self.postMessage({ type: 'error', requestId: null, error: 'Worker error: ' + (err.message || String(err)) });
5871
+ };
5872
+ `;
5873
+ var UnifiedInferenceWorker = class {
5874
+ constructor() {
5875
+ this.worker = null;
5876
+ this.pendingRequests = /* @__PURE__ */ new Map();
5877
+ this.initialized = false;
5878
+ this.poisoned = false;
5879
+ }
5880
+ /**
5881
+ * Initialize the worker (load ORT WASM from CDN)
5882
+ */
5883
+ async init() {
5884
+ if (this.initialized) return;
5885
+ const startTime = performance.now();
5886
+ const telemetry = getTelemetry();
5887
+ const span = telemetry?.startSpan("UnifiedInferenceWorker.init");
5888
+ try {
5889
+ logger6.info("Creating unified inference worker...");
5890
+ this.worker = this.createWorker();
5891
+ await this.sendMessage(
5892
+ { type: "init", wasmPaths: WASM_CDN_PATH3, isIOS: isIOS() },
5893
+ "init:done",
5894
+ INIT_TIMEOUT_MS
5895
+ );
5896
+ this.initialized = true;
5897
+ const loadTimeMs = performance.now() - startTime;
5898
+ logger6.info("Unified worker initialized", { loadTimeMs: Math.round(loadTimeMs) });
5899
+ span?.setAttributes({ "worker.init_time_ms": loadTimeMs });
5900
+ span?.end();
5901
+ } catch (error) {
5902
+ span?.endWithError(error instanceof Error ? error : new Error(String(error)));
5903
+ this.cleanup();
5904
+ throw error;
5905
+ }
5906
+ }
5907
+ // ── SenseVoice ────────────────────────────────────────────────────────
5908
+ async loadSenseVoice(config) {
5909
+ this.assertReady();
5910
+ const startTime = performance.now();
5911
+ const result = await this.sendMessage(
5912
+ {
5913
+ type: "sv:load",
5914
+ modelUrl: resolveUrl2(config.modelUrl),
5915
+ tokensUrl: resolveUrl2(config.tokensUrl),
5916
+ isIOS: isIOS(),
5917
+ language: config.language,
5918
+ textNorm: config.textNorm
5919
+ },
5920
+ "sv:loaded",
5921
+ SV_LOAD_TIMEOUT_MS
5922
+ );
5923
+ const loadTimeMs = performance.now() - startTime;
5924
+ return {
5925
+ backend: "wasm",
5926
+ loadTimeMs,
5927
+ inputNames: result.inputNames,
5928
+ outputNames: result.outputNames,
5929
+ vocabSize: result.vocabSize
5930
+ };
5931
+ }
5932
+ async transcribe(audio) {
5933
+ this.assertReady();
5934
+ const result = await this.sendMessage(
5935
+ { type: "sv:transcribe", audio },
5936
+ "sv:result",
5937
+ SV_INFER_TIMEOUT_MS
5938
+ );
5939
+ return {
5940
+ text: result.text,
5941
+ language: result.language,
5942
+ emotion: result.emotion,
5943
+ event: result.event,
5944
+ inferenceTimeMs: result.inferenceTimeMs,
5945
+ preprocessTimeMs: result.preprocessTimeMs
5946
+ };
5947
+ }
5948
+ async disposeSenseVoice() {
5949
+ if (!this.worker) return;
5950
+ await this.sendMessage({ type: "sv:dispose" }, "sv:disposed", DISPOSE_TIMEOUT_MS);
5951
+ }
5952
+ // ── Wav2ArkitCpu (Lip Sync) ──────────────────────────────────────────
5953
+ async loadLipSync(config) {
5954
+ this.assertReady();
5955
+ const startTime = performance.now();
5956
+ const result = await this.sendMessage(
5957
+ {
5958
+ type: "cpu:load",
5959
+ modelUrl: resolveUrl2(config.modelUrl),
5960
+ externalDataUrl: config.externalDataUrl ? resolveUrl2(config.externalDataUrl) : null,
5961
+ isIOS: isIOS()
5962
+ },
5963
+ "cpu:loaded",
5964
+ CPU_LOAD_TIMEOUT_MS
5965
+ );
5966
+ const loadTimeMs = performance.now() - startTime;
5967
+ return {
5968
+ backend: "wasm",
5969
+ loadTimeMs,
5970
+ inputNames: result.inputNames,
5971
+ outputNames: result.outputNames
5972
+ };
5973
+ }
5974
+ async inferLipSync(audio) {
5975
+ this.assertReady();
5976
+ return this.sendMessage(
5977
+ { type: "cpu:infer", audio },
5978
+ "cpu:result",
5979
+ CPU_INFER_TIMEOUT_MS
5980
+ );
5981
+ }
5982
+ async disposeLipSync() {
5983
+ if (!this.worker) return;
5984
+ await this.sendMessage({ type: "cpu:dispose" }, "cpu:disposed", DISPOSE_TIMEOUT_MS);
5985
+ }
5986
+ // ── Silero VAD ────────────────────────────────────────────────────────
5987
+ async loadVAD(config) {
5988
+ this.assertReady();
5989
+ const startTime = performance.now();
5990
+ const chunkSize = config.sampleRate === 16e3 ? 512 : 256;
5991
+ const result = await this.sendMessage(
5992
+ {
5993
+ type: "vad:load",
5994
+ modelUrl: resolveUrl2(config.modelUrl),
5995
+ sampleRate: config.sampleRate
5996
+ },
5997
+ "vad:loaded",
5998
+ VAD_LOAD_TIMEOUT_MS
5999
+ );
6000
+ const loadTimeMs = performance.now() - startTime;
6001
+ return {
6002
+ backend: "wasm",
6003
+ loadTimeMs,
6004
+ inputNames: result.inputNames,
6005
+ outputNames: result.outputNames,
6006
+ sampleRate: config.sampleRate,
6007
+ chunkSize
6008
+ };
6009
+ }
6010
+ async processVAD(audio, state, context) {
6011
+ this.assertReady();
6012
+ return this.sendMessage(
6013
+ { type: "vad:process", audio, state, context },
6014
+ "vad:result",
6015
+ VAD_INFER_TIMEOUT_MS
6016
+ );
6017
+ }
6018
+ async resetVAD() {
6019
+ this.assertReady();
6020
+ const result = await this.sendMessage(
6021
+ { type: "vad:reset" },
6022
+ "vad:reset",
6023
+ VAD_INFER_TIMEOUT_MS
6024
+ );
6025
+ return result.state;
6026
+ }
6027
+ async disposeVAD() {
6028
+ if (!this.worker) return;
6029
+ await this.sendMessage({ type: "vad:dispose" }, "vad:disposed", DISPOSE_TIMEOUT_MS);
6030
+ }
6031
+ // ── Lifecycle ─────────────────────────────────────────────────────────
6032
+ async dispose() {
6033
+ if (this.worker) {
6034
+ try {
6035
+ await this.sendMessage({ type: "dispose-all" }, "dispose-all:done", DISPOSE_TIMEOUT_MS);
6036
+ } catch {
6037
+ }
6038
+ this.worker.terminate();
6039
+ this.worker = null;
6040
+ }
6041
+ this.initialized = false;
6042
+ this.poisoned = false;
6043
+ this.rejectAllPending("Worker disposed");
6044
+ this.pendingRequests.clear();
6045
+ }
6046
+ /** Check if the worker is initialized and not poisoned */
6047
+ get isReady() {
6048
+ return this.initialized && !this.poisoned && this.worker !== null;
6049
+ }
6050
+ /** Check if Web Workers are supported */
6051
+ static isSupported() {
6052
+ return typeof Worker !== "undefined";
6053
+ }
6054
+ // ── Private ───────────────────────────────────────────────────────────
6055
+ assertReady() {
6056
+ if (!this.initialized || !this.worker) {
6057
+ throw new Error("UnifiedInferenceWorker not initialized. Call init() first.");
6058
+ }
6059
+ if (this.poisoned) {
6060
+ throw new Error("UnifiedInferenceWorker timed out \u2014 unavailable until page reload");
6061
+ }
6062
+ }
6063
+ createWorker() {
6064
+ const blob = new Blob([WORKER_SCRIPT2], { type: "application/javascript" });
6065
+ const blobUrl = URL.createObjectURL(blob);
6066
+ const worker = new Worker(blobUrl);
6067
+ URL.revokeObjectURL(blobUrl);
6068
+ worker.onmessage = (event) => {
6069
+ this.handleWorkerMessage(event.data);
6070
+ };
6071
+ worker.onerror = (error) => {
6072
+ logger6.error("Unified worker error", { error: error.message });
6073
+ this.rejectAllPending(`Worker error: ${error.message}`);
6074
+ };
6075
+ return worker;
6076
+ }
6077
+ handleWorkerMessage(data) {
6078
+ const requestId = data.requestId;
6079
+ if (data.type === "error") {
6080
+ if (requestId && this.pendingRequests.has(requestId)) {
6081
+ const pending = this.pendingRequests.get(requestId);
6082
+ clearTimeout(pending.timeout);
6083
+ this.pendingRequests.delete(requestId);
6084
+ pending.reject(new Error(data.error));
6085
+ } else {
6086
+ logger6.error("Worker broadcast error", { error: data.error });
6087
+ this.rejectAllPending(data.error);
6088
+ }
6089
+ return;
6090
+ }
6091
+ if (requestId && this.pendingRequests.has(requestId)) {
6092
+ const pending = this.pendingRequests.get(requestId);
6093
+ clearTimeout(pending.timeout);
6094
+ this.pendingRequests.delete(requestId);
6095
+ pending.resolve(data);
6096
+ }
6097
+ }
6098
+ sendMessage(message, expectedType, timeoutMs) {
6099
+ return new Promise((resolve, reject) => {
6100
+ if (!this.worker) {
6101
+ reject(new Error("Worker not initialized"));
6102
+ return;
6103
+ }
6104
+ const requestId = nextRequestId();
6105
+ const timeout = setTimeout(() => {
6106
+ this.pendingRequests.delete(requestId);
6107
+ this.poisoned = true;
6108
+ logger6.error("CRITICAL: Worker operation timed out \u2014 worker is dead", {
6109
+ type: message.type,
6110
+ timeoutMs
6111
+ });
6112
+ reject(new Error(`Worker operation '${message.type}' timed out after ${timeoutMs}ms`));
6113
+ }, timeoutMs);
6114
+ this.pendingRequests.set(requestId, {
6115
+ resolve,
6116
+ reject,
6117
+ timeout
6118
+ });
6119
+ this.worker.postMessage({ ...message, requestId });
6120
+ });
6121
+ }
6122
+ rejectAllPending(reason) {
6123
+ for (const [, pending] of this.pendingRequests) {
6124
+ clearTimeout(pending.timeout);
6125
+ pending.reject(new Error(reason));
6126
+ }
6127
+ this.pendingRequests.clear();
6128
+ }
6129
+ cleanup() {
6130
+ if (this.worker) {
6131
+ this.worker.terminate();
6132
+ this.worker = null;
6133
+ }
6134
+ this.initialized = false;
6135
+ this.rejectAllPending("Worker cleanup");
6136
+ this.pendingRequests.clear();
6137
+ }
6138
+ };
6139
+ var SenseVoiceUnifiedAdapter = class {
6140
+ constructor(worker, config) {
6141
+ this._isLoaded = false;
6142
+ this.inferenceQueue = Promise.resolve();
6143
+ this.worker = worker;
6144
+ const modelDir = config.modelUrl.substring(0, config.modelUrl.lastIndexOf("/"));
6145
+ this.config = {
6146
+ modelUrl: config.modelUrl,
6147
+ tokensUrl: config.tokensUrl ?? `${modelDir}/tokens.txt`,
6148
+ language: config.language ?? "auto",
6149
+ textNorm: config.textNorm ?? "with_itn"
6150
+ };
6151
+ this.languageId = resolveLanguageId(this.config.language);
6152
+ this.textNormId = resolveTextNormId(this.config.textNorm);
6153
+ }
6154
+ get isLoaded() {
6155
+ return this._isLoaded;
6156
+ }
6157
+ get backend() {
6158
+ return this._isLoaded ? "wasm" : null;
6159
+ }
6160
+ async load(onProgress) {
6161
+ const telemetry = getTelemetry();
6162
+ const span = telemetry?.startSpan("SenseVoiceUnifiedAdapter.load", {
6163
+ "model.url": this.config.modelUrl
6164
+ });
6165
+ try {
6166
+ const result = await this.worker.loadSenseVoice({
6167
+ modelUrl: this.config.modelUrl,
6168
+ tokensUrl: this.config.tokensUrl,
6169
+ language: this.languageId,
6170
+ textNorm: this.textNormId
6171
+ });
6172
+ this._isLoaded = true;
6173
+ onProgress?.(1, 1);
6174
+ logger6.info("SenseVoice loaded via unified worker", {
6175
+ backend: "wasm",
6176
+ loadTimeMs: Math.round(result.loadTimeMs),
6177
+ vocabSize: result.vocabSize
6178
+ });
6179
+ span?.setAttributes({ "model.backend": "wasm", "model.load_time_ms": result.loadTimeMs });
6180
+ span?.end();
6181
+ telemetry?.recordHistogram("omote.model.load_time", result.loadTimeMs, {
6182
+ model: "sensevoice-unified",
6183
+ backend: "wasm"
6184
+ });
6185
+ return result;
6186
+ } catch (error) {
6187
+ span?.endWithError(error instanceof Error ? error : new Error(String(error)));
6188
+ throw error;
6189
+ }
6190
+ }
6191
+ async transcribe(audioSamples) {
6192
+ if (!this._isLoaded) throw new Error("Model not loaded. Call load() first.");
6193
+ const audio = new Float32Array(audioSamples);
6194
+ return new Promise((resolve, reject) => {
6195
+ this.inferenceQueue = this.inferenceQueue.then(async () => {
6196
+ try {
6197
+ const result = await this.worker.transcribe(audio);
6198
+ resolve(result);
6199
+ } catch (err) {
6200
+ reject(err);
6201
+ }
6202
+ });
6203
+ });
6204
+ }
6205
+ async dispose() {
6206
+ if (this._isLoaded) {
6207
+ await this.worker.disposeSenseVoice();
6208
+ this._isLoaded = false;
6209
+ }
6210
+ }
6211
+ };
6212
+ var Wav2ArkitCpuUnifiedAdapter = class {
6213
+ constructor(worker, config) {
6214
+ this.modelId = "wav2arkit_cpu";
6215
+ this._isLoaded = false;
6216
+ this.inferenceQueue = Promise.resolve();
6217
+ this.worker = worker;
6218
+ this.config = config;
6219
+ }
6220
+ get isLoaded() {
6221
+ return this._isLoaded;
6222
+ }
6223
+ get backend() {
6224
+ return this._isLoaded ? "wasm" : null;
6225
+ }
6226
+ async load() {
6227
+ const telemetry = getTelemetry();
6228
+ const span = telemetry?.startSpan("Wav2ArkitCpuUnifiedAdapter.load", {
6229
+ "model.url": this.config.modelUrl
6230
+ });
6231
+ try {
6232
+ const externalDataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${this.config.modelUrl}.data` : null;
6233
+ const result = await this.worker.loadLipSync({
6234
+ modelUrl: this.config.modelUrl,
6235
+ externalDataUrl: externalDataUrl || null
6236
+ });
6237
+ this._isLoaded = true;
6238
+ logger6.info("Wav2ArkitCpu loaded via unified worker", {
6239
+ backend: "wasm",
6240
+ loadTimeMs: Math.round(result.loadTimeMs)
6241
+ });
6242
+ span?.setAttributes({ "model.backend": "wasm", "model.load_time_ms": result.loadTimeMs });
6243
+ span?.end();
6244
+ telemetry?.recordHistogram("omote.model.load_time", result.loadTimeMs, {
6245
+ model: "wav2arkit_cpu-unified",
6246
+ backend: "wasm"
6247
+ });
6248
+ return result;
6249
+ } catch (error) {
6250
+ span?.endWithError(error instanceof Error ? error : new Error(String(error)));
6251
+ throw error;
6252
+ }
6253
+ }
6254
+ async infer(audioSamples, _identityIndex) {
6255
+ if (!this._isLoaded) throw new Error("Model not loaded. Call load() first.");
6256
+ const audioCopy = new Float32Array(audioSamples);
6257
+ return new Promise((resolve, reject) => {
6258
+ this.inferenceQueue = this.inferenceQueue.then(async () => {
6259
+ const telemetry = getTelemetry();
6260
+ const span = telemetry?.startSpan("Wav2ArkitCpuUnifiedAdapter.infer", {
6261
+ "inference.input_samples": audioCopy.length
6262
+ });
6263
+ try {
6264
+ const startTime = performance.now();
6265
+ const result = await this.worker.inferLipSync(audioCopy);
6266
+ const inferenceTimeMs = performance.now() - startTime;
6267
+ const flatBuffer = result.blendshapes;
6268
+ const { numFrames, numBlendshapes } = result;
6269
+ const blendshapes = [];
6270
+ for (let f = 0; f < numFrames; f++) {
6271
+ blendshapes.push(flatBuffer.slice(f * numBlendshapes, (f + 1) * numBlendshapes));
6272
+ }
6273
+ span?.setAttributes({
6274
+ "inference.duration_ms": inferenceTimeMs,
6275
+ "inference.frames": numFrames
6276
+ });
6277
+ span?.end();
6278
+ resolve({ blendshapes, numFrames, inferenceTimeMs });
6279
+ } catch (err) {
6280
+ span?.endWithError(err instanceof Error ? err : new Error(String(err)));
6281
+ reject(err);
6282
+ }
6283
+ });
6284
+ });
6285
+ }
6286
+ async dispose() {
6287
+ if (this._isLoaded) {
6288
+ await this.worker.disposeLipSync();
6289
+ this._isLoaded = false;
6290
+ }
6291
+ }
6292
+ };
6293
+ var SileroVADUnifiedAdapter = class {
6294
+ constructor(worker, config) {
6295
+ this._isLoaded = false;
6296
+ // Inference queue
6297
+ this.inferenceQueue = Promise.resolve();
6298
+ // Pre-speech buffer
6299
+ this.preSpeechBuffer = [];
6300
+ this.wasSpeaking = false;
6301
+ this.worker = worker;
6302
+ const sr = config.sampleRate ?? 16e3;
6303
+ this.config = {
6304
+ modelUrl: config.modelUrl,
6305
+ backend: config.backend ?? "wasm",
6306
+ sampleRate: sr,
6307
+ threshold: config.threshold ?? 0.5,
6308
+ preSpeechBufferChunks: config.preSpeechBufferChunks ?? 10
6309
+ };
6310
+ this.chunkSize = sr === 16e3 ? 512 : 256;
6311
+ this.contextSize = sr === 16e3 ? 64 : 32;
6312
+ this.state = new Float32Array(2 * 1 * 128);
6313
+ this.context = new Float32Array(this.contextSize);
6314
+ }
6315
+ get isLoaded() {
6316
+ return this._isLoaded;
6317
+ }
6318
+ get backend() {
6319
+ return this._isLoaded ? "wasm" : null;
6320
+ }
6321
+ get sampleRate() {
6322
+ return this.config.sampleRate;
6323
+ }
6324
+ get threshold() {
6325
+ return this.config.threshold;
6326
+ }
6327
+ getChunkSize() {
6328
+ return this.chunkSize;
6329
+ }
6330
+ getChunkDurationMs() {
6331
+ return this.chunkSize / this.config.sampleRate * 1e3;
6332
+ }
6333
+ async load() {
6334
+ const telemetry = getTelemetry();
6335
+ const span = telemetry?.startSpan("SileroVADUnifiedAdapter.load", {
6336
+ "model.url": this.config.modelUrl
6337
+ });
6338
+ try {
6339
+ const result = await this.worker.loadVAD({
6340
+ modelUrl: this.config.modelUrl,
6341
+ sampleRate: this.config.sampleRate
6342
+ });
6343
+ this._isLoaded = true;
6344
+ logger6.info("SileroVAD loaded via unified worker", {
6345
+ backend: "wasm",
6346
+ loadTimeMs: Math.round(result.loadTimeMs),
6347
+ sampleRate: this.config.sampleRate,
6348
+ chunkSize: this.chunkSize
6349
+ });
6350
+ span?.setAttributes({ "model.backend": "wasm", "model.load_time_ms": result.loadTimeMs });
6351
+ span?.end();
6352
+ telemetry?.recordHistogram("omote.model.load_time", result.loadTimeMs, {
6353
+ model: "silero-vad-unified",
6354
+ backend: "wasm"
6355
+ });
6356
+ return result;
6357
+ } catch (error) {
6358
+ span?.endWithError(error instanceof Error ? error : new Error(String(error)));
6359
+ throw error;
6360
+ }
6361
+ }
6362
+ async process(audioChunk) {
6363
+ if (!this._isLoaded) throw new Error("Model not loaded. Call load() first.");
6364
+ if (audioChunk.length !== this.chunkSize) {
6365
+ throw new Error(
6366
+ `Audio chunk must be exactly ${this.chunkSize} samples (got ${audioChunk.length}). Use getChunkSize() to get required size.`
6367
+ );
6368
+ }
6369
+ const audioChunkCopy = new Float32Array(audioChunk);
6370
+ return new Promise((resolve, reject) => {
6371
+ this.inferenceQueue = this.inferenceQueue.then(async () => {
6372
+ try {
6373
+ const startTime = performance.now();
6374
+ const result = await this.worker.processVAD(audioChunkCopy, this.state, this.context);
6375
+ this.state = result.state;
6376
+ this.context = audioChunkCopy.slice(-this.contextSize);
6377
+ const inferenceTimeMs = performance.now() - startTime;
6378
+ const isSpeech = result.probability > this.config.threshold;
6379
+ let preSpeechChunks;
6380
+ if (isSpeech && !this.wasSpeaking) {
6381
+ preSpeechChunks = [...this.preSpeechBuffer];
6382
+ this.preSpeechBuffer = [];
6383
+ } else if (!isSpeech && !this.wasSpeaking) {
6384
+ this.preSpeechBuffer.push(new Float32Array(audioChunkCopy));
6385
+ if (this.preSpeechBuffer.length > this.config.preSpeechBufferChunks) {
6386
+ this.preSpeechBuffer.shift();
6387
+ }
6388
+ } else if (!isSpeech && this.wasSpeaking) {
6389
+ this.preSpeechBuffer = [];
6390
+ }
6391
+ this.wasSpeaking = isSpeech;
6392
+ resolve({
6393
+ probability: result.probability,
6394
+ isSpeech,
6395
+ inferenceTimeMs,
6396
+ preSpeechChunks
6397
+ });
6398
+ } catch (err) {
6399
+ reject(err);
6400
+ }
6401
+ });
6402
+ });
6403
+ }
6404
+ async reset() {
6405
+ if (!this._isLoaded) throw new Error("Model not loaded. Call load() first.");
6406
+ const newState = await this.worker.resetVAD();
6407
+ this.state = newState;
6408
+ this.context = new Float32Array(this.contextSize);
6409
+ this.preSpeechBuffer = [];
6410
+ this.wasSpeaking = false;
6411
+ }
6412
+ async dispose() {
6413
+ if (this._isLoaded) {
6414
+ await this.worker.disposeVAD();
6415
+ this._isLoaded = false;
6416
+ }
6417
+ this.state = new Float32Array(2 * 1 * 128);
6418
+ this.context = new Float32Array(this.contextSize);
6419
+ this.preSpeechBuffer = [];
6420
+ this.wasSpeaking = false;
6421
+ }
6422
+ };
6423
+
6424
+ // src/inference/createSenseVoice.ts
6425
+ var logger7 = createLogger("createSenseVoice");
6426
+ function createSenseVoice(config) {
6427
+ if (config.unifiedWorker) {
6428
+ logger7.info("Creating SenseVoiceUnifiedAdapter (shared unified worker)");
6429
+ return new SenseVoiceUnifiedAdapter(config.unifiedWorker, {
6430
+ modelUrl: config.modelUrl,
6431
+ tokensUrl: config.tokensUrl,
6432
+ language: config.language,
6433
+ textNorm: config.textNorm
6434
+ });
6435
+ }
6436
+ const useWorker = config.useWorker ?? "auto";
6437
+ if (useWorker === true) {
6438
+ if (!SenseVoiceWorker.isSupported()) {
6439
+ throw new Error("Web Workers are not supported in this environment");
6440
+ }
6441
+ logger7.info("Creating SenseVoiceWorker (off-main-thread)");
6442
+ return new SenseVoiceWorker({
6443
+ modelUrl: config.modelUrl,
6444
+ tokensUrl: config.tokensUrl,
6445
+ language: config.language,
6446
+ textNorm: config.textNorm
6447
+ });
6448
+ }
6449
+ if (useWorker === false) {
6450
+ logger7.info("Creating SenseVoiceInference (main thread)");
6451
+ return new SenseVoiceInference({
6452
+ modelUrl: config.modelUrl,
6453
+ tokensUrl: config.tokensUrl,
6454
+ language: config.language,
6455
+ textNorm: config.textNorm
6456
+ });
6457
+ }
6458
+ if (SenseVoiceWorker.isSupported() && !isIOS()) {
6459
+ logger7.info("Auto-detected: creating SenseVoiceWorker (off-main-thread)");
6460
+ return new SenseVoiceWorker({
6461
+ modelUrl: config.modelUrl,
6462
+ tokensUrl: config.tokensUrl,
6463
+ language: config.language,
6464
+ textNorm: config.textNorm
6465
+ });
6466
+ }
6467
+ logger7.info("Auto-detected: creating SenseVoiceInference (main thread)", {
6468
+ reason: isIOS() ? "iOS (shared ORT instance)" : "Worker unsupported"
6469
+ });
6470
+ return new SenseVoiceInference({
6471
+ modelUrl: config.modelUrl,
6472
+ tokensUrl: config.tokensUrl,
6473
+ language: config.language,
6474
+ textNorm: config.textNorm
6475
+ });
6476
+ }
6477
+
6478
+ // src/inference/Wav2ArkitCpuInference.ts
6479
+ var logger8 = createLogger("Wav2ArkitCpu");
6480
+ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
6481
+ constructor(config) {
6482
+ this.modelId = "wav2arkit_cpu";
6483
+ this.session = null;
6484
+ this.ort = null;
6485
+ this._backend = "wasm";
6486
+ this.isLoading = false;
6487
+ // Inference queue for handling concurrent calls
6488
+ this.inferenceQueue = Promise.resolve();
6489
+ // Session health: set to true if session.run() times out.
6490
+ // A timed-out session may have a zombie WASM dispatch still running,
6491
+ // so all future infer() calls reject immediately to prevent concurrent access.
6492
+ this.poisoned = false;
6493
+ this.config = config;
6494
+ }
6495
+ get backend() {
6496
+ return this.session ? this._backend : null;
6497
+ }
6498
+ get isLoaded() {
6499
+ return this.session !== null;
6500
+ }
6501
+ /**
6502
+ * Load the ONNX model
6503
+ */
6504
+ async load() {
6505
+ if (this.isLoading) {
6506
+ throw new Error("Model is already loading");
6507
+ }
6508
+ if (this.session) {
6509
+ throw new Error("Model already loaded. Call dispose() first.");
6510
+ }
6511
+ this.isLoading = true;
6512
+ const startTime = performance.now();
6513
+ const telemetry = getTelemetry();
6514
+ const span = telemetry?.startSpan("Wav2ArkitCpu.load", {
6515
+ "model.url": this.config.modelUrl,
6516
+ "model.backend_requested": this.config.backend || "wasm"
6517
+ });
6518
+ try {
6519
+ const preference = this.config.backend || "wasm";
6520
+ logger8.info("Loading ONNX Runtime...", { preference });
6521
+ const { ort, backend } = await getOnnxRuntimeForPreference(preference);
6522
+ this.ort = ort;
6523
+ this._backend = backend;
6524
+ logger8.info("ONNX Runtime loaded", { backend: this._backend });
6525
+ const modelUrl = this.config.modelUrl;
6526
+ const dataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${modelUrl}.data` : null;
6527
+ const sessionOptions = getSessionOptions(this._backend);
6528
+ if (isIOS()) {
6529
+ logger8.info("iOS: passing model URLs directly to ORT (low-memory path)", {
6530
+ modelUrl,
6531
+ dataUrl
6532
+ });
6533
+ if (dataUrl) {
6534
+ const dataFilename = dataUrl.split("/").pop();
6535
+ sessionOptions.externalData = [{
6536
+ path: dataFilename,
6537
+ data: dataUrl
6538
+ // URL string — ORT fetches directly into WASM
6539
+ }];
6540
+ }
6541
+ this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
6542
+ } else {
6543
+ const cache = getModelCache();
6544
+ const isCached = await cache.has(modelUrl);
6545
+ let modelBuffer;
6546
+ if (isCached) {
6547
+ logger8.debug("Loading model from cache", { modelUrl });
6548
+ modelBuffer = await cache.get(modelUrl);
6549
+ if (!modelBuffer) {
6550
+ logger8.warn("Cache corruption detected, clearing and retrying", { modelUrl });
6551
+ await cache.delete(modelUrl);
6552
+ modelBuffer = await fetchWithCache(modelUrl);
6553
+ }
6554
+ } else {
6555
+ logger8.debug("Fetching and caching model graph", { modelUrl });
6556
+ modelBuffer = await fetchWithCache(modelUrl);
6557
+ }
6558
+ if (!modelBuffer) {
6559
+ throw new Error(`Failed to load model: ${modelUrl}`);
6560
+ }
6561
+ let externalDataBuffer = null;
6562
+ if (dataUrl) {
6563
+ try {
6564
+ const isDataCached = await cache.has(dataUrl);
6565
+ if (isDataCached) {
6566
+ logger8.debug("Loading external data from cache", { dataUrl });
6567
+ externalDataBuffer = await cache.get(dataUrl);
6568
+ if (!externalDataBuffer) {
6569
+ logger8.warn("Cache corruption for external data, retrying", { dataUrl });
6570
+ await cache.delete(dataUrl);
6571
+ externalDataBuffer = await fetchWithCache(dataUrl);
6572
+ }
6573
+ } else {
6574
+ logger8.info("Fetching external model data", {
6575
+ dataUrl,
6576
+ note: "This may be a large download (400MB+)"
6577
+ });
6578
+ externalDataBuffer = await fetchWithCache(dataUrl);
6579
+ }
6580
+ logger8.info("External data loaded", {
6581
+ size: formatBytes(externalDataBuffer.byteLength)
6582
+ });
6583
+ } catch (err) {
6584
+ logger8.debug("No external data file found (single-file model)", {
6585
+ dataUrl,
6586
+ error: err.message
6587
+ });
6588
+ }
6589
+ }
6590
+ logger8.debug("Creating ONNX session", {
6591
+ graphSize: formatBytes(modelBuffer.byteLength),
6592
+ externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
6593
+ backend: this._backend
6594
+ });
6595
+ if (externalDataBuffer) {
6596
+ const dataFilename = dataUrl.split("/").pop();
6597
+ sessionOptions.externalData = [{
6598
+ path: dataFilename,
6599
+ data: new Uint8Array(externalDataBuffer)
6600
+ }];
6601
+ }
6602
+ const modelData = new Uint8Array(modelBuffer);
6603
+ this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
6604
+ }
6605
+ const loadTimeMs = performance.now() - startTime;
6606
+ logger8.info("Model loaded successfully", {
6607
+ backend: this._backend,
6608
+ loadTimeMs: Math.round(loadTimeMs),
6609
+ inputs: this.session.inputNames,
6610
+ outputs: this.session.outputNames
6611
+ });
6612
+ span?.setAttributes({
6613
+ "model.backend": this._backend,
6614
+ "model.load_time_ms": loadTimeMs,
6615
+ "model.cached": !isIOS()
6616
+ });
6617
+ span?.end();
6618
+ telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
6619
+ model: "wav2arkit_cpu",
6620
+ backend: this._backend
6621
+ });
6622
+ logger8.debug("Running warmup inference");
6623
+ const warmupStart = performance.now();
6624
+ const silentAudio = new Float32Array(16e3);
6625
+ await this.infer(silentAudio);
6626
+ const warmupTimeMs = performance.now() - warmupStart;
6627
+ logger8.info("Warmup inference complete", {
6628
+ warmupTimeMs: Math.round(warmupTimeMs),
6629
+ backend: this._backend
6630
+ });
6631
+ telemetry?.recordHistogram("omote.model.warmup_time", warmupTimeMs, {
6632
+ model: "wav2arkit_cpu",
6633
+ backend: this._backend
6634
+ });
6635
+ return {
6636
+ backend: this._backend,
6637
+ loadTimeMs,
6638
+ inputNames: [...this.session.inputNames],
6639
+ outputNames: [...this.session.outputNames]
6640
+ };
6641
+ } catch (error) {
6642
+ span?.endWithError(error instanceof Error ? error : new Error(String(error)));
6643
+ telemetry?.incrementCounter("omote.errors.total", 1, {
6644
+ model: "wav2arkit_cpu",
6645
+ error_type: "load_failed"
6646
+ });
6647
+ throw error;
6648
+ } finally {
6649
+ this.isLoading = false;
6650
+ }
6651
+ }
6652
+ /**
6653
+ * Run inference on raw audio
6654
+ *
6655
+ * Accepts variable-length audio (not fixed to 16000 samples).
6656
+ * Output frames = ceil(30 * numSamples / 16000).
6657
+ *
6658
+ * @param audioSamples - Float32Array of raw audio at 16kHz
6659
+ * @param _identityIndex - Ignored (identity 11 is baked into the model)
6660
+ */
6661
+ async infer(audioSamples, _identityIndex) {
6662
+ if (!this.session) {
6663
+ throw new Error("Model not loaded. Call load() first.");
6664
+ }
6665
+ if (this.poisoned) {
6666
+ throw new Error("Wav2ArkitCpu session timed out \u2014 inference unavailable until page reload");
6667
+ }
6668
+ const audioCopy = new Float32Array(audioSamples);
6669
+ const feeds = {
6670
+ "audio_waveform": new this.ort.Tensor("float32", audioCopy, [1, audioCopy.length])
6671
+ };
6672
+ return this.queueInference(feeds, audioCopy.length);
6673
+ }
6674
+ /**
6675
+ * Queue inference to serialize ONNX session calls
6676
+ */
6677
+ queueInference(feeds, inputSamples) {
6678
+ return new Promise((resolve, reject) => {
6679
+ this.inferenceQueue = this.inferenceQueue.then(async () => {
6680
+ const telemetry = getTelemetry();
6681
+ const span = telemetry?.startSpan("Wav2ArkitCpu.infer", {
6682
+ "inference.backend": this._backend,
6683
+ "inference.input_samples": inputSamples
6684
+ });
6685
+ try {
6686
+ const startTime = performance.now();
6687
+ let timeoutId;
6688
+ const results = await Promise.race([
6689
+ this.session.run(feeds).then((r) => {
6690
+ clearTimeout(timeoutId);
6691
+ return r;
6692
+ }),
6693
+ new Promise((_, rej) => {
6694
+ timeoutId = setTimeout(
6695
+ () => rej(new Error(`Wav2ArkitCpu inference timed out after ${_Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS}ms`)),
6696
+ _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
6697
+ );
6698
+ })
6699
+ ]);
6700
+ const inferenceTimeMs = performance.now() - startTime;
6701
+ const blendshapeOutput = results["blendshapes"];
6702
+ if (!blendshapeOutput) {
6703
+ throw new Error("Missing blendshapes output from model");
6704
+ }
6705
+ const blendshapeData = blendshapeOutput.data;
6706
+ const numFrames = blendshapeOutput.dims[1];
6707
+ const numBlendshapes = blendshapeOutput.dims[2];
6708
+ const blendshapes = [];
6709
+ for (let f = 0; f < numFrames; f++) {
6710
+ const rawFrame = blendshapeData.slice(f * numBlendshapes, (f + 1) * numBlendshapes);
6711
+ const symmetrized = symmetrizeBlendshapes(rawFrame);
6712
+ blendshapes.push(symmetrized);
6713
+ }
6714
+ logger8.trace("Inference completed", {
6715
+ inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
6716
+ numFrames,
6717
+ inputSamples
6718
+ });
6719
+ span?.setAttributes({
6720
+ "inference.duration_ms": inferenceTimeMs,
6721
+ "inference.frames": numFrames
6722
+ });
6723
+ span?.end();
6724
+ telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
6725
+ model: "wav2arkit_cpu",
6726
+ backend: this._backend
6727
+ });
6728
+ telemetry?.incrementCounter("omote.inference.total", 1, {
6729
+ model: "wav2arkit_cpu",
6730
+ backend: this._backend,
6731
+ status: "success"
6732
+ });
6733
+ resolve({
6734
+ blendshapes,
6735
+ numFrames,
6736
+ inferenceTimeMs
6737
+ });
6738
+ } catch (err) {
6739
+ const errMsg = err instanceof Error ? err.message : String(err);
6740
+ if (errMsg.includes("timed out")) {
6741
+ this.poisoned = true;
6742
+ logger8.error("CRITICAL: Inference session timed out \u2014 Wav2ArkitCpu is dead. Page reload required.", {
6743
+ backend: this._backend,
6744
+ timeoutMs: _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
6745
+ });
6746
+ } else if (typeof err === "number") {
6747
+ const oomError = new Error(
6748
+ `Wav2ArkitCpu inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
6749
+ );
6750
+ logger8.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
6751
+ pointer: `0x${err.toString(16)}`,
6752
+ backend: this._backend
6753
+ });
6754
+ span?.endWithError(oomError);
6755
+ telemetry?.incrementCounter("omote.inference.total", 1, {
6756
+ model: "wav2arkit_cpu",
6757
+ backend: this._backend,
6758
+ status: "error"
6759
+ });
6760
+ reject(oomError);
6761
+ return;
6762
+ } else {
6763
+ logger8.error("Inference failed", { error: errMsg, backend: this._backend });
6764
+ }
6765
+ span?.endWithError(err instanceof Error ? err : new Error(String(err)));
6766
+ telemetry?.incrementCounter("omote.inference.total", 1, {
6767
+ model: "wav2arkit_cpu",
6768
+ backend: this._backend,
6769
+ status: "error"
6770
+ });
6771
+ reject(err);
6772
+ }
6773
+ });
6774
+ });
6775
+ }
6776
+ /**
6777
+ * Dispose of the model and free resources
6778
+ */
6779
+ async dispose() {
6780
+ if (this.session) {
6781
+ await this.session.release();
6782
+ this.session = null;
6783
+ }
6784
+ }
6785
+ };
6786
+ _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS = 5e3;
6787
+ var Wav2ArkitCpuInference = _Wav2ArkitCpuInference;
6788
+
6789
+ // src/inference/Wav2ArkitCpuWorker.ts
6790
+ var logger9 = createLogger("Wav2ArkitCpuWorker");
6791
+ var WASM_CDN_PATH4 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
6792
+ var LOAD_TIMEOUT_MS2 = 6e4;
6793
+ var INFERENCE_TIMEOUT_MS2 = 5e3;
6794
+ function resolveUrl3(url) {
6795
+ if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
6796
+ try {
6797
+ return new URL(url, globalThis.location?.origin ?? "https://localhost").href;
6798
+ } catch {
6799
+ return url;
6800
+ }
6801
+ }
6802
+ var WORKER_SCRIPT3 = `
6803
+ // Wav2ArkitCpu Worker Script
6804
+ // Loaded via Blob URL - no separate file needed
6805
+
6806
+ var ort = null;
6807
+ var session = null;
6808
+
6809
+ // Precomputed symmetric index pairs from LAM_BLENDSHAPES alphabetical ordering
6810
+ // Used to average left/right blendshape pairs for symmetrized output
6811
+ const SYMMETRIC_INDEX_PAIRS = [
6812
+ [23, 25], // jawLeft, jawRight
6813
+ [32, 38], // mouthLeft, mouthRight
6814
+ [43, 44], // mouthSmileLeft, mouthSmileRight
6815
+ [29, 30], // mouthFrownLeft, mouthFrownRight
6816
+ [27, 28], // mouthDimpleLeft, mouthDimpleRight
6817
+ [45, 46], // mouthStretchLeft, mouthStretchRight
6818
+ [35, 36], // mouthPressLeft, mouthPressRight
6819
+ [47, 48], // mouthUpperUpLeft, mouthUpperUpRight
6820
+ [33, 34], // mouthLowerDownLeft, mouthLowerDownRight
6821
+ [49, 50], // noseSneerLeft, noseSneerRight
6822
+ [6, 7], // cheekSquintLeft, cheekSquintRight
6823
+ [0, 1], // browDownLeft, browDownRight
6824
+ [3, 4], // browOuterUpLeft, browOuterUpRight
6825
+ [8, 9], // eyeBlinkLeft, eyeBlinkRight
6826
+ [16, 17], // eyeLookUpLeft, eyeLookUpRight
6827
+ [10, 11], // eyeLookDownLeft, eyeLookDownRight
6828
+ [12, 13], // eyeLookInLeft, eyeLookInRight
6829
+ [14, 15], // eyeLookOutLeft, eyeLookOutRight
6830
+ [18, 19], // eyeSquintLeft, eyeSquintRight
6831
+ [20, 21], // eyeWideLeft, eyeWideRight
6832
+ ];
6833
+
6834
+ /**
6835
+ * Symmetrize blendshapes by averaging left/right pairs
6836
+ * Inlined from blendshapeUtils.ts for worker context
6837
+ */
6838
+ function symmetrizeBlendshapes(frame) {
6839
+ const result = new Float32Array(frame);
6840
+ for (const [lIdx, rIdx] of SYMMETRIC_INDEX_PAIRS) {
6841
+ const avg = (frame[lIdx] + frame[rIdx]) / 2;
6842
+ result[lIdx] = avg;
6843
+ result[rIdx] = avg;
6844
+ }
6845
+ return result;
6846
+ }
6847
+
6848
+ /**
6849
+ * Load ONNX Runtime from CDN
6850
+ */
6851
+ async function loadOrt(wasmPaths) {
6852
+ if (ort) return;
6853
+
6854
+ // Import ONNX Runtime from CDN
6855
+ const ortUrl = wasmPaths + 'ort.wasm.min.js';
6856
+
6857
+ // Load the script by fetching and executing it
6858
+ const response = await fetch(ortUrl);
6859
+ const scriptText = await response.text();
6860
+
6861
+ // Create a blob URL for the script
6862
+ const blob = new Blob([scriptText], { type: 'application/javascript' });
6863
+ const blobUrl = URL.createObjectURL(blob);
6864
+
6865
+ // Import the module
6866
+ importScripts(blobUrl);
6867
+ URL.revokeObjectURL(blobUrl);
6868
+
6869
+ // ort is now available as global
6870
+ ort = self.ort;
6871
+
6872
+ // Configure WASM settings
6873
+ ort.env.wasm.wasmPaths = wasmPaths;
6874
+ ort.env.wasm.numThreads = 1; // Single thread in worker
6875
+ ort.env.wasm.simd = true;
6876
+ ort.env.wasm.proxy = false; // No proxy in worker
6877
+ }
6878
+
6879
+ /**
6880
+ * Load the wav2arkit_cpu model
6881
+ */
6882
+ async function loadModel(modelUrl, externalDataUrl, isIOS) {
6883
+ const sessionOptions = {
6884
+ executionProviders: ['wasm'],
6885
+ graphOptimizationLevel: 'all',
6886
+ };
6887
+
6888
+ const dataFilename = externalDataUrl ? externalDataUrl.split('/').pop() : null;
6889
+
6890
+ if (isIOS) {
6891
+ // iOS: Pass URLs directly to ORT to avoid loading 402MB into JS heap.
6892
+ // ORT fetches externally into WASM memory, cutting peak JS memory from
6893
+ // ~800MB to ~2MB (just the graph).
6894
+ if (externalDataUrl && dataFilename) {
6895
+ sessionOptions.externalData = [{ path: dataFilename, data: externalDataUrl }];
6896
+ }
6897
+ session = await ort.InferenceSession.create(modelUrl, sessionOptions);
6898
+ } else {
6899
+ // Desktop: fetch model graph as ArrayBuffer
6900
+ const graphResponse = await fetch(modelUrl);
6901
+ if (!graphResponse.ok) {
6902
+ throw new Error('Failed to fetch model graph: ' + graphResponse.status + ' ' + graphResponse.statusText);
6903
+ }
6904
+ const graphBuffer = await graphResponse.arrayBuffer();
6905
+
6906
+ // Fetch external data file if present
6907
+ if (externalDataUrl && dataFilename) {
6908
+ const dataResponse = await fetch(externalDataUrl);
6909
+ if (!dataResponse.ok) {
6910
+ throw new Error('Failed to fetch external data: ' + dataResponse.status + ' ' + dataResponse.statusText);
6911
+ }
6912
+ const dataBuffer = await dataResponse.arrayBuffer();
6913
+ sessionOptions.externalData = [{ path: dataFilename, data: new Uint8Array(dataBuffer) }];
6914
+ }
6915
+
6916
+ session = await ort.InferenceSession.create(new Uint8Array(graphBuffer), sessionOptions);
6917
+ }
6918
+
6919
+ // Warmup inference with 16000 silent samples
6920
+ const warmupAudio = new Float32Array(16000);
6921
+ const warmupTensor = new ort.Tensor('float32', warmupAudio, [1, warmupAudio.length]);
6922
+ await session.run({ audio_waveform: warmupTensor });
6923
+
6924
+ return {
6925
+ inputNames: session.inputNames.slice(),
6926
+ outputNames: session.outputNames.slice(),
6927
+ };
6928
+ }
6929
+
6930
+ /**
6931
+ * Run lip sync inference
6932
+ */
6933
+ async function runInference(audio) {
6934
+ const tensor = new ort.Tensor('float32', audio, [1, audio.length]);
6935
+ const results = await session.run({ audio_waveform: tensor });
6936
+
6937
+ const blendshapeOutput = results['blendshapes'];
6938
+ if (!blendshapeOutput) {
6939
+ throw new Error('Missing blendshapes output from model');
6940
+ }
6941
+
6942
+ const blendshapeData = blendshapeOutput.data;
6943
+ const numFrames = blendshapeOutput.dims[1];
6944
+ const numBlendshapes = blendshapeOutput.dims[2];
6945
+
6946
+ // Symmetrize each frame and flatten into a single Float32Array for transfer
6947
+ const flatBuffer = new Float32Array(numFrames * numBlendshapes);
6948
+ for (let f = 0; f < numFrames; f++) {
6949
+ const offset = f * numBlendshapes;
6950
+ const rawFrame = blendshapeData.slice(offset, offset + numBlendshapes);
6951
+ const symmetrized = symmetrizeBlendshapes(rawFrame);
6952
+ flatBuffer.set(symmetrized, offset);
6953
+ }
6954
+
6955
+ return { flatBuffer, numFrames, numBlendshapes };
6956
+ }
6957
+
6958
+ // Message handler
6959
+ self.onmessage = async function(e) {
6960
+ const msg = e.data;
6961
+
6962
+ try {
6963
+ switch (msg.type) {
6964
+ case 'load': {
6965
+ const startTime = performance.now();
6966
+ await loadOrt(msg.wasmPaths);
6967
+ const { inputNames, outputNames } = await loadModel(msg.modelUrl, msg.externalDataUrl, msg.isIOS);
6968
+ const loadTimeMs = performance.now() - startTime;
6969
+
6970
+ self.postMessage({
6971
+ type: 'loaded',
6972
+ inputNames,
6973
+ outputNames,
6974
+ loadTimeMs,
6975
+ });
6976
+ break;
6977
+ }
6978
+
6979
+ case 'infer': {
6980
+ const startTime = performance.now();
6981
+ const { flatBuffer, numFrames, numBlendshapes } = await runInference(msg.audio);
6982
+ const inferenceTimeMs = performance.now() - startTime;
6983
+
6984
+ self.postMessage({
6985
+ type: 'result',
6986
+ blendshapes: flatBuffer,
6987
+ numFrames,
6988
+ numBlendshapes,
6989
+ inferenceTimeMs,
6990
+ }, [flatBuffer.buffer]);
6991
+ break;
6992
+ }
6993
+
6994
+ case 'dispose': {
6995
+ if (session) {
6996
+ await session.release();
6997
+ session = null;
6998
+ }
6999
+ ort = null;
7000
+ self.postMessage({ type: 'disposed' });
7001
+ break;
7002
+ }
7003
+
7004
+ default:
7005
+ self.postMessage({
7006
+ type: 'error',
7007
+ error: 'Unknown message type: ' + msg.type,
7008
+ });
7009
+ }
7010
+ } catch (err) {
7011
+ let errorMessage;
7012
+ if (typeof err === 'number') {
7013
+ // ORT WASM throws raw C++ exception pointers as bare numbers
7014
+ errorMessage = 'ORT WASM C++ exception pointer (0x' + err.toString(16) + ') \u2014 likely OOM';
7015
+ } else {
7016
+ errorMessage = err.message || String(err);
7017
+ }
7018
+ self.postMessage({
7019
+ type: 'error',
7020
+ error: errorMessage,
7021
+ });
7022
+ }
7023
+ };
7024
+
7025
+ // Error handler
7026
+ self.onerror = function(err) {
7027
+ self.postMessage({
7028
+ type: 'error',
7029
+ error: 'Worker error: ' + (err.message || String(err)),
7030
+ });
7031
+ };
7032
+ `;
7033
+ var Wav2ArkitCpuWorker = class {
7034
+ constructor(config) {
7035
+ this.modelId = "wav2arkit_cpu";
7036
+ this.worker = null;
7037
+ this.isLoading = false;
7038
+ this._isLoaded = false;
7039
+ // Inference queue for serialization
7040
+ this.inferenceQueue = Promise.resolve();
7041
+ // Session health: set to true if worker inference times out.
7042
+ // A timed-out worker may have a zombie WASM dispatch still running,
7043
+ // so all future infer() calls reject immediately to prevent concurrent access.
7044
+ this.poisoned = false;
7045
+ // Pending message handlers
7046
+ this.pendingResolvers = /* @__PURE__ */ new Map();
7047
+ this.config = config;
7048
+ }
7049
+ get isLoaded() {
7050
+ return this._isLoaded;
7051
+ }
7052
+ /**
7053
+ * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
7054
+ */
7055
+ get backend() {
7056
+ return this._isLoaded ? "wasm" : null;
7057
+ }
7058
+ /**
7059
+ * Create the worker from inline script
7060
+ */
7061
+ createWorker() {
7062
+ const blob = new Blob([WORKER_SCRIPT3], { type: "application/javascript" });
7063
+ const blobUrl = URL.createObjectURL(blob);
7064
+ const worker = new Worker(blobUrl);
7065
+ URL.revokeObjectURL(blobUrl);
7066
+ worker.onmessage = (event) => {
7067
+ this.handleWorkerMessage(event.data);
7068
+ };
7069
+ worker.onerror = (error) => {
7070
+ logger9.error("Worker error", { error: error.message });
7071
+ for (const [, resolver] of this.pendingResolvers) {
7072
+ resolver.reject(new Error(`Worker error: ${error.message}`));
7073
+ }
7074
+ this.pendingResolvers.clear();
7075
+ };
7076
+ return worker;
7077
+ }
7078
+ /**
7079
+ * Handle messages from worker
7080
+ */
7081
+ handleWorkerMessage(result) {
7082
+ const resolver = this.pendingResolvers.get(result.type);
7083
+ if (resolver) {
7084
+ this.pendingResolvers.delete(result.type);
7085
+ if (result.type === "error") {
7086
+ resolver.reject(new Error(result.error));
7087
+ } else {
7088
+ resolver.resolve(result);
7089
+ }
7090
+ }
7091
+ }
7092
+ /**
7093
+ * Send message to worker and wait for response
7094
+ */
7095
+ sendMessage(message, expectedType, timeoutMs) {
7096
+ return new Promise((resolve, reject) => {
7097
+ if (!this.worker) {
7098
+ reject(new Error("Worker not initialized"));
7099
+ return;
7100
+ }
7101
+ const timeoutId = setTimeout(() => {
7102
+ this.pendingResolvers.delete(expectedType);
7103
+ reject(new Error(`Worker operation timed out after ${timeoutMs}ms`));
7104
+ }, timeoutMs);
7105
+ this.pendingResolvers.set(expectedType, {
7106
+ resolve: (value) => {
7107
+ clearTimeout(timeoutId);
7108
+ resolve(value);
7109
+ },
7110
+ reject: (error) => {
7111
+ clearTimeout(timeoutId);
7112
+ reject(error);
7113
+ }
7114
+ });
7115
+ this.pendingResolvers.set("error", {
7116
+ resolve: () => {
7117
+ },
7118
+ // Never called for errors
7119
+ reject: (error) => {
7120
+ clearTimeout(timeoutId);
7121
+ this.pendingResolvers.delete(expectedType);
7122
+ reject(error);
7123
+ }
7124
+ });
7125
+ this.worker.postMessage(message);
7126
+ });
7127
+ }
7128
+ /**
7129
+ * Load the ONNX model in the worker
4217
7130
  */
4218
7131
  async load() {
4219
7132
  if (this.isLoading) {
4220
7133
  throw new Error("Model is already loading");
4221
7134
  }
4222
- if (this.session) {
7135
+ if (this._isLoaded) {
4223
7136
  throw new Error("Model already loaded. Call dispose() first.");
4224
7137
  }
4225
7138
  this.isLoading = true;
4226
7139
  const startTime = performance.now();
4227
7140
  const telemetry = getTelemetry();
4228
- const span = telemetry?.startSpan("Wav2ArkitCpu.load", {
7141
+ const span = telemetry?.startSpan("Wav2ArkitCpuWorker.load", {
4229
7142
  "model.url": this.config.modelUrl,
4230
- "model.backend_requested": this.config.backend || "wasm"
7143
+ "model.backend_requested": "wasm"
4231
7144
  });
4232
7145
  try {
4233
- const preference = this.config.backend || "wasm";
4234
- logger5.info("Loading ONNX Runtime...", { preference });
4235
- const { ort, backend } = await getOnnxRuntimeForPreference(preference);
4236
- this.ort = ort;
4237
- this._backend = backend;
4238
- logger5.info("ONNX Runtime loaded", { backend: this._backend });
4239
- const modelUrl = this.config.modelUrl;
4240
- const dataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${modelUrl}.data` : null;
4241
- const sessionOptions = getSessionOptions(this._backend);
4242
- if (isIOS()) {
4243
- logger5.info("iOS: passing model URLs directly to ORT (low-memory path)", {
4244
- modelUrl,
4245
- dataUrl
4246
- });
4247
- if (dataUrl) {
4248
- const dataFilename = dataUrl.split("/").pop();
4249
- sessionOptions.externalData = [{
4250
- path: dataFilename,
4251
- data: dataUrl
4252
- // URL string — ORT fetches directly into WASM
4253
- }];
4254
- }
4255
- this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
4256
- } else {
4257
- const cache = getModelCache();
4258
- const isCached = await cache.has(modelUrl);
4259
- let modelBuffer;
4260
- if (isCached) {
4261
- logger5.debug("Loading model from cache", { modelUrl });
4262
- modelBuffer = await cache.get(modelUrl);
4263
- if (!modelBuffer) {
4264
- logger5.warn("Cache corruption detected, clearing and retrying", { modelUrl });
4265
- await cache.delete(modelUrl);
4266
- modelBuffer = await fetchWithCache(modelUrl);
4267
- }
4268
- } else {
4269
- logger5.debug("Fetching and caching model graph", { modelUrl });
4270
- modelBuffer = await fetchWithCache(modelUrl);
4271
- }
4272
- if (!modelBuffer) {
4273
- throw new Error(`Failed to load model: ${modelUrl}`);
4274
- }
4275
- let externalDataBuffer = null;
4276
- if (dataUrl) {
4277
- try {
4278
- const isDataCached = await cache.has(dataUrl);
4279
- if (isDataCached) {
4280
- logger5.debug("Loading external data from cache", { dataUrl });
4281
- externalDataBuffer = await cache.get(dataUrl);
4282
- if (!externalDataBuffer) {
4283
- logger5.warn("Cache corruption for external data, retrying", { dataUrl });
4284
- await cache.delete(dataUrl);
4285
- externalDataBuffer = await fetchWithCache(dataUrl);
4286
- }
4287
- } else {
4288
- logger5.info("Fetching external model data", {
4289
- dataUrl,
4290
- note: "This may be a large download (400MB+)"
4291
- });
4292
- externalDataBuffer = await fetchWithCache(dataUrl);
4293
- }
4294
- logger5.info("External data loaded", {
4295
- size: formatBytes(externalDataBuffer.byteLength)
4296
- });
4297
- } catch (err) {
4298
- logger5.debug("No external data file found (single-file model)", {
4299
- dataUrl,
4300
- error: err.message
4301
- });
4302
- }
4303
- }
4304
- logger5.debug("Creating ONNX session", {
4305
- graphSize: formatBytes(modelBuffer.byteLength),
4306
- externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
4307
- backend: this._backend
4308
- });
4309
- if (externalDataBuffer) {
4310
- const dataFilename = dataUrl.split("/").pop();
4311
- sessionOptions.externalData = [{
4312
- path: dataFilename,
4313
- data: new Uint8Array(externalDataBuffer)
4314
- }];
4315
- }
4316
- const modelData = new Uint8Array(modelBuffer);
4317
- this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
4318
- }
7146
+ logger9.info("Creating wav2arkit_cpu worker...");
7147
+ this.worker = this.createWorker();
7148
+ const externalDataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${this.config.modelUrl}.data` : null;
7149
+ logger9.info("Loading model in worker...", {
7150
+ modelUrl: this.config.modelUrl,
7151
+ externalDataUrl,
7152
+ isIOS: isIOS()
7153
+ });
7154
+ const result = await this.sendMessage(
7155
+ {
7156
+ type: "load",
7157
+ modelUrl: resolveUrl3(this.config.modelUrl),
7158
+ externalDataUrl: externalDataUrl ? resolveUrl3(externalDataUrl) : null,
7159
+ wasmPaths: WASM_CDN_PATH4,
7160
+ isIOS: isIOS()
7161
+ },
7162
+ "loaded",
7163
+ LOAD_TIMEOUT_MS2
7164
+ );
7165
+ this._isLoaded = true;
4319
7166
  const loadTimeMs = performance.now() - startTime;
4320
- logger5.info("Model loaded successfully", {
4321
- backend: this._backend,
7167
+ logger9.info("Wav2ArkitCpu worker loaded successfully", {
7168
+ backend: "wasm",
4322
7169
  loadTimeMs: Math.round(loadTimeMs),
4323
- inputs: this.session.inputNames,
4324
- outputs: this.session.outputNames
7170
+ workerLoadTimeMs: Math.round(result.loadTimeMs),
7171
+ inputs: result.inputNames,
7172
+ outputs: result.outputNames
4325
7173
  });
4326
7174
  span?.setAttributes({
4327
- "model.backend": this._backend,
7175
+ "model.backend": "wasm",
4328
7176
  "model.load_time_ms": loadTimeMs,
4329
- "model.cached": !isIOS()
7177
+ "model.worker_load_time_ms": result.loadTimeMs
4330
7178
  });
4331
7179
  span?.end();
4332
7180
  telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
4333
- model: "wav2arkit_cpu",
4334
- backend: this._backend
4335
- });
4336
- logger5.debug("Running warmup inference");
4337
- const warmupStart = performance.now();
4338
- const silentAudio = new Float32Array(16e3);
4339
- await this.infer(silentAudio);
4340
- const warmupTimeMs = performance.now() - warmupStart;
4341
- logger5.info("Warmup inference complete", {
4342
- warmupTimeMs: Math.round(warmupTimeMs),
4343
- backend: this._backend
4344
- });
4345
- telemetry?.recordHistogram("omote.model.warmup_time", warmupTimeMs, {
4346
- model: "wav2arkit_cpu",
4347
- backend: this._backend
7181
+ model: "wav2arkit_cpu-worker",
7182
+ backend: "wasm"
4348
7183
  });
4349
7184
  return {
4350
- backend: this._backend,
7185
+ backend: "wasm",
4351
7186
  loadTimeMs,
4352
- inputNames: [...this.session.inputNames],
4353
- outputNames: [...this.session.outputNames]
7187
+ inputNames: result.inputNames,
7188
+ outputNames: result.outputNames
4354
7189
  };
4355
7190
  } catch (error) {
4356
7191
  span?.endWithError(error instanceof Error ? error : new Error(String(error)));
4357
7192
  telemetry?.incrementCounter("omote.errors.total", 1, {
4358
- model: "wav2arkit_cpu",
7193
+ model: "wav2arkit_cpu-worker",
4359
7194
  error_type: "load_failed"
4360
7195
  });
7196
+ if (this.worker) {
7197
+ this.worker.terminate();
7198
+ this.worker = null;
7199
+ }
4361
7200
  throw error;
4362
7201
  } finally {
4363
7202
  this.isLoading = false;
@@ -4373,75 +7212,62 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
4373
7212
  * @param _identityIndex - Ignored (identity 11 is baked into the model)
4374
7213
  */
4375
7214
  async infer(audioSamples, _identityIndex) {
4376
- if (!this.session) {
7215
+ if (!this._isLoaded || !this.worker) {
4377
7216
  throw new Error("Model not loaded. Call load() first.");
4378
7217
  }
4379
7218
  if (this.poisoned) {
4380
- throw new Error("Wav2ArkitCpu session timed out \u2014 inference unavailable until page reload");
7219
+ throw new Error("Wav2ArkitCpu worker session timed out \u2014 inference unavailable until page reload");
4381
7220
  }
4382
7221
  const audioCopy = new Float32Array(audioSamples);
4383
- const feeds = {
4384
- "audio_waveform": new this.ort.Tensor("float32", audioCopy, [1, audioCopy.length])
4385
- };
4386
- return this.queueInference(feeds, audioCopy.length);
7222
+ return this.queueInference(audioCopy);
4387
7223
  }
4388
7224
  /**
4389
- * Queue inference to serialize ONNX session calls
7225
+ * Queue inference to serialize worker calls
4390
7226
  */
4391
- queueInference(feeds, inputSamples) {
7227
+ queueInference(audioSamples) {
4392
7228
  return new Promise((resolve, reject) => {
4393
7229
  this.inferenceQueue = this.inferenceQueue.then(async () => {
4394
7230
  const telemetry = getTelemetry();
4395
- const span = telemetry?.startSpan("Wav2ArkitCpu.infer", {
4396
- "inference.backend": this._backend,
4397
- "inference.input_samples": inputSamples
7231
+ const span = telemetry?.startSpan("Wav2ArkitCpuWorker.infer", {
7232
+ "inference.backend": "wasm",
7233
+ "inference.input_samples": audioSamples.length
4398
7234
  });
4399
7235
  try {
4400
7236
  const startTime = performance.now();
4401
- let timeoutId;
4402
- const results = await Promise.race([
4403
- this.session.run(feeds).then((r) => {
4404
- clearTimeout(timeoutId);
4405
- return r;
4406
- }),
4407
- new Promise((_, rej) => {
4408
- timeoutId = setTimeout(
4409
- () => rej(new Error(`Wav2ArkitCpu inference timed out after ${_Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS}ms`)),
4410
- _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
4411
- );
4412
- })
4413
- ]);
7237
+ const result = await this.sendMessage(
7238
+ {
7239
+ type: "infer",
7240
+ audio: audioSamples
7241
+ },
7242
+ "result",
7243
+ INFERENCE_TIMEOUT_MS2
7244
+ );
4414
7245
  const inferenceTimeMs = performance.now() - startTime;
4415
- const blendshapeOutput = results["blendshapes"];
4416
- if (!blendshapeOutput) {
4417
- throw new Error("Missing blendshapes output from model");
4418
- }
4419
- const blendshapeData = blendshapeOutput.data;
4420
- const numFrames = blendshapeOutput.dims[1];
4421
- const numBlendshapes = blendshapeOutput.dims[2];
7246
+ const flatBuffer = result.blendshapes;
7247
+ const { numFrames, numBlendshapes } = result;
4422
7248
  const blendshapes = [];
4423
7249
  for (let f = 0; f < numFrames; f++) {
4424
- const rawFrame = blendshapeData.slice(f * numBlendshapes, (f + 1) * numBlendshapes);
4425
- const symmetrized = symmetrizeBlendshapes(rawFrame);
4426
- blendshapes.push(symmetrized);
7250
+ blendshapes.push(flatBuffer.slice(f * numBlendshapes, (f + 1) * numBlendshapes));
4427
7251
  }
4428
- logger5.trace("Inference completed", {
7252
+ logger9.trace("Worker inference completed", {
4429
7253
  inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
7254
+ workerTimeMs: Math.round(result.inferenceTimeMs * 100) / 100,
4430
7255
  numFrames,
4431
- inputSamples
7256
+ inputSamples: audioSamples.length
4432
7257
  });
4433
7258
  span?.setAttributes({
4434
7259
  "inference.duration_ms": inferenceTimeMs,
7260
+ "inference.worker_duration_ms": result.inferenceTimeMs,
4435
7261
  "inference.frames": numFrames
4436
7262
  });
4437
7263
  span?.end();
4438
7264
  telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
4439
- model: "wav2arkit_cpu",
4440
- backend: this._backend
7265
+ model: "wav2arkit_cpu-worker",
7266
+ backend: "wasm"
4441
7267
  });
4442
7268
  telemetry?.incrementCounter("omote.inference.total", 1, {
4443
- model: "wav2arkit_cpu",
4444
- backend: this._backend,
7269
+ model: "wav2arkit_cpu-worker",
7270
+ backend: "wasm",
4445
7271
  status: "success"
4446
7272
  });
4447
7273
  resolve({
@@ -4453,33 +7279,17 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
4453
7279
  const errMsg = err instanceof Error ? err.message : String(err);
4454
7280
  if (errMsg.includes("timed out")) {
4455
7281
  this.poisoned = true;
4456
- logger5.error("CRITICAL: Inference session timed out \u2014 Wav2ArkitCpu is dead. Page reload required.", {
4457
- backend: this._backend,
4458
- timeoutMs: _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
4459
- });
4460
- } else if (typeof err === "number") {
4461
- const oomError = new Error(
4462
- `Wav2ArkitCpu inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
4463
- );
4464
- logger5.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
4465
- pointer: `0x${err.toString(16)}`,
4466
- backend: this._backend
4467
- });
4468
- span?.endWithError(oomError);
4469
- telemetry?.incrementCounter("omote.inference.total", 1, {
4470
- model: "wav2arkit_cpu",
4471
- backend: this._backend,
4472
- status: "error"
7282
+ logger9.error("CRITICAL: Worker inference timed out \u2014 Wav2ArkitCpu worker is dead. Page reload required.", {
7283
+ backend: "wasm",
7284
+ timeoutMs: INFERENCE_TIMEOUT_MS2
4473
7285
  });
4474
- reject(oomError);
4475
- return;
4476
7286
  } else {
4477
- logger5.error("Inference failed", { error: errMsg, backend: this._backend });
7287
+ logger9.error("Worker inference failed", { error: errMsg, backend: "wasm" });
4478
7288
  }
4479
7289
  span?.endWithError(err instanceof Error ? err : new Error(String(err)));
4480
7290
  telemetry?.incrementCounter("omote.inference.total", 1, {
4481
- model: "wav2arkit_cpu",
4482
- backend: this._backend,
7291
+ model: "wav2arkit_cpu-worker",
7292
+ backend: "wasm",
4483
7293
  status: "error"
4484
7294
  });
4485
7295
  reject(err);
@@ -4488,39 +7298,62 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
4488
7298
  });
4489
7299
  }
4490
7300
  /**
4491
- * Dispose of the model and free resources
7301
+ * Dispose of the worker and free resources
4492
7302
  */
4493
7303
  async dispose() {
4494
- if (this.session) {
4495
- await this.session.release();
4496
- this.session = null;
7304
+ if (this.worker) {
7305
+ try {
7306
+ await this.sendMessage({ type: "dispose" }, "disposed", INFERENCE_TIMEOUT_MS2);
7307
+ } catch {
7308
+ }
7309
+ this.worker.terminate();
7310
+ this.worker = null;
4497
7311
  }
7312
+ this._isLoaded = false;
7313
+ this.poisoned = false;
7314
+ this.pendingResolvers.clear();
7315
+ }
7316
+ /**
7317
+ * Check if Web Workers are supported
7318
+ */
7319
+ static isSupported() {
7320
+ return typeof Worker !== "undefined";
4498
7321
  }
4499
7322
  };
4500
- _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS = 5e3;
4501
- var Wav2ArkitCpuInference = _Wav2ArkitCpuInference;
4502
7323
 
4503
7324
  // src/inference/createLipSync.ts
4504
- var logger6 = createLogger("createLipSync");
7325
+ var logger10 = createLogger("createLipSync");
4505
7326
  function createLipSync(config) {
4506
7327
  const mode = config.mode ?? "auto";
4507
7328
  const fallbackOnError = config.fallbackOnError ?? true;
4508
7329
  let useCpu;
4509
7330
  if (mode === "cpu") {
4510
7331
  useCpu = true;
4511
- logger6.info("Forcing CPU lip sync model (wav2arkit_cpu)");
7332
+ logger10.info("Forcing CPU lip sync model (wav2arkit_cpu)");
4512
7333
  } else if (mode === "gpu") {
4513
7334
  useCpu = false;
4514
- logger6.info("Forcing GPU lip sync model (Wav2Vec2)");
7335
+ logger10.info("Forcing GPU lip sync model (Wav2Vec2)");
4515
7336
  } else {
4516
7337
  useCpu = shouldUseCpuLipSync();
4517
- logger6.info("Auto-detected lip sync model", {
7338
+ logger10.info("Auto-detected lip sync model", {
4518
7339
  useCpu,
4519
7340
  isSafari: isSafari()
4520
7341
  });
4521
7342
  }
4522
7343
  if (useCpu) {
4523
- logger6.info("Creating Wav2ArkitCpuInference (404MB, WASM)");
7344
+ if (config.unifiedWorker) {
7345
+ logger10.info("Creating Wav2ArkitCpuUnifiedAdapter (404MB, WASM, shared unified worker)");
7346
+ return new Wav2ArkitCpuUnifiedAdapter(config.unifiedWorker, {
7347
+ modelUrl: config.cpuModelUrl
7348
+ });
7349
+ }
7350
+ if (config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
7351
+ logger10.info("Creating Wav2ArkitCpuWorker (404MB, WASM, off-main-thread)");
7352
+ return new Wav2ArkitCpuWorker({
7353
+ modelUrl: config.cpuModelUrl
7354
+ });
7355
+ }
7356
+ logger10.info("Creating Wav2ArkitCpuInference (404MB, WASM)");
4524
7357
  return new Wav2ArkitCpuInference({
4525
7358
  modelUrl: config.cpuModelUrl
4526
7359
  });
@@ -4532,10 +7365,10 @@ function createLipSync(config) {
4532
7365
  numIdentityClasses: config.numIdentityClasses
4533
7366
  });
4534
7367
  if (fallbackOnError) {
4535
- logger6.info("Creating Wav2Vec2Inference with CPU fallback");
7368
+ logger10.info("Creating Wav2Vec2Inference with CPU fallback");
4536
7369
  return new LipSyncWithFallback(gpuInstance, config);
4537
7370
  }
4538
- logger6.info("Creating Wav2Vec2Inference (no fallback)");
7371
+ logger10.info("Creating Wav2Vec2Inference (no fallback)");
4539
7372
  return gpuInstance;
4540
7373
  }
4541
7374
  var LipSyncWithFallback = class {
@@ -4561,16 +7394,28 @@ var LipSyncWithFallback = class {
4561
7394
  }
4562
7395
  }
4563
7396
  async fallbackToCpu(reason) {
4564
- logger6.warn("GPU model load failed, falling back to CPU model", { reason });
7397
+ logger10.warn("GPU model load failed, falling back to CPU model", { reason });
4565
7398
  try {
4566
7399
  await this.implementation.dispose();
4567
7400
  } catch {
4568
7401
  }
4569
- this.implementation = new Wav2ArkitCpuInference({
4570
- modelUrl: this.config.cpuModelUrl
4571
- });
7402
+ if (this.config.unifiedWorker) {
7403
+ this.implementation = new Wav2ArkitCpuUnifiedAdapter(this.config.unifiedWorker, {
7404
+ modelUrl: this.config.cpuModelUrl
7405
+ });
7406
+ logger10.info("Fallback to Wav2ArkitCpuUnifiedAdapter successful");
7407
+ } else if (this.config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
7408
+ this.implementation = new Wav2ArkitCpuWorker({
7409
+ modelUrl: this.config.cpuModelUrl
7410
+ });
7411
+ logger10.info("Fallback to Wav2ArkitCpuWorker successful");
7412
+ } else {
7413
+ this.implementation = new Wav2ArkitCpuInference({
7414
+ modelUrl: this.config.cpuModelUrl
7415
+ });
7416
+ logger10.info("Fallback to Wav2ArkitCpuInference successful");
7417
+ }
4572
7418
  this.hasFallenBack = true;
4573
- logger6.info("Fallback to Wav2ArkitCpuInference successful");
4574
7419
  return await this.implementation.load();
4575
7420
  }
4576
7421
  async infer(audioSamples, identityIndex) {
@@ -4582,7 +7427,7 @@ var LipSyncWithFallback = class {
4582
7427
  };
4583
7428
 
4584
7429
  // src/inference/SileroVADInference.ts
4585
- var logger7 = createLogger("SileroVAD");
7430
+ var logger11 = createLogger("SileroVAD");
4586
7431
  var SileroVADInference = class {
4587
7432
  constructor(config) {
4588
7433
  this.session = null;
@@ -4656,23 +7501,23 @@ var SileroVADInference = class {
4656
7501
  "model.sample_rate": this.config.sampleRate
4657
7502
  });
4658
7503
  try {
4659
- logger7.info("Loading ONNX Runtime...", { preference: this.config.backend });
7504
+ logger11.info("Loading ONNX Runtime...", { preference: this.config.backend });
4660
7505
  const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
4661
7506
  this.ort = ort;
4662
7507
  this._backend = backend;
4663
- logger7.info("ONNX Runtime loaded", { backend: this._backend });
7508
+ logger11.info("ONNX Runtime loaded", { backend: this._backend });
4664
7509
  const cache = getModelCache();
4665
7510
  const modelUrl = this.config.modelUrl;
4666
7511
  const isCached = await cache.has(modelUrl);
4667
7512
  let modelBuffer;
4668
7513
  if (isCached) {
4669
- logger7.debug("Loading model from cache", { modelUrl });
7514
+ logger11.debug("Loading model from cache", { modelUrl });
4670
7515
  modelBuffer = await cache.get(modelUrl);
4671
7516
  } else {
4672
- logger7.debug("Fetching and caching model", { modelUrl });
7517
+ logger11.debug("Fetching and caching model", { modelUrl });
4673
7518
  modelBuffer = await fetchWithCache(modelUrl);
4674
7519
  }
4675
- logger7.debug("Creating ONNX session", {
7520
+ logger11.debug("Creating ONNX session", {
4676
7521
  size: formatBytes(modelBuffer.byteLength),
4677
7522
  backend: this._backend
4678
7523
  });
@@ -4681,7 +7526,7 @@ var SileroVADInference = class {
4681
7526
  this.session = await ort.InferenceSession.create(modelData, sessionOptions);
4682
7527
  this.reset();
4683
7528
  const loadTimeMs = performance.now() - startTime;
4684
- logger7.info("Model loaded successfully", {
7529
+ logger11.info("Model loaded successfully", {
4685
7530
  backend: this._backend,
4686
7531
  loadTimeMs: Math.round(loadTimeMs),
4687
7532
  sampleRate: this.config.sampleRate,
@@ -4736,7 +7581,7 @@ var SileroVADInference = class {
4736
7581
  []
4737
7582
  );
4738
7583
  } catch (e) {
4739
- logger7.warn("BigInt64Array not available, using bigint array fallback", {
7584
+ logger11.warn("BigInt64Array not available, using bigint array fallback", {
4740
7585
  error: e instanceof Error ? e.message : String(e)
4741
7586
  });
4742
7587
  this.srTensor = new this.ort.Tensor(
@@ -4842,7 +7687,7 @@ var SileroVADInference = class {
4842
7687
  this.preSpeechBuffer.shift();
4843
7688
  }
4844
7689
  }
4845
- logger7.trace("Skipping VAD inference - audio too quiet", {
7690
+ logger11.trace("Skipping VAD inference - audio too quiet", {
4846
7691
  rms: Math.round(rms * 1e4) / 1e4,
4847
7692
  threshold: MIN_ENERGY_THRESHOLD
4848
7693
  });
@@ -4896,7 +7741,7 @@ var SileroVADInference = class {
4896
7741
  if (isSpeech && !this.wasSpeaking) {
4897
7742
  preSpeechChunks = [...this.preSpeechBuffer];
4898
7743
  this.preSpeechBuffer = [];
4899
- logger7.debug("Speech started with pre-speech buffer", {
7744
+ logger11.debug("Speech started with pre-speech buffer", {
4900
7745
  preSpeechChunks: preSpeechChunks.length,
4901
7746
  durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
4902
7747
  });
@@ -4909,7 +7754,7 @@ var SileroVADInference = class {
4909
7754
  this.preSpeechBuffer = [];
4910
7755
  }
4911
7756
  this.wasSpeaking = isSpeech;
4912
- logger7.trace("VAD inference completed", {
7757
+ logger11.trace("VAD inference completed", {
4913
7758
  probability: Math.round(probability * 1e3) / 1e3,
4914
7759
  isSpeech,
4915
7760
  inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100
@@ -4940,7 +7785,7 @@ var SileroVADInference = class {
4940
7785
  const oomError = new Error(
4941
7786
  `SileroVAD inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reducing concurrent model sessions or reloading the page.`
4942
7787
  );
4943
- logger7.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
7788
+ logger11.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
4944
7789
  pointer: `0x${err.toString(16)}`,
4945
7790
  backend: this._backend
4946
7791
  });
@@ -4983,19 +7828,27 @@ var SileroVADInference = class {
4983
7828
  SileroVADInference.isWebGPUAvailable = isWebGPUAvailable;
4984
7829
 
4985
7830
  // src/inference/SileroVADWorker.ts
4986
- var logger8 = createLogger("SileroVADWorker");
4987
- var WASM_CDN_PATH2 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
4988
- var LOAD_TIMEOUT_MS = 1e4;
4989
- var INFERENCE_TIMEOUT_MS = 1e3;
4990
- var WORKER_SCRIPT = `
7831
+ var logger12 = createLogger("SileroVADWorker");
7832
+ var WASM_CDN_PATH5 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
7833
+ var LOAD_TIMEOUT_MS3 = 1e4;
7834
+ var INFERENCE_TIMEOUT_MS3 = 1e3;
7835
+ function resolveUrl4(url) {
7836
+ if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
7837
+ try {
7838
+ return new URL(url, globalThis.location?.origin ?? "https://localhost").href;
7839
+ } catch {
7840
+ return url;
7841
+ }
7842
+ }
7843
+ var WORKER_SCRIPT4 = `
4991
7844
  // Silero VAD Worker Script
4992
7845
  // Loaded via Blob URL - no separate file needed
4993
7846
 
4994
- let ort = null;
4995
- let session = null;
4996
- let sampleRate = 16000;
4997
- let chunkSize = 512;
4998
- let contextSize = 64;
7847
+ var ort = null;
7848
+ var session = null;
7849
+ var sampleRate = 16000;
7850
+ var chunkSize = 512;
7851
+ var contextSize = 64;
4999
7852
 
5000
7853
  /**
5001
7854
  * Load ONNX Runtime from CDN
@@ -5245,7 +8098,7 @@ var SileroVADWorker = class {
5245
8098
  * Create the worker from inline script
5246
8099
  */
5247
8100
  createWorker() {
5248
- const blob = new Blob([WORKER_SCRIPT], { type: "application/javascript" });
8101
+ const blob = new Blob([WORKER_SCRIPT4], { type: "application/javascript" });
5249
8102
  const blobUrl = URL.createObjectURL(blob);
5250
8103
  const worker = new Worker(blobUrl);
5251
8104
  URL.revokeObjectURL(blobUrl);
@@ -5253,7 +8106,7 @@ var SileroVADWorker = class {
5253
8106
  this.handleWorkerMessage(event.data);
5254
8107
  };
5255
8108
  worker.onerror = (error) => {
5256
- logger8.error("Worker error", { error: error.message });
8109
+ logger12.error("Worker error", { error: error.message });
5257
8110
  for (const [, resolver] of this.pendingResolvers) {
5258
8111
  resolver.reject(new Error(`Worker error: ${error.message}`));
5259
8112
  }
@@ -5329,25 +8182,25 @@ var SileroVADWorker = class {
5329
8182
  "model.sample_rate": this.config.sampleRate
5330
8183
  });
5331
8184
  try {
5332
- logger8.info("Creating VAD worker...");
8185
+ logger12.info("Creating VAD worker...");
5333
8186
  this.worker = this.createWorker();
5334
- logger8.info("Loading model in worker...", {
8187
+ logger12.info("Loading model in worker...", {
5335
8188
  modelUrl: this.config.modelUrl,
5336
8189
  sampleRate: this.config.sampleRate
5337
8190
  });
5338
8191
  const result = await this.sendMessage(
5339
8192
  {
5340
8193
  type: "load",
5341
- modelUrl: this.config.modelUrl,
8194
+ modelUrl: resolveUrl4(this.config.modelUrl),
5342
8195
  sampleRate: this.config.sampleRate,
5343
- wasmPaths: WASM_CDN_PATH2
8196
+ wasmPaths: WASM_CDN_PATH5
5344
8197
  },
5345
8198
  "loaded",
5346
- LOAD_TIMEOUT_MS
8199
+ LOAD_TIMEOUT_MS3
5347
8200
  );
5348
8201
  this._isLoaded = true;
5349
8202
  const loadTimeMs = performance.now() - startTime;
5350
- logger8.info("VAD worker loaded successfully", {
8203
+ logger12.info("VAD worker loaded successfully", {
5351
8204
  backend: "wasm",
5352
8205
  loadTimeMs: Math.round(loadTimeMs),
5353
8206
  workerLoadTimeMs: Math.round(result.loadTimeMs),
@@ -5398,7 +8251,7 @@ var SileroVADWorker = class {
5398
8251
  const result = await this.sendMessage(
5399
8252
  { type: "reset" },
5400
8253
  "reset",
5401
- INFERENCE_TIMEOUT_MS
8254
+ INFERENCE_TIMEOUT_MS3
5402
8255
  );
5403
8256
  this.state = result.state;
5404
8257
  this.context = new Float32Array(this.contextSize);
@@ -5444,7 +8297,7 @@ var SileroVADWorker = class {
5444
8297
  context: this.context
5445
8298
  },
5446
8299
  "result",
5447
- INFERENCE_TIMEOUT_MS
8300
+ INFERENCE_TIMEOUT_MS3
5448
8301
  );
5449
8302
  this.state = result.state;
5450
8303
  this.context = audioChunkCopy.slice(-this.contextSize);
@@ -5454,7 +8307,7 @@ var SileroVADWorker = class {
5454
8307
  if (isSpeech && !this.wasSpeaking) {
5455
8308
  preSpeechChunks = [...this.preSpeechBuffer];
5456
8309
  this.preSpeechBuffer = [];
5457
- logger8.debug("Speech started with pre-speech buffer", {
8310
+ logger12.debug("Speech started with pre-speech buffer", {
5458
8311
  preSpeechChunks: preSpeechChunks.length,
5459
8312
  durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
5460
8313
  });
@@ -5467,7 +8320,7 @@ var SileroVADWorker = class {
5467
8320
  this.preSpeechBuffer = [];
5468
8321
  }
5469
8322
  this.wasSpeaking = isSpeech;
5470
- logger8.trace("VAD worker inference completed", {
8323
+ logger12.trace("VAD worker inference completed", {
5471
8324
  probability: Math.round(result.probability * 1e3) / 1e3,
5472
8325
  isSpeech,
5473
8326
  inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
@@ -5513,7 +8366,7 @@ var SileroVADWorker = class {
5513
8366
  async dispose() {
5514
8367
  if (this.worker) {
5515
8368
  try {
5516
- await this.sendMessage({ type: "dispose" }, "disposed", INFERENCE_TIMEOUT_MS);
8369
+ await this.sendMessage({ type: "dispose" }, "disposed", INFERENCE_TIMEOUT_MS3);
5517
8370
  } catch {
5518
8371
  }
5519
8372
  this.worker.terminate();
@@ -5535,40 +8388,44 @@ var SileroVADWorker = class {
5535
8388
  };
5536
8389
 
5537
8390
  // src/inference/createSileroVAD.ts
5538
- var logger9 = createLogger("createSileroVAD");
8391
+ var logger13 = createLogger("createSileroVAD");
5539
8392
  function supportsVADWorker() {
5540
8393
  if (typeof Worker === "undefined") {
5541
- logger9.debug("Worker not supported: Worker constructor undefined");
8394
+ logger13.debug("Worker not supported: Worker constructor undefined");
5542
8395
  return false;
5543
8396
  }
5544
8397
  if (typeof URL === "undefined" || typeof URL.createObjectURL === "undefined") {
5545
- logger9.debug("Worker not supported: URL.createObjectURL unavailable");
8398
+ logger13.debug("Worker not supported: URL.createObjectURL unavailable");
5546
8399
  return false;
5547
8400
  }
5548
8401
  if (typeof Blob === "undefined") {
5549
- logger9.debug("Worker not supported: Blob constructor unavailable");
8402
+ logger13.debug("Worker not supported: Blob constructor unavailable");
5550
8403
  return false;
5551
8404
  }
5552
8405
  return true;
5553
8406
  }
5554
8407
  function createSileroVAD(config) {
8408
+ if (config.unifiedWorker) {
8409
+ logger13.info("Creating SileroVADUnifiedAdapter (shared unified worker)");
8410
+ return new SileroVADUnifiedAdapter(config.unifiedWorker, config);
8411
+ }
5555
8412
  const fallbackOnError = config.fallbackOnError ?? true;
5556
8413
  let useWorker;
5557
8414
  if (config.useWorker !== void 0) {
5558
8415
  useWorker = config.useWorker;
5559
- logger9.debug("Worker preference explicitly set", { useWorker });
8416
+ logger13.debug("Worker preference explicitly set", { useWorker });
5560
8417
  } else {
5561
8418
  const workerSupported = supportsVADWorker();
5562
8419
  const onMobile = isMobile();
5563
8420
  useWorker = workerSupported && !onMobile;
5564
- logger9.debug("Auto-detected Worker preference", {
8421
+ logger13.debug("Auto-detected Worker preference", {
5565
8422
  useWorker,
5566
8423
  workerSupported,
5567
8424
  onMobile
5568
8425
  });
5569
8426
  }
5570
8427
  if (useWorker) {
5571
- logger9.info("Creating SileroVADWorker (off-main-thread)");
8428
+ logger13.info("Creating SileroVADWorker (off-main-thread)");
5572
8429
  const worker = new SileroVADWorker({
5573
8430
  modelUrl: config.modelUrl,
5574
8431
  sampleRate: config.sampleRate,
@@ -5580,7 +8437,7 @@ function createSileroVAD(config) {
5580
8437
  }
5581
8438
  return worker;
5582
8439
  }
5583
- logger9.info("Creating SileroVADInference (main thread)");
8440
+ logger13.info("Creating SileroVADInference (main thread)");
5584
8441
  return new SileroVADInference(config);
5585
8442
  }
5586
8443
  var VADWorkerWithFallback = class {
@@ -5606,7 +8463,7 @@ var VADWorkerWithFallback = class {
5606
8463
  try {
5607
8464
  return await this.implementation.load();
5608
8465
  } catch (error) {
5609
- logger9.warn("Worker load failed, falling back to main thread", {
8466
+ logger13.warn("Worker load failed, falling back to main thread", {
5610
8467
  error: error instanceof Error ? error.message : String(error)
5611
8468
  });
5612
8469
  try {
@@ -5615,7 +8472,7 @@ var VADWorkerWithFallback = class {
5615
8472
  }
5616
8473
  this.implementation = new SileroVADInference(this.config);
5617
8474
  this.hasFallenBack = true;
5618
- logger9.info("Fallback to SileroVADInference successful");
8475
+ logger13.info("Fallback to SileroVADInference successful");
5619
8476
  return await this.implementation.load();
5620
8477
  }
5621
8478
  }
@@ -5637,7 +8494,7 @@ var VADWorkerWithFallback = class {
5637
8494
  };
5638
8495
 
5639
8496
  // src/inference/SafariSpeechRecognition.ts
5640
- var logger10 = createLogger("SafariSpeech");
8497
+ var logger14 = createLogger("SafariSpeech");
5641
8498
  var SafariSpeechRecognition = class _SafariSpeechRecognition {
5642
8499
  constructor(config = {}) {
5643
8500
  this.recognition = null;
@@ -5656,7 +8513,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5656
8513
  interimResults: config.interimResults ?? true,
5657
8514
  maxAlternatives: config.maxAlternatives ?? 1
5658
8515
  };
5659
- logger10.debug("SafariSpeechRecognition created", {
8516
+ logger14.debug("SafariSpeechRecognition created", {
5660
8517
  language: this.config.language,
5661
8518
  continuous: this.config.continuous
5662
8519
  });
@@ -5717,7 +8574,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5717
8574
  */
5718
8575
  async start() {
5719
8576
  if (this.isListening) {
5720
- logger10.warn("Already listening");
8577
+ logger14.warn("Already listening");
5721
8578
  return;
5722
8579
  }
5723
8580
  if (!_SafariSpeechRecognition.isAvailable()) {
@@ -5747,7 +8604,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5747
8604
  this.isListening = true;
5748
8605
  this.startTime = performance.now();
5749
8606
  this.accumulatedText = "";
5750
- logger10.info("Speech recognition started", {
8607
+ logger14.info("Speech recognition started", {
5751
8608
  language: this.config.language
5752
8609
  });
5753
8610
  span?.end();
@@ -5762,7 +8619,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5762
8619
  */
5763
8620
  async stop() {
5764
8621
  if (!this.isListening || !this.recognition) {
5765
- logger10.warn("Not currently listening");
8622
+ logger14.warn("Not currently listening");
5766
8623
  return {
5767
8624
  text: this.accumulatedText,
5768
8625
  language: this.config.language,
@@ -5791,7 +8648,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5791
8648
  if (this.recognition && this.isListening) {
5792
8649
  this.recognition.abort();
5793
8650
  this.isListening = false;
5794
- logger10.info("Speech recognition aborted");
8651
+ logger14.info("Speech recognition aborted");
5795
8652
  }
5796
8653
  }
5797
8654
  /**
@@ -5822,7 +8679,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5822
8679
  this.isListening = false;
5823
8680
  this.resultCallbacks = [];
5824
8681
  this.errorCallbacks = [];
5825
- logger10.debug("SafariSpeechRecognition disposed");
8682
+ logger14.debug("SafariSpeechRecognition disposed");
5826
8683
  }
5827
8684
  /**
5828
8685
  * Set up event handlers for the recognition instance
@@ -5850,7 +8707,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5850
8707
  confidence: alternative.confidence
5851
8708
  };
5852
8709
  this.emitResult(speechResult);
5853
- logger10.trace("Speech result", {
8710
+ logger14.trace("Speech result", {
5854
8711
  text: text.substring(0, 50),
5855
8712
  isFinal,
5856
8713
  confidence: alternative.confidence
@@ -5860,12 +8717,12 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5860
8717
  span?.end();
5861
8718
  } catch (error) {
5862
8719
  span?.endWithError(error instanceof Error ? error : new Error(String(error)));
5863
- logger10.error("Error processing speech result", { error });
8720
+ logger14.error("Error processing speech result", { error });
5864
8721
  }
5865
8722
  };
5866
8723
  this.recognition.onerror = (event) => {
5867
8724
  const error = new Error(`Speech recognition error: ${event.error} - ${event.message}`);
5868
- logger10.error("Speech recognition error", { error: event.error, message: event.message });
8725
+ logger14.error("Speech recognition error", { error: event.error, message: event.message });
5869
8726
  this.emitError(error);
5870
8727
  if (this.stopRejecter) {
5871
8728
  this.stopRejecter(error);
@@ -5875,7 +8732,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5875
8732
  };
5876
8733
  this.recognition.onend = () => {
5877
8734
  this.isListening = false;
5878
- logger10.info("Speech recognition ended", {
8735
+ logger14.info("Speech recognition ended", {
5879
8736
  totalText: this.accumulatedText.length,
5880
8737
  durationMs: performance.now() - this.startTime
5881
8738
  });
@@ -5892,13 +8749,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5892
8749
  }
5893
8750
  };
5894
8751
  this.recognition.onstart = () => {
5895
- logger10.debug("Speech recognition started by browser");
8752
+ logger14.debug("Speech recognition started by browser");
5896
8753
  };
5897
8754
  this.recognition.onspeechstart = () => {
5898
- logger10.debug("Speech detected");
8755
+ logger14.debug("Speech detected");
5899
8756
  };
5900
8757
  this.recognition.onspeechend = () => {
5901
- logger10.debug("Speech ended");
8758
+ logger14.debug("Speech ended");
5902
8759
  };
5903
8760
  }
5904
8761
  /**
@@ -5909,7 +8766,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5909
8766
  try {
5910
8767
  callback(result);
5911
8768
  } catch (error) {
5912
- logger10.error("Error in result callback", { error });
8769
+ logger14.error("Error in result callback", { error });
5913
8770
  }
5914
8771
  }
5915
8772
  }
@@ -5921,7 +8778,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5921
8778
  try {
5922
8779
  callback(error);
5923
8780
  } catch (callbackError) {
5924
- logger10.error("Error in error callback", { error: callbackError });
8781
+ logger14.error("Error in error callback", { error: callbackError });
5925
8782
  }
5926
8783
  }
5927
8784
  }
@@ -6248,11 +9105,14 @@ var AgentCoreAdapter = class extends EventEmitter {
6248
9105
  return new Promise((resolve) => {
6249
9106
  const timeout = setTimeout(() => resolve(false), 5e3);
6250
9107
  const handler = (event) => {
6251
- const data = JSON.parse(event.data);
6252
- if (data.type === "pong") {
6253
- clearTimeout(timeout);
6254
- this.ws?.removeEventListener("message", handler);
6255
- resolve(true);
9108
+ try {
9109
+ const data = JSON.parse(event.data);
9110
+ if (data.type === "pong") {
9111
+ clearTimeout(timeout);
9112
+ this.ws?.removeEventListener("message", handler);
9113
+ resolve(true);
9114
+ }
9115
+ } catch {
6256
9116
  }
6257
9117
  };
6258
9118
  this.ws?.addEventListener("message", handler);
@@ -6378,7 +9238,10 @@ var AgentCoreAdapter = class extends EventEmitter {
6378
9238
  }));
6379
9239
  };
6380
9240
  this.ws.onmessage = (event) => {
6381
- this.handleAgentCoreMessage(JSON.parse(event.data));
9241
+ try {
9242
+ this.handleAgentCoreMessage(JSON.parse(event.data));
9243
+ } catch {
9244
+ }
6382
9245
  };
6383
9246
  this.ws.onerror = () => {
6384
9247
  reject(new Error("WebSocket connection failed"));
@@ -6390,14 +9253,17 @@ var AgentCoreAdapter = class extends EventEmitter {
6390
9253
  reject(new Error("Auth timeout"));
6391
9254
  }, 1e4);
6392
9255
  const authHandler = (event) => {
6393
- const data = JSON.parse(event.data);
6394
- if (data.type === "auth_success") {
6395
- clearTimeout(authTimeout);
6396
- this.ws?.removeEventListener("message", authHandler);
6397
- resolve();
6398
- } else if (data.type === "auth_failed") {
6399
- clearTimeout(authTimeout);
6400
- reject(new Error(data.message));
9256
+ try {
9257
+ const data = JSON.parse(event.data);
9258
+ if (data.type === "auth_success") {
9259
+ clearTimeout(authTimeout);
9260
+ this.ws?.removeEventListener("message", authHandler);
9261
+ resolve();
9262
+ } else if (data.type === "auth_failed") {
9263
+ clearTimeout(authTimeout);
9264
+ reject(new Error(data.message));
9265
+ }
9266
+ } catch {
6401
9267
  }
6402
9268
  };
6403
9269
  this.ws.addEventListener("message", authHandler);
@@ -7206,20 +10072,6 @@ var InterruptionHandler = class extends EventEmitter {
7206
10072
  this.onSilenceDetected();
7207
10073
  }
7208
10074
  }
7209
- /**
7210
- * @deprecated Use processVADResult() instead. This method uses naive RMS detection.
7211
- * Process audio samples for VAD (legacy - uses simple RMS)
7212
- */
7213
- processAudio(samples) {
7214
- if (!this.config.enabled) return;
7215
- const rms = this.calculateRMS(samples);
7216
- const vadProbability = Math.min(rms / 0.02, 1);
7217
- if (vadProbability > this.config.vadThreshold) {
7218
- this.onSpeechDetected(rms);
7219
- } else {
7220
- this.onSilenceDetected();
7221
- }
7222
- }
7223
10075
  /**
7224
10076
  * Notify that AI started speaking
7225
10077
  */
@@ -7264,15 +10116,6 @@ var InterruptionHandler = class extends EventEmitter {
7264
10116
  };
7265
10117
  }
7266
10118
  // ==================== Private Methods ====================
7267
- calculateRMS(samples) {
7268
- let sum = 0;
7269
- const scale = samples instanceof Int16Array ? 32768 : 1;
7270
- for (let i = 0; i < samples.length; i++) {
7271
- const sample = samples[i] / scale;
7272
- sum += sample * sample;
7273
- }
7274
- return Math.sqrt(sum / samples.length);
7275
- }
7276
10119
  onSpeechDetected(rms) {
7277
10120
  const now = Date.now();
7278
10121
  this.lastSpeechTime = now;
@@ -8383,13 +11226,19 @@ export {
8383
11226
  RingBuffer,
8384
11227
  SafariSpeechRecognition,
8385
11228
  SenseVoiceInference,
11229
+ SenseVoiceUnifiedAdapter,
11230
+ SenseVoiceWorker,
8386
11231
  SileroVADInference,
11232
+ SileroVADUnifiedAdapter,
8387
11233
  SileroVADWorker,
8388
11234
  SyncedAudioPipeline,
8389
11235
  TenantManager,
8390
11236
  UPPER_FACE_BLENDSHAPES,
11237
+ UnifiedInferenceWorker,
8391
11238
  WAV2ARKIT_BLENDSHAPES,
8392
11239
  Wav2ArkitCpuInference,
11240
+ Wav2ArkitCpuUnifiedAdapter,
11241
+ Wav2ArkitCpuWorker,
8393
11242
  Wav2Vec2Inference,
8394
11243
  applyCMVN,
8395
11244
  applyLFR,
@@ -8403,6 +11252,7 @@ export {
8403
11252
  createEmotionVector,
8404
11253
  createLipSync,
8405
11254
  createLogger,
11255
+ createSenseVoice,
8406
11256
  createSessionWithFallback,
8407
11257
  createSileroVAD,
8408
11258
  ctcGreedyDecode,