@omote/core 0.4.4 → 0.4.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +1165 -673
- package/dist/index.d.ts +1165 -673
- package/dist/index.js +3307 -337
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +3302 -332
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -67,13 +67,19 @@ __export(index_exports, {
|
|
|
67
67
|
RingBuffer: () => RingBuffer,
|
|
68
68
|
SafariSpeechRecognition: () => SafariSpeechRecognition,
|
|
69
69
|
SenseVoiceInference: () => SenseVoiceInference,
|
|
70
|
+
SenseVoiceUnifiedAdapter: () => SenseVoiceUnifiedAdapter,
|
|
71
|
+
SenseVoiceWorker: () => SenseVoiceWorker,
|
|
70
72
|
SileroVADInference: () => SileroVADInference,
|
|
73
|
+
SileroVADUnifiedAdapter: () => SileroVADUnifiedAdapter,
|
|
71
74
|
SileroVADWorker: () => SileroVADWorker,
|
|
72
75
|
SyncedAudioPipeline: () => SyncedAudioPipeline,
|
|
73
76
|
TenantManager: () => TenantManager,
|
|
74
77
|
UPPER_FACE_BLENDSHAPES: () => UPPER_FACE_BLENDSHAPES,
|
|
78
|
+
UnifiedInferenceWorker: () => UnifiedInferenceWorker,
|
|
75
79
|
WAV2ARKIT_BLENDSHAPES: () => WAV2ARKIT_BLENDSHAPES,
|
|
76
80
|
Wav2ArkitCpuInference: () => Wav2ArkitCpuInference,
|
|
81
|
+
Wav2ArkitCpuUnifiedAdapter: () => Wav2ArkitCpuUnifiedAdapter,
|
|
82
|
+
Wav2ArkitCpuWorker: () => Wav2ArkitCpuWorker,
|
|
77
83
|
Wav2Vec2Inference: () => Wav2Vec2Inference,
|
|
78
84
|
applyCMVN: () => applyCMVN,
|
|
79
85
|
applyLFR: () => applyLFR,
|
|
@@ -87,6 +93,7 @@ __export(index_exports, {
|
|
|
87
93
|
createEmotionVector: () => createEmotionVector,
|
|
88
94
|
createLipSync: () => createLipSync,
|
|
89
95
|
createLogger: () => createLogger,
|
|
96
|
+
createSenseVoice: () => createSenseVoice,
|
|
90
97
|
createSessionWithFallback: () => createSessionWithFallback,
|
|
91
98
|
createSileroVAD: () => createSileroVAD,
|
|
92
99
|
ctcGreedyDecode: () => ctcGreedyDecode,
|
|
@@ -469,7 +476,14 @@ var AudioScheduler = class {
|
|
|
469
476
|
source.connect(gainNode);
|
|
470
477
|
const scheduleTime = this.nextPlayTime;
|
|
471
478
|
source.start(scheduleTime);
|
|
472
|
-
|
|
479
|
+
const entry = { source, gainNode };
|
|
480
|
+
this.scheduledSources.push(entry);
|
|
481
|
+
source.onended = () => {
|
|
482
|
+
const idx = this.scheduledSources.indexOf(entry);
|
|
483
|
+
if (idx !== -1) {
|
|
484
|
+
this.scheduledSources.splice(idx, 1);
|
|
485
|
+
}
|
|
486
|
+
};
|
|
473
487
|
const duration = audioData.length / ctx.sampleRate;
|
|
474
488
|
this.nextPlayTime = scheduleTime + duration;
|
|
475
489
|
return scheduleTime;
|
|
@@ -825,7 +839,7 @@ var LAMPipeline = class {
|
|
|
825
839
|
}
|
|
826
840
|
};
|
|
827
841
|
|
|
828
|
-
// src/audio/
|
|
842
|
+
// src/audio/audioUtils.ts
|
|
829
843
|
function pcm16ToFloat32(buffer) {
|
|
830
844
|
const byteLen = buffer.byteLength & ~1;
|
|
831
845
|
const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
|
|
@@ -835,6 +849,15 @@ function pcm16ToFloat32(buffer) {
|
|
|
835
849
|
}
|
|
836
850
|
return float32;
|
|
837
851
|
}
|
|
852
|
+
function int16ToFloat32(int16) {
|
|
853
|
+
const float32 = new Float32Array(int16.length);
|
|
854
|
+
for (let i = 0; i < int16.length; i++) {
|
|
855
|
+
float32[i] = int16[i] / 32768;
|
|
856
|
+
}
|
|
857
|
+
return float32;
|
|
858
|
+
}
|
|
859
|
+
|
|
860
|
+
// src/audio/SyncedAudioPipeline.ts
|
|
838
861
|
var SyncedAudioPipeline = class extends EventEmitter {
|
|
839
862
|
constructor(options) {
|
|
840
863
|
super();
|
|
@@ -2773,12 +2796,12 @@ var Logger = class _Logger {
|
|
|
2773
2796
|
};
|
|
2774
2797
|
var loggerCache = /* @__PURE__ */ new Map();
|
|
2775
2798
|
function createLogger(module2) {
|
|
2776
|
-
let
|
|
2777
|
-
if (!
|
|
2778
|
-
|
|
2779
|
-
loggerCache.set(module2,
|
|
2799
|
+
let logger15 = loggerCache.get(module2);
|
|
2800
|
+
if (!logger15) {
|
|
2801
|
+
logger15 = new Logger(module2);
|
|
2802
|
+
loggerCache.set(module2, logger15);
|
|
2780
2803
|
}
|
|
2781
|
-
return
|
|
2804
|
+
return logger15;
|
|
2782
2805
|
}
|
|
2783
2806
|
var noopLogger = {
|
|
2784
2807
|
module: "noop",
|
|
@@ -2806,7 +2829,7 @@ function isIOSSafari() {
|
|
|
2806
2829
|
function isIOS() {
|
|
2807
2830
|
if (typeof navigator === "undefined") return false;
|
|
2808
2831
|
const ua = navigator.userAgent.toLowerCase();
|
|
2809
|
-
return /iphone|ipad|ipod/.test(ua);
|
|
2832
|
+
return /iphone|ipad|ipod/.test(ua) || /macintosh/.test(ua) && navigator.maxTouchPoints > 1;
|
|
2810
2833
|
}
|
|
2811
2834
|
function isAndroid() {
|
|
2812
2835
|
if (typeof navigator === "undefined") return false;
|
|
@@ -3427,10 +3450,16 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3427
3450
|
});
|
|
3428
3451
|
logger2.debug("Running warmup inference to initialize GPU context");
|
|
3429
3452
|
const warmupStart = performance.now();
|
|
3430
|
-
const
|
|
3453
|
+
const warmupAudio = new Float32Array(16e3);
|
|
3454
|
+
const warmupIdentity = new Float32Array(this.numIdentityClasses);
|
|
3455
|
+
warmupIdentity[0] = 1;
|
|
3456
|
+
const warmupFeeds = {
|
|
3457
|
+
"audio": new this.ort.Tensor("float32", warmupAudio, [1, 16e3]),
|
|
3458
|
+
"identity": new this.ort.Tensor("float32", warmupIdentity, [1, this.numIdentityClasses])
|
|
3459
|
+
};
|
|
3431
3460
|
const WARMUP_TIMEOUT_MS = 15e3;
|
|
3432
3461
|
const warmupResult = await Promise.race([
|
|
3433
|
-
this.
|
|
3462
|
+
this.session.run(warmupFeeds).then(() => "ok"),
|
|
3434
3463
|
new Promise((r) => setTimeout(() => r("timeout"), WARMUP_TIMEOUT_MS))
|
|
3435
3464
|
]);
|
|
3436
3465
|
const warmupTimeMs = performance.now() - warmupStart;
|
|
@@ -3536,14 +3565,18 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3536
3565
|
});
|
|
3537
3566
|
try {
|
|
3538
3567
|
const startTime = performance.now();
|
|
3568
|
+
let timeoutId;
|
|
3539
3569
|
const results = await Promise.race([
|
|
3540
|
-
this.session.run(feeds)
|
|
3541
|
-
|
|
3542
|
-
|
|
3570
|
+
this.session.run(feeds).then((r) => {
|
|
3571
|
+
clearTimeout(timeoutId);
|
|
3572
|
+
return r;
|
|
3573
|
+
}),
|
|
3574
|
+
new Promise((_, rej) => {
|
|
3575
|
+
timeoutId = setTimeout(
|
|
3543
3576
|
() => rej(new Error(`Wav2Vec2 inference timed out after ${_Wav2Vec2Inference.INFERENCE_TIMEOUT_MS}ms`)),
|
|
3544
3577
|
_Wav2Vec2Inference.INFERENCE_TIMEOUT_MS
|
|
3545
|
-
)
|
|
3546
|
-
)
|
|
3578
|
+
);
|
|
3579
|
+
})
|
|
3547
3580
|
]);
|
|
3548
3581
|
const inferenceTimeMs = performance.now() - startTime;
|
|
3549
3582
|
const asrOutput = results["asr_logits"];
|
|
@@ -3649,15 +3682,6 @@ var Wav2Vec2Inference = _Wav2Vec2Inference;
|
|
|
3649
3682
|
|
|
3650
3683
|
// src/audio/FullFacePipeline.ts
|
|
3651
3684
|
var logger3 = createLogger("FullFacePipeline");
|
|
3652
|
-
function pcm16ToFloat322(buffer) {
|
|
3653
|
-
const byteLen = buffer.byteLength & ~1;
|
|
3654
|
-
const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
|
|
3655
|
-
const float32 = new Float32Array(int16.length);
|
|
3656
|
-
for (let i = 0; i < int16.length; i++) {
|
|
3657
|
-
float32[i] = int16[i] / 32768;
|
|
3658
|
-
}
|
|
3659
|
-
return float32;
|
|
3660
|
-
}
|
|
3661
3685
|
var BLENDSHAPE_INDEX_MAP = /* @__PURE__ */ new Map();
|
|
3662
3686
|
LAM_BLENDSHAPES.forEach((name, index) => {
|
|
3663
3687
|
BLENDSHAPE_INDEX_MAP.set(name, index);
|
|
@@ -3807,7 +3831,7 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3807
3831
|
if (!combined) {
|
|
3808
3832
|
return;
|
|
3809
3833
|
}
|
|
3810
|
-
const float32 =
|
|
3834
|
+
const float32 = pcm16ToFloat32(combined);
|
|
3811
3835
|
const scheduleTime = await this.scheduler.schedule(float32);
|
|
3812
3836
|
if (!this.playbackStarted) {
|
|
3813
3837
|
this.playbackStarted = true;
|
|
@@ -4290,13 +4314,18 @@ function ctcGreedyDecode(logits, seqLen, vocabSize, tokenMap) {
|
|
|
4290
4314
|
|
|
4291
4315
|
// src/inference/SenseVoiceInference.ts
|
|
4292
4316
|
var logger4 = createLogger("SenseVoice");
|
|
4293
|
-
var
|
|
4317
|
+
var _SenseVoiceInference = class _SenseVoiceInference {
|
|
4294
4318
|
constructor(config) {
|
|
4295
4319
|
this.session = null;
|
|
4296
4320
|
this.ort = null;
|
|
4297
4321
|
this._backend = "wasm";
|
|
4298
4322
|
this.isLoading = false;
|
|
4299
4323
|
this.inferenceQueue = Promise.resolve();
|
|
4324
|
+
// Session health: set to true if session.run() times out.
|
|
4325
|
+
// A timed-out session may have a zombie WASM dispatch still running,
|
|
4326
|
+
// so all future transcribe() calls reject immediately to prevent concurrent access.
|
|
4327
|
+
this.poisoned = false;
|
|
4328
|
+
// 10s for SenseVoice (heavier preprocessing)
|
|
4300
4329
|
// Preprocessing state (loaded once)
|
|
4301
4330
|
this.tokenMap = null;
|
|
4302
4331
|
this.negMean = null;
|
|
@@ -4444,6 +4473,9 @@ var SenseVoiceInference = class {
|
|
|
4444
4473
|
if (!this.session || !this.ort || !this.tokenMap) {
|
|
4445
4474
|
throw new Error("Model not loaded. Call load() first.");
|
|
4446
4475
|
}
|
|
4476
|
+
if (this.poisoned) {
|
|
4477
|
+
throw new Error("SenseVoice session timed out \u2014 inference unavailable until page reload");
|
|
4478
|
+
}
|
|
4447
4479
|
const audio = new Float32Array(audioSamples);
|
|
4448
4480
|
return this.queueInference(audio);
|
|
4449
4481
|
}
|
|
@@ -4481,7 +4513,19 @@ var SenseVoiceInference = class {
|
|
|
4481
4513
|
language: new ort.Tensor("int32", new Int32Array([this.languageId]), [1]),
|
|
4482
4514
|
text_norm: new ort.Tensor("int32", new Int32Array([this.textNormId]), [1])
|
|
4483
4515
|
};
|
|
4484
|
-
|
|
4516
|
+
let timeoutId;
|
|
4517
|
+
const results = await Promise.race([
|
|
4518
|
+
this.session.run(feeds).then((r) => {
|
|
4519
|
+
clearTimeout(timeoutId);
|
|
4520
|
+
return r;
|
|
4521
|
+
}),
|
|
4522
|
+
new Promise((_, rej) => {
|
|
4523
|
+
timeoutId = setTimeout(
|
|
4524
|
+
() => rej(new Error(`SenseVoice inference timed out after ${_SenseVoiceInference.INFERENCE_TIMEOUT_MS}ms`)),
|
|
4525
|
+
_SenseVoiceInference.INFERENCE_TIMEOUT_MS
|
|
4526
|
+
);
|
|
4527
|
+
})
|
|
4528
|
+
]);
|
|
4485
4529
|
const logitsOutput = results["logits"];
|
|
4486
4530
|
if (!logitsOutput) {
|
|
4487
4531
|
throw new Error('Model output missing "logits" tensor');
|
|
@@ -4527,6 +4571,32 @@ var SenseVoiceInference = class {
|
|
|
4527
4571
|
preprocessTimeMs
|
|
4528
4572
|
});
|
|
4529
4573
|
} catch (err) {
|
|
4574
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
4575
|
+
if (errMsg.includes("timed out")) {
|
|
4576
|
+
this.poisoned = true;
|
|
4577
|
+
logger4.error("CRITICAL: Inference session timed out \u2014 SenseVoice is dead. Page reload required.", {
|
|
4578
|
+
backend: this._backend,
|
|
4579
|
+
timeoutMs: _SenseVoiceInference.INFERENCE_TIMEOUT_MS
|
|
4580
|
+
});
|
|
4581
|
+
} else if (typeof err === "number") {
|
|
4582
|
+
const oomError = new Error(
|
|
4583
|
+
`SenseVoice inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
|
|
4584
|
+
);
|
|
4585
|
+
logger4.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
4586
|
+
pointer: `0x${err.toString(16)}`,
|
|
4587
|
+
backend: this._backend
|
|
4588
|
+
});
|
|
4589
|
+
span?.endWithError(oomError);
|
|
4590
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
4591
|
+
model: "sensevoice",
|
|
4592
|
+
backend: this._backend,
|
|
4593
|
+
status: "error"
|
|
4594
|
+
});
|
|
4595
|
+
reject(oomError);
|
|
4596
|
+
return;
|
|
4597
|
+
} else {
|
|
4598
|
+
logger4.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
4599
|
+
}
|
|
4530
4600
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
4531
4601
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
4532
4602
|
model: "sensevoice",
|
|
@@ -4550,241 +4620,3082 @@ var SenseVoiceInference = class {
|
|
|
4550
4620
|
this.invStddev = null;
|
|
4551
4621
|
}
|
|
4552
4622
|
};
|
|
4623
|
+
_SenseVoiceInference.INFERENCE_TIMEOUT_MS = 1e4;
|
|
4624
|
+
var SenseVoiceInference = _SenseVoiceInference;
|
|
4553
4625
|
|
|
4554
|
-
// src/inference/
|
|
4555
|
-
var logger5 = createLogger("
|
|
4556
|
-
var
|
|
4557
|
-
|
|
4558
|
-
|
|
4559
|
-
|
|
4560
|
-
|
|
4561
|
-
|
|
4562
|
-
|
|
4563
|
-
|
|
4564
|
-
|
|
4565
|
-
this.config = config;
|
|
4566
|
-
}
|
|
4567
|
-
get backend() {
|
|
4568
|
-
return this.session ? this._backend : null;
|
|
4569
|
-
}
|
|
4570
|
-
get isLoaded() {
|
|
4571
|
-
return this.session !== null;
|
|
4626
|
+
// src/inference/SenseVoiceWorker.ts
|
|
4627
|
+
var logger5 = createLogger("SenseVoiceWorker");
|
|
4628
|
+
var WASM_CDN_PATH2 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
4629
|
+
var LOAD_TIMEOUT_MS = 3e4;
|
|
4630
|
+
var INFERENCE_TIMEOUT_MS = 1e4;
|
|
4631
|
+
function resolveUrl(url) {
|
|
4632
|
+
if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
|
|
4633
|
+
try {
|
|
4634
|
+
return new URL(url, globalThis.location?.origin ?? "https://localhost").href;
|
|
4635
|
+
} catch {
|
|
4636
|
+
return url;
|
|
4572
4637
|
}
|
|
4573
|
-
|
|
4574
|
-
|
|
4575
|
-
|
|
4576
|
-
|
|
4577
|
-
|
|
4578
|
-
|
|
4638
|
+
}
|
|
4639
|
+
var WORKER_SCRIPT = `
|
|
4640
|
+
// SenseVoice ASR Worker Script
|
|
4641
|
+
// Loaded via Blob URL - no separate file needed
|
|
4642
|
+
|
|
4643
|
+
var ort = null;
|
|
4644
|
+
var session = null;
|
|
4645
|
+
var tokenMap = null;
|
|
4646
|
+
var negMean = null;
|
|
4647
|
+
var invStddev = null;
|
|
4648
|
+
var languageId = 0;
|
|
4649
|
+
var textNormId = 14;
|
|
4650
|
+
var vocabSize = 0;
|
|
4651
|
+
|
|
4652
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
4653
|
+
// kaldiFbank.ts \u2014 inlined as plain JavaScript
|
|
4654
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
4655
|
+
|
|
4656
|
+
/**
|
|
4657
|
+
* In-place Radix-2 Cooley-Tukey FFT
|
|
4658
|
+
*/
|
|
4659
|
+
function fft(re, im) {
|
|
4660
|
+
var n = re.length;
|
|
4661
|
+
|
|
4662
|
+
// Bit-reversal permutation
|
|
4663
|
+
for (var i = 1, j = 0; i < n; i++) {
|
|
4664
|
+
var bit = n >> 1;
|
|
4665
|
+
while (j & bit) {
|
|
4666
|
+
j ^= bit;
|
|
4667
|
+
bit >>= 1;
|
|
4579
4668
|
}
|
|
4580
|
-
|
|
4581
|
-
|
|
4669
|
+
j ^= bit;
|
|
4670
|
+
if (i < j) {
|
|
4671
|
+
var tmp = re[i]; re[i] = re[j]; re[j] = tmp;
|
|
4672
|
+
tmp = im[i]; im[i] = im[j]; im[j] = tmp;
|
|
4582
4673
|
}
|
|
4583
|
-
|
|
4584
|
-
|
|
4585
|
-
|
|
4586
|
-
|
|
4587
|
-
|
|
4588
|
-
|
|
4589
|
-
|
|
4590
|
-
|
|
4591
|
-
|
|
4592
|
-
|
|
4593
|
-
|
|
4594
|
-
|
|
4595
|
-
|
|
4596
|
-
|
|
4597
|
-
|
|
4598
|
-
|
|
4599
|
-
|
|
4600
|
-
|
|
4601
|
-
|
|
4602
|
-
|
|
4603
|
-
|
|
4604
|
-
|
|
4605
|
-
|
|
4606
|
-
|
|
4607
|
-
sessionOptions.externalData = [{
|
|
4608
|
-
path: dataFilename,
|
|
4609
|
-
data: dataUrl
|
|
4610
|
-
// URL string — ORT fetches directly into WASM
|
|
4611
|
-
}];
|
|
4612
|
-
}
|
|
4613
|
-
this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
|
|
4614
|
-
} else {
|
|
4615
|
-
const cache = getModelCache();
|
|
4616
|
-
const isCached = await cache.has(modelUrl);
|
|
4617
|
-
let modelBuffer;
|
|
4618
|
-
if (isCached) {
|
|
4619
|
-
logger5.debug("Loading model from cache", { modelUrl });
|
|
4620
|
-
modelBuffer = await cache.get(modelUrl);
|
|
4621
|
-
if (!modelBuffer) {
|
|
4622
|
-
logger5.warn("Cache corruption detected, clearing and retrying", { modelUrl });
|
|
4623
|
-
await cache.delete(modelUrl);
|
|
4624
|
-
modelBuffer = await fetchWithCache(modelUrl);
|
|
4625
|
-
}
|
|
4626
|
-
} else {
|
|
4627
|
-
logger5.debug("Fetching and caching model graph", { modelUrl });
|
|
4628
|
-
modelBuffer = await fetchWithCache(modelUrl);
|
|
4629
|
-
}
|
|
4630
|
-
if (!modelBuffer) {
|
|
4631
|
-
throw new Error(`Failed to load model: ${modelUrl}`);
|
|
4632
|
-
}
|
|
4633
|
-
let externalDataBuffer = null;
|
|
4634
|
-
if (dataUrl) {
|
|
4635
|
-
try {
|
|
4636
|
-
const isDataCached = await cache.has(dataUrl);
|
|
4637
|
-
if (isDataCached) {
|
|
4638
|
-
logger5.debug("Loading external data from cache", { dataUrl });
|
|
4639
|
-
externalDataBuffer = await cache.get(dataUrl);
|
|
4640
|
-
if (!externalDataBuffer) {
|
|
4641
|
-
logger5.warn("Cache corruption for external data, retrying", { dataUrl });
|
|
4642
|
-
await cache.delete(dataUrl);
|
|
4643
|
-
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
4644
|
-
}
|
|
4645
|
-
} else {
|
|
4646
|
-
logger5.info("Fetching external model data", {
|
|
4647
|
-
dataUrl,
|
|
4648
|
-
note: "This may be a large download (400MB+)"
|
|
4649
|
-
});
|
|
4650
|
-
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
4651
|
-
}
|
|
4652
|
-
logger5.info("External data loaded", {
|
|
4653
|
-
size: formatBytes(externalDataBuffer.byteLength)
|
|
4654
|
-
});
|
|
4655
|
-
} catch (err) {
|
|
4656
|
-
logger5.debug("No external data file found (single-file model)", {
|
|
4657
|
-
dataUrl,
|
|
4658
|
-
error: err.message
|
|
4659
|
-
});
|
|
4660
|
-
}
|
|
4661
|
-
}
|
|
4662
|
-
logger5.debug("Creating ONNX session", {
|
|
4663
|
-
graphSize: formatBytes(modelBuffer.byteLength),
|
|
4664
|
-
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
|
|
4665
|
-
backend: this._backend
|
|
4666
|
-
});
|
|
4667
|
-
if (externalDataBuffer) {
|
|
4668
|
-
const dataFilename = dataUrl.split("/").pop();
|
|
4669
|
-
sessionOptions.externalData = [{
|
|
4670
|
-
path: dataFilename,
|
|
4671
|
-
data: new Uint8Array(externalDataBuffer)
|
|
4672
|
-
}];
|
|
4673
|
-
}
|
|
4674
|
-
const modelData = new Uint8Array(modelBuffer);
|
|
4675
|
-
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
4674
|
+
}
|
|
4675
|
+
|
|
4676
|
+
// Butterfly passes
|
|
4677
|
+
for (var len = 2; len <= n; len *= 2) {
|
|
4678
|
+
var halfLen = len / 2;
|
|
4679
|
+
var angle = -2 * Math.PI / len;
|
|
4680
|
+
var wRe = Math.cos(angle);
|
|
4681
|
+
var wIm = Math.sin(angle);
|
|
4682
|
+
|
|
4683
|
+
for (var i = 0; i < n; i += len) {
|
|
4684
|
+
var curRe = 1;
|
|
4685
|
+
var curIm = 0;
|
|
4686
|
+
for (var j = 0; j < halfLen; j++) {
|
|
4687
|
+
var a = i + j;
|
|
4688
|
+
var b = a + halfLen;
|
|
4689
|
+
var tRe = curRe * re[b] - curIm * im[b];
|
|
4690
|
+
var tIm = curRe * im[b] + curIm * re[b];
|
|
4691
|
+
re[b] = re[a] - tRe;
|
|
4692
|
+
im[b] = im[a] - tIm;
|
|
4693
|
+
re[a] += tRe;
|
|
4694
|
+
im[a] += tIm;
|
|
4695
|
+
var nextRe = curRe * wRe - curIm * wIm;
|
|
4696
|
+
curIm = curRe * wIm + curIm * wRe;
|
|
4697
|
+
curRe = nextRe;
|
|
4676
4698
|
}
|
|
4677
|
-
const loadTimeMs = performance.now() - startTime;
|
|
4678
|
-
logger5.info("Model loaded successfully", {
|
|
4679
|
-
backend: this._backend,
|
|
4680
|
-
loadTimeMs: Math.round(loadTimeMs),
|
|
4681
|
-
inputs: this.session.inputNames,
|
|
4682
|
-
outputs: this.session.outputNames
|
|
4683
|
-
});
|
|
4684
|
-
span?.setAttributes({
|
|
4685
|
-
"model.backend": this._backend,
|
|
4686
|
-
"model.load_time_ms": loadTimeMs,
|
|
4687
|
-
"model.cached": !isIOS()
|
|
4688
|
-
});
|
|
4689
|
-
span?.end();
|
|
4690
|
-
telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
|
|
4691
|
-
model: "wav2arkit_cpu",
|
|
4692
|
-
backend: this._backend
|
|
4693
|
-
});
|
|
4694
|
-
logger5.debug("Running warmup inference");
|
|
4695
|
-
const warmupStart = performance.now();
|
|
4696
|
-
const silentAudio = new Float32Array(16e3);
|
|
4697
|
-
await this.infer(silentAudio);
|
|
4698
|
-
const warmupTimeMs = performance.now() - warmupStart;
|
|
4699
|
-
logger5.info("Warmup inference complete", {
|
|
4700
|
-
warmupTimeMs: Math.round(warmupTimeMs),
|
|
4701
|
-
backend: this._backend
|
|
4702
|
-
});
|
|
4703
|
-
telemetry?.recordHistogram("omote.model.warmup_time", warmupTimeMs, {
|
|
4704
|
-
model: "wav2arkit_cpu",
|
|
4705
|
-
backend: this._backend
|
|
4706
|
-
});
|
|
4707
|
-
return {
|
|
4708
|
-
backend: this._backend,
|
|
4709
|
-
loadTimeMs,
|
|
4710
|
-
inputNames: [...this.session.inputNames],
|
|
4711
|
-
outputNames: [...this.session.outputNames]
|
|
4712
|
-
};
|
|
4713
|
-
} catch (error) {
|
|
4714
|
-
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
4715
|
-
telemetry?.incrementCounter("omote.errors.total", 1, {
|
|
4716
|
-
model: "wav2arkit_cpu",
|
|
4717
|
-
error_type: "load_failed"
|
|
4718
|
-
});
|
|
4719
|
-
throw error;
|
|
4720
|
-
} finally {
|
|
4721
|
-
this.isLoading = false;
|
|
4722
4699
|
}
|
|
4723
4700
|
}
|
|
4724
|
-
|
|
4725
|
-
|
|
4726
|
-
|
|
4727
|
-
|
|
4728
|
-
|
|
4729
|
-
|
|
4730
|
-
|
|
4731
|
-
|
|
4732
|
-
|
|
4733
|
-
|
|
4734
|
-
|
|
4735
|
-
|
|
4701
|
+
}
|
|
4702
|
+
|
|
4703
|
+
/** HTK mel scale */
|
|
4704
|
+
function htkMel(freq) {
|
|
4705
|
+
return 1127.0 * Math.log(1.0 + freq / 700.0);
|
|
4706
|
+
}
|
|
4707
|
+
|
|
4708
|
+
function htkMelInverse(mel) {
|
|
4709
|
+
return 700.0 * (Math.exp(mel / 1127.0) - 1.0);
|
|
4710
|
+
}
|
|
4711
|
+
|
|
4712
|
+
/**
|
|
4713
|
+
* Build triangular mel filterbank matrix
|
|
4714
|
+
*/
|
|
4715
|
+
function buildMelFilterbank(numBins, fftSize, sampleRate, lowFreq, highFreq) {
|
|
4716
|
+
var numFftBins = fftSize / 2 + 1;
|
|
4717
|
+
var lowMel = htkMel(lowFreq);
|
|
4718
|
+
var highMel = htkMel(highFreq);
|
|
4719
|
+
|
|
4720
|
+
// numBins + 2 equally spaced points in mel space
|
|
4721
|
+
var melPoints = new Float64Array(numBins + 2);
|
|
4722
|
+
for (var i = 0; i < numBins + 2; i++) {
|
|
4723
|
+
melPoints[i] = lowMel + (highMel - lowMel) * i / (numBins + 1);
|
|
4724
|
+
}
|
|
4725
|
+
|
|
4726
|
+
// Convert mel points to FFT bin indices (float, not rounded)
|
|
4727
|
+
var binFreqs = new Float64Array(numBins + 2);
|
|
4728
|
+
for (var i = 0; i < numBins + 2; i++) {
|
|
4729
|
+
binFreqs[i] = htkMelInverse(melPoints[i]) * fftSize / sampleRate;
|
|
4730
|
+
}
|
|
4731
|
+
|
|
4732
|
+
var filters = [];
|
|
4733
|
+
|
|
4734
|
+
for (var m = 0; m < numBins; m++) {
|
|
4735
|
+
var left = binFreqs[m];
|
|
4736
|
+
var center = binFreqs[m + 1];
|
|
4737
|
+
var right = binFreqs[m + 2];
|
|
4738
|
+
|
|
4739
|
+
var startBin = Math.max(0, Math.ceil(left));
|
|
4740
|
+
var endBin = Math.min(numFftBins - 1, Math.floor(right));
|
|
4741
|
+
|
|
4742
|
+
var weights = new Float32Array(endBin - startBin + 1);
|
|
4743
|
+
for (var k = startBin; k <= endBin; k++) {
|
|
4744
|
+
if (k <= center) {
|
|
4745
|
+
weights[k - startBin] = (center - left) > 0 ? (k - left) / (center - left) : 0;
|
|
4746
|
+
} else {
|
|
4747
|
+
weights[k - startBin] = (right - center) > 0 ? (right - k) / (right - center) : 0;
|
|
4748
|
+
}
|
|
4736
4749
|
}
|
|
4737
|
-
|
|
4738
|
-
|
|
4739
|
-
|
|
4740
|
-
|
|
4741
|
-
|
|
4750
|
+
|
|
4751
|
+
filters.push({ startBin: startBin, weights: weights });
|
|
4752
|
+
}
|
|
4753
|
+
|
|
4754
|
+
return filters;
|
|
4755
|
+
}
|
|
4756
|
+
|
|
4757
|
+
/** Create Hamming window */
|
|
4758
|
+
function createHammingWindow(length) {
|
|
4759
|
+
var w = new Float32Array(length);
|
|
4760
|
+
for (var i = 0; i < length; i++) {
|
|
4761
|
+
w[i] = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (length - 1));
|
|
4762
|
+
}
|
|
4763
|
+
return w;
|
|
4764
|
+
}
|
|
4765
|
+
|
|
4766
|
+
/**
|
|
4767
|
+
* Compute Kaldi-compatible log mel filterbank features
|
|
4768
|
+
*/
|
|
4769
|
+
function computeKaldiFbank(audio, sampleRate, numMelBins, opts) {
|
|
4770
|
+
var frameLengthMs = (opts && opts.frameLengthMs !== undefined) ? opts.frameLengthMs : 25;
|
|
4771
|
+
var frameShiftMs = (opts && opts.frameShiftMs !== undefined) ? opts.frameShiftMs : 10;
|
|
4772
|
+
var lowFreq = (opts && opts.lowFreq !== undefined) ? opts.lowFreq : 20;
|
|
4773
|
+
var highFreq = (opts && opts.highFreq !== undefined) ? opts.highFreq : (sampleRate / 2);
|
|
4774
|
+
var dither = (opts && opts.dither !== undefined) ? opts.dither : 0;
|
|
4775
|
+
var preemphasis = (opts && opts.preemphasis !== undefined) ? opts.preemphasis : 0.97;
|
|
4776
|
+
|
|
4777
|
+
var frameLengthSamples = Math.round(sampleRate * frameLengthMs / 1000);
|
|
4778
|
+
var frameShiftSamples = Math.round(sampleRate * frameShiftMs / 1000);
|
|
4779
|
+
|
|
4780
|
+
// Kaldi signal scaling: float [-1,1] -> int16 range
|
|
4781
|
+
var scaled = new Float32Array(audio.length);
|
|
4782
|
+
for (var i = 0; i < audio.length; i++) {
|
|
4783
|
+
scaled[i] = audio[i] * 32768;
|
|
4784
|
+
}
|
|
4785
|
+
|
|
4786
|
+
// Optional dithering
|
|
4787
|
+
if (dither > 0) {
|
|
4788
|
+
for (var i = 0; i < scaled.length; i++) {
|
|
4789
|
+
var u1 = Math.random();
|
|
4790
|
+
var u2 = Math.random();
|
|
4791
|
+
scaled[i] += dither * Math.sqrt(-2 * Math.log(u1 + 1e-10)) * Math.cos(2 * Math.PI * u2);
|
|
4792
|
+
}
|
|
4793
|
+
}
|
|
4794
|
+
|
|
4795
|
+
// Number of frames (snip_edges=true: only complete frames)
|
|
4796
|
+
var numFrames = Math.max(0, Math.floor((scaled.length - frameLengthSamples) / frameShiftSamples) + 1);
|
|
4797
|
+
if (numFrames === 0) {
|
|
4798
|
+
return new Float32Array(0);
|
|
4799
|
+
}
|
|
4800
|
+
|
|
4801
|
+
// FFT size: next power of 2
|
|
4802
|
+
var fftSize = 1;
|
|
4803
|
+
while (fftSize < frameLengthSamples) fftSize *= 2;
|
|
4804
|
+
|
|
4805
|
+
var numFftBins = fftSize / 2 + 1;
|
|
4806
|
+
|
|
4807
|
+
// Pre-compute window and filterbank
|
|
4808
|
+
var window = createHammingWindow(frameLengthSamples);
|
|
4809
|
+
var filters = buildMelFilterbank(numMelBins, fftSize, sampleRate, lowFreq, highFreq);
|
|
4810
|
+
|
|
4811
|
+
// Allocate output
|
|
4812
|
+
var output = new Float32Array(numFrames * numMelBins);
|
|
4813
|
+
|
|
4814
|
+
// FFT buffers (reused per frame)
|
|
4815
|
+
var fftRe = new Float64Array(fftSize);
|
|
4816
|
+
var fftIm = new Float64Array(fftSize);
|
|
4817
|
+
|
|
4818
|
+
for (var f = 0; f < numFrames; f++) {
|
|
4819
|
+
var offset = f * frameShiftSamples;
|
|
4820
|
+
|
|
4821
|
+
// Clear FFT buffers
|
|
4822
|
+
fftRe.fill(0);
|
|
4823
|
+
fftIm.fill(0);
|
|
4824
|
+
|
|
4825
|
+
// Extract frame with preemphasis and windowing
|
|
4826
|
+
for (var i = 0; i < frameLengthSamples; i++) {
|
|
4827
|
+
var sample = scaled[offset + i];
|
|
4828
|
+
// Preemphasis: y[n] = x[n] - coeff * x[n-1]
|
|
4829
|
+
if (preemphasis > 0 && i > 0) {
|
|
4830
|
+
sample -= preemphasis * scaled[offset + i - 1];
|
|
4831
|
+
} else if (preemphasis > 0 && i === 0 && offset > 0) {
|
|
4832
|
+
sample -= preemphasis * scaled[offset - 1];
|
|
4833
|
+
}
|
|
4834
|
+
// Apply window
|
|
4835
|
+
fftRe[i] = sample * window[i];
|
|
4836
|
+
}
|
|
4837
|
+
|
|
4838
|
+
// FFT
|
|
4839
|
+
fft(fftRe, fftIm);
|
|
4840
|
+
|
|
4841
|
+
// Power spectrum -> mel filterbank -> log
|
|
4842
|
+
var outOffset = f * numMelBins;
|
|
4843
|
+
for (var m = 0; m < numMelBins; m++) {
|
|
4844
|
+
var filter = filters[m];
|
|
4845
|
+
var energy = 0;
|
|
4846
|
+
for (var k = 0; k < filter.weights.length; k++) {
|
|
4847
|
+
var bin = filter.startBin + k;
|
|
4848
|
+
if (bin < numFftBins) {
|
|
4849
|
+
var powerSpec = fftRe[bin] * fftRe[bin] + fftIm[bin] * fftIm[bin];
|
|
4850
|
+
energy += filter.weights[k] * powerSpec;
|
|
4851
|
+
}
|
|
4852
|
+
}
|
|
4853
|
+
output[outOffset + m] = Math.log(Math.max(energy, 1e-10));
|
|
4854
|
+
}
|
|
4855
|
+
}
|
|
4856
|
+
|
|
4857
|
+
return output;
|
|
4858
|
+
}
|
|
4859
|
+
|
|
4860
|
+
/**
|
|
4861
|
+
* Apply Low Frame Rate stacking for SenseVoice
|
|
4862
|
+
*/
|
|
4863
|
+
function applyLFR(features, featureDim, lfrM, lfrN) {
|
|
4864
|
+
var numFrames = features.length / featureDim;
|
|
4865
|
+
if (numFrames === 0) return new Float32Array(0);
|
|
4866
|
+
|
|
4867
|
+
var leftPad = Math.floor((lfrM - 1) / 2); // 3 for lfrM=7
|
|
4868
|
+
var paddedLen = numFrames + leftPad;
|
|
4869
|
+
var numOutputFrames = Math.ceil(paddedLen / lfrN);
|
|
4870
|
+
var outputDim = featureDim * lfrM;
|
|
4871
|
+
|
|
4872
|
+
var output = new Float32Array(numOutputFrames * outputDim);
|
|
4873
|
+
|
|
4874
|
+
for (var i = 0; i < numOutputFrames; i++) {
|
|
4875
|
+
var startFrame = i * lfrN - leftPad;
|
|
4876
|
+
|
|
4877
|
+
for (var j = 0; j < lfrM; j++) {
|
|
4878
|
+
var srcFrame = startFrame + j;
|
|
4879
|
+
// Clamp to valid range
|
|
4880
|
+
if (srcFrame < 0) srcFrame = 0;
|
|
4881
|
+
if (srcFrame >= numFrames) srcFrame = numFrames - 1;
|
|
4882
|
+
|
|
4883
|
+
var srcOffset = srcFrame * featureDim;
|
|
4884
|
+
var dstOffset = i * outputDim + j * featureDim;
|
|
4885
|
+
for (var k = 0; k < featureDim; k++) {
|
|
4886
|
+
output[dstOffset + k] = features[srcOffset + k];
|
|
4887
|
+
}
|
|
4888
|
+
}
|
|
4889
|
+
}
|
|
4890
|
+
|
|
4891
|
+
return output;
|
|
4892
|
+
}
|
|
4893
|
+
|
|
4894
|
+
/**
|
|
4895
|
+
* Apply CMVN normalization in-place
|
|
4896
|
+
*/
|
|
4897
|
+
function applyCMVN(features, dim, negMeanVec, invStddevVec) {
|
|
4898
|
+
for (var i = 0; i < features.length; i++) {
|
|
4899
|
+
var d = i % dim;
|
|
4900
|
+
features[i] = (features[i] + negMeanVec[d]) * invStddevVec[d];
|
|
4901
|
+
}
|
|
4902
|
+
return features;
|
|
4903
|
+
}
|
|
4904
|
+
|
|
4905
|
+
/**
|
|
4906
|
+
* Parse CMVN vectors from comma-separated strings (stored in ONNX metadata)
|
|
4907
|
+
*/
|
|
4908
|
+
function parseCMVNFromMetadata(negMeanStr, invStddevStr) {
|
|
4909
|
+
var negMeanArr = new Float32Array(
|
|
4910
|
+
negMeanStr.split(',').map(function(s) { return parseFloat(s.trim()); })
|
|
4911
|
+
);
|
|
4912
|
+
var invStddevArr = new Float32Array(
|
|
4913
|
+
invStddevStr.split(',').map(function(s) { return parseFloat(s.trim()); })
|
|
4914
|
+
);
|
|
4915
|
+
return { negMean: negMeanArr, invStddev: invStddevArr };
|
|
4916
|
+
}
|
|
4917
|
+
|
|
4918
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
4919
|
+
// ctcDecoder.ts \u2014 inlined as plain JavaScript
|
|
4920
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
4921
|
+
|
|
4922
|
+
/** SenseVoice language ID -> string mapping */
|
|
4923
|
+
var LANGUAGE_IDS = {
|
|
4924
|
+
0: 'auto',
|
|
4925
|
+
3: 'zh',
|
|
4926
|
+
4: 'en',
|
|
4927
|
+
7: 'yue',
|
|
4928
|
+
11: 'ja',
|
|
4929
|
+
12: 'ko',
|
|
4930
|
+
13: 'nospeech'
|
|
4931
|
+
};
|
|
4932
|
+
|
|
4933
|
+
/** SenseVoice text normalization ID -> string mapping */
|
|
4934
|
+
var TEXT_NORM_IDS = {
|
|
4935
|
+
14: 'with_itn',
|
|
4936
|
+
15: 'without_itn'
|
|
4937
|
+
};
|
|
4938
|
+
|
|
4939
|
+
/** Resolve language string to SenseVoice language ID */
|
|
4940
|
+
function resolveLanguageId(language) {
|
|
4941
|
+
var map = {
|
|
4942
|
+
auto: 0,
|
|
4943
|
+
zh: 3,
|
|
4944
|
+
en: 4,
|
|
4945
|
+
yue: 7,
|
|
4946
|
+
ja: 11,
|
|
4947
|
+
ko: 12
|
|
4948
|
+
};
|
|
4949
|
+
return map[language] !== undefined ? map[language] : 0;
|
|
4950
|
+
}
|
|
4951
|
+
|
|
4952
|
+
/** Resolve text norm string to SenseVoice text norm ID */
|
|
4953
|
+
function resolveTextNormId(textNorm) {
|
|
4954
|
+
return textNorm === 'without_itn' ? 15 : 14;
|
|
4955
|
+
}
|
|
4956
|
+
|
|
4957
|
+
/**
|
|
4958
|
+
* Parse tokens.txt into a token ID -> string map
|
|
4959
|
+
*/
|
|
4960
|
+
function parseTokensFile(content) {
|
|
4961
|
+
var map = new Map();
|
|
4962
|
+
var lines = content.split('\\n');
|
|
4963
|
+
for (var idx = 0; idx < lines.length; idx++) {
|
|
4964
|
+
var trimmed = lines[idx].trim();
|
|
4965
|
+
if (!trimmed) continue;
|
|
4966
|
+
// Find the last space - token string may contain spaces
|
|
4967
|
+
var lastSpace = trimmed.lastIndexOf(' ');
|
|
4968
|
+
if (lastSpace === -1) continue;
|
|
4969
|
+
var token = trimmed.substring(0, lastSpace);
|
|
4970
|
+
var id = parseInt(trimmed.substring(lastSpace + 1), 10);
|
|
4971
|
+
if (!isNaN(id)) {
|
|
4972
|
+
map.set(id, token);
|
|
4973
|
+
}
|
|
4974
|
+
}
|
|
4975
|
+
return map;
|
|
4976
|
+
}
|
|
4977
|
+
|
|
4978
|
+
/**
|
|
4979
|
+
* SenseVoice structured token pattern matching
|
|
4980
|
+
*/
|
|
4981
|
+
function parseStructuredToken(token) {
|
|
4982
|
+
var match = token.match(/^<\\|(.+)\\|>$/);
|
|
4983
|
+
if (!match) return null;
|
|
4984
|
+
|
|
4985
|
+
var value = match[1];
|
|
4986
|
+
|
|
4987
|
+
// Language tokens
|
|
4988
|
+
if (value === 'zh' || value === 'en' || value === 'ja' || value === 'ko' || value === 'yue' || value === 'nospeech') {
|
|
4989
|
+
return { type: 'language', value: value };
|
|
4990
|
+
}
|
|
4991
|
+
|
|
4992
|
+
// Emotion tokens
|
|
4993
|
+
var emotions = ['HAPPY', 'SAD', 'ANGRY', 'NEUTRAL', 'FEARFUL', 'DISGUSTED', 'SURPRISED', 'EMO_UNKNOWN'];
|
|
4994
|
+
if (emotions.indexOf(value) !== -1) {
|
|
4995
|
+
return { type: 'emotion', value: value };
|
|
4996
|
+
}
|
|
4997
|
+
|
|
4998
|
+
// Audio event tokens
|
|
4999
|
+
var events = ['Speech', 'BGM', 'Applause', 'Laughter', 'Crying', 'Coughing', 'Sneezing', 'EVENT_UNKNOWN'];
|
|
5000
|
+
if (events.indexOf(value) !== -1) {
|
|
5001
|
+
return { type: 'event', value: value };
|
|
5002
|
+
}
|
|
5003
|
+
|
|
5004
|
+
// ITN tokens
|
|
5005
|
+
if (value === 'withitn' || value === 'woitn' || value === 'with_itn' || value === 'without_itn') {
|
|
5006
|
+
return { type: 'textnorm', value: value };
|
|
5007
|
+
}
|
|
5008
|
+
|
|
5009
|
+
return null;
|
|
5010
|
+
}
|
|
5011
|
+
|
|
5012
|
+
/**
|
|
5013
|
+
* CTC greedy decode
|
|
5014
|
+
*/
|
|
5015
|
+
function ctcGreedyDecode(logits, seqLen, vocabSz, tokenMapLocal) {
|
|
5016
|
+
// Step 1: Argmax per time step
|
|
5017
|
+
var tokenIds = [];
|
|
5018
|
+
for (var t = 0; t < seqLen; t++) {
|
|
5019
|
+
var offset = t * vocabSz;
|
|
5020
|
+
var maxIdx = 0;
|
|
5021
|
+
var maxVal = logits[offset];
|
|
5022
|
+
for (var v = 1; v < vocabSz; v++) {
|
|
5023
|
+
if (logits[offset + v] > maxVal) {
|
|
5024
|
+
maxVal = logits[offset + v];
|
|
5025
|
+
maxIdx = v;
|
|
5026
|
+
}
|
|
5027
|
+
}
|
|
5028
|
+
tokenIds.push(maxIdx);
|
|
5029
|
+
}
|
|
5030
|
+
|
|
5031
|
+
// Step 2: Collapse consecutive duplicates
|
|
5032
|
+
var collapsed = [];
|
|
5033
|
+
var prev = -1;
|
|
5034
|
+
for (var idx = 0; idx < tokenIds.length; idx++) {
|
|
5035
|
+
var id = tokenIds[idx];
|
|
5036
|
+
if (id !== prev) {
|
|
5037
|
+
collapsed.push(id);
|
|
5038
|
+
prev = id;
|
|
5039
|
+
}
|
|
5040
|
+
}
|
|
5041
|
+
|
|
5042
|
+
// Step 3: Remove blank tokens (ID 0) and special tokens (<s>=1, </s>=2)
|
|
5043
|
+
var filtered = collapsed.filter(function(id) { return id !== 0 && id !== 1 && id !== 2; });
|
|
5044
|
+
|
|
5045
|
+
// Step 4: Convert to token strings and parse structured tokens
|
|
5046
|
+
var language = undefined;
|
|
5047
|
+
var emotion = undefined;
|
|
5048
|
+
var event = undefined;
|
|
5049
|
+
var textTokens = [];
|
|
5050
|
+
|
|
5051
|
+
for (var idx = 0; idx < filtered.length; idx++) {
|
|
5052
|
+
var id = filtered[idx];
|
|
5053
|
+
var token = tokenMapLocal.get(id);
|
|
5054
|
+
if (!token) continue;
|
|
5055
|
+
|
|
5056
|
+
var structured = parseStructuredToken(token);
|
|
5057
|
+
if (structured) {
|
|
5058
|
+
if (structured.type === 'language') language = structured.value;
|
|
5059
|
+
else if (structured.type === 'emotion') emotion = structured.value;
|
|
5060
|
+
else if (structured.type === 'event') event = structured.value;
|
|
5061
|
+
// Skip textnorm tokens
|
|
5062
|
+
} else {
|
|
5063
|
+
textTokens.push(token);
|
|
5064
|
+
}
|
|
5065
|
+
}
|
|
5066
|
+
|
|
5067
|
+
// Step 5: Join tokens, handle SentencePiece boundary marker
|
|
5068
|
+
var text = textTokens.join('');
|
|
5069
|
+
// Replace SentencePiece word boundary (U+2581) with space
|
|
5070
|
+
text = text.replace(/\\u2581/g, ' ').trim();
|
|
5071
|
+
|
|
5072
|
+
return { text: text, language: language, emotion: emotion, event: event };
|
|
5073
|
+
}
|
|
5074
|
+
|
|
5075
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5076
|
+
// Worker globals and message handler
|
|
5077
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5078
|
+
|
|
5079
|
+
/**
|
|
5080
|
+
* Load ONNX Runtime from CDN
|
|
5081
|
+
*/
|
|
5082
|
+
async function loadOrt(wasmPaths) {
|
|
5083
|
+
if (ort) return;
|
|
5084
|
+
|
|
5085
|
+
// Import ONNX Runtime from CDN
|
|
5086
|
+
var ortUrl = wasmPaths + 'ort.wasm.min.js';
|
|
5087
|
+
|
|
5088
|
+
// Load the script by fetching and executing it
|
|
5089
|
+
var response = await fetch(ortUrl);
|
|
5090
|
+
var scriptText = await response.text();
|
|
5091
|
+
|
|
5092
|
+
// Create a blob URL for the script
|
|
5093
|
+
var blob = new Blob([scriptText], { type: 'application/javascript' });
|
|
5094
|
+
var blobUrl = URL.createObjectURL(blob);
|
|
5095
|
+
|
|
5096
|
+
// Import the module
|
|
5097
|
+
importScripts(blobUrl);
|
|
5098
|
+
URL.revokeObjectURL(blobUrl);
|
|
5099
|
+
|
|
5100
|
+
// ort is now available as global
|
|
5101
|
+
ort = self.ort;
|
|
5102
|
+
|
|
5103
|
+
// Configure WASM settings
|
|
5104
|
+
ort.env.wasm.wasmPaths = wasmPaths;
|
|
5105
|
+
ort.env.wasm.numThreads = 1; // Single thread in worker
|
|
5106
|
+
ort.env.wasm.simd = true;
|
|
5107
|
+
ort.env.wasm.proxy = false; // No proxy in worker
|
|
5108
|
+
}
|
|
5109
|
+
|
|
5110
|
+
/**
|
|
5111
|
+
* Load the SenseVoice model and tokens
|
|
5112
|
+
*/
|
|
5113
|
+
async function loadModel(modelUrl, tokensUrl, isIOSDevice, lang, textNorm) {
|
|
5114
|
+
// 1. Fetch and parse tokens.txt
|
|
5115
|
+
var tokensResponse = await fetch(tokensUrl);
|
|
5116
|
+
if (!tokensResponse.ok) {
|
|
5117
|
+
throw new Error('Failed to fetch tokens.txt: ' + tokensResponse.status + ' ' + tokensResponse.statusText);
|
|
5118
|
+
}
|
|
5119
|
+
var tokensText = await tokensResponse.text();
|
|
5120
|
+
tokenMap = parseTokensFile(tokensText);
|
|
5121
|
+
|
|
5122
|
+
// 2. Store language/textNorm IDs
|
|
5123
|
+
languageId = lang;
|
|
5124
|
+
textNormId = textNorm;
|
|
5125
|
+
|
|
5126
|
+
// 3. Create inference session
|
|
5127
|
+
var sessionOptions = {
|
|
5128
|
+
executionProviders: ['wasm'],
|
|
5129
|
+
graphOptimizationLevel: 'all',
|
|
5130
|
+
};
|
|
5131
|
+
|
|
5132
|
+
if (isIOSDevice) {
|
|
5133
|
+
// iOS: pass URL string directly to ORT to avoid 239MB JS heap allocation
|
|
5134
|
+
// ORT fetches into WASM memory, keeping JS heap at ~2MB
|
|
5135
|
+
session = await ort.InferenceSession.create(modelUrl, sessionOptions);
|
|
5136
|
+
} else {
|
|
5137
|
+
// Desktop: fetch ArrayBuffer for potential caching
|
|
5138
|
+
var modelResponse = await fetch(modelUrl);
|
|
5139
|
+
if (!modelResponse.ok) {
|
|
5140
|
+
throw new Error('Failed to fetch model: ' + modelResponse.status + ' ' + modelResponse.statusText);
|
|
5141
|
+
}
|
|
5142
|
+
var modelBuffer = await modelResponse.arrayBuffer();
|
|
5143
|
+
var modelData = new Uint8Array(modelBuffer);
|
|
5144
|
+
session = await ort.InferenceSession.create(modelData, sessionOptions);
|
|
5145
|
+
}
|
|
5146
|
+
|
|
5147
|
+
// 4. Try to read CMVN from model metadata
|
|
5148
|
+
try {
|
|
5149
|
+
var metadata = session.handler && session.handler.metadata;
|
|
5150
|
+
if (metadata && metadata.neg_mean && metadata.inv_stddev) {
|
|
5151
|
+
var cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
|
|
5152
|
+
negMean = cmvn.negMean;
|
|
5153
|
+
invStddev = cmvn.invStddev;
|
|
5154
|
+
}
|
|
5155
|
+
} catch (cmvnErr) {
|
|
5156
|
+
// CMVN not available \u2014 features will not be normalized
|
|
5157
|
+
}
|
|
5158
|
+
|
|
5159
|
+
// 5. Determine vocab size from tokenMap
|
|
5160
|
+
vocabSize = 0;
|
|
5161
|
+
tokenMap.forEach(function(val, key) {
|
|
5162
|
+
if (key >= vocabSize) vocabSize = key + 1;
|
|
5163
|
+
});
|
|
5164
|
+
|
|
5165
|
+
return {
|
|
5166
|
+
vocabSize: vocabSize,
|
|
5167
|
+
inputNames: session.inputNames.slice(),
|
|
5168
|
+
outputNames: session.outputNames.slice(),
|
|
5169
|
+
};
|
|
5170
|
+
}
|
|
5171
|
+
|
|
5172
|
+
/**
|
|
5173
|
+
* Run transcription on audio samples
|
|
5174
|
+
*/
|
|
5175
|
+
async function runTranscription(audio) {
|
|
5176
|
+
var preprocessStart = performance.now();
|
|
5177
|
+
|
|
5178
|
+
// 1. Compute Kaldi fbank features [T, 80]
|
|
5179
|
+
var fbank = computeKaldiFbank(audio, 16000, 80);
|
|
5180
|
+
var numFrames = fbank.length / 80;
|
|
5181
|
+
|
|
5182
|
+
if (numFrames === 0) {
|
|
5183
|
+
return {
|
|
5184
|
+
text: '',
|
|
5185
|
+
language: undefined,
|
|
5186
|
+
emotion: undefined,
|
|
5187
|
+
event: undefined,
|
|
5188
|
+
inferenceTimeMs: performance.now() - preprocessStart,
|
|
5189
|
+
preprocessTimeMs: performance.now() - preprocessStart,
|
|
5190
|
+
};
|
|
5191
|
+
}
|
|
5192
|
+
|
|
5193
|
+
// 2. Apply LFR stacking [T_reduced, 560]
|
|
5194
|
+
var lfrFeatures = applyLFR(fbank, 80, 7, 6);
|
|
5195
|
+
var numLfrFrames = lfrFeatures.length / 560;
|
|
5196
|
+
|
|
5197
|
+
// 3. Apply CMVN normalization (in-place)
|
|
5198
|
+
if (negMean && invStddev) {
|
|
5199
|
+
applyCMVN(lfrFeatures, 560, negMean, invStddev);
|
|
5200
|
+
}
|
|
5201
|
+
|
|
5202
|
+
var preprocessTimeMs = performance.now() - preprocessStart;
|
|
5203
|
+
|
|
5204
|
+
// 4. Build ORT tensors
|
|
5205
|
+
var feeds = {
|
|
5206
|
+
x: new ort.Tensor('float32', lfrFeatures, [1, numLfrFrames, 560]),
|
|
5207
|
+
x_length: new ort.Tensor('int32', new Int32Array([numLfrFrames]), [1]),
|
|
5208
|
+
language: new ort.Tensor('int32', new Int32Array([languageId]), [1]),
|
|
5209
|
+
text_norm: new ort.Tensor('int32', new Int32Array([textNormId]), [1]),
|
|
5210
|
+
};
|
|
5211
|
+
|
|
5212
|
+
// 5. Run inference
|
|
5213
|
+
var results = await session.run(feeds);
|
|
5214
|
+
|
|
5215
|
+
var logitsOutput = results['logits'];
|
|
5216
|
+
if (!logitsOutput) {
|
|
5217
|
+
throw new Error('Model output missing "logits" tensor');
|
|
5218
|
+
}
|
|
5219
|
+
|
|
5220
|
+
var logitsData = logitsOutput.data;
|
|
5221
|
+
var logitsDims = logitsOutput.dims;
|
|
5222
|
+
var seqLen = logitsDims[1];
|
|
5223
|
+
var modelVocabSize = logitsDims[2];
|
|
5224
|
+
|
|
5225
|
+
// 6. CTC decode
|
|
5226
|
+
var decoded = ctcGreedyDecode(logitsData, seqLen, modelVocabSize, tokenMap);
|
|
5227
|
+
|
|
5228
|
+
var totalTimeMs = performance.now() - preprocessStart;
|
|
5229
|
+
|
|
5230
|
+
return {
|
|
5231
|
+
text: decoded.text,
|
|
5232
|
+
language: decoded.language,
|
|
5233
|
+
emotion: decoded.emotion,
|
|
5234
|
+
event: decoded.event,
|
|
5235
|
+
inferenceTimeMs: totalTimeMs,
|
|
5236
|
+
preprocessTimeMs: preprocessTimeMs,
|
|
5237
|
+
};
|
|
5238
|
+
}
|
|
5239
|
+
|
|
5240
|
+
// Message handler
|
|
5241
|
+
self.onmessage = async function(e) {
|
|
5242
|
+
var msg = e.data;
|
|
5243
|
+
|
|
5244
|
+
try {
|
|
5245
|
+
switch (msg.type) {
|
|
5246
|
+
case 'load': {
|
|
5247
|
+
var startTime = performance.now();
|
|
5248
|
+
await loadOrt(msg.wasmPaths);
|
|
5249
|
+
var info = await loadModel(msg.modelUrl, msg.tokensUrl, msg.isIOS, msg.language, msg.textNorm);
|
|
5250
|
+
var loadTimeMs = performance.now() - startTime;
|
|
5251
|
+
|
|
5252
|
+
self.postMessage({
|
|
5253
|
+
type: 'loaded',
|
|
5254
|
+
vocabSize: info.vocabSize,
|
|
5255
|
+
inputNames: info.inputNames,
|
|
5256
|
+
outputNames: info.outputNames,
|
|
5257
|
+
loadTimeMs: loadTimeMs,
|
|
5258
|
+
});
|
|
5259
|
+
break;
|
|
5260
|
+
}
|
|
5261
|
+
|
|
5262
|
+
case 'transcribe': {
|
|
5263
|
+
var result = await runTranscription(msg.audio);
|
|
5264
|
+
|
|
5265
|
+
self.postMessage({
|
|
5266
|
+
type: 'result',
|
|
5267
|
+
text: result.text,
|
|
5268
|
+
language: result.language,
|
|
5269
|
+
emotion: result.emotion,
|
|
5270
|
+
event: result.event,
|
|
5271
|
+
inferenceTimeMs: result.inferenceTimeMs,
|
|
5272
|
+
preprocessTimeMs: result.preprocessTimeMs,
|
|
5273
|
+
});
|
|
5274
|
+
break;
|
|
5275
|
+
}
|
|
5276
|
+
|
|
5277
|
+
case 'dispose': {
|
|
5278
|
+
if (session) {
|
|
5279
|
+
await session.release();
|
|
5280
|
+
session = null;
|
|
5281
|
+
}
|
|
5282
|
+
ort = null;
|
|
5283
|
+
tokenMap = null;
|
|
5284
|
+
negMean = null;
|
|
5285
|
+
invStddev = null;
|
|
5286
|
+
self.postMessage({ type: 'disposed' });
|
|
5287
|
+
break;
|
|
5288
|
+
}
|
|
5289
|
+
|
|
5290
|
+
default:
|
|
5291
|
+
self.postMessage({
|
|
5292
|
+
type: 'error',
|
|
5293
|
+
error: 'Unknown message type: ' + msg.type,
|
|
5294
|
+
});
|
|
5295
|
+
}
|
|
5296
|
+
} catch (err) {
|
|
5297
|
+
var errorMsg = err.message || String(err);
|
|
5298
|
+
// Handle raw C++ exception pointers from ORT WASM
|
|
5299
|
+
if (typeof err === 'number') {
|
|
5300
|
+
errorMsg = 'Raw C++ exception pointer (0x' + err.toString(16) + '). Likely OOM in WASM.';
|
|
5301
|
+
}
|
|
5302
|
+
self.postMessage({
|
|
5303
|
+
type: 'error',
|
|
5304
|
+
error: errorMsg,
|
|
5305
|
+
});
|
|
5306
|
+
}
|
|
5307
|
+
};
|
|
5308
|
+
|
|
5309
|
+
// Error handler
|
|
5310
|
+
self.onerror = function(err) {
|
|
5311
|
+
self.postMessage({
|
|
5312
|
+
type: 'error',
|
|
5313
|
+
error: 'Worker error: ' + (err.message || String(err)),
|
|
5314
|
+
});
|
|
5315
|
+
};
|
|
5316
|
+
`;
|
|
5317
|
+
var SenseVoiceWorker = class {
|
|
5318
|
+
constructor(config) {
|
|
5319
|
+
this.worker = null;
|
|
5320
|
+
this.isLoading = false;
|
|
5321
|
+
this._isLoaded = false;
|
|
5322
|
+
// Inference queue for serialization
|
|
5323
|
+
this.inferenceQueue = Promise.resolve();
|
|
5324
|
+
// Session health: set to true if worker operation times out
|
|
5325
|
+
this.poisoned = false;
|
|
5326
|
+
// Pending message handlers
|
|
5327
|
+
this.pendingResolvers = /* @__PURE__ */ new Map();
|
|
5328
|
+
const modelDir = config.modelUrl.substring(0, config.modelUrl.lastIndexOf("/"));
|
|
5329
|
+
const tokensUrl = config.tokensUrl ?? `${modelDir}/tokens.txt`;
|
|
5330
|
+
this.config = {
|
|
5331
|
+
modelUrl: config.modelUrl,
|
|
5332
|
+
tokensUrl,
|
|
5333
|
+
language: config.language ?? "auto",
|
|
5334
|
+
textNorm: config.textNorm ?? "with_itn"
|
|
5335
|
+
};
|
|
5336
|
+
this.languageId = resolveLanguageId(this.config.language);
|
|
5337
|
+
this.textNormId = resolveTextNormId(this.config.textNorm);
|
|
5338
|
+
}
|
|
5339
|
+
get isLoaded() {
|
|
5340
|
+
return this._isLoaded;
|
|
5341
|
+
}
|
|
5342
|
+
/**
|
|
5343
|
+
* Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
|
|
5344
|
+
*/
|
|
5345
|
+
get backend() {
|
|
5346
|
+
return this._isLoaded ? "wasm" : null;
|
|
5347
|
+
}
|
|
5348
|
+
/**
|
|
5349
|
+
* Create the worker from inline script
|
|
5350
|
+
*/
|
|
5351
|
+
createWorker() {
|
|
5352
|
+
const blob = new Blob([WORKER_SCRIPT], { type: "application/javascript" });
|
|
5353
|
+
const blobUrl = URL.createObjectURL(blob);
|
|
5354
|
+
const worker = new Worker(blobUrl);
|
|
5355
|
+
URL.revokeObjectURL(blobUrl);
|
|
5356
|
+
worker.onmessage = (event) => {
|
|
5357
|
+
this.handleWorkerMessage(event.data);
|
|
5358
|
+
};
|
|
5359
|
+
worker.onerror = (error) => {
|
|
5360
|
+
logger5.error("Worker error", { error: error.message });
|
|
5361
|
+
for (const [, resolver] of this.pendingResolvers) {
|
|
5362
|
+
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
5363
|
+
}
|
|
5364
|
+
this.pendingResolvers.clear();
|
|
5365
|
+
};
|
|
5366
|
+
return worker;
|
|
5367
|
+
}
|
|
5368
|
+
/**
|
|
5369
|
+
* Handle messages from worker
|
|
5370
|
+
*/
|
|
5371
|
+
handleWorkerMessage(result) {
|
|
5372
|
+
const resolver = this.pendingResolvers.get(result.type);
|
|
5373
|
+
if (resolver) {
|
|
5374
|
+
this.pendingResolvers.delete(result.type);
|
|
5375
|
+
if (result.type === "error") {
|
|
5376
|
+
resolver.reject(new Error(result.error));
|
|
5377
|
+
} else {
|
|
5378
|
+
resolver.resolve(result);
|
|
5379
|
+
}
|
|
5380
|
+
}
|
|
5381
|
+
}
|
|
5382
|
+
/**
|
|
5383
|
+
* Send message to worker and wait for response
|
|
5384
|
+
*/
|
|
5385
|
+
sendMessage(message, expectedType, timeoutMs) {
|
|
5386
|
+
return new Promise((resolve, reject) => {
|
|
5387
|
+
if (!this.worker) {
|
|
5388
|
+
reject(new Error("Worker not initialized"));
|
|
5389
|
+
return;
|
|
5390
|
+
}
|
|
5391
|
+
const timeoutId = setTimeout(() => {
|
|
5392
|
+
this.pendingResolvers.delete(expectedType);
|
|
5393
|
+
this.poisoned = true;
|
|
5394
|
+
reject(new Error(`Worker operation timed out after ${timeoutMs}ms`));
|
|
5395
|
+
}, timeoutMs);
|
|
5396
|
+
this.pendingResolvers.set(expectedType, {
|
|
5397
|
+
resolve: (value) => {
|
|
5398
|
+
clearTimeout(timeoutId);
|
|
5399
|
+
resolve(value);
|
|
5400
|
+
},
|
|
5401
|
+
reject: (error) => {
|
|
5402
|
+
clearTimeout(timeoutId);
|
|
5403
|
+
reject(error);
|
|
5404
|
+
}
|
|
5405
|
+
});
|
|
5406
|
+
this.pendingResolvers.set("error", {
|
|
5407
|
+
resolve: () => {
|
|
5408
|
+
},
|
|
5409
|
+
// Never called for errors
|
|
5410
|
+
reject: (error) => {
|
|
5411
|
+
clearTimeout(timeoutId);
|
|
5412
|
+
this.pendingResolvers.delete(expectedType);
|
|
5413
|
+
reject(error);
|
|
5414
|
+
}
|
|
5415
|
+
});
|
|
5416
|
+
this.worker.postMessage(message);
|
|
5417
|
+
});
|
|
5418
|
+
}
|
|
5419
|
+
/**
|
|
5420
|
+
* Load the ONNX model in the worker
|
|
5421
|
+
*
|
|
5422
|
+
* @param onProgress - Optional progress callback. Fires once at 100% when load completes
|
|
5423
|
+
* (the worker downloads and loads the model internally, so granular progress is not available).
|
|
5424
|
+
*/
|
|
5425
|
+
async load(onProgress) {
|
|
5426
|
+
if (this.isLoading) {
|
|
5427
|
+
throw new Error("Model is already loading");
|
|
5428
|
+
}
|
|
5429
|
+
if (this._isLoaded) {
|
|
5430
|
+
throw new Error("Model already loaded. Call dispose() first.");
|
|
5431
|
+
}
|
|
5432
|
+
this.isLoading = true;
|
|
5433
|
+
const startTime = performance.now();
|
|
5434
|
+
const telemetry = getTelemetry();
|
|
5435
|
+
const span = telemetry?.startSpan("SenseVoiceWorker.load", {
|
|
5436
|
+
"model.url": this.config.modelUrl,
|
|
5437
|
+
"model.language": this.config.language
|
|
5438
|
+
});
|
|
5439
|
+
try {
|
|
5440
|
+
logger5.info("Creating SenseVoice worker...");
|
|
5441
|
+
this.worker = this.createWorker();
|
|
5442
|
+
logger5.info("Loading model in worker...", {
|
|
5443
|
+
modelUrl: this.config.modelUrl,
|
|
5444
|
+
tokensUrl: this.config.tokensUrl,
|
|
5445
|
+
language: this.config.language,
|
|
5446
|
+
textNorm: this.config.textNorm
|
|
5447
|
+
});
|
|
5448
|
+
const result = await this.sendMessage(
|
|
5449
|
+
{
|
|
5450
|
+
type: "load",
|
|
5451
|
+
modelUrl: resolveUrl(this.config.modelUrl),
|
|
5452
|
+
tokensUrl: resolveUrl(this.config.tokensUrl),
|
|
5453
|
+
wasmPaths: WASM_CDN_PATH2,
|
|
5454
|
+
isIOS: isIOS(),
|
|
5455
|
+
language: this.languageId,
|
|
5456
|
+
textNorm: this.textNormId
|
|
5457
|
+
},
|
|
5458
|
+
"loaded",
|
|
5459
|
+
LOAD_TIMEOUT_MS
|
|
5460
|
+
);
|
|
5461
|
+
this._isLoaded = true;
|
|
5462
|
+
const loadTimeMs = performance.now() - startTime;
|
|
5463
|
+
onProgress?.(1, 1);
|
|
5464
|
+
logger5.info("SenseVoice worker loaded successfully", {
|
|
5465
|
+
backend: "wasm",
|
|
5466
|
+
loadTimeMs: Math.round(loadTimeMs),
|
|
5467
|
+
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
5468
|
+
vocabSize: result.vocabSize,
|
|
5469
|
+
language: this.config.language,
|
|
5470
|
+
textNorm: this.config.textNorm
|
|
5471
|
+
});
|
|
5472
|
+
span?.setAttributes({
|
|
5473
|
+
"model.backend": "wasm",
|
|
5474
|
+
"model.load_time_ms": loadTimeMs,
|
|
5475
|
+
"model.worker_load_time_ms": result.loadTimeMs,
|
|
5476
|
+
"model.vocab_size": result.vocabSize
|
|
5477
|
+
});
|
|
5478
|
+
span?.end();
|
|
5479
|
+
telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
|
|
5480
|
+
model: "sensevoice-worker",
|
|
5481
|
+
backend: "wasm"
|
|
5482
|
+
});
|
|
5483
|
+
return {
|
|
5484
|
+
backend: "wasm",
|
|
5485
|
+
loadTimeMs,
|
|
5486
|
+
inputNames: result.inputNames,
|
|
5487
|
+
outputNames: result.outputNames,
|
|
5488
|
+
vocabSize: result.vocabSize
|
|
5489
|
+
};
|
|
5490
|
+
} catch (error) {
|
|
5491
|
+
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
5492
|
+
telemetry?.incrementCounter("omote.errors.total", 1, {
|
|
5493
|
+
model: "sensevoice-worker",
|
|
5494
|
+
error_type: "load_failed"
|
|
5495
|
+
});
|
|
5496
|
+
if (this.worker) {
|
|
5497
|
+
this.worker.terminate();
|
|
5498
|
+
this.worker = null;
|
|
5499
|
+
}
|
|
5500
|
+
throw error;
|
|
5501
|
+
} finally {
|
|
5502
|
+
this.isLoading = false;
|
|
5503
|
+
}
|
|
5504
|
+
}
|
|
5505
|
+
/**
|
|
5506
|
+
* Transcribe audio samples to text
|
|
5507
|
+
*
|
|
5508
|
+
* @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
|
|
5509
|
+
* @returns Transcription result with text, emotion, language, and event
|
|
5510
|
+
*/
|
|
5511
|
+
async transcribe(audioSamples) {
|
|
5512
|
+
if (!this._isLoaded || !this.worker) {
|
|
5513
|
+
throw new Error("Worker not loaded. Call load() first.");
|
|
5514
|
+
}
|
|
5515
|
+
if (this.poisoned) {
|
|
5516
|
+
throw new Error("SenseVoice worker timed out \u2014 inference unavailable until page reload");
|
|
5517
|
+
}
|
|
5518
|
+
const audio = new Float32Array(audioSamples);
|
|
5519
|
+
return this.queueInference(audio);
|
|
5520
|
+
}
|
|
5521
|
+
/**
|
|
5522
|
+
* Queue inference to serialize worker calls
|
|
5523
|
+
*/
|
|
5524
|
+
queueInference(audio) {
|
|
5525
|
+
return new Promise((resolve, reject) => {
|
|
5526
|
+
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
5527
|
+
const telemetry = getTelemetry();
|
|
5528
|
+
const span = telemetry?.startSpan("SenseVoiceWorker.transcribe", {
|
|
5529
|
+
"inference.backend": "wasm",
|
|
5530
|
+
"inference.input_samples": audio.length
|
|
5531
|
+
});
|
|
5532
|
+
try {
|
|
5533
|
+
const startTime = performance.now();
|
|
5534
|
+
const result = await this.sendMessage(
|
|
5535
|
+
{
|
|
5536
|
+
type: "transcribe",
|
|
5537
|
+
audio
|
|
5538
|
+
},
|
|
5539
|
+
"result",
|
|
5540
|
+
INFERENCE_TIMEOUT_MS
|
|
5541
|
+
);
|
|
5542
|
+
const totalTimeMs = performance.now() - startTime;
|
|
5543
|
+
logger5.trace("Worker transcription complete", {
|
|
5544
|
+
text: result.text.substring(0, 50),
|
|
5545
|
+
language: result.language,
|
|
5546
|
+
emotion: result.emotion,
|
|
5547
|
+
event: result.event,
|
|
5548
|
+
preprocessTimeMs: Math.round(result.preprocessTimeMs * 100) / 100,
|
|
5549
|
+
inferenceTimeMs: Math.round(result.inferenceTimeMs * 100) / 100,
|
|
5550
|
+
roundTripMs: Math.round(totalTimeMs * 100) / 100
|
|
5551
|
+
});
|
|
5552
|
+
span?.setAttributes({
|
|
5553
|
+
"inference.duration_ms": totalTimeMs,
|
|
5554
|
+
"inference.worker_duration_ms": result.inferenceTimeMs,
|
|
5555
|
+
"inference.preprocess_ms": result.preprocessTimeMs,
|
|
5556
|
+
"inference.text_length": result.text.length
|
|
5557
|
+
});
|
|
5558
|
+
span?.end();
|
|
5559
|
+
telemetry?.recordHistogram("omote.inference.latency", totalTimeMs, {
|
|
5560
|
+
model: "sensevoice-worker",
|
|
5561
|
+
backend: "wasm"
|
|
5562
|
+
});
|
|
5563
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
5564
|
+
model: "sensevoice-worker",
|
|
5565
|
+
backend: "wasm",
|
|
5566
|
+
status: "success"
|
|
5567
|
+
});
|
|
5568
|
+
resolve({
|
|
5569
|
+
text: result.text,
|
|
5570
|
+
language: result.language,
|
|
5571
|
+
emotion: result.emotion,
|
|
5572
|
+
event: result.event,
|
|
5573
|
+
inferenceTimeMs: result.inferenceTimeMs,
|
|
5574
|
+
preprocessTimeMs: result.preprocessTimeMs
|
|
5575
|
+
});
|
|
5576
|
+
} catch (err) {
|
|
5577
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
5578
|
+
if (errMsg.includes("timed out")) {
|
|
5579
|
+
logger5.error("CRITICAL: Worker inference timed out \u2014 SenseVoice worker is dead. Page reload required.", {
|
|
5580
|
+
timeoutMs: INFERENCE_TIMEOUT_MS
|
|
5581
|
+
});
|
|
5582
|
+
} else {
|
|
5583
|
+
logger5.error("Worker inference failed", { error: errMsg });
|
|
5584
|
+
}
|
|
5585
|
+
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
5586
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
5587
|
+
model: "sensevoice-worker",
|
|
5588
|
+
backend: "wasm",
|
|
5589
|
+
status: "error"
|
|
5590
|
+
});
|
|
5591
|
+
reject(err);
|
|
5592
|
+
}
|
|
5593
|
+
});
|
|
5594
|
+
});
|
|
5595
|
+
}
|
|
5596
|
+
/**
|
|
5597
|
+
* Dispose of the worker and free resources
|
|
5598
|
+
*/
|
|
5599
|
+
async dispose() {
|
|
5600
|
+
if (this.worker) {
|
|
5601
|
+
try {
|
|
5602
|
+
await this.sendMessage({ type: "dispose" }, "disposed", INFERENCE_TIMEOUT_MS);
|
|
5603
|
+
} catch {
|
|
5604
|
+
}
|
|
5605
|
+
this.worker.terminate();
|
|
5606
|
+
this.worker = null;
|
|
5607
|
+
}
|
|
5608
|
+
this._isLoaded = false;
|
|
5609
|
+
this.poisoned = false;
|
|
5610
|
+
this.pendingResolvers.clear();
|
|
5611
|
+
}
|
|
5612
|
+
/**
|
|
5613
|
+
* Check if Web Workers are supported
|
|
5614
|
+
*/
|
|
5615
|
+
static isSupported() {
|
|
5616
|
+
return typeof Worker !== "undefined";
|
|
5617
|
+
}
|
|
5618
|
+
};
|
|
5619
|
+
|
|
5620
|
+
// src/inference/UnifiedInferenceWorker.ts
|
|
5621
|
+
var logger6 = createLogger("UnifiedInferenceWorker");
|
|
5622
|
+
var WASM_CDN_PATH3 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
5623
|
+
var INIT_TIMEOUT_MS = 15e3;
|
|
5624
|
+
var SV_LOAD_TIMEOUT_MS = 3e4;
|
|
5625
|
+
var SV_INFER_TIMEOUT_MS = 1e4;
|
|
5626
|
+
var CPU_LOAD_TIMEOUT_MS = 6e4;
|
|
5627
|
+
var CPU_INFER_TIMEOUT_MS = 5e3;
|
|
5628
|
+
var VAD_LOAD_TIMEOUT_MS = 1e4;
|
|
5629
|
+
var VAD_INFER_TIMEOUT_MS = 1e3;
|
|
5630
|
+
var DISPOSE_TIMEOUT_MS = 5e3;
|
|
5631
|
+
function resolveUrl2(url) {
|
|
5632
|
+
if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
|
|
5633
|
+
try {
|
|
5634
|
+
return new URL(url, globalThis.location?.origin ?? "https://localhost").href;
|
|
5635
|
+
} catch {
|
|
5636
|
+
return url;
|
|
5637
|
+
}
|
|
5638
|
+
}
|
|
5639
|
+
var requestCounter = 0;
|
|
5640
|
+
function nextRequestId() {
|
|
5641
|
+
return `req_${++requestCounter}_${Date.now()}`;
|
|
5642
|
+
}
|
|
5643
|
+
var WORKER_SCRIPT2 = `
|
|
5644
|
+
// Unified Inference Worker Script
|
|
5645
|
+
// Hosts SenseVoice + Wav2ArkitCpu + Silero VAD in a single ORT instance
|
|
5646
|
+
|
|
5647
|
+
var ort = null;
|
|
5648
|
+
|
|
5649
|
+
// SenseVoice state
|
|
5650
|
+
var svSession = null;
|
|
5651
|
+
var svTokenMap = null;
|
|
5652
|
+
var svNegMean = null;
|
|
5653
|
+
var svInvStddev = null;
|
|
5654
|
+
var svLanguageId = 0;
|
|
5655
|
+
var svTextNormId = 14;
|
|
5656
|
+
var svVocabSize = 0;
|
|
5657
|
+
|
|
5658
|
+
// Wav2ArkitCpu state
|
|
5659
|
+
var cpuSession = null;
|
|
5660
|
+
|
|
5661
|
+
// Silero VAD state
|
|
5662
|
+
var vadSession = null;
|
|
5663
|
+
var vadSampleRate = 16000;
|
|
5664
|
+
var vadChunkSize = 512;
|
|
5665
|
+
var vadContextSize = 64;
|
|
5666
|
+
|
|
5667
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5668
|
+
// kaldiFbank.ts \u2014 inlined as plain JavaScript
|
|
5669
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5670
|
+
|
|
5671
|
+
function fft(re, im) {
|
|
5672
|
+
var n = re.length;
|
|
5673
|
+
for (var i = 1, j = 0; i < n; i++) {
|
|
5674
|
+
var bit = n >> 1;
|
|
5675
|
+
while (j & bit) { j ^= bit; bit >>= 1; }
|
|
5676
|
+
j ^= bit;
|
|
5677
|
+
if (i < j) {
|
|
5678
|
+
var tmp = re[i]; re[i] = re[j]; re[j] = tmp;
|
|
5679
|
+
tmp = im[i]; im[i] = im[j]; im[j] = tmp;
|
|
5680
|
+
}
|
|
5681
|
+
}
|
|
5682
|
+
for (var len = 2; len <= n; len *= 2) {
|
|
5683
|
+
var halfLen = len / 2;
|
|
5684
|
+
var angle = -2 * Math.PI / len;
|
|
5685
|
+
var wRe = Math.cos(angle);
|
|
5686
|
+
var wIm = Math.sin(angle);
|
|
5687
|
+
for (var i = 0; i < n; i += len) {
|
|
5688
|
+
var curRe = 1, curIm = 0;
|
|
5689
|
+
for (var j = 0; j < halfLen; j++) {
|
|
5690
|
+
var a = i + j, b = a + halfLen;
|
|
5691
|
+
var tRe = curRe * re[b] - curIm * im[b];
|
|
5692
|
+
var tIm = curRe * im[b] + curIm * re[b];
|
|
5693
|
+
re[b] = re[a] - tRe; im[b] = im[a] - tIm;
|
|
5694
|
+
re[a] += tRe; im[a] += tIm;
|
|
5695
|
+
var nextRe = curRe * wRe - curIm * wIm;
|
|
5696
|
+
curIm = curRe * wIm + curIm * wRe;
|
|
5697
|
+
curRe = nextRe;
|
|
5698
|
+
}
|
|
5699
|
+
}
|
|
5700
|
+
}
|
|
5701
|
+
}
|
|
5702
|
+
|
|
5703
|
+
function htkMel(freq) { return 1127.0 * Math.log(1.0 + freq / 700.0); }
|
|
5704
|
+
function htkMelInverse(mel) { return 700.0 * (Math.exp(mel / 1127.0) - 1.0); }
|
|
5705
|
+
|
|
5706
|
+
function buildMelFilterbank(numBins, fftSize, sampleRate, lowFreq, highFreq) {
|
|
5707
|
+
var numFftBins = fftSize / 2 + 1;
|
|
5708
|
+
var lowMel = htkMel(lowFreq);
|
|
5709
|
+
var highMel = htkMel(highFreq);
|
|
5710
|
+
var melPoints = new Float64Array(numBins + 2);
|
|
5711
|
+
for (var i = 0; i < numBins + 2; i++) {
|
|
5712
|
+
melPoints[i] = lowMel + (highMel - lowMel) * i / (numBins + 1);
|
|
5713
|
+
}
|
|
5714
|
+
var binFreqs = new Float64Array(numBins + 2);
|
|
5715
|
+
for (var i = 0; i < numBins + 2; i++) {
|
|
5716
|
+
binFreqs[i] = htkMelInverse(melPoints[i]) * fftSize / sampleRate;
|
|
5717
|
+
}
|
|
5718
|
+
var filters = [];
|
|
5719
|
+
for (var m = 0; m < numBins; m++) {
|
|
5720
|
+
var left = binFreqs[m], center = binFreqs[m + 1], right = binFreqs[m + 2];
|
|
5721
|
+
var startBin = Math.max(0, Math.ceil(left));
|
|
5722
|
+
var endBin = Math.min(numFftBins - 1, Math.floor(right));
|
|
5723
|
+
var weights = new Float32Array(endBin - startBin + 1);
|
|
5724
|
+
for (var k = startBin; k <= endBin; k++) {
|
|
5725
|
+
if (k <= center) {
|
|
5726
|
+
weights[k - startBin] = (center - left) > 0 ? (k - left) / (center - left) : 0;
|
|
5727
|
+
} else {
|
|
5728
|
+
weights[k - startBin] = (right - center) > 0 ? (right - k) / (right - center) : 0;
|
|
5729
|
+
}
|
|
5730
|
+
}
|
|
5731
|
+
filters.push({ startBin: startBin, weights: weights });
|
|
5732
|
+
}
|
|
5733
|
+
return filters;
|
|
5734
|
+
}
|
|
5735
|
+
|
|
5736
|
+
function createHammingWindow(length) {
|
|
5737
|
+
var w = new Float32Array(length);
|
|
5738
|
+
for (var i = 0; i < length; i++) {
|
|
5739
|
+
w[i] = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (length - 1));
|
|
5740
|
+
}
|
|
5741
|
+
return w;
|
|
5742
|
+
}
|
|
5743
|
+
|
|
5744
|
+
function computeKaldiFbank(audio, sampleRate, numMelBins, opts) {
|
|
5745
|
+
var frameLengthMs = (opts && opts.frameLengthMs !== undefined) ? opts.frameLengthMs : 25;
|
|
5746
|
+
var frameShiftMs = (opts && opts.frameShiftMs !== undefined) ? opts.frameShiftMs : 10;
|
|
5747
|
+
var lowFreq = (opts && opts.lowFreq !== undefined) ? opts.lowFreq : 20;
|
|
5748
|
+
var highFreq = (opts && opts.highFreq !== undefined) ? opts.highFreq : (sampleRate / 2);
|
|
5749
|
+
var dither = (opts && opts.dither !== undefined) ? opts.dither : 0;
|
|
5750
|
+
var preemphasis = (opts && opts.preemphasis !== undefined) ? opts.preemphasis : 0.97;
|
|
5751
|
+
|
|
5752
|
+
var frameLengthSamples = Math.round(sampleRate * frameLengthMs / 1000);
|
|
5753
|
+
var frameShiftSamples = Math.round(sampleRate * frameShiftMs / 1000);
|
|
5754
|
+
|
|
5755
|
+
var scaled = new Float32Array(audio.length);
|
|
5756
|
+
for (var i = 0; i < audio.length; i++) { scaled[i] = audio[i] * 32768; }
|
|
5757
|
+
|
|
5758
|
+
if (dither > 0) {
|
|
5759
|
+
for (var i = 0; i < scaled.length; i++) {
|
|
5760
|
+
var u1 = Math.random(), u2 = Math.random();
|
|
5761
|
+
scaled[i] += dither * Math.sqrt(-2 * Math.log(u1 + 1e-10)) * Math.cos(2 * Math.PI * u2);
|
|
5762
|
+
}
|
|
5763
|
+
}
|
|
5764
|
+
|
|
5765
|
+
var numFrames = Math.max(0, Math.floor((scaled.length - frameLengthSamples) / frameShiftSamples) + 1);
|
|
5766
|
+
if (numFrames === 0) return new Float32Array(0);
|
|
5767
|
+
|
|
5768
|
+
var fftSize = 1;
|
|
5769
|
+
while (fftSize < frameLengthSamples) fftSize *= 2;
|
|
5770
|
+
var numFftBins = fftSize / 2 + 1;
|
|
5771
|
+
|
|
5772
|
+
var window = createHammingWindow(frameLengthSamples);
|
|
5773
|
+
var filters = buildMelFilterbank(numMelBins, fftSize, sampleRate, lowFreq, highFreq);
|
|
5774
|
+
var output = new Float32Array(numFrames * numMelBins);
|
|
5775
|
+
var fftRe = new Float64Array(fftSize);
|
|
5776
|
+
var fftIm = new Float64Array(fftSize);
|
|
5777
|
+
|
|
5778
|
+
for (var f = 0; f < numFrames; f++) {
|
|
5779
|
+
var offset = f * frameShiftSamples;
|
|
5780
|
+
fftRe.fill(0); fftIm.fill(0);
|
|
5781
|
+
for (var i = 0; i < frameLengthSamples; i++) {
|
|
5782
|
+
var sample = scaled[offset + i];
|
|
5783
|
+
if (preemphasis > 0 && i > 0) {
|
|
5784
|
+
sample -= preemphasis * scaled[offset + i - 1];
|
|
5785
|
+
} else if (preemphasis > 0 && i === 0 && offset > 0) {
|
|
5786
|
+
sample -= preemphasis * scaled[offset - 1];
|
|
5787
|
+
}
|
|
5788
|
+
fftRe[i] = sample * window[i];
|
|
5789
|
+
}
|
|
5790
|
+
fft(fftRe, fftIm);
|
|
5791
|
+
var outOffset = f * numMelBins;
|
|
5792
|
+
for (var m = 0; m < numMelBins; m++) {
|
|
5793
|
+
var filter = filters[m];
|
|
5794
|
+
var energy = 0;
|
|
5795
|
+
for (var k = 0; k < filter.weights.length; k++) {
|
|
5796
|
+
var bin = filter.startBin + k;
|
|
5797
|
+
if (bin < numFftBins) {
|
|
5798
|
+
var powerSpec = fftRe[bin] * fftRe[bin] + fftIm[bin] * fftIm[bin];
|
|
5799
|
+
energy += filter.weights[k] * powerSpec;
|
|
5800
|
+
}
|
|
5801
|
+
}
|
|
5802
|
+
output[outOffset + m] = Math.log(Math.max(energy, 1e-10));
|
|
5803
|
+
}
|
|
5804
|
+
}
|
|
5805
|
+
return output;
|
|
5806
|
+
}
|
|
5807
|
+
|
|
5808
|
+
function applyLFR(features, featureDim, lfrM, lfrN) {
|
|
5809
|
+
var numFrames = features.length / featureDim;
|
|
5810
|
+
if (numFrames === 0) return new Float32Array(0);
|
|
5811
|
+
var leftPad = Math.floor((lfrM - 1) / 2);
|
|
5812
|
+
var paddedLen = numFrames + leftPad;
|
|
5813
|
+
var numOutputFrames = Math.ceil(paddedLen / lfrN);
|
|
5814
|
+
var outputDim = featureDim * lfrM;
|
|
5815
|
+
var output = new Float32Array(numOutputFrames * outputDim);
|
|
5816
|
+
for (var i = 0; i < numOutputFrames; i++) {
|
|
5817
|
+
var startFrame = i * lfrN - leftPad;
|
|
5818
|
+
for (var j = 0; j < lfrM; j++) {
|
|
5819
|
+
var srcFrame = startFrame + j;
|
|
5820
|
+
if (srcFrame < 0) srcFrame = 0;
|
|
5821
|
+
if (srcFrame >= numFrames) srcFrame = numFrames - 1;
|
|
5822
|
+
var srcOffset = srcFrame * featureDim;
|
|
5823
|
+
var dstOffset = i * outputDim + j * featureDim;
|
|
5824
|
+
for (var k = 0; k < featureDim; k++) {
|
|
5825
|
+
output[dstOffset + k] = features[srcOffset + k];
|
|
5826
|
+
}
|
|
5827
|
+
}
|
|
5828
|
+
}
|
|
5829
|
+
return output;
|
|
5830
|
+
}
|
|
5831
|
+
|
|
5832
|
+
function applyCMVN(features, dim, negMeanVec, invStddevVec) {
|
|
5833
|
+
for (var i = 0; i < features.length; i++) {
|
|
5834
|
+
var d = i % dim;
|
|
5835
|
+
features[i] = (features[i] + negMeanVec[d]) * invStddevVec[d];
|
|
5836
|
+
}
|
|
5837
|
+
return features;
|
|
5838
|
+
}
|
|
5839
|
+
|
|
5840
|
+
function parseCMVNFromMetadata(negMeanStr, invStddevStr) {
|
|
5841
|
+
var negMeanArr = new Float32Array(
|
|
5842
|
+
negMeanStr.split(',').map(function(s) { return parseFloat(s.trim()); })
|
|
5843
|
+
);
|
|
5844
|
+
var invStddevArr = new Float32Array(
|
|
5845
|
+
invStddevStr.split(',').map(function(s) { return parseFloat(s.trim()); })
|
|
5846
|
+
);
|
|
5847
|
+
return { negMean: negMeanArr, invStddev: invStddevArr };
|
|
5848
|
+
}
|
|
5849
|
+
|
|
5850
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5851
|
+
// ctcDecoder.ts \u2014 inlined as plain JavaScript
|
|
5852
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5853
|
+
|
|
5854
|
+
var LANGUAGE_IDS = { 0: 'auto', 3: 'zh', 4: 'en', 7: 'yue', 11: 'ja', 12: 'ko', 13: 'nospeech' };
|
|
5855
|
+
var TEXT_NORM_IDS = { 14: 'with_itn', 15: 'without_itn' };
|
|
5856
|
+
|
|
5857
|
+
function resolveLanguageIdW(language) {
|
|
5858
|
+
var map = { auto: 0, zh: 3, en: 4, yue: 7, ja: 11, ko: 12 };
|
|
5859
|
+
return map[language] !== undefined ? map[language] : 0;
|
|
5860
|
+
}
|
|
5861
|
+
|
|
5862
|
+
function resolveTextNormIdW(textNorm) {
|
|
5863
|
+
return textNorm === 'without_itn' ? 15 : 14;
|
|
5864
|
+
}
|
|
5865
|
+
|
|
5866
|
+
function parseTokensFile(content) {
|
|
5867
|
+
var map = new Map();
|
|
5868
|
+
var lines = content.split('\\n');
|
|
5869
|
+
for (var idx = 0; idx < lines.length; idx++) {
|
|
5870
|
+
var trimmed = lines[idx].trim();
|
|
5871
|
+
if (!trimmed) continue;
|
|
5872
|
+
var lastSpace = trimmed.lastIndexOf(' ');
|
|
5873
|
+
if (lastSpace === -1) continue;
|
|
5874
|
+
var token = trimmed.substring(0, lastSpace);
|
|
5875
|
+
var id = parseInt(trimmed.substring(lastSpace + 1), 10);
|
|
5876
|
+
if (!isNaN(id)) map.set(id, token);
|
|
5877
|
+
}
|
|
5878
|
+
return map;
|
|
5879
|
+
}
|
|
5880
|
+
|
|
5881
|
+
function parseStructuredToken(token) {
|
|
5882
|
+
var match = token.match(/^<\\|(.+)\\|>$/);
|
|
5883
|
+
if (!match) return null;
|
|
5884
|
+
var value = match[1];
|
|
5885
|
+
if (value === 'zh' || value === 'en' || value === 'ja' || value === 'ko' || value === 'yue' || value === 'nospeech') {
|
|
5886
|
+
return { type: 'language', value: value };
|
|
5887
|
+
}
|
|
5888
|
+
var emotions = ['HAPPY', 'SAD', 'ANGRY', 'NEUTRAL', 'FEARFUL', 'DISGUSTED', 'SURPRISED', 'EMO_UNKNOWN'];
|
|
5889
|
+
if (emotions.indexOf(value) !== -1) return { type: 'emotion', value: value };
|
|
5890
|
+
var events = ['Speech', 'BGM', 'Applause', 'Laughter', 'Crying', 'Coughing', 'Sneezing', 'EVENT_UNKNOWN'];
|
|
5891
|
+
if (events.indexOf(value) !== -1) return { type: 'event', value: value };
|
|
5892
|
+
if (value === 'withitn' || value === 'woitn' || value === 'with_itn' || value === 'without_itn') {
|
|
5893
|
+
return { type: 'textnorm', value: value };
|
|
5894
|
+
}
|
|
5895
|
+
return null;
|
|
5896
|
+
}
|
|
5897
|
+
|
|
5898
|
+
function ctcGreedyDecode(logits, seqLen, vocabSz, tokenMapLocal) {
|
|
5899
|
+
var tokenIds = [];
|
|
5900
|
+
for (var t = 0; t < seqLen; t++) {
|
|
5901
|
+
var offset = t * vocabSz;
|
|
5902
|
+
var maxIdx = 0, maxVal = logits[offset];
|
|
5903
|
+
for (var v = 1; v < vocabSz; v++) {
|
|
5904
|
+
if (logits[offset + v] > maxVal) { maxVal = logits[offset + v]; maxIdx = v; }
|
|
5905
|
+
}
|
|
5906
|
+
tokenIds.push(maxIdx);
|
|
5907
|
+
}
|
|
5908
|
+
var collapsed = [], prev = -1;
|
|
5909
|
+
for (var idx = 0; idx < tokenIds.length; idx++) {
|
|
5910
|
+
var id = tokenIds[idx];
|
|
5911
|
+
if (id !== prev) { collapsed.push(id); prev = id; }
|
|
5912
|
+
}
|
|
5913
|
+
var filtered = collapsed.filter(function(id) { return id !== 0 && id !== 1 && id !== 2; });
|
|
5914
|
+
var language = undefined, emotion = undefined, event = undefined;
|
|
5915
|
+
var textTokens = [];
|
|
5916
|
+
for (var idx = 0; idx < filtered.length; idx++) {
|
|
5917
|
+
var id = filtered[idx];
|
|
5918
|
+
var token = tokenMapLocal.get(id);
|
|
5919
|
+
if (!token) continue;
|
|
5920
|
+
var structured = parseStructuredToken(token);
|
|
5921
|
+
if (structured) {
|
|
5922
|
+
if (structured.type === 'language') language = structured.value;
|
|
5923
|
+
else if (structured.type === 'emotion') emotion = structured.value;
|
|
5924
|
+
else if (structured.type === 'event') event = structured.value;
|
|
5925
|
+
} else {
|
|
5926
|
+
textTokens.push(token);
|
|
5927
|
+
}
|
|
5928
|
+
}
|
|
5929
|
+
var text = textTokens.join('');
|
|
5930
|
+
text = text.replace(/\\u2581/g, ' ').trim();
|
|
5931
|
+
return { text: text, language: language, emotion: emotion, event: event };
|
|
5932
|
+
}
|
|
5933
|
+
|
|
5934
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5935
|
+
// blendshapeUtils.ts \u2014 inlined
|
|
5936
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5937
|
+
|
|
5938
|
+
var SYMMETRIC_INDEX_PAIRS = [
|
|
5939
|
+
[23, 25], [32, 38], [43, 44], [29, 30], [27, 28], [45, 46],
|
|
5940
|
+
[35, 36], [47, 48], [33, 34], [49, 50], [6, 7], [0, 1],
|
|
5941
|
+
[3, 4], [8, 9], [16, 17], [10, 11], [12, 13], [14, 15],
|
|
5942
|
+
[18, 19], [20, 21],
|
|
5943
|
+
];
|
|
5944
|
+
|
|
5945
|
+
function symmetrizeBlendshapes(frame) {
|
|
5946
|
+
var result = new Float32Array(frame);
|
|
5947
|
+
for (var p = 0; p < SYMMETRIC_INDEX_PAIRS.length; p++) {
|
|
5948
|
+
var lIdx = SYMMETRIC_INDEX_PAIRS[p][0], rIdx = SYMMETRIC_INDEX_PAIRS[p][1];
|
|
5949
|
+
var avg = (frame[lIdx] + frame[rIdx]) / 2;
|
|
5950
|
+
result[lIdx] = avg;
|
|
5951
|
+
result[rIdx] = avg;
|
|
5952
|
+
}
|
|
5953
|
+
return result;
|
|
5954
|
+
}
|
|
5955
|
+
|
|
5956
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5957
|
+
// Shared ORT loader
|
|
5958
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5959
|
+
|
|
5960
|
+
async function loadOrt(wasmPaths, isIOSDevice) {
|
|
5961
|
+
if (ort) return;
|
|
5962
|
+
var ortUrl = wasmPaths + 'ort.wasm.min.js';
|
|
5963
|
+
var response = await fetch(ortUrl);
|
|
5964
|
+
var scriptText = await response.text();
|
|
5965
|
+
var blob = new Blob([scriptText], { type: 'application/javascript' });
|
|
5966
|
+
var blobUrl = URL.createObjectURL(blob);
|
|
5967
|
+
importScripts(blobUrl);
|
|
5968
|
+
URL.revokeObjectURL(blobUrl);
|
|
5969
|
+
ort = self.ort;
|
|
5970
|
+
ort.env.wasm.wasmPaths = wasmPaths;
|
|
5971
|
+
ort.env.wasm.numThreads = isIOSDevice ? 1 : 4;
|
|
5972
|
+
ort.env.wasm.simd = true;
|
|
5973
|
+
ort.env.wasm.proxy = false;
|
|
5974
|
+
}
|
|
5975
|
+
|
|
5976
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5977
|
+
// SenseVoice handlers
|
|
5978
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5979
|
+
|
|
5980
|
+
async function svLoad(msg) {
|
|
5981
|
+
var tokensResponse = await fetch(msg.tokensUrl);
|
|
5982
|
+
if (!tokensResponse.ok) throw new Error('Failed to fetch tokens.txt: ' + tokensResponse.status);
|
|
5983
|
+
var tokensText = await tokensResponse.text();
|
|
5984
|
+
svTokenMap = parseTokensFile(tokensText);
|
|
5985
|
+
svLanguageId = msg.language;
|
|
5986
|
+
svTextNormId = msg.textNorm;
|
|
5987
|
+
|
|
5988
|
+
var sessionOptions = { executionProviders: ['wasm'], graphOptimizationLevel: 'all' };
|
|
5989
|
+
if (msg.isIOS) {
|
|
5990
|
+
svSession = await ort.InferenceSession.create(msg.modelUrl, sessionOptions);
|
|
5991
|
+
} else {
|
|
5992
|
+
var modelResponse = await fetch(msg.modelUrl);
|
|
5993
|
+
if (!modelResponse.ok) throw new Error('Failed to fetch model: ' + modelResponse.status);
|
|
5994
|
+
var modelBuffer = await modelResponse.arrayBuffer();
|
|
5995
|
+
svSession = await ort.InferenceSession.create(new Uint8Array(modelBuffer), sessionOptions);
|
|
5996
|
+
}
|
|
5997
|
+
|
|
5998
|
+
try {
|
|
5999
|
+
var metadata = svSession.handler && svSession.handler.metadata;
|
|
6000
|
+
if (metadata && metadata.neg_mean && metadata.inv_stddev) {
|
|
6001
|
+
var cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
|
|
6002
|
+
svNegMean = cmvn.negMean;
|
|
6003
|
+
svInvStddev = cmvn.invStddev;
|
|
6004
|
+
}
|
|
6005
|
+
} catch (e) { /* CMVN not available */ }
|
|
6006
|
+
|
|
6007
|
+
svVocabSize = 0;
|
|
6008
|
+
svTokenMap.forEach(function(val, key) { if (key >= svVocabSize) svVocabSize = key + 1; });
|
|
6009
|
+
|
|
6010
|
+
return {
|
|
6011
|
+
vocabSize: svVocabSize,
|
|
6012
|
+
inputNames: svSession.inputNames.slice(),
|
|
6013
|
+
outputNames: svSession.outputNames.slice(),
|
|
6014
|
+
};
|
|
6015
|
+
}
|
|
6016
|
+
|
|
6017
|
+
async function svTranscribe(audio) {
|
|
6018
|
+
var preprocessStart = performance.now();
|
|
6019
|
+
var fbank = computeKaldiFbank(audio, 16000, 80);
|
|
6020
|
+
var numFrames = fbank.length / 80;
|
|
6021
|
+
if (numFrames === 0) {
|
|
6022
|
+
return { text: '', inferenceTimeMs: performance.now() - preprocessStart, preprocessTimeMs: performance.now() - preprocessStart };
|
|
6023
|
+
}
|
|
6024
|
+
var lfrFeatures = applyLFR(fbank, 80, 7, 6);
|
|
6025
|
+
var numLfrFrames = lfrFeatures.length / 560;
|
|
6026
|
+
if (svNegMean && svInvStddev) applyCMVN(lfrFeatures, 560, svNegMean, svInvStddev);
|
|
6027
|
+
var preprocessTimeMs = performance.now() - preprocessStart;
|
|
6028
|
+
|
|
6029
|
+
var feeds = {
|
|
6030
|
+
x: new ort.Tensor('float32', lfrFeatures, [1, numLfrFrames, 560]),
|
|
6031
|
+
x_length: new ort.Tensor('int32', new Int32Array([numLfrFrames]), [1]),
|
|
6032
|
+
language: new ort.Tensor('int32', new Int32Array([svLanguageId]), [1]),
|
|
6033
|
+
text_norm: new ort.Tensor('int32', new Int32Array([svTextNormId]), [1]),
|
|
6034
|
+
};
|
|
6035
|
+
var results = await svSession.run(feeds);
|
|
6036
|
+
var logitsOutput = results['logits'];
|
|
6037
|
+
if (!logitsOutput) throw new Error('Model output missing "logits" tensor');
|
|
6038
|
+
|
|
6039
|
+
var decoded = ctcGreedyDecode(logitsOutput.data, logitsOutput.dims[1], logitsOutput.dims[2], svTokenMap);
|
|
6040
|
+
var totalTimeMs = performance.now() - preprocessStart;
|
|
6041
|
+
|
|
6042
|
+
return {
|
|
6043
|
+
text: decoded.text, language: decoded.language, emotion: decoded.emotion, event: decoded.event,
|
|
6044
|
+
inferenceTimeMs: totalTimeMs, preprocessTimeMs: preprocessTimeMs,
|
|
6045
|
+
};
|
|
6046
|
+
}
|
|
6047
|
+
|
|
6048
|
+
async function svDispose() {
|
|
6049
|
+
if (svSession) { await svSession.release(); svSession = null; }
|
|
6050
|
+
svTokenMap = null; svNegMean = null; svInvStddev = null;
|
|
6051
|
+
}
|
|
6052
|
+
|
|
6053
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
6054
|
+
// Wav2ArkitCpu handlers
|
|
6055
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
6056
|
+
|
|
6057
|
+
async function cpuLoad(msg) {
|
|
6058
|
+
var sessionOptions = { executionProviders: ['wasm'], graphOptimizationLevel: 'all' };
|
|
6059
|
+
var dataFilename = msg.externalDataUrl ? msg.externalDataUrl.split('/').pop() : null;
|
|
6060
|
+
|
|
6061
|
+
if (msg.isIOS) {
|
|
6062
|
+
if (msg.externalDataUrl && dataFilename) {
|
|
6063
|
+
sessionOptions.externalData = [{ path: dataFilename, data: msg.externalDataUrl }];
|
|
6064
|
+
}
|
|
6065
|
+
cpuSession = await ort.InferenceSession.create(msg.modelUrl, sessionOptions);
|
|
6066
|
+
} else {
|
|
6067
|
+
var graphResponse = await fetch(msg.modelUrl);
|
|
6068
|
+
if (!graphResponse.ok) throw new Error('Failed to fetch model graph: ' + graphResponse.status);
|
|
6069
|
+
var graphBuffer = await graphResponse.arrayBuffer();
|
|
6070
|
+
if (msg.externalDataUrl && dataFilename) {
|
|
6071
|
+
var dataResponse = await fetch(msg.externalDataUrl);
|
|
6072
|
+
if (!dataResponse.ok) throw new Error('Failed to fetch external data: ' + dataResponse.status);
|
|
6073
|
+
var dataBuffer = await dataResponse.arrayBuffer();
|
|
6074
|
+
sessionOptions.externalData = [{ path: dataFilename, data: new Uint8Array(dataBuffer) }];
|
|
6075
|
+
}
|
|
6076
|
+
cpuSession = await ort.InferenceSession.create(new Uint8Array(graphBuffer), sessionOptions);
|
|
6077
|
+
}
|
|
6078
|
+
|
|
6079
|
+
// Warmup
|
|
6080
|
+
var warmupAudio = new Float32Array(16000);
|
|
6081
|
+
var warmupTensor = new ort.Tensor('float32', warmupAudio, [1, warmupAudio.length]);
|
|
6082
|
+
await cpuSession.run({ audio_waveform: warmupTensor });
|
|
6083
|
+
|
|
6084
|
+
return {
|
|
6085
|
+
inputNames: cpuSession.inputNames.slice(),
|
|
6086
|
+
outputNames: cpuSession.outputNames.slice(),
|
|
6087
|
+
};
|
|
6088
|
+
}
|
|
6089
|
+
|
|
6090
|
+
async function cpuInfer(audio) {
|
|
6091
|
+
var tensor = new ort.Tensor('float32', audio, [1, audio.length]);
|
|
6092
|
+
var results = await cpuSession.run({ audio_waveform: tensor });
|
|
6093
|
+
var blendshapeOutput = results['blendshapes'];
|
|
6094
|
+
if (!blendshapeOutput) throw new Error('Missing blendshapes output from model');
|
|
6095
|
+
|
|
6096
|
+
var blendshapeData = blendshapeOutput.data;
|
|
6097
|
+
var numFrames = blendshapeOutput.dims[1];
|
|
6098
|
+
var numBlendshapes = blendshapeOutput.dims[2];
|
|
6099
|
+
|
|
6100
|
+
var flatBuffer = new Float32Array(numFrames * numBlendshapes);
|
|
6101
|
+
for (var f = 0; f < numFrames; f++) {
|
|
6102
|
+
var offset = f * numBlendshapes;
|
|
6103
|
+
var rawFrame = blendshapeData.slice(offset, offset + numBlendshapes);
|
|
6104
|
+
var symmetrized = symmetrizeBlendshapes(rawFrame);
|
|
6105
|
+
flatBuffer.set(symmetrized, offset);
|
|
6106
|
+
}
|
|
6107
|
+
return { flatBuffer: flatBuffer, numFrames: numFrames, numBlendshapes: numBlendshapes };
|
|
6108
|
+
}
|
|
6109
|
+
|
|
6110
|
+
async function cpuDispose() {
|
|
6111
|
+
if (cpuSession) { await cpuSession.release(); cpuSession = null; }
|
|
6112
|
+
}
|
|
6113
|
+
|
|
6114
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
6115
|
+
// Silero VAD handlers
|
|
6116
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
6117
|
+
|
|
6118
|
+
async function vadLoad(msg) {
|
|
6119
|
+
vadSampleRate = msg.sampleRate;
|
|
6120
|
+
vadChunkSize = vadSampleRate === 16000 ? 512 : 256;
|
|
6121
|
+
vadContextSize = vadSampleRate === 16000 ? 64 : 32;
|
|
6122
|
+
|
|
6123
|
+
var response = await fetch(msg.modelUrl);
|
|
6124
|
+
if (!response.ok) throw new Error('Failed to fetch VAD model: ' + response.status);
|
|
6125
|
+
var modelBuffer = await response.arrayBuffer();
|
|
6126
|
+
vadSession = await ort.InferenceSession.create(new Uint8Array(modelBuffer), {
|
|
6127
|
+
executionProviders: ['wasm'],
|
|
6128
|
+
graphOptimizationLevel: 'all',
|
|
6129
|
+
});
|
|
6130
|
+
|
|
6131
|
+
return {
|
|
6132
|
+
inputNames: vadSession.inputNames.slice(),
|
|
6133
|
+
outputNames: vadSession.outputNames.slice(),
|
|
6134
|
+
};
|
|
6135
|
+
}
|
|
6136
|
+
|
|
6137
|
+
async function vadProcess(audio, state, context) {
|
|
6138
|
+
var inputSize = vadContextSize + vadChunkSize;
|
|
6139
|
+
var inputBuffer = new Float32Array(inputSize);
|
|
6140
|
+
inputBuffer.set(context, 0);
|
|
6141
|
+
inputBuffer.set(audio, vadContextSize);
|
|
6142
|
+
|
|
6143
|
+
var inputTensor = new ort.Tensor('float32', new Float32Array(inputBuffer), [1, inputSize]);
|
|
6144
|
+
var stateTensor = new ort.Tensor('float32', new Float32Array(state), [2, 1, 128]);
|
|
6145
|
+
var srTensor;
|
|
6146
|
+
try {
|
|
6147
|
+
srTensor = new ort.Tensor('int64', new BigInt64Array([BigInt(vadSampleRate)]), []);
|
|
6148
|
+
} catch (e) {
|
|
6149
|
+
srTensor = new ort.Tensor('int64', [BigInt(vadSampleRate)], []);
|
|
6150
|
+
}
|
|
6151
|
+
|
|
6152
|
+
var feeds = { 'input': inputTensor, 'state': stateTensor, 'sr': srTensor };
|
|
6153
|
+
var results = await vadSession.run(feeds);
|
|
6154
|
+
var outputTensor = results['output'];
|
|
6155
|
+
var newStateTensor = results['stateN'] || results['state'];
|
|
6156
|
+
if (!outputTensor) throw new Error('Missing output tensor from VAD model');
|
|
6157
|
+
|
|
6158
|
+
return { probability: outputTensor.data[0], newState: new Float32Array(newStateTensor.data) };
|
|
6159
|
+
}
|
|
6160
|
+
|
|
6161
|
+
function vadCreateInitialState() {
|
|
6162
|
+
return new Float32Array(2 * 1 * 128);
|
|
6163
|
+
}
|
|
6164
|
+
|
|
6165
|
+
async function vadDispose() {
|
|
6166
|
+
if (vadSession) { await vadSession.release(); vadSession = null; }
|
|
6167
|
+
}
|
|
6168
|
+
|
|
6169
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
6170
|
+
// Message handler
|
|
6171
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
6172
|
+
|
|
6173
|
+
self.onmessage = async function(e) {
|
|
6174
|
+
var msg = e.data;
|
|
6175
|
+
var requestId = msg.requestId;
|
|
6176
|
+
|
|
6177
|
+
try {
|
|
6178
|
+
switch (msg.type) {
|
|
6179
|
+
case 'init': {
|
|
6180
|
+
var startTime = performance.now();
|
|
6181
|
+
await loadOrt(msg.wasmPaths, msg.isIOS);
|
|
6182
|
+
self.postMessage({ type: 'init:done', requestId: requestId, loadTimeMs: performance.now() - startTime });
|
|
6183
|
+
break;
|
|
6184
|
+
}
|
|
6185
|
+
|
|
6186
|
+
case 'sv:load': {
|
|
6187
|
+
var startTime = performance.now();
|
|
6188
|
+
var info = await svLoad(msg);
|
|
6189
|
+
self.postMessage({
|
|
6190
|
+
type: 'sv:loaded', requestId: requestId, vocabSize: info.vocabSize,
|
|
6191
|
+
inputNames: info.inputNames, outputNames: info.outputNames,
|
|
6192
|
+
loadTimeMs: performance.now() - startTime,
|
|
6193
|
+
});
|
|
6194
|
+
break;
|
|
6195
|
+
}
|
|
6196
|
+
|
|
6197
|
+
case 'sv:transcribe': {
|
|
6198
|
+
var result = await svTranscribe(msg.audio);
|
|
6199
|
+
self.postMessage({
|
|
6200
|
+
type: 'sv:result', requestId: requestId,
|
|
6201
|
+
text: result.text, language: result.language, emotion: result.emotion, event: result.event,
|
|
6202
|
+
inferenceTimeMs: result.inferenceTimeMs, preprocessTimeMs: result.preprocessTimeMs,
|
|
6203
|
+
});
|
|
6204
|
+
break;
|
|
6205
|
+
}
|
|
6206
|
+
|
|
6207
|
+
case 'sv:dispose': {
|
|
6208
|
+
await svDispose();
|
|
6209
|
+
self.postMessage({ type: 'sv:disposed', requestId: requestId });
|
|
6210
|
+
break;
|
|
6211
|
+
}
|
|
6212
|
+
|
|
6213
|
+
case 'cpu:load': {
|
|
6214
|
+
var startTime = performance.now();
|
|
6215
|
+
var info = await cpuLoad(msg);
|
|
6216
|
+
self.postMessage({
|
|
6217
|
+
type: 'cpu:loaded', requestId: requestId,
|
|
6218
|
+
inputNames: info.inputNames, outputNames: info.outputNames,
|
|
6219
|
+
loadTimeMs: performance.now() - startTime,
|
|
6220
|
+
});
|
|
6221
|
+
break;
|
|
6222
|
+
}
|
|
6223
|
+
|
|
6224
|
+
case 'cpu:infer': {
|
|
6225
|
+
var startTime = performance.now();
|
|
6226
|
+
var result = await cpuInfer(msg.audio);
|
|
6227
|
+
var inferenceTimeMs = performance.now() - startTime;
|
|
6228
|
+
self.postMessage({
|
|
6229
|
+
type: 'cpu:result', requestId: requestId,
|
|
6230
|
+
blendshapes: result.flatBuffer, numFrames: result.numFrames,
|
|
6231
|
+
numBlendshapes: result.numBlendshapes, inferenceTimeMs: inferenceTimeMs,
|
|
6232
|
+
}, [result.flatBuffer.buffer]);
|
|
6233
|
+
break;
|
|
6234
|
+
}
|
|
6235
|
+
|
|
6236
|
+
case 'cpu:dispose': {
|
|
6237
|
+
await cpuDispose();
|
|
6238
|
+
self.postMessage({ type: 'cpu:disposed', requestId: requestId });
|
|
6239
|
+
break;
|
|
6240
|
+
}
|
|
6241
|
+
|
|
6242
|
+
case 'vad:load': {
|
|
6243
|
+
var startTime = performance.now();
|
|
6244
|
+
var info = await vadLoad(msg);
|
|
6245
|
+
self.postMessage({
|
|
6246
|
+
type: 'vad:loaded', requestId: requestId,
|
|
6247
|
+
inputNames: info.inputNames, outputNames: info.outputNames,
|
|
6248
|
+
loadTimeMs: performance.now() - startTime,
|
|
6249
|
+
});
|
|
6250
|
+
break;
|
|
6251
|
+
}
|
|
6252
|
+
|
|
6253
|
+
case 'vad:process': {
|
|
6254
|
+
var startTime = performance.now();
|
|
6255
|
+
var result = await vadProcess(msg.audio, msg.state, msg.context);
|
|
6256
|
+
self.postMessage({
|
|
6257
|
+
type: 'vad:result', requestId: requestId,
|
|
6258
|
+
probability: result.probability, state: result.newState,
|
|
6259
|
+
inferenceTimeMs: performance.now() - startTime,
|
|
6260
|
+
});
|
|
6261
|
+
break;
|
|
6262
|
+
}
|
|
6263
|
+
|
|
6264
|
+
case 'vad:reset': {
|
|
6265
|
+
var state = vadCreateInitialState();
|
|
6266
|
+
self.postMessage({ type: 'vad:reset', requestId: requestId, state: state });
|
|
6267
|
+
break;
|
|
6268
|
+
}
|
|
6269
|
+
|
|
6270
|
+
case 'vad:dispose': {
|
|
6271
|
+
await vadDispose();
|
|
6272
|
+
self.postMessage({ type: 'vad:disposed', requestId: requestId });
|
|
6273
|
+
break;
|
|
6274
|
+
}
|
|
6275
|
+
|
|
6276
|
+
case 'dispose-all': {
|
|
6277
|
+
await svDispose();
|
|
6278
|
+
await cpuDispose();
|
|
6279
|
+
await vadDispose();
|
|
6280
|
+
ort = null;
|
|
6281
|
+
self.postMessage({ type: 'dispose-all:done', requestId: requestId });
|
|
6282
|
+
break;
|
|
6283
|
+
}
|
|
6284
|
+
|
|
6285
|
+
default:
|
|
6286
|
+
self.postMessage({ type: 'error', requestId: requestId, error: 'Unknown message type: ' + msg.type });
|
|
6287
|
+
}
|
|
6288
|
+
} catch (err) {
|
|
6289
|
+
var errorMsg = err.message || String(err);
|
|
6290
|
+
if (typeof err === 'number') {
|
|
6291
|
+
errorMsg = 'Raw C++ exception pointer (0x' + err.toString(16) + '). Likely OOM in WASM.';
|
|
6292
|
+
}
|
|
6293
|
+
self.postMessage({ type: 'error', requestId: requestId, error: errorMsg });
|
|
6294
|
+
}
|
|
6295
|
+
};
|
|
6296
|
+
|
|
6297
|
+
self.onerror = function(err) {
|
|
6298
|
+
self.postMessage({ type: 'error', requestId: null, error: 'Worker error: ' + (err.message || String(err)) });
|
|
6299
|
+
};
|
|
6300
|
+
`;
|
|
6301
|
+
var UnifiedInferenceWorker = class {
|
|
6302
|
+
constructor() {
|
|
6303
|
+
this.worker = null;
|
|
6304
|
+
this.pendingRequests = /* @__PURE__ */ new Map();
|
|
6305
|
+
this.initialized = false;
|
|
6306
|
+
this.poisoned = false;
|
|
6307
|
+
}
|
|
6308
|
+
/**
|
|
6309
|
+
* Initialize the worker (load ORT WASM from CDN)
|
|
6310
|
+
*/
|
|
6311
|
+
async init() {
|
|
6312
|
+
if (this.initialized) return;
|
|
6313
|
+
const startTime = performance.now();
|
|
6314
|
+
const telemetry = getTelemetry();
|
|
6315
|
+
const span = telemetry?.startSpan("UnifiedInferenceWorker.init");
|
|
6316
|
+
try {
|
|
6317
|
+
logger6.info("Creating unified inference worker...");
|
|
6318
|
+
this.worker = this.createWorker();
|
|
6319
|
+
await this.sendMessage(
|
|
6320
|
+
{ type: "init", wasmPaths: WASM_CDN_PATH3, isIOS: isIOS() },
|
|
6321
|
+
"init:done",
|
|
6322
|
+
INIT_TIMEOUT_MS
|
|
6323
|
+
);
|
|
6324
|
+
this.initialized = true;
|
|
6325
|
+
const loadTimeMs = performance.now() - startTime;
|
|
6326
|
+
logger6.info("Unified worker initialized", { loadTimeMs: Math.round(loadTimeMs) });
|
|
6327
|
+
span?.setAttributes({ "worker.init_time_ms": loadTimeMs });
|
|
6328
|
+
span?.end();
|
|
6329
|
+
} catch (error) {
|
|
6330
|
+
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
6331
|
+
this.cleanup();
|
|
6332
|
+
throw error;
|
|
6333
|
+
}
|
|
6334
|
+
}
|
|
6335
|
+
// ── SenseVoice ────────────────────────────────────────────────────────
|
|
6336
|
+
async loadSenseVoice(config) {
|
|
6337
|
+
this.assertReady();
|
|
6338
|
+
const startTime = performance.now();
|
|
6339
|
+
const result = await this.sendMessage(
|
|
6340
|
+
{
|
|
6341
|
+
type: "sv:load",
|
|
6342
|
+
modelUrl: resolveUrl2(config.modelUrl),
|
|
6343
|
+
tokensUrl: resolveUrl2(config.tokensUrl),
|
|
6344
|
+
isIOS: isIOS(),
|
|
6345
|
+
language: config.language,
|
|
6346
|
+
textNorm: config.textNorm
|
|
6347
|
+
},
|
|
6348
|
+
"sv:loaded",
|
|
6349
|
+
SV_LOAD_TIMEOUT_MS
|
|
6350
|
+
);
|
|
6351
|
+
const loadTimeMs = performance.now() - startTime;
|
|
6352
|
+
return {
|
|
6353
|
+
backend: "wasm",
|
|
6354
|
+
loadTimeMs,
|
|
6355
|
+
inputNames: result.inputNames,
|
|
6356
|
+
outputNames: result.outputNames,
|
|
6357
|
+
vocabSize: result.vocabSize
|
|
6358
|
+
};
|
|
6359
|
+
}
|
|
6360
|
+
async transcribe(audio) {
|
|
6361
|
+
this.assertReady();
|
|
6362
|
+
const result = await this.sendMessage(
|
|
6363
|
+
{ type: "sv:transcribe", audio },
|
|
6364
|
+
"sv:result",
|
|
6365
|
+
SV_INFER_TIMEOUT_MS
|
|
6366
|
+
);
|
|
6367
|
+
return {
|
|
6368
|
+
text: result.text,
|
|
6369
|
+
language: result.language,
|
|
6370
|
+
emotion: result.emotion,
|
|
6371
|
+
event: result.event,
|
|
6372
|
+
inferenceTimeMs: result.inferenceTimeMs,
|
|
6373
|
+
preprocessTimeMs: result.preprocessTimeMs
|
|
6374
|
+
};
|
|
6375
|
+
}
|
|
6376
|
+
async disposeSenseVoice() {
|
|
6377
|
+
if (!this.worker) return;
|
|
6378
|
+
await this.sendMessage({ type: "sv:dispose" }, "sv:disposed", DISPOSE_TIMEOUT_MS);
|
|
6379
|
+
}
|
|
6380
|
+
// ── Wav2ArkitCpu (Lip Sync) ──────────────────────────────────────────
|
|
6381
|
+
async loadLipSync(config) {
|
|
6382
|
+
this.assertReady();
|
|
6383
|
+
const startTime = performance.now();
|
|
6384
|
+
const result = await this.sendMessage(
|
|
6385
|
+
{
|
|
6386
|
+
type: "cpu:load",
|
|
6387
|
+
modelUrl: resolveUrl2(config.modelUrl),
|
|
6388
|
+
externalDataUrl: config.externalDataUrl ? resolveUrl2(config.externalDataUrl) : null,
|
|
6389
|
+
isIOS: isIOS()
|
|
6390
|
+
},
|
|
6391
|
+
"cpu:loaded",
|
|
6392
|
+
CPU_LOAD_TIMEOUT_MS
|
|
6393
|
+
);
|
|
6394
|
+
const loadTimeMs = performance.now() - startTime;
|
|
6395
|
+
return {
|
|
6396
|
+
backend: "wasm",
|
|
6397
|
+
loadTimeMs,
|
|
6398
|
+
inputNames: result.inputNames,
|
|
6399
|
+
outputNames: result.outputNames
|
|
6400
|
+
};
|
|
6401
|
+
}
|
|
6402
|
+
async inferLipSync(audio) {
|
|
6403
|
+
this.assertReady();
|
|
6404
|
+
return this.sendMessage(
|
|
6405
|
+
{ type: "cpu:infer", audio },
|
|
6406
|
+
"cpu:result",
|
|
6407
|
+
CPU_INFER_TIMEOUT_MS
|
|
6408
|
+
);
|
|
6409
|
+
}
|
|
6410
|
+
async disposeLipSync() {
|
|
6411
|
+
if (!this.worker) return;
|
|
6412
|
+
await this.sendMessage({ type: "cpu:dispose" }, "cpu:disposed", DISPOSE_TIMEOUT_MS);
|
|
6413
|
+
}
|
|
6414
|
+
// ── Silero VAD ────────────────────────────────────────────────────────
|
|
6415
|
+
async loadVAD(config) {
|
|
6416
|
+
this.assertReady();
|
|
6417
|
+
const startTime = performance.now();
|
|
6418
|
+
const chunkSize = config.sampleRate === 16e3 ? 512 : 256;
|
|
6419
|
+
const result = await this.sendMessage(
|
|
6420
|
+
{
|
|
6421
|
+
type: "vad:load",
|
|
6422
|
+
modelUrl: resolveUrl2(config.modelUrl),
|
|
6423
|
+
sampleRate: config.sampleRate
|
|
6424
|
+
},
|
|
6425
|
+
"vad:loaded",
|
|
6426
|
+
VAD_LOAD_TIMEOUT_MS
|
|
6427
|
+
);
|
|
6428
|
+
const loadTimeMs = performance.now() - startTime;
|
|
6429
|
+
return {
|
|
6430
|
+
backend: "wasm",
|
|
6431
|
+
loadTimeMs,
|
|
6432
|
+
inputNames: result.inputNames,
|
|
6433
|
+
outputNames: result.outputNames,
|
|
6434
|
+
sampleRate: config.sampleRate,
|
|
6435
|
+
chunkSize
|
|
6436
|
+
};
|
|
6437
|
+
}
|
|
6438
|
+
async processVAD(audio, state, context) {
|
|
6439
|
+
this.assertReady();
|
|
6440
|
+
return this.sendMessage(
|
|
6441
|
+
{ type: "vad:process", audio, state, context },
|
|
6442
|
+
"vad:result",
|
|
6443
|
+
VAD_INFER_TIMEOUT_MS
|
|
6444
|
+
);
|
|
6445
|
+
}
|
|
6446
|
+
async resetVAD() {
|
|
6447
|
+
this.assertReady();
|
|
6448
|
+
const result = await this.sendMessage(
|
|
6449
|
+
{ type: "vad:reset" },
|
|
6450
|
+
"vad:reset",
|
|
6451
|
+
VAD_INFER_TIMEOUT_MS
|
|
6452
|
+
);
|
|
6453
|
+
return result.state;
|
|
6454
|
+
}
|
|
6455
|
+
async disposeVAD() {
|
|
6456
|
+
if (!this.worker) return;
|
|
6457
|
+
await this.sendMessage({ type: "vad:dispose" }, "vad:disposed", DISPOSE_TIMEOUT_MS);
|
|
6458
|
+
}
|
|
6459
|
+
// ── Lifecycle ─────────────────────────────────────────────────────────
|
|
6460
|
+
async dispose() {
|
|
6461
|
+
if (this.worker) {
|
|
6462
|
+
try {
|
|
6463
|
+
await this.sendMessage({ type: "dispose-all" }, "dispose-all:done", DISPOSE_TIMEOUT_MS);
|
|
6464
|
+
} catch {
|
|
6465
|
+
}
|
|
6466
|
+
this.worker.terminate();
|
|
6467
|
+
this.worker = null;
|
|
6468
|
+
}
|
|
6469
|
+
this.initialized = false;
|
|
6470
|
+
this.poisoned = false;
|
|
6471
|
+
this.rejectAllPending("Worker disposed");
|
|
6472
|
+
this.pendingRequests.clear();
|
|
6473
|
+
}
|
|
6474
|
+
/** Check if the worker is initialized and not poisoned */
|
|
6475
|
+
get isReady() {
|
|
6476
|
+
return this.initialized && !this.poisoned && this.worker !== null;
|
|
6477
|
+
}
|
|
6478
|
+
/** Check if Web Workers are supported */
|
|
6479
|
+
static isSupported() {
|
|
6480
|
+
return typeof Worker !== "undefined";
|
|
6481
|
+
}
|
|
6482
|
+
// ── Private ───────────────────────────────────────────────────────────
|
|
6483
|
+
assertReady() {
|
|
6484
|
+
if (!this.initialized || !this.worker) {
|
|
6485
|
+
throw new Error("UnifiedInferenceWorker not initialized. Call init() first.");
|
|
6486
|
+
}
|
|
6487
|
+
if (this.poisoned) {
|
|
6488
|
+
throw new Error("UnifiedInferenceWorker timed out \u2014 unavailable until page reload");
|
|
6489
|
+
}
|
|
6490
|
+
}
|
|
6491
|
+
createWorker() {
|
|
6492
|
+
const blob = new Blob([WORKER_SCRIPT2], { type: "application/javascript" });
|
|
6493
|
+
const blobUrl = URL.createObjectURL(blob);
|
|
6494
|
+
const worker = new Worker(blobUrl);
|
|
6495
|
+
URL.revokeObjectURL(blobUrl);
|
|
6496
|
+
worker.onmessage = (event) => {
|
|
6497
|
+
this.handleWorkerMessage(event.data);
|
|
6498
|
+
};
|
|
6499
|
+
worker.onerror = (error) => {
|
|
6500
|
+
logger6.error("Unified worker error", { error: error.message });
|
|
6501
|
+
this.rejectAllPending(`Worker error: ${error.message}`);
|
|
6502
|
+
};
|
|
6503
|
+
return worker;
|
|
6504
|
+
}
|
|
6505
|
+
handleWorkerMessage(data) {
|
|
6506
|
+
const requestId = data.requestId;
|
|
6507
|
+
if (data.type === "error") {
|
|
6508
|
+
if (requestId && this.pendingRequests.has(requestId)) {
|
|
6509
|
+
const pending = this.pendingRequests.get(requestId);
|
|
6510
|
+
clearTimeout(pending.timeout);
|
|
6511
|
+
this.pendingRequests.delete(requestId);
|
|
6512
|
+
pending.reject(new Error(data.error));
|
|
6513
|
+
} else {
|
|
6514
|
+
logger6.error("Worker broadcast error", { error: data.error });
|
|
6515
|
+
this.rejectAllPending(data.error);
|
|
6516
|
+
}
|
|
6517
|
+
return;
|
|
6518
|
+
}
|
|
6519
|
+
if (requestId && this.pendingRequests.has(requestId)) {
|
|
6520
|
+
const pending = this.pendingRequests.get(requestId);
|
|
6521
|
+
clearTimeout(pending.timeout);
|
|
6522
|
+
this.pendingRequests.delete(requestId);
|
|
6523
|
+
pending.resolve(data);
|
|
6524
|
+
}
|
|
6525
|
+
}
|
|
6526
|
+
sendMessage(message, expectedType, timeoutMs) {
|
|
6527
|
+
return new Promise((resolve, reject) => {
|
|
6528
|
+
if (!this.worker) {
|
|
6529
|
+
reject(new Error("Worker not initialized"));
|
|
6530
|
+
return;
|
|
6531
|
+
}
|
|
6532
|
+
const requestId = nextRequestId();
|
|
6533
|
+
const timeout = setTimeout(() => {
|
|
6534
|
+
this.pendingRequests.delete(requestId);
|
|
6535
|
+
this.poisoned = true;
|
|
6536
|
+
logger6.error("CRITICAL: Worker operation timed out \u2014 worker is dead", {
|
|
6537
|
+
type: message.type,
|
|
6538
|
+
timeoutMs
|
|
6539
|
+
});
|
|
6540
|
+
reject(new Error(`Worker operation '${message.type}' timed out after ${timeoutMs}ms`));
|
|
6541
|
+
}, timeoutMs);
|
|
6542
|
+
this.pendingRequests.set(requestId, {
|
|
6543
|
+
resolve,
|
|
6544
|
+
reject,
|
|
6545
|
+
timeout
|
|
6546
|
+
});
|
|
6547
|
+
this.worker.postMessage({ ...message, requestId });
|
|
6548
|
+
});
|
|
6549
|
+
}
|
|
6550
|
+
rejectAllPending(reason) {
|
|
6551
|
+
for (const [, pending] of this.pendingRequests) {
|
|
6552
|
+
clearTimeout(pending.timeout);
|
|
6553
|
+
pending.reject(new Error(reason));
|
|
6554
|
+
}
|
|
6555
|
+
this.pendingRequests.clear();
|
|
6556
|
+
}
|
|
6557
|
+
cleanup() {
|
|
6558
|
+
if (this.worker) {
|
|
6559
|
+
this.worker.terminate();
|
|
6560
|
+
this.worker = null;
|
|
6561
|
+
}
|
|
6562
|
+
this.initialized = false;
|
|
6563
|
+
this.rejectAllPending("Worker cleanup");
|
|
6564
|
+
this.pendingRequests.clear();
|
|
6565
|
+
}
|
|
6566
|
+
};
|
|
6567
|
+
var SenseVoiceUnifiedAdapter = class {
|
|
6568
|
+
constructor(worker, config) {
|
|
6569
|
+
this._isLoaded = false;
|
|
6570
|
+
this.inferenceQueue = Promise.resolve();
|
|
6571
|
+
this.worker = worker;
|
|
6572
|
+
const modelDir = config.modelUrl.substring(0, config.modelUrl.lastIndexOf("/"));
|
|
6573
|
+
this.config = {
|
|
6574
|
+
modelUrl: config.modelUrl,
|
|
6575
|
+
tokensUrl: config.tokensUrl ?? `${modelDir}/tokens.txt`,
|
|
6576
|
+
language: config.language ?? "auto",
|
|
6577
|
+
textNorm: config.textNorm ?? "with_itn"
|
|
6578
|
+
};
|
|
6579
|
+
this.languageId = resolveLanguageId(this.config.language);
|
|
6580
|
+
this.textNormId = resolveTextNormId(this.config.textNorm);
|
|
6581
|
+
}
|
|
6582
|
+
get isLoaded() {
|
|
6583
|
+
return this._isLoaded;
|
|
6584
|
+
}
|
|
6585
|
+
get backend() {
|
|
6586
|
+
return this._isLoaded ? "wasm" : null;
|
|
6587
|
+
}
|
|
6588
|
+
async load(onProgress) {
|
|
6589
|
+
const telemetry = getTelemetry();
|
|
6590
|
+
const span = telemetry?.startSpan("SenseVoiceUnifiedAdapter.load", {
|
|
6591
|
+
"model.url": this.config.modelUrl
|
|
6592
|
+
});
|
|
6593
|
+
try {
|
|
6594
|
+
const result = await this.worker.loadSenseVoice({
|
|
6595
|
+
modelUrl: this.config.modelUrl,
|
|
6596
|
+
tokensUrl: this.config.tokensUrl,
|
|
6597
|
+
language: this.languageId,
|
|
6598
|
+
textNorm: this.textNormId
|
|
6599
|
+
});
|
|
6600
|
+
this._isLoaded = true;
|
|
6601
|
+
onProgress?.(1, 1);
|
|
6602
|
+
logger6.info("SenseVoice loaded via unified worker", {
|
|
6603
|
+
backend: "wasm",
|
|
6604
|
+
loadTimeMs: Math.round(result.loadTimeMs),
|
|
6605
|
+
vocabSize: result.vocabSize
|
|
6606
|
+
});
|
|
6607
|
+
span?.setAttributes({ "model.backend": "wasm", "model.load_time_ms": result.loadTimeMs });
|
|
6608
|
+
span?.end();
|
|
6609
|
+
telemetry?.recordHistogram("omote.model.load_time", result.loadTimeMs, {
|
|
6610
|
+
model: "sensevoice-unified",
|
|
6611
|
+
backend: "wasm"
|
|
6612
|
+
});
|
|
6613
|
+
return result;
|
|
6614
|
+
} catch (error) {
|
|
6615
|
+
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
6616
|
+
throw error;
|
|
6617
|
+
}
|
|
6618
|
+
}
|
|
6619
|
+
async transcribe(audioSamples) {
|
|
6620
|
+
if (!this._isLoaded) throw new Error("Model not loaded. Call load() first.");
|
|
6621
|
+
const audio = new Float32Array(audioSamples);
|
|
6622
|
+
return new Promise((resolve, reject) => {
|
|
6623
|
+
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
6624
|
+
try {
|
|
6625
|
+
const result = await this.worker.transcribe(audio);
|
|
6626
|
+
resolve(result);
|
|
6627
|
+
} catch (err) {
|
|
6628
|
+
reject(err);
|
|
6629
|
+
}
|
|
6630
|
+
});
|
|
6631
|
+
});
|
|
6632
|
+
}
|
|
6633
|
+
async dispose() {
|
|
6634
|
+
if (this._isLoaded) {
|
|
6635
|
+
await this.worker.disposeSenseVoice();
|
|
6636
|
+
this._isLoaded = false;
|
|
6637
|
+
}
|
|
6638
|
+
}
|
|
6639
|
+
};
|
|
6640
|
+
var Wav2ArkitCpuUnifiedAdapter = class {
|
|
6641
|
+
constructor(worker, config) {
|
|
6642
|
+
this.modelId = "wav2arkit_cpu";
|
|
6643
|
+
this._isLoaded = false;
|
|
6644
|
+
this.inferenceQueue = Promise.resolve();
|
|
6645
|
+
this.worker = worker;
|
|
6646
|
+
this.config = config;
|
|
6647
|
+
}
|
|
6648
|
+
get isLoaded() {
|
|
6649
|
+
return this._isLoaded;
|
|
6650
|
+
}
|
|
6651
|
+
get backend() {
|
|
6652
|
+
return this._isLoaded ? "wasm" : null;
|
|
6653
|
+
}
|
|
6654
|
+
async load() {
|
|
6655
|
+
const telemetry = getTelemetry();
|
|
6656
|
+
const span = telemetry?.startSpan("Wav2ArkitCpuUnifiedAdapter.load", {
|
|
6657
|
+
"model.url": this.config.modelUrl
|
|
6658
|
+
});
|
|
6659
|
+
try {
|
|
6660
|
+
const externalDataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${this.config.modelUrl}.data` : null;
|
|
6661
|
+
const result = await this.worker.loadLipSync({
|
|
6662
|
+
modelUrl: this.config.modelUrl,
|
|
6663
|
+
externalDataUrl: externalDataUrl || null
|
|
6664
|
+
});
|
|
6665
|
+
this._isLoaded = true;
|
|
6666
|
+
logger6.info("Wav2ArkitCpu loaded via unified worker", {
|
|
6667
|
+
backend: "wasm",
|
|
6668
|
+
loadTimeMs: Math.round(result.loadTimeMs)
|
|
6669
|
+
});
|
|
6670
|
+
span?.setAttributes({ "model.backend": "wasm", "model.load_time_ms": result.loadTimeMs });
|
|
6671
|
+
span?.end();
|
|
6672
|
+
telemetry?.recordHistogram("omote.model.load_time", result.loadTimeMs, {
|
|
6673
|
+
model: "wav2arkit_cpu-unified",
|
|
6674
|
+
backend: "wasm"
|
|
6675
|
+
});
|
|
6676
|
+
return result;
|
|
6677
|
+
} catch (error) {
|
|
6678
|
+
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
6679
|
+
throw error;
|
|
6680
|
+
}
|
|
6681
|
+
}
|
|
6682
|
+
async infer(audioSamples, _identityIndex) {
|
|
6683
|
+
if (!this._isLoaded) throw new Error("Model not loaded. Call load() first.");
|
|
6684
|
+
const audioCopy = new Float32Array(audioSamples);
|
|
6685
|
+
return new Promise((resolve, reject) => {
|
|
6686
|
+
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
6687
|
+
const telemetry = getTelemetry();
|
|
6688
|
+
const span = telemetry?.startSpan("Wav2ArkitCpuUnifiedAdapter.infer", {
|
|
6689
|
+
"inference.input_samples": audioCopy.length
|
|
6690
|
+
});
|
|
6691
|
+
try {
|
|
6692
|
+
const startTime = performance.now();
|
|
6693
|
+
const result = await this.worker.inferLipSync(audioCopy);
|
|
6694
|
+
const inferenceTimeMs = performance.now() - startTime;
|
|
6695
|
+
const flatBuffer = result.blendshapes;
|
|
6696
|
+
const { numFrames, numBlendshapes } = result;
|
|
6697
|
+
const blendshapes = [];
|
|
6698
|
+
for (let f = 0; f < numFrames; f++) {
|
|
6699
|
+
blendshapes.push(flatBuffer.slice(f * numBlendshapes, (f + 1) * numBlendshapes));
|
|
6700
|
+
}
|
|
6701
|
+
span?.setAttributes({
|
|
6702
|
+
"inference.duration_ms": inferenceTimeMs,
|
|
6703
|
+
"inference.frames": numFrames
|
|
6704
|
+
});
|
|
6705
|
+
span?.end();
|
|
6706
|
+
resolve({ blendshapes, numFrames, inferenceTimeMs });
|
|
6707
|
+
} catch (err) {
|
|
6708
|
+
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
6709
|
+
reject(err);
|
|
6710
|
+
}
|
|
6711
|
+
});
|
|
6712
|
+
});
|
|
6713
|
+
}
|
|
6714
|
+
async dispose() {
|
|
6715
|
+
if (this._isLoaded) {
|
|
6716
|
+
await this.worker.disposeLipSync();
|
|
6717
|
+
this._isLoaded = false;
|
|
6718
|
+
}
|
|
6719
|
+
}
|
|
6720
|
+
};
|
|
6721
|
+
var SileroVADUnifiedAdapter = class {
|
|
6722
|
+
constructor(worker, config) {
|
|
6723
|
+
this._isLoaded = false;
|
|
6724
|
+
// Inference queue
|
|
6725
|
+
this.inferenceQueue = Promise.resolve();
|
|
6726
|
+
// Pre-speech buffer
|
|
6727
|
+
this.preSpeechBuffer = [];
|
|
6728
|
+
this.wasSpeaking = false;
|
|
6729
|
+
this.worker = worker;
|
|
6730
|
+
const sr = config.sampleRate ?? 16e3;
|
|
6731
|
+
this.config = {
|
|
6732
|
+
modelUrl: config.modelUrl,
|
|
6733
|
+
backend: config.backend ?? "wasm",
|
|
6734
|
+
sampleRate: sr,
|
|
6735
|
+
threshold: config.threshold ?? 0.5,
|
|
6736
|
+
preSpeechBufferChunks: config.preSpeechBufferChunks ?? 10
|
|
6737
|
+
};
|
|
6738
|
+
this.chunkSize = sr === 16e3 ? 512 : 256;
|
|
6739
|
+
this.contextSize = sr === 16e3 ? 64 : 32;
|
|
6740
|
+
this.state = new Float32Array(2 * 1 * 128);
|
|
6741
|
+
this.context = new Float32Array(this.contextSize);
|
|
6742
|
+
}
|
|
6743
|
+
get isLoaded() {
|
|
6744
|
+
return this._isLoaded;
|
|
6745
|
+
}
|
|
6746
|
+
get backend() {
|
|
6747
|
+
return this._isLoaded ? "wasm" : null;
|
|
6748
|
+
}
|
|
6749
|
+
get sampleRate() {
|
|
6750
|
+
return this.config.sampleRate;
|
|
6751
|
+
}
|
|
6752
|
+
get threshold() {
|
|
6753
|
+
return this.config.threshold;
|
|
6754
|
+
}
|
|
6755
|
+
getChunkSize() {
|
|
6756
|
+
return this.chunkSize;
|
|
6757
|
+
}
|
|
6758
|
+
getChunkDurationMs() {
|
|
6759
|
+
return this.chunkSize / this.config.sampleRate * 1e3;
|
|
6760
|
+
}
|
|
6761
|
+
async load() {
|
|
6762
|
+
const telemetry = getTelemetry();
|
|
6763
|
+
const span = telemetry?.startSpan("SileroVADUnifiedAdapter.load", {
|
|
6764
|
+
"model.url": this.config.modelUrl
|
|
6765
|
+
});
|
|
6766
|
+
try {
|
|
6767
|
+
const result = await this.worker.loadVAD({
|
|
6768
|
+
modelUrl: this.config.modelUrl,
|
|
6769
|
+
sampleRate: this.config.sampleRate
|
|
6770
|
+
});
|
|
6771
|
+
this._isLoaded = true;
|
|
6772
|
+
logger6.info("SileroVAD loaded via unified worker", {
|
|
6773
|
+
backend: "wasm",
|
|
6774
|
+
loadTimeMs: Math.round(result.loadTimeMs),
|
|
6775
|
+
sampleRate: this.config.sampleRate,
|
|
6776
|
+
chunkSize: this.chunkSize
|
|
6777
|
+
});
|
|
6778
|
+
span?.setAttributes({ "model.backend": "wasm", "model.load_time_ms": result.loadTimeMs });
|
|
6779
|
+
span?.end();
|
|
6780
|
+
telemetry?.recordHistogram("omote.model.load_time", result.loadTimeMs, {
|
|
6781
|
+
model: "silero-vad-unified",
|
|
6782
|
+
backend: "wasm"
|
|
6783
|
+
});
|
|
6784
|
+
return result;
|
|
6785
|
+
} catch (error) {
|
|
6786
|
+
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
6787
|
+
throw error;
|
|
6788
|
+
}
|
|
6789
|
+
}
|
|
6790
|
+
async process(audioChunk) {
|
|
6791
|
+
if (!this._isLoaded) throw new Error("Model not loaded. Call load() first.");
|
|
6792
|
+
if (audioChunk.length !== this.chunkSize) {
|
|
6793
|
+
throw new Error(
|
|
6794
|
+
`Audio chunk must be exactly ${this.chunkSize} samples (got ${audioChunk.length}). Use getChunkSize() to get required size.`
|
|
6795
|
+
);
|
|
6796
|
+
}
|
|
6797
|
+
const audioChunkCopy = new Float32Array(audioChunk);
|
|
6798
|
+
return new Promise((resolve, reject) => {
|
|
6799
|
+
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
6800
|
+
try {
|
|
6801
|
+
const startTime = performance.now();
|
|
6802
|
+
const result = await this.worker.processVAD(audioChunkCopy, this.state, this.context);
|
|
6803
|
+
this.state = result.state;
|
|
6804
|
+
this.context = audioChunkCopy.slice(-this.contextSize);
|
|
6805
|
+
const inferenceTimeMs = performance.now() - startTime;
|
|
6806
|
+
const isSpeech = result.probability > this.config.threshold;
|
|
6807
|
+
let preSpeechChunks;
|
|
6808
|
+
if (isSpeech && !this.wasSpeaking) {
|
|
6809
|
+
preSpeechChunks = [...this.preSpeechBuffer];
|
|
6810
|
+
this.preSpeechBuffer = [];
|
|
6811
|
+
} else if (!isSpeech && !this.wasSpeaking) {
|
|
6812
|
+
this.preSpeechBuffer.push(new Float32Array(audioChunkCopy));
|
|
6813
|
+
if (this.preSpeechBuffer.length > this.config.preSpeechBufferChunks) {
|
|
6814
|
+
this.preSpeechBuffer.shift();
|
|
6815
|
+
}
|
|
6816
|
+
} else if (!isSpeech && this.wasSpeaking) {
|
|
6817
|
+
this.preSpeechBuffer = [];
|
|
6818
|
+
}
|
|
6819
|
+
this.wasSpeaking = isSpeech;
|
|
6820
|
+
resolve({
|
|
6821
|
+
probability: result.probability,
|
|
6822
|
+
isSpeech,
|
|
6823
|
+
inferenceTimeMs,
|
|
6824
|
+
preSpeechChunks
|
|
6825
|
+
});
|
|
6826
|
+
} catch (err) {
|
|
6827
|
+
reject(err);
|
|
6828
|
+
}
|
|
6829
|
+
});
|
|
6830
|
+
});
|
|
6831
|
+
}
|
|
6832
|
+
async reset() {
|
|
6833
|
+
if (!this._isLoaded) throw new Error("Model not loaded. Call load() first.");
|
|
6834
|
+
const newState = await this.worker.resetVAD();
|
|
6835
|
+
this.state = newState;
|
|
6836
|
+
this.context = new Float32Array(this.contextSize);
|
|
6837
|
+
this.preSpeechBuffer = [];
|
|
6838
|
+
this.wasSpeaking = false;
|
|
6839
|
+
}
|
|
6840
|
+
async dispose() {
|
|
6841
|
+
if (this._isLoaded) {
|
|
6842
|
+
await this.worker.disposeVAD();
|
|
6843
|
+
this._isLoaded = false;
|
|
6844
|
+
}
|
|
6845
|
+
this.state = new Float32Array(2 * 1 * 128);
|
|
6846
|
+
this.context = new Float32Array(this.contextSize);
|
|
6847
|
+
this.preSpeechBuffer = [];
|
|
6848
|
+
this.wasSpeaking = false;
|
|
6849
|
+
}
|
|
6850
|
+
};
|
|
6851
|
+
|
|
6852
|
+
// src/inference/createSenseVoice.ts
|
|
6853
|
+
var logger7 = createLogger("createSenseVoice");
|
|
6854
|
+
function createSenseVoice(config) {
|
|
6855
|
+
if (config.unifiedWorker) {
|
|
6856
|
+
logger7.info("Creating SenseVoiceUnifiedAdapter (shared unified worker)");
|
|
6857
|
+
return new SenseVoiceUnifiedAdapter(config.unifiedWorker, {
|
|
6858
|
+
modelUrl: config.modelUrl,
|
|
6859
|
+
tokensUrl: config.tokensUrl,
|
|
6860
|
+
language: config.language,
|
|
6861
|
+
textNorm: config.textNorm
|
|
6862
|
+
});
|
|
6863
|
+
}
|
|
6864
|
+
const useWorker = config.useWorker ?? "auto";
|
|
6865
|
+
if (useWorker === true) {
|
|
6866
|
+
if (!SenseVoiceWorker.isSupported()) {
|
|
6867
|
+
throw new Error("Web Workers are not supported in this environment");
|
|
6868
|
+
}
|
|
6869
|
+
logger7.info("Creating SenseVoiceWorker (off-main-thread)");
|
|
6870
|
+
return new SenseVoiceWorker({
|
|
6871
|
+
modelUrl: config.modelUrl,
|
|
6872
|
+
tokensUrl: config.tokensUrl,
|
|
6873
|
+
language: config.language,
|
|
6874
|
+
textNorm: config.textNorm
|
|
6875
|
+
});
|
|
6876
|
+
}
|
|
6877
|
+
if (useWorker === false) {
|
|
6878
|
+
logger7.info("Creating SenseVoiceInference (main thread)");
|
|
6879
|
+
return new SenseVoiceInference({
|
|
6880
|
+
modelUrl: config.modelUrl,
|
|
6881
|
+
tokensUrl: config.tokensUrl,
|
|
6882
|
+
language: config.language,
|
|
6883
|
+
textNorm: config.textNorm
|
|
6884
|
+
});
|
|
6885
|
+
}
|
|
6886
|
+
if (SenseVoiceWorker.isSupported() && !isIOS()) {
|
|
6887
|
+
logger7.info("Auto-detected: creating SenseVoiceWorker (off-main-thread)");
|
|
6888
|
+
return new SenseVoiceWorker({
|
|
6889
|
+
modelUrl: config.modelUrl,
|
|
6890
|
+
tokensUrl: config.tokensUrl,
|
|
6891
|
+
language: config.language,
|
|
6892
|
+
textNorm: config.textNorm
|
|
6893
|
+
});
|
|
6894
|
+
}
|
|
6895
|
+
logger7.info("Auto-detected: creating SenseVoiceInference (main thread)", {
|
|
6896
|
+
reason: isIOS() ? "iOS (shared ORT instance)" : "Worker unsupported"
|
|
6897
|
+
});
|
|
6898
|
+
return new SenseVoiceInference({
|
|
6899
|
+
modelUrl: config.modelUrl,
|
|
6900
|
+
tokensUrl: config.tokensUrl,
|
|
6901
|
+
language: config.language,
|
|
6902
|
+
textNorm: config.textNorm
|
|
6903
|
+
});
|
|
6904
|
+
}
|
|
6905
|
+
|
|
6906
|
+
// src/inference/Wav2ArkitCpuInference.ts
|
|
6907
|
+
var logger8 = createLogger("Wav2ArkitCpu");
|
|
6908
|
+
var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
6909
|
+
constructor(config) {
|
|
6910
|
+
this.modelId = "wav2arkit_cpu";
|
|
6911
|
+
this.session = null;
|
|
6912
|
+
this.ort = null;
|
|
6913
|
+
this._backend = "wasm";
|
|
6914
|
+
this.isLoading = false;
|
|
6915
|
+
// Inference queue for handling concurrent calls
|
|
6916
|
+
this.inferenceQueue = Promise.resolve();
|
|
6917
|
+
// Session health: set to true if session.run() times out.
|
|
6918
|
+
// A timed-out session may have a zombie WASM dispatch still running,
|
|
6919
|
+
// so all future infer() calls reject immediately to prevent concurrent access.
|
|
6920
|
+
this.poisoned = false;
|
|
6921
|
+
this.config = config;
|
|
6922
|
+
}
|
|
6923
|
+
get backend() {
|
|
6924
|
+
return this.session ? this._backend : null;
|
|
6925
|
+
}
|
|
6926
|
+
get isLoaded() {
|
|
6927
|
+
return this.session !== null;
|
|
6928
|
+
}
|
|
6929
|
+
/**
|
|
6930
|
+
* Load the ONNX model
|
|
6931
|
+
*/
|
|
6932
|
+
async load() {
|
|
6933
|
+
if (this.isLoading) {
|
|
6934
|
+
throw new Error("Model is already loading");
|
|
6935
|
+
}
|
|
6936
|
+
if (this.session) {
|
|
6937
|
+
throw new Error("Model already loaded. Call dispose() first.");
|
|
6938
|
+
}
|
|
6939
|
+
this.isLoading = true;
|
|
6940
|
+
const startTime = performance.now();
|
|
6941
|
+
const telemetry = getTelemetry();
|
|
6942
|
+
const span = telemetry?.startSpan("Wav2ArkitCpu.load", {
|
|
6943
|
+
"model.url": this.config.modelUrl,
|
|
6944
|
+
"model.backend_requested": this.config.backend || "wasm"
|
|
6945
|
+
});
|
|
6946
|
+
try {
|
|
6947
|
+
const preference = this.config.backend || "wasm";
|
|
6948
|
+
logger8.info("Loading ONNX Runtime...", { preference });
|
|
6949
|
+
const { ort, backend } = await getOnnxRuntimeForPreference(preference);
|
|
6950
|
+
this.ort = ort;
|
|
6951
|
+
this._backend = backend;
|
|
6952
|
+
logger8.info("ONNX Runtime loaded", { backend: this._backend });
|
|
6953
|
+
const modelUrl = this.config.modelUrl;
|
|
6954
|
+
const dataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${modelUrl}.data` : null;
|
|
6955
|
+
const sessionOptions = getSessionOptions(this._backend);
|
|
6956
|
+
if (isIOS()) {
|
|
6957
|
+
logger8.info("iOS: passing model URLs directly to ORT (low-memory path)", {
|
|
6958
|
+
modelUrl,
|
|
6959
|
+
dataUrl
|
|
6960
|
+
});
|
|
6961
|
+
if (dataUrl) {
|
|
6962
|
+
const dataFilename = dataUrl.split("/").pop();
|
|
6963
|
+
sessionOptions.externalData = [{
|
|
6964
|
+
path: dataFilename,
|
|
6965
|
+
data: dataUrl
|
|
6966
|
+
// URL string — ORT fetches directly into WASM
|
|
6967
|
+
}];
|
|
6968
|
+
}
|
|
6969
|
+
this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
|
|
6970
|
+
} else {
|
|
6971
|
+
const cache = getModelCache();
|
|
6972
|
+
const isCached = await cache.has(modelUrl);
|
|
6973
|
+
let modelBuffer;
|
|
6974
|
+
if (isCached) {
|
|
6975
|
+
logger8.debug("Loading model from cache", { modelUrl });
|
|
6976
|
+
modelBuffer = await cache.get(modelUrl);
|
|
6977
|
+
if (!modelBuffer) {
|
|
6978
|
+
logger8.warn("Cache corruption detected, clearing and retrying", { modelUrl });
|
|
6979
|
+
await cache.delete(modelUrl);
|
|
6980
|
+
modelBuffer = await fetchWithCache(modelUrl);
|
|
6981
|
+
}
|
|
6982
|
+
} else {
|
|
6983
|
+
logger8.debug("Fetching and caching model graph", { modelUrl });
|
|
6984
|
+
modelBuffer = await fetchWithCache(modelUrl);
|
|
6985
|
+
}
|
|
6986
|
+
if (!modelBuffer) {
|
|
6987
|
+
throw new Error(`Failed to load model: ${modelUrl}`);
|
|
6988
|
+
}
|
|
6989
|
+
let externalDataBuffer = null;
|
|
6990
|
+
if (dataUrl) {
|
|
6991
|
+
try {
|
|
6992
|
+
const isDataCached = await cache.has(dataUrl);
|
|
6993
|
+
if (isDataCached) {
|
|
6994
|
+
logger8.debug("Loading external data from cache", { dataUrl });
|
|
6995
|
+
externalDataBuffer = await cache.get(dataUrl);
|
|
6996
|
+
if (!externalDataBuffer) {
|
|
6997
|
+
logger8.warn("Cache corruption for external data, retrying", { dataUrl });
|
|
6998
|
+
await cache.delete(dataUrl);
|
|
6999
|
+
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
7000
|
+
}
|
|
7001
|
+
} else {
|
|
7002
|
+
logger8.info("Fetching external model data", {
|
|
7003
|
+
dataUrl,
|
|
7004
|
+
note: "This may be a large download (400MB+)"
|
|
7005
|
+
});
|
|
7006
|
+
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
7007
|
+
}
|
|
7008
|
+
logger8.info("External data loaded", {
|
|
7009
|
+
size: formatBytes(externalDataBuffer.byteLength)
|
|
7010
|
+
});
|
|
7011
|
+
} catch (err) {
|
|
7012
|
+
logger8.debug("No external data file found (single-file model)", {
|
|
7013
|
+
dataUrl,
|
|
7014
|
+
error: err.message
|
|
7015
|
+
});
|
|
7016
|
+
}
|
|
7017
|
+
}
|
|
7018
|
+
logger8.debug("Creating ONNX session", {
|
|
7019
|
+
graphSize: formatBytes(modelBuffer.byteLength),
|
|
7020
|
+
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
|
|
7021
|
+
backend: this._backend
|
|
7022
|
+
});
|
|
7023
|
+
if (externalDataBuffer) {
|
|
7024
|
+
const dataFilename = dataUrl.split("/").pop();
|
|
7025
|
+
sessionOptions.externalData = [{
|
|
7026
|
+
path: dataFilename,
|
|
7027
|
+
data: new Uint8Array(externalDataBuffer)
|
|
7028
|
+
}];
|
|
7029
|
+
}
|
|
7030
|
+
const modelData = new Uint8Array(modelBuffer);
|
|
7031
|
+
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
7032
|
+
}
|
|
7033
|
+
const loadTimeMs = performance.now() - startTime;
|
|
7034
|
+
logger8.info("Model loaded successfully", {
|
|
7035
|
+
backend: this._backend,
|
|
7036
|
+
loadTimeMs: Math.round(loadTimeMs),
|
|
7037
|
+
inputs: this.session.inputNames,
|
|
7038
|
+
outputs: this.session.outputNames
|
|
7039
|
+
});
|
|
7040
|
+
span?.setAttributes({
|
|
7041
|
+
"model.backend": this._backend,
|
|
7042
|
+
"model.load_time_ms": loadTimeMs,
|
|
7043
|
+
"model.cached": !isIOS()
|
|
7044
|
+
});
|
|
7045
|
+
span?.end();
|
|
7046
|
+
telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
|
|
7047
|
+
model: "wav2arkit_cpu",
|
|
7048
|
+
backend: this._backend
|
|
7049
|
+
});
|
|
7050
|
+
logger8.debug("Running warmup inference");
|
|
7051
|
+
const warmupStart = performance.now();
|
|
7052
|
+
const silentAudio = new Float32Array(16e3);
|
|
7053
|
+
await this.infer(silentAudio);
|
|
7054
|
+
const warmupTimeMs = performance.now() - warmupStart;
|
|
7055
|
+
logger8.info("Warmup inference complete", {
|
|
7056
|
+
warmupTimeMs: Math.round(warmupTimeMs),
|
|
7057
|
+
backend: this._backend
|
|
7058
|
+
});
|
|
7059
|
+
telemetry?.recordHistogram("omote.model.warmup_time", warmupTimeMs, {
|
|
7060
|
+
model: "wav2arkit_cpu",
|
|
7061
|
+
backend: this._backend
|
|
7062
|
+
});
|
|
7063
|
+
return {
|
|
7064
|
+
backend: this._backend,
|
|
7065
|
+
loadTimeMs,
|
|
7066
|
+
inputNames: [...this.session.inputNames],
|
|
7067
|
+
outputNames: [...this.session.outputNames]
|
|
7068
|
+
};
|
|
7069
|
+
} catch (error) {
|
|
7070
|
+
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
7071
|
+
telemetry?.incrementCounter("omote.errors.total", 1, {
|
|
7072
|
+
model: "wav2arkit_cpu",
|
|
7073
|
+
error_type: "load_failed"
|
|
7074
|
+
});
|
|
7075
|
+
throw error;
|
|
7076
|
+
} finally {
|
|
7077
|
+
this.isLoading = false;
|
|
7078
|
+
}
|
|
7079
|
+
}
|
|
7080
|
+
/**
|
|
7081
|
+
* Run inference on raw audio
|
|
7082
|
+
*
|
|
7083
|
+
* Accepts variable-length audio (not fixed to 16000 samples).
|
|
7084
|
+
* Output frames = ceil(30 * numSamples / 16000).
|
|
7085
|
+
*
|
|
7086
|
+
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
7087
|
+
* @param _identityIndex - Ignored (identity 11 is baked into the model)
|
|
7088
|
+
*/
|
|
7089
|
+
async infer(audioSamples, _identityIndex) {
|
|
7090
|
+
if (!this.session) {
|
|
7091
|
+
throw new Error("Model not loaded. Call load() first.");
|
|
7092
|
+
}
|
|
7093
|
+
if (this.poisoned) {
|
|
7094
|
+
throw new Error("Wav2ArkitCpu session timed out \u2014 inference unavailable until page reload");
|
|
7095
|
+
}
|
|
7096
|
+
const audioCopy = new Float32Array(audioSamples);
|
|
7097
|
+
const feeds = {
|
|
7098
|
+
"audio_waveform": new this.ort.Tensor("float32", audioCopy, [1, audioCopy.length])
|
|
7099
|
+
};
|
|
7100
|
+
return this.queueInference(feeds, audioCopy.length);
|
|
7101
|
+
}
|
|
7102
|
+
/**
|
|
7103
|
+
* Queue inference to serialize ONNX session calls
|
|
7104
|
+
*/
|
|
7105
|
+
queueInference(feeds, inputSamples) {
|
|
7106
|
+
return new Promise((resolve, reject) => {
|
|
7107
|
+
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
7108
|
+
const telemetry = getTelemetry();
|
|
7109
|
+
const span = telemetry?.startSpan("Wav2ArkitCpu.infer", {
|
|
7110
|
+
"inference.backend": this._backend,
|
|
7111
|
+
"inference.input_samples": inputSamples
|
|
7112
|
+
});
|
|
7113
|
+
try {
|
|
7114
|
+
const startTime = performance.now();
|
|
7115
|
+
let timeoutId;
|
|
7116
|
+
const results = await Promise.race([
|
|
7117
|
+
this.session.run(feeds).then((r) => {
|
|
7118
|
+
clearTimeout(timeoutId);
|
|
7119
|
+
return r;
|
|
7120
|
+
}),
|
|
7121
|
+
new Promise((_, rej) => {
|
|
7122
|
+
timeoutId = setTimeout(
|
|
7123
|
+
() => rej(new Error(`Wav2ArkitCpu inference timed out after ${_Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS}ms`)),
|
|
7124
|
+
_Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
|
|
7125
|
+
);
|
|
7126
|
+
})
|
|
7127
|
+
]);
|
|
7128
|
+
const inferenceTimeMs = performance.now() - startTime;
|
|
7129
|
+
const blendshapeOutput = results["blendshapes"];
|
|
7130
|
+
if (!blendshapeOutput) {
|
|
7131
|
+
throw new Error("Missing blendshapes output from model");
|
|
7132
|
+
}
|
|
7133
|
+
const blendshapeData = blendshapeOutput.data;
|
|
7134
|
+
const numFrames = blendshapeOutput.dims[1];
|
|
7135
|
+
const numBlendshapes = blendshapeOutput.dims[2];
|
|
7136
|
+
const blendshapes = [];
|
|
7137
|
+
for (let f = 0; f < numFrames; f++) {
|
|
7138
|
+
const rawFrame = blendshapeData.slice(f * numBlendshapes, (f + 1) * numBlendshapes);
|
|
7139
|
+
const symmetrized = symmetrizeBlendshapes(rawFrame);
|
|
7140
|
+
blendshapes.push(symmetrized);
|
|
7141
|
+
}
|
|
7142
|
+
logger8.trace("Inference completed", {
|
|
7143
|
+
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
7144
|
+
numFrames,
|
|
7145
|
+
inputSamples
|
|
7146
|
+
});
|
|
7147
|
+
span?.setAttributes({
|
|
7148
|
+
"inference.duration_ms": inferenceTimeMs,
|
|
7149
|
+
"inference.frames": numFrames
|
|
7150
|
+
});
|
|
7151
|
+
span?.end();
|
|
7152
|
+
telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
|
|
7153
|
+
model: "wav2arkit_cpu",
|
|
7154
|
+
backend: this._backend
|
|
7155
|
+
});
|
|
7156
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
7157
|
+
model: "wav2arkit_cpu",
|
|
7158
|
+
backend: this._backend,
|
|
7159
|
+
status: "success"
|
|
7160
|
+
});
|
|
7161
|
+
resolve({
|
|
7162
|
+
blendshapes,
|
|
7163
|
+
numFrames,
|
|
7164
|
+
inferenceTimeMs
|
|
7165
|
+
});
|
|
7166
|
+
} catch (err) {
|
|
7167
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
7168
|
+
if (errMsg.includes("timed out")) {
|
|
7169
|
+
this.poisoned = true;
|
|
7170
|
+
logger8.error("CRITICAL: Inference session timed out \u2014 Wav2ArkitCpu is dead. Page reload required.", {
|
|
7171
|
+
backend: this._backend,
|
|
7172
|
+
timeoutMs: _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
|
|
7173
|
+
});
|
|
7174
|
+
} else if (typeof err === "number") {
|
|
7175
|
+
const oomError = new Error(
|
|
7176
|
+
`Wav2ArkitCpu inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
|
|
7177
|
+
);
|
|
7178
|
+
logger8.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
7179
|
+
pointer: `0x${err.toString(16)}`,
|
|
7180
|
+
backend: this._backend
|
|
7181
|
+
});
|
|
7182
|
+
span?.endWithError(oomError);
|
|
7183
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
7184
|
+
model: "wav2arkit_cpu",
|
|
7185
|
+
backend: this._backend,
|
|
7186
|
+
status: "error"
|
|
7187
|
+
});
|
|
7188
|
+
reject(oomError);
|
|
7189
|
+
return;
|
|
7190
|
+
} else {
|
|
7191
|
+
logger8.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
7192
|
+
}
|
|
7193
|
+
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
7194
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
7195
|
+
model: "wav2arkit_cpu",
|
|
7196
|
+
backend: this._backend,
|
|
7197
|
+
status: "error"
|
|
7198
|
+
});
|
|
7199
|
+
reject(err);
|
|
7200
|
+
}
|
|
7201
|
+
});
|
|
7202
|
+
});
|
|
7203
|
+
}
|
|
7204
|
+
/**
|
|
7205
|
+
* Dispose of the model and free resources
|
|
7206
|
+
*/
|
|
7207
|
+
async dispose() {
|
|
7208
|
+
if (this.session) {
|
|
7209
|
+
await this.session.release();
|
|
7210
|
+
this.session = null;
|
|
7211
|
+
}
|
|
7212
|
+
}
|
|
7213
|
+
};
|
|
7214
|
+
_Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS = 5e3;
|
|
7215
|
+
var Wav2ArkitCpuInference = _Wav2ArkitCpuInference;
|
|
7216
|
+
|
|
7217
|
+
// src/inference/Wav2ArkitCpuWorker.ts
|
|
7218
|
+
var logger9 = createLogger("Wav2ArkitCpuWorker");
|
|
7219
|
+
var WASM_CDN_PATH4 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
7220
|
+
var LOAD_TIMEOUT_MS2 = 6e4;
|
|
7221
|
+
var INFERENCE_TIMEOUT_MS2 = 5e3;
|
|
7222
|
+
function resolveUrl3(url) {
|
|
7223
|
+
if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
|
|
7224
|
+
try {
|
|
7225
|
+
return new URL(url, globalThis.location?.origin ?? "https://localhost").href;
|
|
7226
|
+
} catch {
|
|
7227
|
+
return url;
|
|
7228
|
+
}
|
|
7229
|
+
}
|
|
7230
|
+
var WORKER_SCRIPT3 = `
|
|
7231
|
+
// Wav2ArkitCpu Worker Script
|
|
7232
|
+
// Loaded via Blob URL - no separate file needed
|
|
7233
|
+
|
|
7234
|
+
var ort = null;
|
|
7235
|
+
var session = null;
|
|
7236
|
+
|
|
7237
|
+
// Precomputed symmetric index pairs from LAM_BLENDSHAPES alphabetical ordering
|
|
7238
|
+
// Used to average left/right blendshape pairs for symmetrized output
|
|
7239
|
+
const SYMMETRIC_INDEX_PAIRS = [
|
|
7240
|
+
[23, 25], // jawLeft, jawRight
|
|
7241
|
+
[32, 38], // mouthLeft, mouthRight
|
|
7242
|
+
[43, 44], // mouthSmileLeft, mouthSmileRight
|
|
7243
|
+
[29, 30], // mouthFrownLeft, mouthFrownRight
|
|
7244
|
+
[27, 28], // mouthDimpleLeft, mouthDimpleRight
|
|
7245
|
+
[45, 46], // mouthStretchLeft, mouthStretchRight
|
|
7246
|
+
[35, 36], // mouthPressLeft, mouthPressRight
|
|
7247
|
+
[47, 48], // mouthUpperUpLeft, mouthUpperUpRight
|
|
7248
|
+
[33, 34], // mouthLowerDownLeft, mouthLowerDownRight
|
|
7249
|
+
[49, 50], // noseSneerLeft, noseSneerRight
|
|
7250
|
+
[6, 7], // cheekSquintLeft, cheekSquintRight
|
|
7251
|
+
[0, 1], // browDownLeft, browDownRight
|
|
7252
|
+
[3, 4], // browOuterUpLeft, browOuterUpRight
|
|
7253
|
+
[8, 9], // eyeBlinkLeft, eyeBlinkRight
|
|
7254
|
+
[16, 17], // eyeLookUpLeft, eyeLookUpRight
|
|
7255
|
+
[10, 11], // eyeLookDownLeft, eyeLookDownRight
|
|
7256
|
+
[12, 13], // eyeLookInLeft, eyeLookInRight
|
|
7257
|
+
[14, 15], // eyeLookOutLeft, eyeLookOutRight
|
|
7258
|
+
[18, 19], // eyeSquintLeft, eyeSquintRight
|
|
7259
|
+
[20, 21], // eyeWideLeft, eyeWideRight
|
|
7260
|
+
];
|
|
7261
|
+
|
|
7262
|
+
/**
|
|
7263
|
+
* Symmetrize blendshapes by averaging left/right pairs
|
|
7264
|
+
* Inlined from blendshapeUtils.ts for worker context
|
|
7265
|
+
*/
|
|
7266
|
+
function symmetrizeBlendshapes(frame) {
|
|
7267
|
+
const result = new Float32Array(frame);
|
|
7268
|
+
for (const [lIdx, rIdx] of SYMMETRIC_INDEX_PAIRS) {
|
|
7269
|
+
const avg = (frame[lIdx] + frame[rIdx]) / 2;
|
|
7270
|
+
result[lIdx] = avg;
|
|
7271
|
+
result[rIdx] = avg;
|
|
7272
|
+
}
|
|
7273
|
+
return result;
|
|
7274
|
+
}
|
|
7275
|
+
|
|
7276
|
+
/**
|
|
7277
|
+
* Load ONNX Runtime from CDN
|
|
7278
|
+
*/
|
|
7279
|
+
async function loadOrt(wasmPaths) {
|
|
7280
|
+
if (ort) return;
|
|
7281
|
+
|
|
7282
|
+
// Import ONNX Runtime from CDN
|
|
7283
|
+
const ortUrl = wasmPaths + 'ort.wasm.min.js';
|
|
7284
|
+
|
|
7285
|
+
// Load the script by fetching and executing it
|
|
7286
|
+
const response = await fetch(ortUrl);
|
|
7287
|
+
const scriptText = await response.text();
|
|
7288
|
+
|
|
7289
|
+
// Create a blob URL for the script
|
|
7290
|
+
const blob = new Blob([scriptText], { type: 'application/javascript' });
|
|
7291
|
+
const blobUrl = URL.createObjectURL(blob);
|
|
7292
|
+
|
|
7293
|
+
// Import the module
|
|
7294
|
+
importScripts(blobUrl);
|
|
7295
|
+
URL.revokeObjectURL(blobUrl);
|
|
7296
|
+
|
|
7297
|
+
// ort is now available as global
|
|
7298
|
+
ort = self.ort;
|
|
7299
|
+
|
|
7300
|
+
// Configure WASM settings
|
|
7301
|
+
ort.env.wasm.wasmPaths = wasmPaths;
|
|
7302
|
+
ort.env.wasm.numThreads = 1; // Single thread in worker
|
|
7303
|
+
ort.env.wasm.simd = true;
|
|
7304
|
+
ort.env.wasm.proxy = false; // No proxy in worker
|
|
7305
|
+
}
|
|
7306
|
+
|
|
7307
|
+
/**
|
|
7308
|
+
* Load the wav2arkit_cpu model
|
|
7309
|
+
*/
|
|
7310
|
+
async function loadModel(modelUrl, externalDataUrl, isIOS) {
|
|
7311
|
+
const sessionOptions = {
|
|
7312
|
+
executionProviders: ['wasm'],
|
|
7313
|
+
graphOptimizationLevel: 'all',
|
|
7314
|
+
};
|
|
7315
|
+
|
|
7316
|
+
const dataFilename = externalDataUrl ? externalDataUrl.split('/').pop() : null;
|
|
7317
|
+
|
|
7318
|
+
if (isIOS) {
|
|
7319
|
+
// iOS: Pass URLs directly to ORT to avoid loading 402MB into JS heap.
|
|
7320
|
+
// ORT fetches externally into WASM memory, cutting peak JS memory from
|
|
7321
|
+
// ~800MB to ~2MB (just the graph).
|
|
7322
|
+
if (externalDataUrl && dataFilename) {
|
|
7323
|
+
sessionOptions.externalData = [{ path: dataFilename, data: externalDataUrl }];
|
|
7324
|
+
}
|
|
7325
|
+
session = await ort.InferenceSession.create(modelUrl, sessionOptions);
|
|
7326
|
+
} else {
|
|
7327
|
+
// Desktop: fetch model graph as ArrayBuffer
|
|
7328
|
+
const graphResponse = await fetch(modelUrl);
|
|
7329
|
+
if (!graphResponse.ok) {
|
|
7330
|
+
throw new Error('Failed to fetch model graph: ' + graphResponse.status + ' ' + graphResponse.statusText);
|
|
7331
|
+
}
|
|
7332
|
+
const graphBuffer = await graphResponse.arrayBuffer();
|
|
7333
|
+
|
|
7334
|
+
// Fetch external data file if present
|
|
7335
|
+
if (externalDataUrl && dataFilename) {
|
|
7336
|
+
const dataResponse = await fetch(externalDataUrl);
|
|
7337
|
+
if (!dataResponse.ok) {
|
|
7338
|
+
throw new Error('Failed to fetch external data: ' + dataResponse.status + ' ' + dataResponse.statusText);
|
|
7339
|
+
}
|
|
7340
|
+
const dataBuffer = await dataResponse.arrayBuffer();
|
|
7341
|
+
sessionOptions.externalData = [{ path: dataFilename, data: new Uint8Array(dataBuffer) }];
|
|
7342
|
+
}
|
|
7343
|
+
|
|
7344
|
+
session = await ort.InferenceSession.create(new Uint8Array(graphBuffer), sessionOptions);
|
|
7345
|
+
}
|
|
7346
|
+
|
|
7347
|
+
// Warmup inference with 16000 silent samples
|
|
7348
|
+
const warmupAudio = new Float32Array(16000);
|
|
7349
|
+
const warmupTensor = new ort.Tensor('float32', warmupAudio, [1, warmupAudio.length]);
|
|
7350
|
+
await session.run({ audio_waveform: warmupTensor });
|
|
7351
|
+
|
|
7352
|
+
return {
|
|
7353
|
+
inputNames: session.inputNames.slice(),
|
|
7354
|
+
outputNames: session.outputNames.slice(),
|
|
7355
|
+
};
|
|
7356
|
+
}
|
|
7357
|
+
|
|
7358
|
+
/**
|
|
7359
|
+
* Run lip sync inference
|
|
7360
|
+
*/
|
|
7361
|
+
async function runInference(audio) {
|
|
7362
|
+
const tensor = new ort.Tensor('float32', audio, [1, audio.length]);
|
|
7363
|
+
const results = await session.run({ audio_waveform: tensor });
|
|
7364
|
+
|
|
7365
|
+
const blendshapeOutput = results['blendshapes'];
|
|
7366
|
+
if (!blendshapeOutput) {
|
|
7367
|
+
throw new Error('Missing blendshapes output from model');
|
|
7368
|
+
}
|
|
7369
|
+
|
|
7370
|
+
const blendshapeData = blendshapeOutput.data;
|
|
7371
|
+
const numFrames = blendshapeOutput.dims[1];
|
|
7372
|
+
const numBlendshapes = blendshapeOutput.dims[2];
|
|
7373
|
+
|
|
7374
|
+
// Symmetrize each frame and flatten into a single Float32Array for transfer
|
|
7375
|
+
const flatBuffer = new Float32Array(numFrames * numBlendshapes);
|
|
7376
|
+
for (let f = 0; f < numFrames; f++) {
|
|
7377
|
+
const offset = f * numBlendshapes;
|
|
7378
|
+
const rawFrame = blendshapeData.slice(offset, offset + numBlendshapes);
|
|
7379
|
+
const symmetrized = symmetrizeBlendshapes(rawFrame);
|
|
7380
|
+
flatBuffer.set(symmetrized, offset);
|
|
7381
|
+
}
|
|
7382
|
+
|
|
7383
|
+
return { flatBuffer, numFrames, numBlendshapes };
|
|
7384
|
+
}
|
|
7385
|
+
|
|
7386
|
+
// Message handler
|
|
7387
|
+
self.onmessage = async function(e) {
|
|
7388
|
+
const msg = e.data;
|
|
7389
|
+
|
|
7390
|
+
try {
|
|
7391
|
+
switch (msg.type) {
|
|
7392
|
+
case 'load': {
|
|
7393
|
+
const startTime = performance.now();
|
|
7394
|
+
await loadOrt(msg.wasmPaths);
|
|
7395
|
+
const { inputNames, outputNames } = await loadModel(msg.modelUrl, msg.externalDataUrl, msg.isIOS);
|
|
7396
|
+
const loadTimeMs = performance.now() - startTime;
|
|
7397
|
+
|
|
7398
|
+
self.postMessage({
|
|
7399
|
+
type: 'loaded',
|
|
7400
|
+
inputNames,
|
|
7401
|
+
outputNames,
|
|
7402
|
+
loadTimeMs,
|
|
7403
|
+
});
|
|
7404
|
+
break;
|
|
7405
|
+
}
|
|
7406
|
+
|
|
7407
|
+
case 'infer': {
|
|
7408
|
+
const startTime = performance.now();
|
|
7409
|
+
const { flatBuffer, numFrames, numBlendshapes } = await runInference(msg.audio);
|
|
7410
|
+
const inferenceTimeMs = performance.now() - startTime;
|
|
7411
|
+
|
|
7412
|
+
self.postMessage({
|
|
7413
|
+
type: 'result',
|
|
7414
|
+
blendshapes: flatBuffer,
|
|
7415
|
+
numFrames,
|
|
7416
|
+
numBlendshapes,
|
|
7417
|
+
inferenceTimeMs,
|
|
7418
|
+
}, [flatBuffer.buffer]);
|
|
7419
|
+
break;
|
|
7420
|
+
}
|
|
7421
|
+
|
|
7422
|
+
case 'dispose': {
|
|
7423
|
+
if (session) {
|
|
7424
|
+
await session.release();
|
|
7425
|
+
session = null;
|
|
7426
|
+
}
|
|
7427
|
+
ort = null;
|
|
7428
|
+
self.postMessage({ type: 'disposed' });
|
|
7429
|
+
break;
|
|
7430
|
+
}
|
|
7431
|
+
|
|
7432
|
+
default:
|
|
7433
|
+
self.postMessage({
|
|
7434
|
+
type: 'error',
|
|
7435
|
+
error: 'Unknown message type: ' + msg.type,
|
|
7436
|
+
});
|
|
7437
|
+
}
|
|
7438
|
+
} catch (err) {
|
|
7439
|
+
let errorMessage;
|
|
7440
|
+
if (typeof err === 'number') {
|
|
7441
|
+
// ORT WASM throws raw C++ exception pointers as bare numbers
|
|
7442
|
+
errorMessage = 'ORT WASM C++ exception pointer (0x' + err.toString(16) + ') \u2014 likely OOM';
|
|
7443
|
+
} else {
|
|
7444
|
+
errorMessage = err.message || String(err);
|
|
7445
|
+
}
|
|
7446
|
+
self.postMessage({
|
|
7447
|
+
type: 'error',
|
|
7448
|
+
error: errorMessage,
|
|
7449
|
+
});
|
|
7450
|
+
}
|
|
7451
|
+
};
|
|
7452
|
+
|
|
7453
|
+
// Error handler
|
|
7454
|
+
self.onerror = function(err) {
|
|
7455
|
+
self.postMessage({
|
|
7456
|
+
type: 'error',
|
|
7457
|
+
error: 'Worker error: ' + (err.message || String(err)),
|
|
7458
|
+
});
|
|
7459
|
+
};
|
|
7460
|
+
`;
|
|
7461
|
+
var Wav2ArkitCpuWorker = class {
|
|
7462
|
+
constructor(config) {
|
|
7463
|
+
this.modelId = "wav2arkit_cpu";
|
|
7464
|
+
this.worker = null;
|
|
7465
|
+
this.isLoading = false;
|
|
7466
|
+
this._isLoaded = false;
|
|
7467
|
+
// Inference queue for serialization
|
|
7468
|
+
this.inferenceQueue = Promise.resolve();
|
|
7469
|
+
// Session health: set to true if worker inference times out.
|
|
7470
|
+
// A timed-out worker may have a zombie WASM dispatch still running,
|
|
7471
|
+
// so all future infer() calls reject immediately to prevent concurrent access.
|
|
7472
|
+
this.poisoned = false;
|
|
7473
|
+
// Pending message handlers
|
|
7474
|
+
this.pendingResolvers = /* @__PURE__ */ new Map();
|
|
7475
|
+
this.config = config;
|
|
7476
|
+
}
|
|
7477
|
+
get isLoaded() {
|
|
7478
|
+
return this._isLoaded;
|
|
7479
|
+
}
|
|
7480
|
+
/**
|
|
7481
|
+
* Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
|
|
7482
|
+
*/
|
|
7483
|
+
get backend() {
|
|
7484
|
+
return this._isLoaded ? "wasm" : null;
|
|
4742
7485
|
}
|
|
4743
7486
|
/**
|
|
4744
|
-
*
|
|
7487
|
+
* Create the worker from inline script
|
|
4745
7488
|
*/
|
|
4746
|
-
|
|
7489
|
+
createWorker() {
|
|
7490
|
+
const blob = new Blob([WORKER_SCRIPT3], { type: "application/javascript" });
|
|
7491
|
+
const blobUrl = URL.createObjectURL(blob);
|
|
7492
|
+
const worker = new Worker(blobUrl);
|
|
7493
|
+
URL.revokeObjectURL(blobUrl);
|
|
7494
|
+
worker.onmessage = (event) => {
|
|
7495
|
+
this.handleWorkerMessage(event.data);
|
|
7496
|
+
};
|
|
7497
|
+
worker.onerror = (error) => {
|
|
7498
|
+
logger9.error("Worker error", { error: error.message });
|
|
7499
|
+
for (const [, resolver] of this.pendingResolvers) {
|
|
7500
|
+
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
7501
|
+
}
|
|
7502
|
+
this.pendingResolvers.clear();
|
|
7503
|
+
};
|
|
7504
|
+
return worker;
|
|
7505
|
+
}
|
|
7506
|
+
/**
|
|
7507
|
+
* Handle messages from worker
|
|
7508
|
+
*/
|
|
7509
|
+
handleWorkerMessage(result) {
|
|
7510
|
+
const resolver = this.pendingResolvers.get(result.type);
|
|
7511
|
+
if (resolver) {
|
|
7512
|
+
this.pendingResolvers.delete(result.type);
|
|
7513
|
+
if (result.type === "error") {
|
|
7514
|
+
resolver.reject(new Error(result.error));
|
|
7515
|
+
} else {
|
|
7516
|
+
resolver.resolve(result);
|
|
7517
|
+
}
|
|
7518
|
+
}
|
|
7519
|
+
}
|
|
7520
|
+
/**
|
|
7521
|
+
* Send message to worker and wait for response
|
|
7522
|
+
*/
|
|
7523
|
+
sendMessage(message, expectedType, timeoutMs) {
|
|
7524
|
+
return new Promise((resolve, reject) => {
|
|
7525
|
+
if (!this.worker) {
|
|
7526
|
+
reject(new Error("Worker not initialized"));
|
|
7527
|
+
return;
|
|
7528
|
+
}
|
|
7529
|
+
const timeoutId = setTimeout(() => {
|
|
7530
|
+
this.pendingResolvers.delete(expectedType);
|
|
7531
|
+
reject(new Error(`Worker operation timed out after ${timeoutMs}ms`));
|
|
7532
|
+
}, timeoutMs);
|
|
7533
|
+
this.pendingResolvers.set(expectedType, {
|
|
7534
|
+
resolve: (value) => {
|
|
7535
|
+
clearTimeout(timeoutId);
|
|
7536
|
+
resolve(value);
|
|
7537
|
+
},
|
|
7538
|
+
reject: (error) => {
|
|
7539
|
+
clearTimeout(timeoutId);
|
|
7540
|
+
reject(error);
|
|
7541
|
+
}
|
|
7542
|
+
});
|
|
7543
|
+
this.pendingResolvers.set("error", {
|
|
7544
|
+
resolve: () => {
|
|
7545
|
+
},
|
|
7546
|
+
// Never called for errors
|
|
7547
|
+
reject: (error) => {
|
|
7548
|
+
clearTimeout(timeoutId);
|
|
7549
|
+
this.pendingResolvers.delete(expectedType);
|
|
7550
|
+
reject(error);
|
|
7551
|
+
}
|
|
7552
|
+
});
|
|
7553
|
+
this.worker.postMessage(message);
|
|
7554
|
+
});
|
|
7555
|
+
}
|
|
7556
|
+
/**
|
|
7557
|
+
* Load the ONNX model in the worker
|
|
7558
|
+
*/
|
|
7559
|
+
async load() {
|
|
7560
|
+
if (this.isLoading) {
|
|
7561
|
+
throw new Error("Model is already loading");
|
|
7562
|
+
}
|
|
7563
|
+
if (this._isLoaded) {
|
|
7564
|
+
throw new Error("Model already loaded. Call dispose() first.");
|
|
7565
|
+
}
|
|
7566
|
+
this.isLoading = true;
|
|
7567
|
+
const startTime = performance.now();
|
|
7568
|
+
const telemetry = getTelemetry();
|
|
7569
|
+
const span = telemetry?.startSpan("Wav2ArkitCpuWorker.load", {
|
|
7570
|
+
"model.url": this.config.modelUrl,
|
|
7571
|
+
"model.backend_requested": "wasm"
|
|
7572
|
+
});
|
|
7573
|
+
try {
|
|
7574
|
+
logger9.info("Creating wav2arkit_cpu worker...");
|
|
7575
|
+
this.worker = this.createWorker();
|
|
7576
|
+
const externalDataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${this.config.modelUrl}.data` : null;
|
|
7577
|
+
logger9.info("Loading model in worker...", {
|
|
7578
|
+
modelUrl: this.config.modelUrl,
|
|
7579
|
+
externalDataUrl,
|
|
7580
|
+
isIOS: isIOS()
|
|
7581
|
+
});
|
|
7582
|
+
const result = await this.sendMessage(
|
|
7583
|
+
{
|
|
7584
|
+
type: "load",
|
|
7585
|
+
modelUrl: resolveUrl3(this.config.modelUrl),
|
|
7586
|
+
externalDataUrl: externalDataUrl ? resolveUrl3(externalDataUrl) : null,
|
|
7587
|
+
wasmPaths: WASM_CDN_PATH4,
|
|
7588
|
+
isIOS: isIOS()
|
|
7589
|
+
},
|
|
7590
|
+
"loaded",
|
|
7591
|
+
LOAD_TIMEOUT_MS2
|
|
7592
|
+
);
|
|
7593
|
+
this._isLoaded = true;
|
|
7594
|
+
const loadTimeMs = performance.now() - startTime;
|
|
7595
|
+
logger9.info("Wav2ArkitCpu worker loaded successfully", {
|
|
7596
|
+
backend: "wasm",
|
|
7597
|
+
loadTimeMs: Math.round(loadTimeMs),
|
|
7598
|
+
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
7599
|
+
inputs: result.inputNames,
|
|
7600
|
+
outputs: result.outputNames
|
|
7601
|
+
});
|
|
7602
|
+
span?.setAttributes({
|
|
7603
|
+
"model.backend": "wasm",
|
|
7604
|
+
"model.load_time_ms": loadTimeMs,
|
|
7605
|
+
"model.worker_load_time_ms": result.loadTimeMs
|
|
7606
|
+
});
|
|
7607
|
+
span?.end();
|
|
7608
|
+
telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
|
|
7609
|
+
model: "wav2arkit_cpu-worker",
|
|
7610
|
+
backend: "wasm"
|
|
7611
|
+
});
|
|
7612
|
+
return {
|
|
7613
|
+
backend: "wasm",
|
|
7614
|
+
loadTimeMs,
|
|
7615
|
+
inputNames: result.inputNames,
|
|
7616
|
+
outputNames: result.outputNames
|
|
7617
|
+
};
|
|
7618
|
+
} catch (error) {
|
|
7619
|
+
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
7620
|
+
telemetry?.incrementCounter("omote.errors.total", 1, {
|
|
7621
|
+
model: "wav2arkit_cpu-worker",
|
|
7622
|
+
error_type: "load_failed"
|
|
7623
|
+
});
|
|
7624
|
+
if (this.worker) {
|
|
7625
|
+
this.worker.terminate();
|
|
7626
|
+
this.worker = null;
|
|
7627
|
+
}
|
|
7628
|
+
throw error;
|
|
7629
|
+
} finally {
|
|
7630
|
+
this.isLoading = false;
|
|
7631
|
+
}
|
|
7632
|
+
}
|
|
7633
|
+
/**
|
|
7634
|
+
* Run inference on raw audio
|
|
7635
|
+
*
|
|
7636
|
+
* Accepts variable-length audio (not fixed to 16000 samples).
|
|
7637
|
+
* Output frames = ceil(30 * numSamples / 16000).
|
|
7638
|
+
*
|
|
7639
|
+
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
7640
|
+
* @param _identityIndex - Ignored (identity 11 is baked into the model)
|
|
7641
|
+
*/
|
|
7642
|
+
async infer(audioSamples, _identityIndex) {
|
|
7643
|
+
if (!this._isLoaded || !this.worker) {
|
|
7644
|
+
throw new Error("Model not loaded. Call load() first.");
|
|
7645
|
+
}
|
|
7646
|
+
if (this.poisoned) {
|
|
7647
|
+
throw new Error("Wav2ArkitCpu worker session timed out \u2014 inference unavailable until page reload");
|
|
7648
|
+
}
|
|
7649
|
+
const audioCopy = new Float32Array(audioSamples);
|
|
7650
|
+
return this.queueInference(audioCopy);
|
|
7651
|
+
}
|
|
7652
|
+
/**
|
|
7653
|
+
* Queue inference to serialize worker calls
|
|
7654
|
+
*/
|
|
7655
|
+
queueInference(audioSamples) {
|
|
4747
7656
|
return new Promise((resolve, reject) => {
|
|
4748
7657
|
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
4749
7658
|
const telemetry = getTelemetry();
|
|
4750
|
-
const span = telemetry?.startSpan("
|
|
4751
|
-
"inference.backend":
|
|
4752
|
-
"inference.input_samples":
|
|
7659
|
+
const span = telemetry?.startSpan("Wav2ArkitCpuWorker.infer", {
|
|
7660
|
+
"inference.backend": "wasm",
|
|
7661
|
+
"inference.input_samples": audioSamples.length
|
|
4753
7662
|
});
|
|
4754
7663
|
try {
|
|
4755
7664
|
const startTime = performance.now();
|
|
4756
|
-
const
|
|
7665
|
+
const result = await this.sendMessage(
|
|
7666
|
+
{
|
|
7667
|
+
type: "infer",
|
|
7668
|
+
audio: audioSamples
|
|
7669
|
+
},
|
|
7670
|
+
"result",
|
|
7671
|
+
INFERENCE_TIMEOUT_MS2
|
|
7672
|
+
);
|
|
4757
7673
|
const inferenceTimeMs = performance.now() - startTime;
|
|
4758
|
-
const
|
|
4759
|
-
|
|
4760
|
-
throw new Error("Missing blendshapes output from model");
|
|
4761
|
-
}
|
|
4762
|
-
const blendshapeData = blendshapeOutput.data;
|
|
4763
|
-
const numFrames = blendshapeOutput.dims[1];
|
|
4764
|
-
const numBlendshapes = blendshapeOutput.dims[2];
|
|
7674
|
+
const flatBuffer = result.blendshapes;
|
|
7675
|
+
const { numFrames, numBlendshapes } = result;
|
|
4765
7676
|
const blendshapes = [];
|
|
4766
7677
|
for (let f = 0; f < numFrames; f++) {
|
|
4767
|
-
|
|
4768
|
-
const symmetrized = symmetrizeBlendshapes(rawFrame);
|
|
4769
|
-
blendshapes.push(symmetrized);
|
|
7678
|
+
blendshapes.push(flatBuffer.slice(f * numBlendshapes, (f + 1) * numBlendshapes));
|
|
4770
7679
|
}
|
|
4771
|
-
|
|
7680
|
+
logger9.trace("Worker inference completed", {
|
|
4772
7681
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
7682
|
+
workerTimeMs: Math.round(result.inferenceTimeMs * 100) / 100,
|
|
4773
7683
|
numFrames,
|
|
4774
|
-
inputSamples
|
|
7684
|
+
inputSamples: audioSamples.length
|
|
4775
7685
|
});
|
|
4776
7686
|
span?.setAttributes({
|
|
4777
7687
|
"inference.duration_ms": inferenceTimeMs,
|
|
7688
|
+
"inference.worker_duration_ms": result.inferenceTimeMs,
|
|
4778
7689
|
"inference.frames": numFrames
|
|
4779
7690
|
});
|
|
4780
7691
|
span?.end();
|
|
4781
7692
|
telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
|
|
4782
|
-
model: "wav2arkit_cpu",
|
|
4783
|
-
backend:
|
|
7693
|
+
model: "wav2arkit_cpu-worker",
|
|
7694
|
+
backend: "wasm"
|
|
4784
7695
|
});
|
|
4785
7696
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
4786
|
-
model: "wav2arkit_cpu",
|
|
4787
|
-
backend:
|
|
7697
|
+
model: "wav2arkit_cpu-worker",
|
|
7698
|
+
backend: "wasm",
|
|
4788
7699
|
status: "success"
|
|
4789
7700
|
});
|
|
4790
7701
|
resolve({
|
|
@@ -4793,10 +7704,20 @@ var Wav2ArkitCpuInference = class {
|
|
|
4793
7704
|
inferenceTimeMs
|
|
4794
7705
|
});
|
|
4795
7706
|
} catch (err) {
|
|
7707
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
7708
|
+
if (errMsg.includes("timed out")) {
|
|
7709
|
+
this.poisoned = true;
|
|
7710
|
+
logger9.error("CRITICAL: Worker inference timed out \u2014 Wav2ArkitCpu worker is dead. Page reload required.", {
|
|
7711
|
+
backend: "wasm",
|
|
7712
|
+
timeoutMs: INFERENCE_TIMEOUT_MS2
|
|
7713
|
+
});
|
|
7714
|
+
} else {
|
|
7715
|
+
logger9.error("Worker inference failed", { error: errMsg, backend: "wasm" });
|
|
7716
|
+
}
|
|
4796
7717
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
4797
7718
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
4798
|
-
model: "wav2arkit_cpu",
|
|
4799
|
-
backend:
|
|
7719
|
+
model: "wav2arkit_cpu-worker",
|
|
7720
|
+
backend: "wasm",
|
|
4800
7721
|
status: "error"
|
|
4801
7722
|
});
|
|
4802
7723
|
reject(err);
|
|
@@ -4805,37 +7726,62 @@ var Wav2ArkitCpuInference = class {
|
|
|
4805
7726
|
});
|
|
4806
7727
|
}
|
|
4807
7728
|
/**
|
|
4808
|
-
* Dispose of the
|
|
7729
|
+
* Dispose of the worker and free resources
|
|
4809
7730
|
*/
|
|
4810
7731
|
async dispose() {
|
|
4811
|
-
if (this.
|
|
4812
|
-
|
|
4813
|
-
|
|
7732
|
+
if (this.worker) {
|
|
7733
|
+
try {
|
|
7734
|
+
await this.sendMessage({ type: "dispose" }, "disposed", INFERENCE_TIMEOUT_MS2);
|
|
7735
|
+
} catch {
|
|
7736
|
+
}
|
|
7737
|
+
this.worker.terminate();
|
|
7738
|
+
this.worker = null;
|
|
4814
7739
|
}
|
|
7740
|
+
this._isLoaded = false;
|
|
7741
|
+
this.poisoned = false;
|
|
7742
|
+
this.pendingResolvers.clear();
|
|
7743
|
+
}
|
|
7744
|
+
/**
|
|
7745
|
+
* Check if Web Workers are supported
|
|
7746
|
+
*/
|
|
7747
|
+
static isSupported() {
|
|
7748
|
+
return typeof Worker !== "undefined";
|
|
4815
7749
|
}
|
|
4816
7750
|
};
|
|
4817
7751
|
|
|
4818
7752
|
// src/inference/createLipSync.ts
|
|
4819
|
-
var
|
|
7753
|
+
var logger10 = createLogger("createLipSync");
|
|
4820
7754
|
function createLipSync(config) {
|
|
4821
7755
|
const mode = config.mode ?? "auto";
|
|
4822
7756
|
const fallbackOnError = config.fallbackOnError ?? true;
|
|
4823
7757
|
let useCpu;
|
|
4824
7758
|
if (mode === "cpu") {
|
|
4825
7759
|
useCpu = true;
|
|
4826
|
-
|
|
7760
|
+
logger10.info("Forcing CPU lip sync model (wav2arkit_cpu)");
|
|
4827
7761
|
} else if (mode === "gpu") {
|
|
4828
7762
|
useCpu = false;
|
|
4829
|
-
|
|
7763
|
+
logger10.info("Forcing GPU lip sync model (Wav2Vec2)");
|
|
4830
7764
|
} else {
|
|
4831
7765
|
useCpu = shouldUseCpuLipSync();
|
|
4832
|
-
|
|
7766
|
+
logger10.info("Auto-detected lip sync model", {
|
|
4833
7767
|
useCpu,
|
|
4834
7768
|
isSafari: isSafari()
|
|
4835
7769
|
});
|
|
4836
7770
|
}
|
|
4837
7771
|
if (useCpu) {
|
|
4838
|
-
|
|
7772
|
+
if (config.unifiedWorker) {
|
|
7773
|
+
logger10.info("Creating Wav2ArkitCpuUnifiedAdapter (404MB, WASM, shared unified worker)");
|
|
7774
|
+
return new Wav2ArkitCpuUnifiedAdapter(config.unifiedWorker, {
|
|
7775
|
+
modelUrl: config.cpuModelUrl
|
|
7776
|
+
});
|
|
7777
|
+
}
|
|
7778
|
+
if (config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
|
|
7779
|
+
logger10.info("Creating Wav2ArkitCpuWorker (404MB, WASM, off-main-thread)");
|
|
7780
|
+
return new Wav2ArkitCpuWorker({
|
|
7781
|
+
modelUrl: config.cpuModelUrl
|
|
7782
|
+
});
|
|
7783
|
+
}
|
|
7784
|
+
logger10.info("Creating Wav2ArkitCpuInference (404MB, WASM)");
|
|
4839
7785
|
return new Wav2ArkitCpuInference({
|
|
4840
7786
|
modelUrl: config.cpuModelUrl
|
|
4841
7787
|
});
|
|
@@ -4847,10 +7793,10 @@ function createLipSync(config) {
|
|
|
4847
7793
|
numIdentityClasses: config.numIdentityClasses
|
|
4848
7794
|
});
|
|
4849
7795
|
if (fallbackOnError) {
|
|
4850
|
-
|
|
7796
|
+
logger10.info("Creating Wav2Vec2Inference with CPU fallback");
|
|
4851
7797
|
return new LipSyncWithFallback(gpuInstance, config);
|
|
4852
7798
|
}
|
|
4853
|
-
|
|
7799
|
+
logger10.info("Creating Wav2Vec2Inference (no fallback)");
|
|
4854
7800
|
return gpuInstance;
|
|
4855
7801
|
}
|
|
4856
7802
|
var LipSyncWithFallback = class {
|
|
@@ -4876,16 +7822,28 @@ var LipSyncWithFallback = class {
|
|
|
4876
7822
|
}
|
|
4877
7823
|
}
|
|
4878
7824
|
async fallbackToCpu(reason) {
|
|
4879
|
-
|
|
7825
|
+
logger10.warn("GPU model load failed, falling back to CPU model", { reason });
|
|
4880
7826
|
try {
|
|
4881
7827
|
await this.implementation.dispose();
|
|
4882
7828
|
} catch {
|
|
4883
7829
|
}
|
|
4884
|
-
this.
|
|
4885
|
-
|
|
4886
|
-
|
|
7830
|
+
if (this.config.unifiedWorker) {
|
|
7831
|
+
this.implementation = new Wav2ArkitCpuUnifiedAdapter(this.config.unifiedWorker, {
|
|
7832
|
+
modelUrl: this.config.cpuModelUrl
|
|
7833
|
+
});
|
|
7834
|
+
logger10.info("Fallback to Wav2ArkitCpuUnifiedAdapter successful");
|
|
7835
|
+
} else if (this.config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
|
|
7836
|
+
this.implementation = new Wav2ArkitCpuWorker({
|
|
7837
|
+
modelUrl: this.config.cpuModelUrl
|
|
7838
|
+
});
|
|
7839
|
+
logger10.info("Fallback to Wav2ArkitCpuWorker successful");
|
|
7840
|
+
} else {
|
|
7841
|
+
this.implementation = new Wav2ArkitCpuInference({
|
|
7842
|
+
modelUrl: this.config.cpuModelUrl
|
|
7843
|
+
});
|
|
7844
|
+
logger10.info("Fallback to Wav2ArkitCpuInference successful");
|
|
7845
|
+
}
|
|
4887
7846
|
this.hasFallenBack = true;
|
|
4888
|
-
logger6.info("Fallback to Wav2ArkitCpuInference successful");
|
|
4889
7847
|
return await this.implementation.load();
|
|
4890
7848
|
}
|
|
4891
7849
|
async infer(audioSamples, identityIndex) {
|
|
@@ -4897,7 +7855,7 @@ var LipSyncWithFallback = class {
|
|
|
4897
7855
|
};
|
|
4898
7856
|
|
|
4899
7857
|
// src/inference/SileroVADInference.ts
|
|
4900
|
-
var
|
|
7858
|
+
var logger11 = createLogger("SileroVAD");
|
|
4901
7859
|
var SileroVADInference = class {
|
|
4902
7860
|
constructor(config) {
|
|
4903
7861
|
this.session = null;
|
|
@@ -4971,23 +7929,23 @@ var SileroVADInference = class {
|
|
|
4971
7929
|
"model.sample_rate": this.config.sampleRate
|
|
4972
7930
|
});
|
|
4973
7931
|
try {
|
|
4974
|
-
|
|
7932
|
+
logger11.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
4975
7933
|
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
4976
7934
|
this.ort = ort;
|
|
4977
7935
|
this._backend = backend;
|
|
4978
|
-
|
|
7936
|
+
logger11.info("ONNX Runtime loaded", { backend: this._backend });
|
|
4979
7937
|
const cache = getModelCache();
|
|
4980
7938
|
const modelUrl = this.config.modelUrl;
|
|
4981
7939
|
const isCached = await cache.has(modelUrl);
|
|
4982
7940
|
let modelBuffer;
|
|
4983
7941
|
if (isCached) {
|
|
4984
|
-
|
|
7942
|
+
logger11.debug("Loading model from cache", { modelUrl });
|
|
4985
7943
|
modelBuffer = await cache.get(modelUrl);
|
|
4986
7944
|
} else {
|
|
4987
|
-
|
|
7945
|
+
logger11.debug("Fetching and caching model", { modelUrl });
|
|
4988
7946
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
4989
7947
|
}
|
|
4990
|
-
|
|
7948
|
+
logger11.debug("Creating ONNX session", {
|
|
4991
7949
|
size: formatBytes(modelBuffer.byteLength),
|
|
4992
7950
|
backend: this._backend
|
|
4993
7951
|
});
|
|
@@ -4996,7 +7954,7 @@ var SileroVADInference = class {
|
|
|
4996
7954
|
this.session = await ort.InferenceSession.create(modelData, sessionOptions);
|
|
4997
7955
|
this.reset();
|
|
4998
7956
|
const loadTimeMs = performance.now() - startTime;
|
|
4999
|
-
|
|
7957
|
+
logger11.info("Model loaded successfully", {
|
|
5000
7958
|
backend: this._backend,
|
|
5001
7959
|
loadTimeMs: Math.round(loadTimeMs),
|
|
5002
7960
|
sampleRate: this.config.sampleRate,
|
|
@@ -5051,7 +8009,7 @@ var SileroVADInference = class {
|
|
|
5051
8009
|
[]
|
|
5052
8010
|
);
|
|
5053
8011
|
} catch (e) {
|
|
5054
|
-
|
|
8012
|
+
logger11.warn("BigInt64Array not available, using bigint array fallback", {
|
|
5055
8013
|
error: e instanceof Error ? e.message : String(e)
|
|
5056
8014
|
});
|
|
5057
8015
|
this.srTensor = new this.ort.Tensor(
|
|
@@ -5143,23 +8101,13 @@ var SileroVADInference = class {
|
|
|
5143
8101
|
}
|
|
5144
8102
|
return segments;
|
|
5145
8103
|
}
|
|
5146
|
-
/**
|
|
5147
|
-
* Calculate RMS energy of audio chunk
|
|
5148
|
-
*/
|
|
5149
|
-
calculateRMS(samples) {
|
|
5150
|
-
let sum = 0;
|
|
5151
|
-
for (let i = 0; i < samples.length; i++) {
|
|
5152
|
-
sum += samples[i] * samples[i];
|
|
5153
|
-
}
|
|
5154
|
-
return Math.sqrt(sum / samples.length);
|
|
5155
|
-
}
|
|
5156
8104
|
/**
|
|
5157
8105
|
* Queue inference to serialize ONNX session calls
|
|
5158
8106
|
*/
|
|
5159
8107
|
queueInference(audioChunk) {
|
|
5160
8108
|
const audioChunkCopy = new Float32Array(audioChunk);
|
|
5161
8109
|
const MIN_ENERGY_THRESHOLD = 1e-3;
|
|
5162
|
-
const rms =
|
|
8110
|
+
const rms = calculateRMS(audioChunkCopy);
|
|
5163
8111
|
if (rms < MIN_ENERGY_THRESHOLD) {
|
|
5164
8112
|
if (!this.wasSpeaking) {
|
|
5165
8113
|
this.preSpeechBuffer.push(new Float32Array(audioChunkCopy));
|
|
@@ -5167,7 +8115,7 @@ var SileroVADInference = class {
|
|
|
5167
8115
|
this.preSpeechBuffer.shift();
|
|
5168
8116
|
}
|
|
5169
8117
|
}
|
|
5170
|
-
|
|
8118
|
+
logger11.trace("Skipping VAD inference - audio too quiet", {
|
|
5171
8119
|
rms: Math.round(rms * 1e4) / 1e4,
|
|
5172
8120
|
threshold: MIN_ENERGY_THRESHOLD
|
|
5173
8121
|
});
|
|
@@ -5214,19 +8162,19 @@ var SileroVADInference = class {
|
|
|
5214
8162
|
[2, 1, 128]
|
|
5215
8163
|
);
|
|
5216
8164
|
}
|
|
5217
|
-
this.context =
|
|
8165
|
+
this.context = audioChunkCopy.slice(-this.contextSize);
|
|
5218
8166
|
const inferenceTimeMs = performance.now() - startTime;
|
|
5219
8167
|
const isSpeech = probability > this.config.threshold;
|
|
5220
8168
|
let preSpeechChunks;
|
|
5221
8169
|
if (isSpeech && !this.wasSpeaking) {
|
|
5222
8170
|
preSpeechChunks = [...this.preSpeechBuffer];
|
|
5223
8171
|
this.preSpeechBuffer = [];
|
|
5224
|
-
|
|
8172
|
+
logger11.debug("Speech started with pre-speech buffer", {
|
|
5225
8173
|
preSpeechChunks: preSpeechChunks.length,
|
|
5226
8174
|
durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
|
|
5227
8175
|
});
|
|
5228
8176
|
} else if (!isSpeech && !this.wasSpeaking) {
|
|
5229
|
-
this.preSpeechBuffer.push(new Float32Array(
|
|
8177
|
+
this.preSpeechBuffer.push(new Float32Array(audioChunkCopy));
|
|
5230
8178
|
if (this.preSpeechBuffer.length > this.config.preSpeechBufferChunks) {
|
|
5231
8179
|
this.preSpeechBuffer.shift();
|
|
5232
8180
|
}
|
|
@@ -5234,7 +8182,7 @@ var SileroVADInference = class {
|
|
|
5234
8182
|
this.preSpeechBuffer = [];
|
|
5235
8183
|
}
|
|
5236
8184
|
this.wasSpeaking = isSpeech;
|
|
5237
|
-
|
|
8185
|
+
logger11.trace("VAD inference completed", {
|
|
5238
8186
|
probability: Math.round(probability * 1e3) / 1e3,
|
|
5239
8187
|
isSpeech,
|
|
5240
8188
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100
|
|
@@ -5261,13 +8209,30 @@ var SileroVADInference = class {
|
|
|
5261
8209
|
preSpeechChunks
|
|
5262
8210
|
});
|
|
5263
8211
|
} catch (err) {
|
|
5264
|
-
|
|
5265
|
-
|
|
5266
|
-
|
|
5267
|
-
|
|
5268
|
-
|
|
5269
|
-
|
|
5270
|
-
|
|
8212
|
+
if (typeof err === "number") {
|
|
8213
|
+
const oomError = new Error(
|
|
8214
|
+
`SileroVAD inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reducing concurrent model sessions or reloading the page.`
|
|
8215
|
+
);
|
|
8216
|
+
logger11.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
8217
|
+
pointer: `0x${err.toString(16)}`,
|
|
8218
|
+
backend: this._backend
|
|
8219
|
+
});
|
|
8220
|
+
span?.endWithError(oomError);
|
|
8221
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
8222
|
+
model: "silero-vad",
|
|
8223
|
+
backend: this._backend,
|
|
8224
|
+
status: "error"
|
|
8225
|
+
});
|
|
8226
|
+
reject(oomError);
|
|
8227
|
+
} else {
|
|
8228
|
+
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
8229
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
8230
|
+
model: "silero-vad",
|
|
8231
|
+
backend: this._backend,
|
|
8232
|
+
status: "error"
|
|
8233
|
+
});
|
|
8234
|
+
reject(err);
|
|
8235
|
+
}
|
|
5271
8236
|
}
|
|
5272
8237
|
});
|
|
5273
8238
|
});
|
|
@@ -5291,19 +8256,27 @@ var SileroVADInference = class {
|
|
|
5291
8256
|
SileroVADInference.isWebGPUAvailable = isWebGPUAvailable;
|
|
5292
8257
|
|
|
5293
8258
|
// src/inference/SileroVADWorker.ts
|
|
5294
|
-
var
|
|
5295
|
-
var
|
|
5296
|
-
var
|
|
5297
|
-
var
|
|
5298
|
-
|
|
8259
|
+
var logger12 = createLogger("SileroVADWorker");
|
|
8260
|
+
var WASM_CDN_PATH5 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
8261
|
+
var LOAD_TIMEOUT_MS3 = 1e4;
|
|
8262
|
+
var INFERENCE_TIMEOUT_MS3 = 1e3;
|
|
8263
|
+
function resolveUrl4(url) {
|
|
8264
|
+
if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
|
|
8265
|
+
try {
|
|
8266
|
+
return new URL(url, globalThis.location?.origin ?? "https://localhost").href;
|
|
8267
|
+
} catch {
|
|
8268
|
+
return url;
|
|
8269
|
+
}
|
|
8270
|
+
}
|
|
8271
|
+
var WORKER_SCRIPT4 = `
|
|
5299
8272
|
// Silero VAD Worker Script
|
|
5300
8273
|
// Loaded via Blob URL - no separate file needed
|
|
5301
8274
|
|
|
5302
|
-
|
|
5303
|
-
|
|
5304
|
-
|
|
5305
|
-
|
|
5306
|
-
|
|
8275
|
+
var ort = null;
|
|
8276
|
+
var session = null;
|
|
8277
|
+
var sampleRate = 16000;
|
|
8278
|
+
var chunkSize = 512;
|
|
8279
|
+
var contextSize = 64;
|
|
5307
8280
|
|
|
5308
8281
|
/**
|
|
5309
8282
|
* Load ONNX Runtime from CDN
|
|
@@ -5553,7 +8526,7 @@ var SileroVADWorker = class {
|
|
|
5553
8526
|
* Create the worker from inline script
|
|
5554
8527
|
*/
|
|
5555
8528
|
createWorker() {
|
|
5556
|
-
const blob = new Blob([
|
|
8529
|
+
const blob = new Blob([WORKER_SCRIPT4], { type: "application/javascript" });
|
|
5557
8530
|
const blobUrl = URL.createObjectURL(blob);
|
|
5558
8531
|
const worker = new Worker(blobUrl);
|
|
5559
8532
|
URL.revokeObjectURL(blobUrl);
|
|
@@ -5561,7 +8534,7 @@ var SileroVADWorker = class {
|
|
|
5561
8534
|
this.handleWorkerMessage(event.data);
|
|
5562
8535
|
};
|
|
5563
8536
|
worker.onerror = (error) => {
|
|
5564
|
-
|
|
8537
|
+
logger12.error("Worker error", { error: error.message });
|
|
5565
8538
|
for (const [, resolver] of this.pendingResolvers) {
|
|
5566
8539
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
5567
8540
|
}
|
|
@@ -5637,25 +8610,25 @@ var SileroVADWorker = class {
|
|
|
5637
8610
|
"model.sample_rate": this.config.sampleRate
|
|
5638
8611
|
});
|
|
5639
8612
|
try {
|
|
5640
|
-
|
|
8613
|
+
logger12.info("Creating VAD worker...");
|
|
5641
8614
|
this.worker = this.createWorker();
|
|
5642
|
-
|
|
8615
|
+
logger12.info("Loading model in worker...", {
|
|
5643
8616
|
modelUrl: this.config.modelUrl,
|
|
5644
8617
|
sampleRate: this.config.sampleRate
|
|
5645
8618
|
});
|
|
5646
8619
|
const result = await this.sendMessage(
|
|
5647
8620
|
{
|
|
5648
8621
|
type: "load",
|
|
5649
|
-
modelUrl: this.config.modelUrl,
|
|
8622
|
+
modelUrl: resolveUrl4(this.config.modelUrl),
|
|
5650
8623
|
sampleRate: this.config.sampleRate,
|
|
5651
|
-
wasmPaths:
|
|
8624
|
+
wasmPaths: WASM_CDN_PATH5
|
|
5652
8625
|
},
|
|
5653
8626
|
"loaded",
|
|
5654
|
-
|
|
8627
|
+
LOAD_TIMEOUT_MS3
|
|
5655
8628
|
);
|
|
5656
8629
|
this._isLoaded = true;
|
|
5657
8630
|
const loadTimeMs = performance.now() - startTime;
|
|
5658
|
-
|
|
8631
|
+
logger12.info("VAD worker loaded successfully", {
|
|
5659
8632
|
backend: "wasm",
|
|
5660
8633
|
loadTimeMs: Math.round(loadTimeMs),
|
|
5661
8634
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -5706,7 +8679,7 @@ var SileroVADWorker = class {
|
|
|
5706
8679
|
const result = await this.sendMessage(
|
|
5707
8680
|
{ type: "reset" },
|
|
5708
8681
|
"reset",
|
|
5709
|
-
|
|
8682
|
+
INFERENCE_TIMEOUT_MS3
|
|
5710
8683
|
);
|
|
5711
8684
|
this.state = result.state;
|
|
5712
8685
|
this.context = new Float32Array(this.contextSize);
|
|
@@ -5752,7 +8725,7 @@ var SileroVADWorker = class {
|
|
|
5752
8725
|
context: this.context
|
|
5753
8726
|
},
|
|
5754
8727
|
"result",
|
|
5755
|
-
|
|
8728
|
+
INFERENCE_TIMEOUT_MS3
|
|
5756
8729
|
);
|
|
5757
8730
|
this.state = result.state;
|
|
5758
8731
|
this.context = audioChunkCopy.slice(-this.contextSize);
|
|
@@ -5762,7 +8735,7 @@ var SileroVADWorker = class {
|
|
|
5762
8735
|
if (isSpeech && !this.wasSpeaking) {
|
|
5763
8736
|
preSpeechChunks = [...this.preSpeechBuffer];
|
|
5764
8737
|
this.preSpeechBuffer = [];
|
|
5765
|
-
|
|
8738
|
+
logger12.debug("Speech started with pre-speech buffer", {
|
|
5766
8739
|
preSpeechChunks: preSpeechChunks.length,
|
|
5767
8740
|
durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
|
|
5768
8741
|
});
|
|
@@ -5775,7 +8748,7 @@ var SileroVADWorker = class {
|
|
|
5775
8748
|
this.preSpeechBuffer = [];
|
|
5776
8749
|
}
|
|
5777
8750
|
this.wasSpeaking = isSpeech;
|
|
5778
|
-
|
|
8751
|
+
logger12.trace("VAD worker inference completed", {
|
|
5779
8752
|
probability: Math.round(result.probability * 1e3) / 1e3,
|
|
5780
8753
|
isSpeech,
|
|
5781
8754
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
@@ -5821,7 +8794,7 @@ var SileroVADWorker = class {
|
|
|
5821
8794
|
async dispose() {
|
|
5822
8795
|
if (this.worker) {
|
|
5823
8796
|
try {
|
|
5824
|
-
await this.sendMessage({ type: "dispose" }, "disposed",
|
|
8797
|
+
await this.sendMessage({ type: "dispose" }, "disposed", INFERENCE_TIMEOUT_MS3);
|
|
5825
8798
|
} catch {
|
|
5826
8799
|
}
|
|
5827
8800
|
this.worker.terminate();
|
|
@@ -5843,40 +8816,44 @@ var SileroVADWorker = class {
|
|
|
5843
8816
|
};
|
|
5844
8817
|
|
|
5845
8818
|
// src/inference/createSileroVAD.ts
|
|
5846
|
-
var
|
|
8819
|
+
var logger13 = createLogger("createSileroVAD");
|
|
5847
8820
|
function supportsVADWorker() {
|
|
5848
8821
|
if (typeof Worker === "undefined") {
|
|
5849
|
-
|
|
8822
|
+
logger13.debug("Worker not supported: Worker constructor undefined");
|
|
5850
8823
|
return false;
|
|
5851
8824
|
}
|
|
5852
8825
|
if (typeof URL === "undefined" || typeof URL.createObjectURL === "undefined") {
|
|
5853
|
-
|
|
8826
|
+
logger13.debug("Worker not supported: URL.createObjectURL unavailable");
|
|
5854
8827
|
return false;
|
|
5855
8828
|
}
|
|
5856
8829
|
if (typeof Blob === "undefined") {
|
|
5857
|
-
|
|
8830
|
+
logger13.debug("Worker not supported: Blob constructor unavailable");
|
|
5858
8831
|
return false;
|
|
5859
8832
|
}
|
|
5860
8833
|
return true;
|
|
5861
8834
|
}
|
|
5862
8835
|
function createSileroVAD(config) {
|
|
8836
|
+
if (config.unifiedWorker) {
|
|
8837
|
+
logger13.info("Creating SileroVADUnifiedAdapter (shared unified worker)");
|
|
8838
|
+
return new SileroVADUnifiedAdapter(config.unifiedWorker, config);
|
|
8839
|
+
}
|
|
5863
8840
|
const fallbackOnError = config.fallbackOnError ?? true;
|
|
5864
8841
|
let useWorker;
|
|
5865
8842
|
if (config.useWorker !== void 0) {
|
|
5866
8843
|
useWorker = config.useWorker;
|
|
5867
|
-
|
|
8844
|
+
logger13.debug("Worker preference explicitly set", { useWorker });
|
|
5868
8845
|
} else {
|
|
5869
8846
|
const workerSupported = supportsVADWorker();
|
|
5870
8847
|
const onMobile = isMobile();
|
|
5871
8848
|
useWorker = workerSupported && !onMobile;
|
|
5872
|
-
|
|
8849
|
+
logger13.debug("Auto-detected Worker preference", {
|
|
5873
8850
|
useWorker,
|
|
5874
8851
|
workerSupported,
|
|
5875
8852
|
onMobile
|
|
5876
8853
|
});
|
|
5877
8854
|
}
|
|
5878
8855
|
if (useWorker) {
|
|
5879
|
-
|
|
8856
|
+
logger13.info("Creating SileroVADWorker (off-main-thread)");
|
|
5880
8857
|
const worker = new SileroVADWorker({
|
|
5881
8858
|
modelUrl: config.modelUrl,
|
|
5882
8859
|
sampleRate: config.sampleRate,
|
|
@@ -5888,7 +8865,7 @@ function createSileroVAD(config) {
|
|
|
5888
8865
|
}
|
|
5889
8866
|
return worker;
|
|
5890
8867
|
}
|
|
5891
|
-
|
|
8868
|
+
logger13.info("Creating SileroVADInference (main thread)");
|
|
5892
8869
|
return new SileroVADInference(config);
|
|
5893
8870
|
}
|
|
5894
8871
|
var VADWorkerWithFallback = class {
|
|
@@ -5914,7 +8891,7 @@ var VADWorkerWithFallback = class {
|
|
|
5914
8891
|
try {
|
|
5915
8892
|
return await this.implementation.load();
|
|
5916
8893
|
} catch (error) {
|
|
5917
|
-
|
|
8894
|
+
logger13.warn("Worker load failed, falling back to main thread", {
|
|
5918
8895
|
error: error instanceof Error ? error.message : String(error)
|
|
5919
8896
|
});
|
|
5920
8897
|
try {
|
|
@@ -5923,7 +8900,7 @@ var VADWorkerWithFallback = class {
|
|
|
5923
8900
|
}
|
|
5924
8901
|
this.implementation = new SileroVADInference(this.config);
|
|
5925
8902
|
this.hasFallenBack = true;
|
|
5926
|
-
|
|
8903
|
+
logger13.info("Fallback to SileroVADInference successful");
|
|
5927
8904
|
return await this.implementation.load();
|
|
5928
8905
|
}
|
|
5929
8906
|
}
|
|
@@ -5945,7 +8922,7 @@ var VADWorkerWithFallback = class {
|
|
|
5945
8922
|
};
|
|
5946
8923
|
|
|
5947
8924
|
// src/inference/SafariSpeechRecognition.ts
|
|
5948
|
-
var
|
|
8925
|
+
var logger14 = createLogger("SafariSpeech");
|
|
5949
8926
|
var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
5950
8927
|
constructor(config = {}) {
|
|
5951
8928
|
this.recognition = null;
|
|
@@ -5964,7 +8941,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5964
8941
|
interimResults: config.interimResults ?? true,
|
|
5965
8942
|
maxAlternatives: config.maxAlternatives ?? 1
|
|
5966
8943
|
};
|
|
5967
|
-
|
|
8944
|
+
logger14.debug("SafariSpeechRecognition created", {
|
|
5968
8945
|
language: this.config.language,
|
|
5969
8946
|
continuous: this.config.continuous
|
|
5970
8947
|
});
|
|
@@ -6025,7 +9002,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
6025
9002
|
*/
|
|
6026
9003
|
async start() {
|
|
6027
9004
|
if (this.isListening) {
|
|
6028
|
-
|
|
9005
|
+
logger14.warn("Already listening");
|
|
6029
9006
|
return;
|
|
6030
9007
|
}
|
|
6031
9008
|
if (!_SafariSpeechRecognition.isAvailable()) {
|
|
@@ -6055,7 +9032,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
6055
9032
|
this.isListening = true;
|
|
6056
9033
|
this.startTime = performance.now();
|
|
6057
9034
|
this.accumulatedText = "";
|
|
6058
|
-
|
|
9035
|
+
logger14.info("Speech recognition started", {
|
|
6059
9036
|
language: this.config.language
|
|
6060
9037
|
});
|
|
6061
9038
|
span?.end();
|
|
@@ -6070,7 +9047,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
6070
9047
|
*/
|
|
6071
9048
|
async stop() {
|
|
6072
9049
|
if (!this.isListening || !this.recognition) {
|
|
6073
|
-
|
|
9050
|
+
logger14.warn("Not currently listening");
|
|
6074
9051
|
return {
|
|
6075
9052
|
text: this.accumulatedText,
|
|
6076
9053
|
language: this.config.language,
|
|
@@ -6099,7 +9076,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
6099
9076
|
if (this.recognition && this.isListening) {
|
|
6100
9077
|
this.recognition.abort();
|
|
6101
9078
|
this.isListening = false;
|
|
6102
|
-
|
|
9079
|
+
logger14.info("Speech recognition aborted");
|
|
6103
9080
|
}
|
|
6104
9081
|
}
|
|
6105
9082
|
/**
|
|
@@ -6130,7 +9107,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
6130
9107
|
this.isListening = false;
|
|
6131
9108
|
this.resultCallbacks = [];
|
|
6132
9109
|
this.errorCallbacks = [];
|
|
6133
|
-
|
|
9110
|
+
logger14.debug("SafariSpeechRecognition disposed");
|
|
6134
9111
|
}
|
|
6135
9112
|
/**
|
|
6136
9113
|
* Set up event handlers for the recognition instance
|
|
@@ -6158,7 +9135,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
6158
9135
|
confidence: alternative.confidence
|
|
6159
9136
|
};
|
|
6160
9137
|
this.emitResult(speechResult);
|
|
6161
|
-
|
|
9138
|
+
logger14.trace("Speech result", {
|
|
6162
9139
|
text: text.substring(0, 50),
|
|
6163
9140
|
isFinal,
|
|
6164
9141
|
confidence: alternative.confidence
|
|
@@ -6168,12 +9145,12 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
6168
9145
|
span?.end();
|
|
6169
9146
|
} catch (error) {
|
|
6170
9147
|
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
6171
|
-
|
|
9148
|
+
logger14.error("Error processing speech result", { error });
|
|
6172
9149
|
}
|
|
6173
9150
|
};
|
|
6174
9151
|
this.recognition.onerror = (event) => {
|
|
6175
9152
|
const error = new Error(`Speech recognition error: ${event.error} - ${event.message}`);
|
|
6176
|
-
|
|
9153
|
+
logger14.error("Speech recognition error", { error: event.error, message: event.message });
|
|
6177
9154
|
this.emitError(error);
|
|
6178
9155
|
if (this.stopRejecter) {
|
|
6179
9156
|
this.stopRejecter(error);
|
|
@@ -6183,7 +9160,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
6183
9160
|
};
|
|
6184
9161
|
this.recognition.onend = () => {
|
|
6185
9162
|
this.isListening = false;
|
|
6186
|
-
|
|
9163
|
+
logger14.info("Speech recognition ended", {
|
|
6187
9164
|
totalText: this.accumulatedText.length,
|
|
6188
9165
|
durationMs: performance.now() - this.startTime
|
|
6189
9166
|
});
|
|
@@ -6200,13 +9177,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
6200
9177
|
}
|
|
6201
9178
|
};
|
|
6202
9179
|
this.recognition.onstart = () => {
|
|
6203
|
-
|
|
9180
|
+
logger14.debug("Speech recognition started by browser");
|
|
6204
9181
|
};
|
|
6205
9182
|
this.recognition.onspeechstart = () => {
|
|
6206
|
-
|
|
9183
|
+
logger14.debug("Speech detected");
|
|
6207
9184
|
};
|
|
6208
9185
|
this.recognition.onspeechend = () => {
|
|
6209
|
-
|
|
9186
|
+
logger14.debug("Speech ended");
|
|
6210
9187
|
};
|
|
6211
9188
|
}
|
|
6212
9189
|
/**
|
|
@@ -6217,7 +9194,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
6217
9194
|
try {
|
|
6218
9195
|
callback(result);
|
|
6219
9196
|
} catch (error) {
|
|
6220
|
-
|
|
9197
|
+
logger14.error("Error in result callback", { error });
|
|
6221
9198
|
}
|
|
6222
9199
|
}
|
|
6223
9200
|
}
|
|
@@ -6229,7 +9206,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
6229
9206
|
try {
|
|
6230
9207
|
callback(error);
|
|
6231
9208
|
} catch (callbackError) {
|
|
6232
|
-
|
|
9209
|
+
logger14.error("Error in error callback", { error: callbackError });
|
|
6233
9210
|
}
|
|
6234
9211
|
}
|
|
6235
9212
|
}
|
|
@@ -6494,7 +9471,7 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
6494
9471
|
console.error("[AgentCore] VAD error during interruption detection:", error);
|
|
6495
9472
|
});
|
|
6496
9473
|
}
|
|
6497
|
-
const float32 = audio instanceof Float32Array ? audio :
|
|
9474
|
+
const float32 = audio instanceof Float32Array ? audio : int16ToFloat32(audio);
|
|
6498
9475
|
this.audioBuffer.push(float32);
|
|
6499
9476
|
this.scheduleTranscription();
|
|
6500
9477
|
}
|
|
@@ -6826,7 +9803,7 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
6826
9803
|
* Falls back to simple RMS if VAD not available
|
|
6827
9804
|
*/
|
|
6828
9805
|
async detectVoiceActivity(audio) {
|
|
6829
|
-
const float32 = audio instanceof Float32Array ? audio :
|
|
9806
|
+
const float32 = audio instanceof Float32Array ? audio : int16ToFloat32(audio);
|
|
6830
9807
|
if (this.vad) {
|
|
6831
9808
|
const chunkSize = this.vad.getChunkSize();
|
|
6832
9809
|
for (let i = 0; i + chunkSize <= float32.length; i += chunkSize) {
|
|
@@ -6845,13 +9822,6 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
6845
9822
|
const rms = Math.sqrt(sum / float32.length);
|
|
6846
9823
|
return rms > 0.02;
|
|
6847
9824
|
}
|
|
6848
|
-
int16ToFloat32(int16) {
|
|
6849
|
-
const float32 = new Float32Array(int16.length);
|
|
6850
|
-
for (let i = 0; i < int16.length; i++) {
|
|
6851
|
-
float32[i] = int16[i] / 32768;
|
|
6852
|
-
}
|
|
6853
|
-
return float32;
|
|
6854
|
-
}
|
|
6855
9825
|
base64ToArrayBuffer(base64) {
|
|
6856
9826
|
const binaryString = atob(base64);
|
|
6857
9827
|
const bytes = new Uint8Array(binaryString.length);
|