@omote/core 0.4.4 → 0.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +42 -33
- package/dist/index.d.ts +42 -33
- package/dist/index.js +156 -50
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +156 -50
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -469,7 +469,14 @@ var AudioScheduler = class {
|
|
|
469
469
|
source.connect(gainNode);
|
|
470
470
|
const scheduleTime = this.nextPlayTime;
|
|
471
471
|
source.start(scheduleTime);
|
|
472
|
-
|
|
472
|
+
const entry = { source, gainNode };
|
|
473
|
+
this.scheduledSources.push(entry);
|
|
474
|
+
source.onended = () => {
|
|
475
|
+
const idx = this.scheduledSources.indexOf(entry);
|
|
476
|
+
if (idx !== -1) {
|
|
477
|
+
this.scheduledSources.splice(idx, 1);
|
|
478
|
+
}
|
|
479
|
+
};
|
|
473
480
|
const duration = audioData.length / ctx.sampleRate;
|
|
474
481
|
this.nextPlayTime = scheduleTime + duration;
|
|
475
482
|
return scheduleTime;
|
|
@@ -825,7 +832,7 @@ var LAMPipeline = class {
|
|
|
825
832
|
}
|
|
826
833
|
};
|
|
827
834
|
|
|
828
|
-
// src/audio/
|
|
835
|
+
// src/audio/audioUtils.ts
|
|
829
836
|
function pcm16ToFloat32(buffer) {
|
|
830
837
|
const byteLen = buffer.byteLength & ~1;
|
|
831
838
|
const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
|
|
@@ -835,6 +842,15 @@ function pcm16ToFloat32(buffer) {
|
|
|
835
842
|
}
|
|
836
843
|
return float32;
|
|
837
844
|
}
|
|
845
|
+
function int16ToFloat32(int16) {
|
|
846
|
+
const float32 = new Float32Array(int16.length);
|
|
847
|
+
for (let i = 0; i < int16.length; i++) {
|
|
848
|
+
float32[i] = int16[i] / 32768;
|
|
849
|
+
}
|
|
850
|
+
return float32;
|
|
851
|
+
}
|
|
852
|
+
|
|
853
|
+
// src/audio/SyncedAudioPipeline.ts
|
|
838
854
|
var SyncedAudioPipeline = class extends EventEmitter {
|
|
839
855
|
constructor(options) {
|
|
840
856
|
super();
|
|
@@ -3536,14 +3552,18 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3536
3552
|
});
|
|
3537
3553
|
try {
|
|
3538
3554
|
const startTime = performance.now();
|
|
3555
|
+
let timeoutId;
|
|
3539
3556
|
const results = await Promise.race([
|
|
3540
|
-
this.session.run(feeds)
|
|
3541
|
-
|
|
3542
|
-
|
|
3557
|
+
this.session.run(feeds).then((r) => {
|
|
3558
|
+
clearTimeout(timeoutId);
|
|
3559
|
+
return r;
|
|
3560
|
+
}),
|
|
3561
|
+
new Promise((_, rej) => {
|
|
3562
|
+
timeoutId = setTimeout(
|
|
3543
3563
|
() => rej(new Error(`Wav2Vec2 inference timed out after ${_Wav2Vec2Inference.INFERENCE_TIMEOUT_MS}ms`)),
|
|
3544
3564
|
_Wav2Vec2Inference.INFERENCE_TIMEOUT_MS
|
|
3545
|
-
)
|
|
3546
|
-
)
|
|
3565
|
+
);
|
|
3566
|
+
})
|
|
3547
3567
|
]);
|
|
3548
3568
|
const inferenceTimeMs = performance.now() - startTime;
|
|
3549
3569
|
const asrOutput = results["asr_logits"];
|
|
@@ -3649,15 +3669,6 @@ var Wav2Vec2Inference = _Wav2Vec2Inference;
|
|
|
3649
3669
|
|
|
3650
3670
|
// src/audio/FullFacePipeline.ts
|
|
3651
3671
|
var logger3 = createLogger("FullFacePipeline");
|
|
3652
|
-
function pcm16ToFloat322(buffer) {
|
|
3653
|
-
const byteLen = buffer.byteLength & ~1;
|
|
3654
|
-
const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
|
|
3655
|
-
const float32 = new Float32Array(int16.length);
|
|
3656
|
-
for (let i = 0; i < int16.length; i++) {
|
|
3657
|
-
float32[i] = int16[i] / 32768;
|
|
3658
|
-
}
|
|
3659
|
-
return float32;
|
|
3660
|
-
}
|
|
3661
3672
|
var BLENDSHAPE_INDEX_MAP = /* @__PURE__ */ new Map();
|
|
3662
3673
|
LAM_BLENDSHAPES.forEach((name, index) => {
|
|
3663
3674
|
BLENDSHAPE_INDEX_MAP.set(name, index);
|
|
@@ -3807,7 +3818,7 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3807
3818
|
if (!combined) {
|
|
3808
3819
|
return;
|
|
3809
3820
|
}
|
|
3810
|
-
const float32 =
|
|
3821
|
+
const float32 = pcm16ToFloat32(combined);
|
|
3811
3822
|
const scheduleTime = await this.scheduler.schedule(float32);
|
|
3812
3823
|
if (!this.playbackStarted) {
|
|
3813
3824
|
this.playbackStarted = true;
|
|
@@ -4290,13 +4301,18 @@ function ctcGreedyDecode(logits, seqLen, vocabSize, tokenMap) {
|
|
|
4290
4301
|
|
|
4291
4302
|
// src/inference/SenseVoiceInference.ts
|
|
4292
4303
|
var logger4 = createLogger("SenseVoice");
|
|
4293
|
-
var
|
|
4304
|
+
var _SenseVoiceInference = class _SenseVoiceInference {
|
|
4294
4305
|
constructor(config) {
|
|
4295
4306
|
this.session = null;
|
|
4296
4307
|
this.ort = null;
|
|
4297
4308
|
this._backend = "wasm";
|
|
4298
4309
|
this.isLoading = false;
|
|
4299
4310
|
this.inferenceQueue = Promise.resolve();
|
|
4311
|
+
// Session health: set to true if session.run() times out.
|
|
4312
|
+
// A timed-out session may have a zombie WASM dispatch still running,
|
|
4313
|
+
// so all future transcribe() calls reject immediately to prevent concurrent access.
|
|
4314
|
+
this.poisoned = false;
|
|
4315
|
+
// 10s for SenseVoice (heavier preprocessing)
|
|
4300
4316
|
// Preprocessing state (loaded once)
|
|
4301
4317
|
this.tokenMap = null;
|
|
4302
4318
|
this.negMean = null;
|
|
@@ -4444,6 +4460,9 @@ var SenseVoiceInference = class {
|
|
|
4444
4460
|
if (!this.session || !this.ort || !this.tokenMap) {
|
|
4445
4461
|
throw new Error("Model not loaded. Call load() first.");
|
|
4446
4462
|
}
|
|
4463
|
+
if (this.poisoned) {
|
|
4464
|
+
throw new Error("SenseVoice session timed out \u2014 inference unavailable until page reload");
|
|
4465
|
+
}
|
|
4447
4466
|
const audio = new Float32Array(audioSamples);
|
|
4448
4467
|
return this.queueInference(audio);
|
|
4449
4468
|
}
|
|
@@ -4481,7 +4500,19 @@ var SenseVoiceInference = class {
|
|
|
4481
4500
|
language: new ort.Tensor("int32", new Int32Array([this.languageId]), [1]),
|
|
4482
4501
|
text_norm: new ort.Tensor("int32", new Int32Array([this.textNormId]), [1])
|
|
4483
4502
|
};
|
|
4484
|
-
|
|
4503
|
+
let timeoutId;
|
|
4504
|
+
const results = await Promise.race([
|
|
4505
|
+
this.session.run(feeds).then((r) => {
|
|
4506
|
+
clearTimeout(timeoutId);
|
|
4507
|
+
return r;
|
|
4508
|
+
}),
|
|
4509
|
+
new Promise((_, rej) => {
|
|
4510
|
+
timeoutId = setTimeout(
|
|
4511
|
+
() => rej(new Error(`SenseVoice inference timed out after ${_SenseVoiceInference.INFERENCE_TIMEOUT_MS}ms`)),
|
|
4512
|
+
_SenseVoiceInference.INFERENCE_TIMEOUT_MS
|
|
4513
|
+
);
|
|
4514
|
+
})
|
|
4515
|
+
]);
|
|
4485
4516
|
const logitsOutput = results["logits"];
|
|
4486
4517
|
if (!logitsOutput) {
|
|
4487
4518
|
throw new Error('Model output missing "logits" tensor');
|
|
@@ -4527,6 +4558,32 @@ var SenseVoiceInference = class {
|
|
|
4527
4558
|
preprocessTimeMs
|
|
4528
4559
|
});
|
|
4529
4560
|
} catch (err) {
|
|
4561
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
4562
|
+
if (errMsg.includes("timed out")) {
|
|
4563
|
+
this.poisoned = true;
|
|
4564
|
+
logger4.error("CRITICAL: Inference session timed out \u2014 SenseVoice is dead. Page reload required.", {
|
|
4565
|
+
backend: this._backend,
|
|
4566
|
+
timeoutMs: _SenseVoiceInference.INFERENCE_TIMEOUT_MS
|
|
4567
|
+
});
|
|
4568
|
+
} else if (typeof err === "number") {
|
|
4569
|
+
const oomError = new Error(
|
|
4570
|
+
`SenseVoice inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
|
|
4571
|
+
);
|
|
4572
|
+
logger4.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
4573
|
+
pointer: `0x${err.toString(16)}`,
|
|
4574
|
+
backend: this._backend
|
|
4575
|
+
});
|
|
4576
|
+
span?.endWithError(oomError);
|
|
4577
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
4578
|
+
model: "sensevoice",
|
|
4579
|
+
backend: this._backend,
|
|
4580
|
+
status: "error"
|
|
4581
|
+
});
|
|
4582
|
+
reject(oomError);
|
|
4583
|
+
return;
|
|
4584
|
+
} else {
|
|
4585
|
+
logger4.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
4586
|
+
}
|
|
4530
4587
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
4531
4588
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
4532
4589
|
model: "sensevoice",
|
|
@@ -4550,10 +4607,12 @@ var SenseVoiceInference = class {
|
|
|
4550
4607
|
this.invStddev = null;
|
|
4551
4608
|
}
|
|
4552
4609
|
};
|
|
4610
|
+
_SenseVoiceInference.INFERENCE_TIMEOUT_MS = 1e4;
|
|
4611
|
+
var SenseVoiceInference = _SenseVoiceInference;
|
|
4553
4612
|
|
|
4554
4613
|
// src/inference/Wav2ArkitCpuInference.ts
|
|
4555
4614
|
var logger5 = createLogger("Wav2ArkitCpu");
|
|
4556
|
-
var
|
|
4615
|
+
var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
4557
4616
|
constructor(config) {
|
|
4558
4617
|
this.modelId = "wav2arkit_cpu";
|
|
4559
4618
|
this.session = null;
|
|
@@ -4562,6 +4621,10 @@ var Wav2ArkitCpuInference = class {
|
|
|
4562
4621
|
this.isLoading = false;
|
|
4563
4622
|
// Inference queue for handling concurrent calls
|
|
4564
4623
|
this.inferenceQueue = Promise.resolve();
|
|
4624
|
+
// Session health: set to true if session.run() times out.
|
|
4625
|
+
// A timed-out session may have a zombie WASM dispatch still running,
|
|
4626
|
+
// so all future infer() calls reject immediately to prevent concurrent access.
|
|
4627
|
+
this.poisoned = false;
|
|
4565
4628
|
this.config = config;
|
|
4566
4629
|
}
|
|
4567
4630
|
get backend() {
|
|
@@ -4734,6 +4797,9 @@ var Wav2ArkitCpuInference = class {
|
|
|
4734
4797
|
if (!this.session) {
|
|
4735
4798
|
throw new Error("Model not loaded. Call load() first.");
|
|
4736
4799
|
}
|
|
4800
|
+
if (this.poisoned) {
|
|
4801
|
+
throw new Error("Wav2ArkitCpu session timed out \u2014 inference unavailable until page reload");
|
|
4802
|
+
}
|
|
4737
4803
|
const audioCopy = new Float32Array(audioSamples);
|
|
4738
4804
|
const feeds = {
|
|
4739
4805
|
"audio_waveform": new this.ort.Tensor("float32", audioCopy, [1, audioCopy.length])
|
|
@@ -4753,7 +4819,19 @@ var Wav2ArkitCpuInference = class {
|
|
|
4753
4819
|
});
|
|
4754
4820
|
try {
|
|
4755
4821
|
const startTime = performance.now();
|
|
4756
|
-
|
|
4822
|
+
let timeoutId;
|
|
4823
|
+
const results = await Promise.race([
|
|
4824
|
+
this.session.run(feeds).then((r) => {
|
|
4825
|
+
clearTimeout(timeoutId);
|
|
4826
|
+
return r;
|
|
4827
|
+
}),
|
|
4828
|
+
new Promise((_, rej) => {
|
|
4829
|
+
timeoutId = setTimeout(
|
|
4830
|
+
() => rej(new Error(`Wav2ArkitCpu inference timed out after ${_Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS}ms`)),
|
|
4831
|
+
_Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
|
|
4832
|
+
);
|
|
4833
|
+
})
|
|
4834
|
+
]);
|
|
4757
4835
|
const inferenceTimeMs = performance.now() - startTime;
|
|
4758
4836
|
const blendshapeOutput = results["blendshapes"];
|
|
4759
4837
|
if (!blendshapeOutput) {
|
|
@@ -4793,6 +4871,32 @@ var Wav2ArkitCpuInference = class {
|
|
|
4793
4871
|
inferenceTimeMs
|
|
4794
4872
|
});
|
|
4795
4873
|
} catch (err) {
|
|
4874
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
4875
|
+
if (errMsg.includes("timed out")) {
|
|
4876
|
+
this.poisoned = true;
|
|
4877
|
+
logger5.error("CRITICAL: Inference session timed out \u2014 Wav2ArkitCpu is dead. Page reload required.", {
|
|
4878
|
+
backend: this._backend,
|
|
4879
|
+
timeoutMs: _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
|
|
4880
|
+
});
|
|
4881
|
+
} else if (typeof err === "number") {
|
|
4882
|
+
const oomError = new Error(
|
|
4883
|
+
`Wav2ArkitCpu inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
|
|
4884
|
+
);
|
|
4885
|
+
logger5.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
4886
|
+
pointer: `0x${err.toString(16)}`,
|
|
4887
|
+
backend: this._backend
|
|
4888
|
+
});
|
|
4889
|
+
span?.endWithError(oomError);
|
|
4890
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
4891
|
+
model: "wav2arkit_cpu",
|
|
4892
|
+
backend: this._backend,
|
|
4893
|
+
status: "error"
|
|
4894
|
+
});
|
|
4895
|
+
reject(oomError);
|
|
4896
|
+
return;
|
|
4897
|
+
} else {
|
|
4898
|
+
logger5.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
4899
|
+
}
|
|
4796
4900
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
4797
4901
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
4798
4902
|
model: "wav2arkit_cpu",
|
|
@@ -4814,6 +4918,8 @@ var Wav2ArkitCpuInference = class {
|
|
|
4814
4918
|
}
|
|
4815
4919
|
}
|
|
4816
4920
|
};
|
|
4921
|
+
_Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS = 5e3;
|
|
4922
|
+
var Wav2ArkitCpuInference = _Wav2ArkitCpuInference;
|
|
4817
4923
|
|
|
4818
4924
|
// src/inference/createLipSync.ts
|
|
4819
4925
|
var logger6 = createLogger("createLipSync");
|
|
@@ -5143,23 +5249,13 @@ var SileroVADInference = class {
|
|
|
5143
5249
|
}
|
|
5144
5250
|
return segments;
|
|
5145
5251
|
}
|
|
5146
|
-
/**
|
|
5147
|
-
* Calculate RMS energy of audio chunk
|
|
5148
|
-
*/
|
|
5149
|
-
calculateRMS(samples) {
|
|
5150
|
-
let sum = 0;
|
|
5151
|
-
for (let i = 0; i < samples.length; i++) {
|
|
5152
|
-
sum += samples[i] * samples[i];
|
|
5153
|
-
}
|
|
5154
|
-
return Math.sqrt(sum / samples.length);
|
|
5155
|
-
}
|
|
5156
5252
|
/**
|
|
5157
5253
|
* Queue inference to serialize ONNX session calls
|
|
5158
5254
|
*/
|
|
5159
5255
|
queueInference(audioChunk) {
|
|
5160
5256
|
const audioChunkCopy = new Float32Array(audioChunk);
|
|
5161
5257
|
const MIN_ENERGY_THRESHOLD = 1e-3;
|
|
5162
|
-
const rms =
|
|
5258
|
+
const rms = calculateRMS(audioChunkCopy);
|
|
5163
5259
|
if (rms < MIN_ENERGY_THRESHOLD) {
|
|
5164
5260
|
if (!this.wasSpeaking) {
|
|
5165
5261
|
this.preSpeechBuffer.push(new Float32Array(audioChunkCopy));
|
|
@@ -5214,7 +5310,7 @@ var SileroVADInference = class {
|
|
|
5214
5310
|
[2, 1, 128]
|
|
5215
5311
|
);
|
|
5216
5312
|
}
|
|
5217
|
-
this.context =
|
|
5313
|
+
this.context = audioChunkCopy.slice(-this.contextSize);
|
|
5218
5314
|
const inferenceTimeMs = performance.now() - startTime;
|
|
5219
5315
|
const isSpeech = probability > this.config.threshold;
|
|
5220
5316
|
let preSpeechChunks;
|
|
@@ -5226,7 +5322,7 @@ var SileroVADInference = class {
|
|
|
5226
5322
|
durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
|
|
5227
5323
|
});
|
|
5228
5324
|
} else if (!isSpeech && !this.wasSpeaking) {
|
|
5229
|
-
this.preSpeechBuffer.push(new Float32Array(
|
|
5325
|
+
this.preSpeechBuffer.push(new Float32Array(audioChunkCopy));
|
|
5230
5326
|
if (this.preSpeechBuffer.length > this.config.preSpeechBufferChunks) {
|
|
5231
5327
|
this.preSpeechBuffer.shift();
|
|
5232
5328
|
}
|
|
@@ -5261,13 +5357,30 @@ var SileroVADInference = class {
|
|
|
5261
5357
|
preSpeechChunks
|
|
5262
5358
|
});
|
|
5263
5359
|
} catch (err) {
|
|
5264
|
-
|
|
5265
|
-
|
|
5266
|
-
|
|
5267
|
-
|
|
5268
|
-
|
|
5269
|
-
|
|
5270
|
-
|
|
5360
|
+
if (typeof err === "number") {
|
|
5361
|
+
const oomError = new Error(
|
|
5362
|
+
`SileroVAD inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reducing concurrent model sessions or reloading the page.`
|
|
5363
|
+
);
|
|
5364
|
+
logger7.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
5365
|
+
pointer: `0x${err.toString(16)}`,
|
|
5366
|
+
backend: this._backend
|
|
5367
|
+
});
|
|
5368
|
+
span?.endWithError(oomError);
|
|
5369
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
5370
|
+
model: "silero-vad",
|
|
5371
|
+
backend: this._backend,
|
|
5372
|
+
status: "error"
|
|
5373
|
+
});
|
|
5374
|
+
reject(oomError);
|
|
5375
|
+
} else {
|
|
5376
|
+
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
5377
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
5378
|
+
model: "silero-vad",
|
|
5379
|
+
backend: this._backend,
|
|
5380
|
+
status: "error"
|
|
5381
|
+
});
|
|
5382
|
+
reject(err);
|
|
5383
|
+
}
|
|
5271
5384
|
}
|
|
5272
5385
|
});
|
|
5273
5386
|
});
|
|
@@ -6494,7 +6607,7 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
6494
6607
|
console.error("[AgentCore] VAD error during interruption detection:", error);
|
|
6495
6608
|
});
|
|
6496
6609
|
}
|
|
6497
|
-
const float32 = audio instanceof Float32Array ? audio :
|
|
6610
|
+
const float32 = audio instanceof Float32Array ? audio : int16ToFloat32(audio);
|
|
6498
6611
|
this.audioBuffer.push(float32);
|
|
6499
6612
|
this.scheduleTranscription();
|
|
6500
6613
|
}
|
|
@@ -6826,7 +6939,7 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
6826
6939
|
* Falls back to simple RMS if VAD not available
|
|
6827
6940
|
*/
|
|
6828
6941
|
async detectVoiceActivity(audio) {
|
|
6829
|
-
const float32 = audio instanceof Float32Array ? audio :
|
|
6942
|
+
const float32 = audio instanceof Float32Array ? audio : int16ToFloat32(audio);
|
|
6830
6943
|
if (this.vad) {
|
|
6831
6944
|
const chunkSize = this.vad.getChunkSize();
|
|
6832
6945
|
for (let i = 0; i + chunkSize <= float32.length; i += chunkSize) {
|
|
@@ -6845,13 +6958,6 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
6845
6958
|
const rms = Math.sqrt(sum / float32.length);
|
|
6846
6959
|
return rms > 0.02;
|
|
6847
6960
|
}
|
|
6848
|
-
int16ToFloat32(int16) {
|
|
6849
|
-
const float32 = new Float32Array(int16.length);
|
|
6850
|
-
for (let i = 0; i < int16.length; i++) {
|
|
6851
|
-
float32[i] = int16[i] / 32768;
|
|
6852
|
-
}
|
|
6853
|
-
return float32;
|
|
6854
|
-
}
|
|
6855
6961
|
base64ToArrayBuffer(base64) {
|
|
6856
6962
|
const binaryString = atob(base64);
|
|
6857
6963
|
const bytes = new Uint8Array(binaryString.length);
|