@omote/core 0.4.3 → 0.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -469,7 +469,14 @@ var AudioScheduler = class {
469
469
  source.connect(gainNode);
470
470
  const scheduleTime = this.nextPlayTime;
471
471
  source.start(scheduleTime);
472
- this.scheduledSources.push({ source, gainNode });
472
+ const entry = { source, gainNode };
473
+ this.scheduledSources.push(entry);
474
+ source.onended = () => {
475
+ const idx = this.scheduledSources.indexOf(entry);
476
+ if (idx !== -1) {
477
+ this.scheduledSources.splice(idx, 1);
478
+ }
479
+ };
473
480
  const duration = audioData.length / ctx.sampleRate;
474
481
  this.nextPlayTime = scheduleTime + duration;
475
482
  return scheduleTime;
@@ -825,7 +832,7 @@ var LAMPipeline = class {
825
832
  }
826
833
  };
827
834
 
828
- // src/audio/SyncedAudioPipeline.ts
835
+ // src/audio/audioUtils.ts
829
836
  function pcm16ToFloat32(buffer) {
830
837
  const byteLen = buffer.byteLength & ~1;
831
838
  const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
@@ -835,6 +842,15 @@ function pcm16ToFloat32(buffer) {
835
842
  }
836
843
  return float32;
837
844
  }
845
+ function int16ToFloat32(int16) {
846
+ const float32 = new Float32Array(int16.length);
847
+ for (let i = 0; i < int16.length; i++) {
848
+ float32[i] = int16[i] / 32768;
849
+ }
850
+ return float32;
851
+ }
852
+
853
+ // src/audio/SyncedAudioPipeline.ts
838
854
  var SyncedAudioPipeline = class extends EventEmitter {
839
855
  constructor(options) {
840
856
  super();
@@ -3536,14 +3552,18 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
3536
3552
  });
3537
3553
  try {
3538
3554
  const startTime = performance.now();
3555
+ let timeoutId;
3539
3556
  const results = await Promise.race([
3540
- this.session.run(feeds),
3541
- new Promise(
3542
- (_, rej) => setTimeout(
3557
+ this.session.run(feeds).then((r) => {
3558
+ clearTimeout(timeoutId);
3559
+ return r;
3560
+ }),
3561
+ new Promise((_, rej) => {
3562
+ timeoutId = setTimeout(
3543
3563
  () => rej(new Error(`Wav2Vec2 inference timed out after ${_Wav2Vec2Inference.INFERENCE_TIMEOUT_MS}ms`)),
3544
3564
  _Wav2Vec2Inference.INFERENCE_TIMEOUT_MS
3545
- )
3546
- )
3565
+ );
3566
+ })
3547
3567
  ]);
3548
3568
  const inferenceTimeMs = performance.now() - startTime;
3549
3569
  const asrOutput = results["asr_logits"];
@@ -3649,15 +3669,6 @@ var Wav2Vec2Inference = _Wav2Vec2Inference;
3649
3669
 
3650
3670
  // src/audio/FullFacePipeline.ts
3651
3671
  var logger3 = createLogger("FullFacePipeline");
3652
- function pcm16ToFloat322(buffer) {
3653
- const byteLen = buffer.byteLength & ~1;
3654
- const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
3655
- const float32 = new Float32Array(int16.length);
3656
- for (let i = 0; i < int16.length; i++) {
3657
- float32[i] = int16[i] / 32768;
3658
- }
3659
- return float32;
3660
- }
3661
3672
  var BLENDSHAPE_INDEX_MAP = /* @__PURE__ */ new Map();
3662
3673
  LAM_BLENDSHAPES.forEach((name, index) => {
3663
3674
  BLENDSHAPE_INDEX_MAP.set(name, index);
@@ -3807,7 +3818,7 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
3807
3818
  if (!combined) {
3808
3819
  return;
3809
3820
  }
3810
- const float32 = pcm16ToFloat322(combined);
3821
+ const float32 = pcm16ToFloat32(combined);
3811
3822
  const scheduleTime = await this.scheduler.schedule(float32);
3812
3823
  if (!this.playbackStarted) {
3813
3824
  this.playbackStarted = true;
@@ -4290,13 +4301,18 @@ function ctcGreedyDecode(logits, seqLen, vocabSize, tokenMap) {
4290
4301
 
4291
4302
  // src/inference/SenseVoiceInference.ts
4292
4303
  var logger4 = createLogger("SenseVoice");
4293
- var SenseVoiceInference = class {
4304
+ var _SenseVoiceInference = class _SenseVoiceInference {
4294
4305
  constructor(config) {
4295
4306
  this.session = null;
4296
4307
  this.ort = null;
4297
4308
  this._backend = "wasm";
4298
4309
  this.isLoading = false;
4299
4310
  this.inferenceQueue = Promise.resolve();
4311
+ // Session health: set to true if session.run() times out.
4312
+ // A timed-out session may have a zombie WASM dispatch still running,
4313
+ // so all future transcribe() calls reject immediately to prevent concurrent access.
4314
+ this.poisoned = false;
4315
+ // 10s for SenseVoice (heavier preprocessing)
4300
4316
  // Preprocessing state (loaded once)
4301
4317
  this.tokenMap = null;
4302
4318
  this.negMean = null;
@@ -4444,6 +4460,9 @@ var SenseVoiceInference = class {
4444
4460
  if (!this.session || !this.ort || !this.tokenMap) {
4445
4461
  throw new Error("Model not loaded. Call load() first.");
4446
4462
  }
4463
+ if (this.poisoned) {
4464
+ throw new Error("SenseVoice session timed out \u2014 inference unavailable until page reload");
4465
+ }
4447
4466
  const audio = new Float32Array(audioSamples);
4448
4467
  return this.queueInference(audio);
4449
4468
  }
@@ -4481,7 +4500,19 @@ var SenseVoiceInference = class {
4481
4500
  language: new ort.Tensor("int32", new Int32Array([this.languageId]), [1]),
4482
4501
  text_norm: new ort.Tensor("int32", new Int32Array([this.textNormId]), [1])
4483
4502
  };
4484
- const results = await this.session.run(feeds);
4503
+ let timeoutId;
4504
+ const results = await Promise.race([
4505
+ this.session.run(feeds).then((r) => {
4506
+ clearTimeout(timeoutId);
4507
+ return r;
4508
+ }),
4509
+ new Promise((_, rej) => {
4510
+ timeoutId = setTimeout(
4511
+ () => rej(new Error(`SenseVoice inference timed out after ${_SenseVoiceInference.INFERENCE_TIMEOUT_MS}ms`)),
4512
+ _SenseVoiceInference.INFERENCE_TIMEOUT_MS
4513
+ );
4514
+ })
4515
+ ]);
4485
4516
  const logitsOutput = results["logits"];
4486
4517
  if (!logitsOutput) {
4487
4518
  throw new Error('Model output missing "logits" tensor');
@@ -4527,6 +4558,32 @@ var SenseVoiceInference = class {
4527
4558
  preprocessTimeMs
4528
4559
  });
4529
4560
  } catch (err) {
4561
+ const errMsg = err instanceof Error ? err.message : String(err);
4562
+ if (errMsg.includes("timed out")) {
4563
+ this.poisoned = true;
4564
+ logger4.error("CRITICAL: Inference session timed out \u2014 SenseVoice is dead. Page reload required.", {
4565
+ backend: this._backend,
4566
+ timeoutMs: _SenseVoiceInference.INFERENCE_TIMEOUT_MS
4567
+ });
4568
+ } else if (typeof err === "number") {
4569
+ const oomError = new Error(
4570
+ `SenseVoice inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
4571
+ );
4572
+ logger4.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
4573
+ pointer: `0x${err.toString(16)}`,
4574
+ backend: this._backend
4575
+ });
4576
+ span?.endWithError(oomError);
4577
+ telemetry?.incrementCounter("omote.inference.total", 1, {
4578
+ model: "sensevoice",
4579
+ backend: this._backend,
4580
+ status: "error"
4581
+ });
4582
+ reject(oomError);
4583
+ return;
4584
+ } else {
4585
+ logger4.error("Inference failed", { error: errMsg, backend: this._backend });
4586
+ }
4530
4587
  span?.endWithError(err instanceof Error ? err : new Error(String(err)));
4531
4588
  telemetry?.incrementCounter("omote.inference.total", 1, {
4532
4589
  model: "sensevoice",
@@ -4550,10 +4607,12 @@ var SenseVoiceInference = class {
4550
4607
  this.invStddev = null;
4551
4608
  }
4552
4609
  };
4610
+ _SenseVoiceInference.INFERENCE_TIMEOUT_MS = 1e4;
4611
+ var SenseVoiceInference = _SenseVoiceInference;
4553
4612
 
4554
4613
  // src/inference/Wav2ArkitCpuInference.ts
4555
4614
  var logger5 = createLogger("Wav2ArkitCpu");
4556
- var Wav2ArkitCpuInference = class {
4615
+ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
4557
4616
  constructor(config) {
4558
4617
  this.modelId = "wav2arkit_cpu";
4559
4618
  this.session = null;
@@ -4562,6 +4621,10 @@ var Wav2ArkitCpuInference = class {
4562
4621
  this.isLoading = false;
4563
4622
  // Inference queue for handling concurrent calls
4564
4623
  this.inferenceQueue = Promise.resolve();
4624
+ // Session health: set to true if session.run() times out.
4625
+ // A timed-out session may have a zombie WASM dispatch still running,
4626
+ // so all future infer() calls reject immediately to prevent concurrent access.
4627
+ this.poisoned = false;
4565
4628
  this.config = config;
4566
4629
  }
4567
4630
  get backend() {
@@ -4734,6 +4797,9 @@ var Wav2ArkitCpuInference = class {
4734
4797
  if (!this.session) {
4735
4798
  throw new Error("Model not loaded. Call load() first.");
4736
4799
  }
4800
+ if (this.poisoned) {
4801
+ throw new Error("Wav2ArkitCpu session timed out \u2014 inference unavailable until page reload");
4802
+ }
4737
4803
  const audioCopy = new Float32Array(audioSamples);
4738
4804
  const feeds = {
4739
4805
  "audio_waveform": new this.ort.Tensor("float32", audioCopy, [1, audioCopy.length])
@@ -4753,7 +4819,19 @@ var Wav2ArkitCpuInference = class {
4753
4819
  });
4754
4820
  try {
4755
4821
  const startTime = performance.now();
4756
- const results = await this.session.run(feeds);
4822
+ let timeoutId;
4823
+ const results = await Promise.race([
4824
+ this.session.run(feeds).then((r) => {
4825
+ clearTimeout(timeoutId);
4826
+ return r;
4827
+ }),
4828
+ new Promise((_, rej) => {
4829
+ timeoutId = setTimeout(
4830
+ () => rej(new Error(`Wav2ArkitCpu inference timed out after ${_Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS}ms`)),
4831
+ _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
4832
+ );
4833
+ })
4834
+ ]);
4757
4835
  const inferenceTimeMs = performance.now() - startTime;
4758
4836
  const blendshapeOutput = results["blendshapes"];
4759
4837
  if (!blendshapeOutput) {
@@ -4793,6 +4871,32 @@ var Wav2ArkitCpuInference = class {
4793
4871
  inferenceTimeMs
4794
4872
  });
4795
4873
  } catch (err) {
4874
+ const errMsg = err instanceof Error ? err.message : String(err);
4875
+ if (errMsg.includes("timed out")) {
4876
+ this.poisoned = true;
4877
+ logger5.error("CRITICAL: Inference session timed out \u2014 Wav2ArkitCpu is dead. Page reload required.", {
4878
+ backend: this._backend,
4879
+ timeoutMs: _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
4880
+ });
4881
+ } else if (typeof err === "number") {
4882
+ const oomError = new Error(
4883
+ `Wav2ArkitCpu inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
4884
+ );
4885
+ logger5.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
4886
+ pointer: `0x${err.toString(16)}`,
4887
+ backend: this._backend
4888
+ });
4889
+ span?.endWithError(oomError);
4890
+ telemetry?.incrementCounter("omote.inference.total", 1, {
4891
+ model: "wav2arkit_cpu",
4892
+ backend: this._backend,
4893
+ status: "error"
4894
+ });
4895
+ reject(oomError);
4896
+ return;
4897
+ } else {
4898
+ logger5.error("Inference failed", { error: errMsg, backend: this._backend });
4899
+ }
4796
4900
  span?.endWithError(err instanceof Error ? err : new Error(String(err)));
4797
4901
  telemetry?.incrementCounter("omote.inference.total", 1, {
4798
4902
  model: "wav2arkit_cpu",
@@ -4814,6 +4918,8 @@ var Wav2ArkitCpuInference = class {
4814
4918
  }
4815
4919
  }
4816
4920
  };
4921
+ _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS = 5e3;
4922
+ var Wav2ArkitCpuInference = _Wav2ArkitCpuInference;
4817
4923
 
4818
4924
  // src/inference/createLipSync.ts
4819
4925
  var logger6 = createLogger("createLipSync");
@@ -5143,23 +5249,13 @@ var SileroVADInference = class {
5143
5249
  }
5144
5250
  return segments;
5145
5251
  }
5146
- /**
5147
- * Calculate RMS energy of audio chunk
5148
- */
5149
- calculateRMS(samples) {
5150
- let sum = 0;
5151
- for (let i = 0; i < samples.length; i++) {
5152
- sum += samples[i] * samples[i];
5153
- }
5154
- return Math.sqrt(sum / samples.length);
5155
- }
5156
5252
  /**
5157
5253
  * Queue inference to serialize ONNX session calls
5158
5254
  */
5159
5255
  queueInference(audioChunk) {
5160
5256
  const audioChunkCopy = new Float32Array(audioChunk);
5161
5257
  const MIN_ENERGY_THRESHOLD = 1e-3;
5162
- const rms = this.calculateRMS(audioChunkCopy);
5258
+ const rms = calculateRMS(audioChunkCopy);
5163
5259
  if (rms < MIN_ENERGY_THRESHOLD) {
5164
5260
  if (!this.wasSpeaking) {
5165
5261
  this.preSpeechBuffer.push(new Float32Array(audioChunkCopy));
@@ -5214,7 +5310,7 @@ var SileroVADInference = class {
5214
5310
  [2, 1, 128]
5215
5311
  );
5216
5312
  }
5217
- this.context = audioChunk.slice(-this.contextSize);
5313
+ this.context = audioChunkCopy.slice(-this.contextSize);
5218
5314
  const inferenceTimeMs = performance.now() - startTime;
5219
5315
  const isSpeech = probability > this.config.threshold;
5220
5316
  let preSpeechChunks;
@@ -5226,7 +5322,7 @@ var SileroVADInference = class {
5226
5322
  durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
5227
5323
  });
5228
5324
  } else if (!isSpeech && !this.wasSpeaking) {
5229
- this.preSpeechBuffer.push(new Float32Array(audioChunk));
5325
+ this.preSpeechBuffer.push(new Float32Array(audioChunkCopy));
5230
5326
  if (this.preSpeechBuffer.length > this.config.preSpeechBufferChunks) {
5231
5327
  this.preSpeechBuffer.shift();
5232
5328
  }
@@ -5261,13 +5357,30 @@ var SileroVADInference = class {
5261
5357
  preSpeechChunks
5262
5358
  });
5263
5359
  } catch (err) {
5264
- span?.endWithError(err instanceof Error ? err : new Error(String(err)));
5265
- telemetry?.incrementCounter("omote.inference.total", 1, {
5266
- model: "silero-vad",
5267
- backend: this._backend,
5268
- status: "error"
5269
- });
5270
- reject(err);
5360
+ if (typeof err === "number") {
5361
+ const oomError = new Error(
5362
+ `SileroVAD inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reducing concurrent model sessions or reloading the page.`
5363
+ );
5364
+ logger7.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
5365
+ pointer: `0x${err.toString(16)}`,
5366
+ backend: this._backend
5367
+ });
5368
+ span?.endWithError(oomError);
5369
+ telemetry?.incrementCounter("omote.inference.total", 1, {
5370
+ model: "silero-vad",
5371
+ backend: this._backend,
5372
+ status: "error"
5373
+ });
5374
+ reject(oomError);
5375
+ } else {
5376
+ span?.endWithError(err instanceof Error ? err : new Error(String(err)));
5377
+ telemetry?.incrementCounter("omote.inference.total", 1, {
5378
+ model: "silero-vad",
5379
+ backend: this._backend,
5380
+ status: "error"
5381
+ });
5382
+ reject(err);
5383
+ }
5271
5384
  }
5272
5385
  });
5273
5386
  });
@@ -6494,7 +6607,7 @@ var AgentCoreAdapter = class extends EventEmitter {
6494
6607
  console.error("[AgentCore] VAD error during interruption detection:", error);
6495
6608
  });
6496
6609
  }
6497
- const float32 = audio instanceof Float32Array ? audio : this.int16ToFloat32(audio);
6610
+ const float32 = audio instanceof Float32Array ? audio : int16ToFloat32(audio);
6498
6611
  this.audioBuffer.push(float32);
6499
6612
  this.scheduleTranscription();
6500
6613
  }
@@ -6826,7 +6939,7 @@ var AgentCoreAdapter = class extends EventEmitter {
6826
6939
  * Falls back to simple RMS if VAD not available
6827
6940
  */
6828
6941
  async detectVoiceActivity(audio) {
6829
- const float32 = audio instanceof Float32Array ? audio : this.int16ToFloat32(audio);
6942
+ const float32 = audio instanceof Float32Array ? audio : int16ToFloat32(audio);
6830
6943
  if (this.vad) {
6831
6944
  const chunkSize = this.vad.getChunkSize();
6832
6945
  for (let i = 0; i + chunkSize <= float32.length; i += chunkSize) {
@@ -6845,13 +6958,6 @@ var AgentCoreAdapter = class extends EventEmitter {
6845
6958
  const rms = Math.sqrt(sum / float32.length);
6846
6959
  return rms > 0.02;
6847
6960
  }
6848
- int16ToFloat32(int16) {
6849
- const float32 = new Float32Array(int16.length);
6850
- for (let i = 0; i < int16.length; i++) {
6851
- float32[i] = int16[i] / 32768;
6852
- }
6853
- return float32;
6854
- }
6855
6961
  base64ToArrayBuffer(base64) {
6856
6962
  const binaryString = atob(base64);
6857
6963
  const bytes = new Uint8Array(binaryString.length);