@omote/core 0.4.3 → 0.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -312,7 +312,14 @@ var AudioScheduler = class {
312
312
  source.connect(gainNode);
313
313
  const scheduleTime = this.nextPlayTime;
314
314
  source.start(scheduleTime);
315
- this.scheduledSources.push({ source, gainNode });
315
+ const entry = { source, gainNode };
316
+ this.scheduledSources.push(entry);
317
+ source.onended = () => {
318
+ const idx = this.scheduledSources.indexOf(entry);
319
+ if (idx !== -1) {
320
+ this.scheduledSources.splice(idx, 1);
321
+ }
322
+ };
316
323
  const duration = audioData.length / ctx.sampleRate;
317
324
  this.nextPlayTime = scheduleTime + duration;
318
325
  return scheduleTime;
@@ -668,7 +675,7 @@ var LAMPipeline = class {
668
675
  }
669
676
  };
670
677
 
671
- // src/audio/SyncedAudioPipeline.ts
678
+ // src/audio/audioUtils.ts
672
679
  function pcm16ToFloat32(buffer) {
673
680
  const byteLen = buffer.byteLength & ~1;
674
681
  const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
@@ -678,6 +685,15 @@ function pcm16ToFloat32(buffer) {
678
685
  }
679
686
  return float32;
680
687
  }
688
+ function int16ToFloat32(int16) {
689
+ const float32 = new Float32Array(int16.length);
690
+ for (let i = 0; i < int16.length; i++) {
691
+ float32[i] = int16[i] / 32768;
692
+ }
693
+ return float32;
694
+ }
695
+
696
+ // src/audio/SyncedAudioPipeline.ts
681
697
  var SyncedAudioPipeline = class extends EventEmitter {
682
698
  constructor(options) {
683
699
  super();
@@ -3115,14 +3131,18 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
3115
3131
  });
3116
3132
  try {
3117
3133
  const startTime = performance.now();
3134
+ let timeoutId;
3118
3135
  const results = await Promise.race([
3119
- this.session.run(feeds),
3120
- new Promise(
3121
- (_, rej) => setTimeout(
3136
+ this.session.run(feeds).then((r) => {
3137
+ clearTimeout(timeoutId);
3138
+ return r;
3139
+ }),
3140
+ new Promise((_, rej) => {
3141
+ timeoutId = setTimeout(
3122
3142
  () => rej(new Error(`Wav2Vec2 inference timed out after ${_Wav2Vec2Inference.INFERENCE_TIMEOUT_MS}ms`)),
3123
3143
  _Wav2Vec2Inference.INFERENCE_TIMEOUT_MS
3124
- )
3125
- )
3144
+ );
3145
+ })
3126
3146
  ]);
3127
3147
  const inferenceTimeMs = performance.now() - startTime;
3128
3148
  const asrOutput = results["asr_logits"];
@@ -3228,15 +3248,6 @@ var Wav2Vec2Inference = _Wav2Vec2Inference;
3228
3248
 
3229
3249
  // src/audio/FullFacePipeline.ts
3230
3250
  var logger3 = createLogger("FullFacePipeline");
3231
- function pcm16ToFloat322(buffer) {
3232
- const byteLen = buffer.byteLength & ~1;
3233
- const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
3234
- const float32 = new Float32Array(int16.length);
3235
- for (let i = 0; i < int16.length; i++) {
3236
- float32[i] = int16[i] / 32768;
3237
- }
3238
- return float32;
3239
- }
3240
3251
  var BLENDSHAPE_INDEX_MAP = /* @__PURE__ */ new Map();
3241
3252
  LAM_BLENDSHAPES.forEach((name, index) => {
3242
3253
  BLENDSHAPE_INDEX_MAP.set(name, index);
@@ -3386,7 +3397,7 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
3386
3397
  if (!combined) {
3387
3398
  return;
3388
3399
  }
3389
- const float32 = pcm16ToFloat322(combined);
3400
+ const float32 = pcm16ToFloat32(combined);
3390
3401
  const scheduleTime = await this.scheduler.schedule(float32);
3391
3402
  if (!this.playbackStarted) {
3392
3403
  this.playbackStarted = true;
@@ -3869,13 +3880,18 @@ function ctcGreedyDecode(logits, seqLen, vocabSize, tokenMap) {
3869
3880
 
3870
3881
  // src/inference/SenseVoiceInference.ts
3871
3882
  var logger4 = createLogger("SenseVoice");
3872
- var SenseVoiceInference = class {
3883
+ var _SenseVoiceInference = class _SenseVoiceInference {
3873
3884
  constructor(config) {
3874
3885
  this.session = null;
3875
3886
  this.ort = null;
3876
3887
  this._backend = "wasm";
3877
3888
  this.isLoading = false;
3878
3889
  this.inferenceQueue = Promise.resolve();
3890
+ // Session health: set to true if session.run() times out.
3891
+ // A timed-out session may have a zombie WASM dispatch still running,
3892
+ // so all future transcribe() calls reject immediately to prevent concurrent access.
3893
+ this.poisoned = false;
3894
+ // 10s for SenseVoice (heavier preprocessing)
3879
3895
  // Preprocessing state (loaded once)
3880
3896
  this.tokenMap = null;
3881
3897
  this.negMean = null;
@@ -4023,6 +4039,9 @@ var SenseVoiceInference = class {
4023
4039
  if (!this.session || !this.ort || !this.tokenMap) {
4024
4040
  throw new Error("Model not loaded. Call load() first.");
4025
4041
  }
4042
+ if (this.poisoned) {
4043
+ throw new Error("SenseVoice session timed out \u2014 inference unavailable until page reload");
4044
+ }
4026
4045
  const audio = new Float32Array(audioSamples);
4027
4046
  return this.queueInference(audio);
4028
4047
  }
@@ -4060,7 +4079,19 @@ var SenseVoiceInference = class {
4060
4079
  language: new ort.Tensor("int32", new Int32Array([this.languageId]), [1]),
4061
4080
  text_norm: new ort.Tensor("int32", new Int32Array([this.textNormId]), [1])
4062
4081
  };
4063
- const results = await this.session.run(feeds);
4082
+ let timeoutId;
4083
+ const results = await Promise.race([
4084
+ this.session.run(feeds).then((r) => {
4085
+ clearTimeout(timeoutId);
4086
+ return r;
4087
+ }),
4088
+ new Promise((_, rej) => {
4089
+ timeoutId = setTimeout(
4090
+ () => rej(new Error(`SenseVoice inference timed out after ${_SenseVoiceInference.INFERENCE_TIMEOUT_MS}ms`)),
4091
+ _SenseVoiceInference.INFERENCE_TIMEOUT_MS
4092
+ );
4093
+ })
4094
+ ]);
4064
4095
  const logitsOutput = results["logits"];
4065
4096
  if (!logitsOutput) {
4066
4097
  throw new Error('Model output missing "logits" tensor');
@@ -4106,6 +4137,32 @@ var SenseVoiceInference = class {
4106
4137
  preprocessTimeMs
4107
4138
  });
4108
4139
  } catch (err) {
4140
+ const errMsg = err instanceof Error ? err.message : String(err);
4141
+ if (errMsg.includes("timed out")) {
4142
+ this.poisoned = true;
4143
+ logger4.error("CRITICAL: Inference session timed out \u2014 SenseVoice is dead. Page reload required.", {
4144
+ backend: this._backend,
4145
+ timeoutMs: _SenseVoiceInference.INFERENCE_TIMEOUT_MS
4146
+ });
4147
+ } else if (typeof err === "number") {
4148
+ const oomError = new Error(
4149
+ `SenseVoice inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
4150
+ );
4151
+ logger4.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
4152
+ pointer: `0x${err.toString(16)}`,
4153
+ backend: this._backend
4154
+ });
4155
+ span?.endWithError(oomError);
4156
+ telemetry?.incrementCounter("omote.inference.total", 1, {
4157
+ model: "sensevoice",
4158
+ backend: this._backend,
4159
+ status: "error"
4160
+ });
4161
+ reject(oomError);
4162
+ return;
4163
+ } else {
4164
+ logger4.error("Inference failed", { error: errMsg, backend: this._backend });
4165
+ }
4109
4166
  span?.endWithError(err instanceof Error ? err : new Error(String(err)));
4110
4167
  telemetry?.incrementCounter("omote.inference.total", 1, {
4111
4168
  model: "sensevoice",
@@ -4129,10 +4186,12 @@ var SenseVoiceInference = class {
4129
4186
  this.invStddev = null;
4130
4187
  }
4131
4188
  };
4189
+ _SenseVoiceInference.INFERENCE_TIMEOUT_MS = 1e4;
4190
+ var SenseVoiceInference = _SenseVoiceInference;
4132
4191
 
4133
4192
  // src/inference/Wav2ArkitCpuInference.ts
4134
4193
  var logger5 = createLogger("Wav2ArkitCpu");
4135
- var Wav2ArkitCpuInference = class {
4194
+ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
4136
4195
  constructor(config) {
4137
4196
  this.modelId = "wav2arkit_cpu";
4138
4197
  this.session = null;
@@ -4141,6 +4200,10 @@ var Wav2ArkitCpuInference = class {
4141
4200
  this.isLoading = false;
4142
4201
  // Inference queue for handling concurrent calls
4143
4202
  this.inferenceQueue = Promise.resolve();
4203
+ // Session health: set to true if session.run() times out.
4204
+ // A timed-out session may have a zombie WASM dispatch still running,
4205
+ // so all future infer() calls reject immediately to prevent concurrent access.
4206
+ this.poisoned = false;
4144
4207
  this.config = config;
4145
4208
  }
4146
4209
  get backend() {
@@ -4313,6 +4376,9 @@ var Wav2ArkitCpuInference = class {
4313
4376
  if (!this.session) {
4314
4377
  throw new Error("Model not loaded. Call load() first.");
4315
4378
  }
4379
+ if (this.poisoned) {
4380
+ throw new Error("Wav2ArkitCpu session timed out \u2014 inference unavailable until page reload");
4381
+ }
4316
4382
  const audioCopy = new Float32Array(audioSamples);
4317
4383
  const feeds = {
4318
4384
  "audio_waveform": new this.ort.Tensor("float32", audioCopy, [1, audioCopy.length])
@@ -4332,7 +4398,19 @@ var Wav2ArkitCpuInference = class {
4332
4398
  });
4333
4399
  try {
4334
4400
  const startTime = performance.now();
4335
- const results = await this.session.run(feeds);
4401
+ let timeoutId;
4402
+ const results = await Promise.race([
4403
+ this.session.run(feeds).then((r) => {
4404
+ clearTimeout(timeoutId);
4405
+ return r;
4406
+ }),
4407
+ new Promise((_, rej) => {
4408
+ timeoutId = setTimeout(
4409
+ () => rej(new Error(`Wav2ArkitCpu inference timed out after ${_Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS}ms`)),
4410
+ _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
4411
+ );
4412
+ })
4413
+ ]);
4336
4414
  const inferenceTimeMs = performance.now() - startTime;
4337
4415
  const blendshapeOutput = results["blendshapes"];
4338
4416
  if (!blendshapeOutput) {
@@ -4372,6 +4450,32 @@ var Wav2ArkitCpuInference = class {
4372
4450
  inferenceTimeMs
4373
4451
  });
4374
4452
  } catch (err) {
4453
+ const errMsg = err instanceof Error ? err.message : String(err);
4454
+ if (errMsg.includes("timed out")) {
4455
+ this.poisoned = true;
4456
+ logger5.error("CRITICAL: Inference session timed out \u2014 Wav2ArkitCpu is dead. Page reload required.", {
4457
+ backend: this._backend,
4458
+ timeoutMs: _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
4459
+ });
4460
+ } else if (typeof err === "number") {
4461
+ const oomError = new Error(
4462
+ `Wav2ArkitCpu inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
4463
+ );
4464
+ logger5.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
4465
+ pointer: `0x${err.toString(16)}`,
4466
+ backend: this._backend
4467
+ });
4468
+ span?.endWithError(oomError);
4469
+ telemetry?.incrementCounter("omote.inference.total", 1, {
4470
+ model: "wav2arkit_cpu",
4471
+ backend: this._backend,
4472
+ status: "error"
4473
+ });
4474
+ reject(oomError);
4475
+ return;
4476
+ } else {
4477
+ logger5.error("Inference failed", { error: errMsg, backend: this._backend });
4478
+ }
4375
4479
  span?.endWithError(err instanceof Error ? err : new Error(String(err)));
4376
4480
  telemetry?.incrementCounter("omote.inference.total", 1, {
4377
4481
  model: "wav2arkit_cpu",
@@ -4393,6 +4497,8 @@ var Wav2ArkitCpuInference = class {
4393
4497
  }
4394
4498
  }
4395
4499
  };
4500
+ _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS = 5e3;
4501
+ var Wav2ArkitCpuInference = _Wav2ArkitCpuInference;
4396
4502
 
4397
4503
  // src/inference/createLipSync.ts
4398
4504
  var logger6 = createLogger("createLipSync");
@@ -4722,23 +4828,13 @@ var SileroVADInference = class {
4722
4828
  }
4723
4829
  return segments;
4724
4830
  }
4725
- /**
4726
- * Calculate RMS energy of audio chunk
4727
- */
4728
- calculateRMS(samples) {
4729
- let sum = 0;
4730
- for (let i = 0; i < samples.length; i++) {
4731
- sum += samples[i] * samples[i];
4732
- }
4733
- return Math.sqrt(sum / samples.length);
4734
- }
4735
4831
  /**
4736
4832
  * Queue inference to serialize ONNX session calls
4737
4833
  */
4738
4834
  queueInference(audioChunk) {
4739
4835
  const audioChunkCopy = new Float32Array(audioChunk);
4740
4836
  const MIN_ENERGY_THRESHOLD = 1e-3;
4741
- const rms = this.calculateRMS(audioChunkCopy);
4837
+ const rms = calculateRMS(audioChunkCopy);
4742
4838
  if (rms < MIN_ENERGY_THRESHOLD) {
4743
4839
  if (!this.wasSpeaking) {
4744
4840
  this.preSpeechBuffer.push(new Float32Array(audioChunkCopy));
@@ -4793,7 +4889,7 @@ var SileroVADInference = class {
4793
4889
  [2, 1, 128]
4794
4890
  );
4795
4891
  }
4796
- this.context = audioChunk.slice(-this.contextSize);
4892
+ this.context = audioChunkCopy.slice(-this.contextSize);
4797
4893
  const inferenceTimeMs = performance.now() - startTime;
4798
4894
  const isSpeech = probability > this.config.threshold;
4799
4895
  let preSpeechChunks;
@@ -4805,7 +4901,7 @@ var SileroVADInference = class {
4805
4901
  durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
4806
4902
  });
4807
4903
  } else if (!isSpeech && !this.wasSpeaking) {
4808
- this.preSpeechBuffer.push(new Float32Array(audioChunk));
4904
+ this.preSpeechBuffer.push(new Float32Array(audioChunkCopy));
4809
4905
  if (this.preSpeechBuffer.length > this.config.preSpeechBufferChunks) {
4810
4906
  this.preSpeechBuffer.shift();
4811
4907
  }
@@ -4840,13 +4936,30 @@ var SileroVADInference = class {
4840
4936
  preSpeechChunks
4841
4937
  });
4842
4938
  } catch (err) {
4843
- span?.endWithError(err instanceof Error ? err : new Error(String(err)));
4844
- telemetry?.incrementCounter("omote.inference.total", 1, {
4845
- model: "silero-vad",
4846
- backend: this._backend,
4847
- status: "error"
4848
- });
4849
- reject(err);
4939
+ if (typeof err === "number") {
4940
+ const oomError = new Error(
4941
+ `SileroVAD inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reducing concurrent model sessions or reloading the page.`
4942
+ );
4943
+ logger7.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
4944
+ pointer: `0x${err.toString(16)}`,
4945
+ backend: this._backend
4946
+ });
4947
+ span?.endWithError(oomError);
4948
+ telemetry?.incrementCounter("omote.inference.total", 1, {
4949
+ model: "silero-vad",
4950
+ backend: this._backend,
4951
+ status: "error"
4952
+ });
4953
+ reject(oomError);
4954
+ } else {
4955
+ span?.endWithError(err instanceof Error ? err : new Error(String(err)));
4956
+ telemetry?.incrementCounter("omote.inference.total", 1, {
4957
+ model: "silero-vad",
4958
+ backend: this._backend,
4959
+ status: "error"
4960
+ });
4961
+ reject(err);
4962
+ }
4850
4963
  }
4851
4964
  });
4852
4965
  });
@@ -6073,7 +6186,7 @@ var AgentCoreAdapter = class extends EventEmitter {
6073
6186
  console.error("[AgentCore] VAD error during interruption detection:", error);
6074
6187
  });
6075
6188
  }
6076
- const float32 = audio instanceof Float32Array ? audio : this.int16ToFloat32(audio);
6189
+ const float32 = audio instanceof Float32Array ? audio : int16ToFloat32(audio);
6077
6190
  this.audioBuffer.push(float32);
6078
6191
  this.scheduleTranscription();
6079
6192
  }
@@ -6405,7 +6518,7 @@ var AgentCoreAdapter = class extends EventEmitter {
6405
6518
  * Falls back to simple RMS if VAD not available
6406
6519
  */
6407
6520
  async detectVoiceActivity(audio) {
6408
- const float32 = audio instanceof Float32Array ? audio : this.int16ToFloat32(audio);
6521
+ const float32 = audio instanceof Float32Array ? audio : int16ToFloat32(audio);
6409
6522
  if (this.vad) {
6410
6523
  const chunkSize = this.vad.getChunkSize();
6411
6524
  for (let i = 0; i + chunkSize <= float32.length; i += chunkSize) {
@@ -6424,13 +6537,6 @@ var AgentCoreAdapter = class extends EventEmitter {
6424
6537
  const rms = Math.sqrt(sum / float32.length);
6425
6538
  return rms > 0.02;
6426
6539
  }
6427
- int16ToFloat32(int16) {
6428
- const float32 = new Float32Array(int16.length);
6429
- for (let i = 0; i < int16.length; i++) {
6430
- float32[i] = int16[i] / 32768;
6431
- }
6432
- return float32;
6433
- }
6434
6540
  base64ToArrayBuffer(base64) {
6435
6541
  const binaryString = atob(base64);
6436
6542
  const bytes = new Uint8Array(binaryString.length);