@omote/core 0.4.4 → 0.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +42 -33
- package/dist/index.d.ts +42 -33
- package/dist/index.js +156 -50
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +156 -50
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -312,7 +312,14 @@ var AudioScheduler = class {
|
|
|
312
312
|
source.connect(gainNode);
|
|
313
313
|
const scheduleTime = this.nextPlayTime;
|
|
314
314
|
source.start(scheduleTime);
|
|
315
|
-
|
|
315
|
+
const entry = { source, gainNode };
|
|
316
|
+
this.scheduledSources.push(entry);
|
|
317
|
+
source.onended = () => {
|
|
318
|
+
const idx = this.scheduledSources.indexOf(entry);
|
|
319
|
+
if (idx !== -1) {
|
|
320
|
+
this.scheduledSources.splice(idx, 1);
|
|
321
|
+
}
|
|
322
|
+
};
|
|
316
323
|
const duration = audioData.length / ctx.sampleRate;
|
|
317
324
|
this.nextPlayTime = scheduleTime + duration;
|
|
318
325
|
return scheduleTime;
|
|
@@ -668,7 +675,7 @@ var LAMPipeline = class {
|
|
|
668
675
|
}
|
|
669
676
|
};
|
|
670
677
|
|
|
671
|
-
// src/audio/
|
|
678
|
+
// src/audio/audioUtils.ts
|
|
672
679
|
function pcm16ToFloat32(buffer) {
|
|
673
680
|
const byteLen = buffer.byteLength & ~1;
|
|
674
681
|
const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
|
|
@@ -678,6 +685,15 @@ function pcm16ToFloat32(buffer) {
|
|
|
678
685
|
}
|
|
679
686
|
return float32;
|
|
680
687
|
}
|
|
688
|
+
function int16ToFloat32(int16) {
|
|
689
|
+
const float32 = new Float32Array(int16.length);
|
|
690
|
+
for (let i = 0; i < int16.length; i++) {
|
|
691
|
+
float32[i] = int16[i] / 32768;
|
|
692
|
+
}
|
|
693
|
+
return float32;
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
// src/audio/SyncedAudioPipeline.ts
|
|
681
697
|
var SyncedAudioPipeline = class extends EventEmitter {
|
|
682
698
|
constructor(options) {
|
|
683
699
|
super();
|
|
@@ -3115,14 +3131,18 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3115
3131
|
});
|
|
3116
3132
|
try {
|
|
3117
3133
|
const startTime = performance.now();
|
|
3134
|
+
let timeoutId;
|
|
3118
3135
|
const results = await Promise.race([
|
|
3119
|
-
this.session.run(feeds)
|
|
3120
|
-
|
|
3121
|
-
|
|
3136
|
+
this.session.run(feeds).then((r) => {
|
|
3137
|
+
clearTimeout(timeoutId);
|
|
3138
|
+
return r;
|
|
3139
|
+
}),
|
|
3140
|
+
new Promise((_, rej) => {
|
|
3141
|
+
timeoutId = setTimeout(
|
|
3122
3142
|
() => rej(new Error(`Wav2Vec2 inference timed out after ${_Wav2Vec2Inference.INFERENCE_TIMEOUT_MS}ms`)),
|
|
3123
3143
|
_Wav2Vec2Inference.INFERENCE_TIMEOUT_MS
|
|
3124
|
-
)
|
|
3125
|
-
)
|
|
3144
|
+
);
|
|
3145
|
+
})
|
|
3126
3146
|
]);
|
|
3127
3147
|
const inferenceTimeMs = performance.now() - startTime;
|
|
3128
3148
|
const asrOutput = results["asr_logits"];
|
|
@@ -3228,15 +3248,6 @@ var Wav2Vec2Inference = _Wav2Vec2Inference;
|
|
|
3228
3248
|
|
|
3229
3249
|
// src/audio/FullFacePipeline.ts
|
|
3230
3250
|
var logger3 = createLogger("FullFacePipeline");
|
|
3231
|
-
function pcm16ToFloat322(buffer) {
|
|
3232
|
-
const byteLen = buffer.byteLength & ~1;
|
|
3233
|
-
const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
|
|
3234
|
-
const float32 = new Float32Array(int16.length);
|
|
3235
|
-
for (let i = 0; i < int16.length; i++) {
|
|
3236
|
-
float32[i] = int16[i] / 32768;
|
|
3237
|
-
}
|
|
3238
|
-
return float32;
|
|
3239
|
-
}
|
|
3240
3251
|
var BLENDSHAPE_INDEX_MAP = /* @__PURE__ */ new Map();
|
|
3241
3252
|
LAM_BLENDSHAPES.forEach((name, index) => {
|
|
3242
3253
|
BLENDSHAPE_INDEX_MAP.set(name, index);
|
|
@@ -3386,7 +3397,7 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3386
3397
|
if (!combined) {
|
|
3387
3398
|
return;
|
|
3388
3399
|
}
|
|
3389
|
-
const float32 =
|
|
3400
|
+
const float32 = pcm16ToFloat32(combined);
|
|
3390
3401
|
const scheduleTime = await this.scheduler.schedule(float32);
|
|
3391
3402
|
if (!this.playbackStarted) {
|
|
3392
3403
|
this.playbackStarted = true;
|
|
@@ -3869,13 +3880,18 @@ function ctcGreedyDecode(logits, seqLen, vocabSize, tokenMap) {
|
|
|
3869
3880
|
|
|
3870
3881
|
// src/inference/SenseVoiceInference.ts
|
|
3871
3882
|
var logger4 = createLogger("SenseVoice");
|
|
3872
|
-
var
|
|
3883
|
+
var _SenseVoiceInference = class _SenseVoiceInference {
|
|
3873
3884
|
constructor(config) {
|
|
3874
3885
|
this.session = null;
|
|
3875
3886
|
this.ort = null;
|
|
3876
3887
|
this._backend = "wasm";
|
|
3877
3888
|
this.isLoading = false;
|
|
3878
3889
|
this.inferenceQueue = Promise.resolve();
|
|
3890
|
+
// Session health: set to true if session.run() times out.
|
|
3891
|
+
// A timed-out session may have a zombie WASM dispatch still running,
|
|
3892
|
+
// so all future transcribe() calls reject immediately to prevent concurrent access.
|
|
3893
|
+
this.poisoned = false;
|
|
3894
|
+
// 10s for SenseVoice (heavier preprocessing)
|
|
3879
3895
|
// Preprocessing state (loaded once)
|
|
3880
3896
|
this.tokenMap = null;
|
|
3881
3897
|
this.negMean = null;
|
|
@@ -4023,6 +4039,9 @@ var SenseVoiceInference = class {
|
|
|
4023
4039
|
if (!this.session || !this.ort || !this.tokenMap) {
|
|
4024
4040
|
throw new Error("Model not loaded. Call load() first.");
|
|
4025
4041
|
}
|
|
4042
|
+
if (this.poisoned) {
|
|
4043
|
+
throw new Error("SenseVoice session timed out \u2014 inference unavailable until page reload");
|
|
4044
|
+
}
|
|
4026
4045
|
const audio = new Float32Array(audioSamples);
|
|
4027
4046
|
return this.queueInference(audio);
|
|
4028
4047
|
}
|
|
@@ -4060,7 +4079,19 @@ var SenseVoiceInference = class {
|
|
|
4060
4079
|
language: new ort.Tensor("int32", new Int32Array([this.languageId]), [1]),
|
|
4061
4080
|
text_norm: new ort.Tensor("int32", new Int32Array([this.textNormId]), [1])
|
|
4062
4081
|
};
|
|
4063
|
-
|
|
4082
|
+
let timeoutId;
|
|
4083
|
+
const results = await Promise.race([
|
|
4084
|
+
this.session.run(feeds).then((r) => {
|
|
4085
|
+
clearTimeout(timeoutId);
|
|
4086
|
+
return r;
|
|
4087
|
+
}),
|
|
4088
|
+
new Promise((_, rej) => {
|
|
4089
|
+
timeoutId = setTimeout(
|
|
4090
|
+
() => rej(new Error(`SenseVoice inference timed out after ${_SenseVoiceInference.INFERENCE_TIMEOUT_MS}ms`)),
|
|
4091
|
+
_SenseVoiceInference.INFERENCE_TIMEOUT_MS
|
|
4092
|
+
);
|
|
4093
|
+
})
|
|
4094
|
+
]);
|
|
4064
4095
|
const logitsOutput = results["logits"];
|
|
4065
4096
|
if (!logitsOutput) {
|
|
4066
4097
|
throw new Error('Model output missing "logits" tensor');
|
|
@@ -4106,6 +4137,32 @@ var SenseVoiceInference = class {
|
|
|
4106
4137
|
preprocessTimeMs
|
|
4107
4138
|
});
|
|
4108
4139
|
} catch (err) {
|
|
4140
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
4141
|
+
if (errMsg.includes("timed out")) {
|
|
4142
|
+
this.poisoned = true;
|
|
4143
|
+
logger4.error("CRITICAL: Inference session timed out \u2014 SenseVoice is dead. Page reload required.", {
|
|
4144
|
+
backend: this._backend,
|
|
4145
|
+
timeoutMs: _SenseVoiceInference.INFERENCE_TIMEOUT_MS
|
|
4146
|
+
});
|
|
4147
|
+
} else if (typeof err === "number") {
|
|
4148
|
+
const oomError = new Error(
|
|
4149
|
+
`SenseVoice inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
|
|
4150
|
+
);
|
|
4151
|
+
logger4.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
4152
|
+
pointer: `0x${err.toString(16)}`,
|
|
4153
|
+
backend: this._backend
|
|
4154
|
+
});
|
|
4155
|
+
span?.endWithError(oomError);
|
|
4156
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
4157
|
+
model: "sensevoice",
|
|
4158
|
+
backend: this._backend,
|
|
4159
|
+
status: "error"
|
|
4160
|
+
});
|
|
4161
|
+
reject(oomError);
|
|
4162
|
+
return;
|
|
4163
|
+
} else {
|
|
4164
|
+
logger4.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
4165
|
+
}
|
|
4109
4166
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
4110
4167
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
4111
4168
|
model: "sensevoice",
|
|
@@ -4129,10 +4186,12 @@ var SenseVoiceInference = class {
|
|
|
4129
4186
|
this.invStddev = null;
|
|
4130
4187
|
}
|
|
4131
4188
|
};
|
|
4189
|
+
_SenseVoiceInference.INFERENCE_TIMEOUT_MS = 1e4;
|
|
4190
|
+
var SenseVoiceInference = _SenseVoiceInference;
|
|
4132
4191
|
|
|
4133
4192
|
// src/inference/Wav2ArkitCpuInference.ts
|
|
4134
4193
|
var logger5 = createLogger("Wav2ArkitCpu");
|
|
4135
|
-
var
|
|
4194
|
+
var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
4136
4195
|
constructor(config) {
|
|
4137
4196
|
this.modelId = "wav2arkit_cpu";
|
|
4138
4197
|
this.session = null;
|
|
@@ -4141,6 +4200,10 @@ var Wav2ArkitCpuInference = class {
|
|
|
4141
4200
|
this.isLoading = false;
|
|
4142
4201
|
// Inference queue for handling concurrent calls
|
|
4143
4202
|
this.inferenceQueue = Promise.resolve();
|
|
4203
|
+
// Session health: set to true if session.run() times out.
|
|
4204
|
+
// A timed-out session may have a zombie WASM dispatch still running,
|
|
4205
|
+
// so all future infer() calls reject immediately to prevent concurrent access.
|
|
4206
|
+
this.poisoned = false;
|
|
4144
4207
|
this.config = config;
|
|
4145
4208
|
}
|
|
4146
4209
|
get backend() {
|
|
@@ -4313,6 +4376,9 @@ var Wav2ArkitCpuInference = class {
|
|
|
4313
4376
|
if (!this.session) {
|
|
4314
4377
|
throw new Error("Model not loaded. Call load() first.");
|
|
4315
4378
|
}
|
|
4379
|
+
if (this.poisoned) {
|
|
4380
|
+
throw new Error("Wav2ArkitCpu session timed out \u2014 inference unavailable until page reload");
|
|
4381
|
+
}
|
|
4316
4382
|
const audioCopy = new Float32Array(audioSamples);
|
|
4317
4383
|
const feeds = {
|
|
4318
4384
|
"audio_waveform": new this.ort.Tensor("float32", audioCopy, [1, audioCopy.length])
|
|
@@ -4332,7 +4398,19 @@ var Wav2ArkitCpuInference = class {
|
|
|
4332
4398
|
});
|
|
4333
4399
|
try {
|
|
4334
4400
|
const startTime = performance.now();
|
|
4335
|
-
|
|
4401
|
+
let timeoutId;
|
|
4402
|
+
const results = await Promise.race([
|
|
4403
|
+
this.session.run(feeds).then((r) => {
|
|
4404
|
+
clearTimeout(timeoutId);
|
|
4405
|
+
return r;
|
|
4406
|
+
}),
|
|
4407
|
+
new Promise((_, rej) => {
|
|
4408
|
+
timeoutId = setTimeout(
|
|
4409
|
+
() => rej(new Error(`Wav2ArkitCpu inference timed out after ${_Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS}ms`)),
|
|
4410
|
+
_Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
|
|
4411
|
+
);
|
|
4412
|
+
})
|
|
4413
|
+
]);
|
|
4336
4414
|
const inferenceTimeMs = performance.now() - startTime;
|
|
4337
4415
|
const blendshapeOutput = results["blendshapes"];
|
|
4338
4416
|
if (!blendshapeOutput) {
|
|
@@ -4372,6 +4450,32 @@ var Wav2ArkitCpuInference = class {
|
|
|
4372
4450
|
inferenceTimeMs
|
|
4373
4451
|
});
|
|
4374
4452
|
} catch (err) {
|
|
4453
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
4454
|
+
if (errMsg.includes("timed out")) {
|
|
4455
|
+
this.poisoned = true;
|
|
4456
|
+
logger5.error("CRITICAL: Inference session timed out \u2014 Wav2ArkitCpu is dead. Page reload required.", {
|
|
4457
|
+
backend: this._backend,
|
|
4458
|
+
timeoutMs: _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
|
|
4459
|
+
});
|
|
4460
|
+
} else if (typeof err === "number") {
|
|
4461
|
+
const oomError = new Error(
|
|
4462
|
+
`Wav2ArkitCpu inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
|
|
4463
|
+
);
|
|
4464
|
+
logger5.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
4465
|
+
pointer: `0x${err.toString(16)}`,
|
|
4466
|
+
backend: this._backend
|
|
4467
|
+
});
|
|
4468
|
+
span?.endWithError(oomError);
|
|
4469
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
4470
|
+
model: "wav2arkit_cpu",
|
|
4471
|
+
backend: this._backend,
|
|
4472
|
+
status: "error"
|
|
4473
|
+
});
|
|
4474
|
+
reject(oomError);
|
|
4475
|
+
return;
|
|
4476
|
+
} else {
|
|
4477
|
+
logger5.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
4478
|
+
}
|
|
4375
4479
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
4376
4480
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
4377
4481
|
model: "wav2arkit_cpu",
|
|
@@ -4393,6 +4497,8 @@ var Wav2ArkitCpuInference = class {
|
|
|
4393
4497
|
}
|
|
4394
4498
|
}
|
|
4395
4499
|
};
|
|
4500
|
+
_Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS = 5e3;
|
|
4501
|
+
var Wav2ArkitCpuInference = _Wav2ArkitCpuInference;
|
|
4396
4502
|
|
|
4397
4503
|
// src/inference/createLipSync.ts
|
|
4398
4504
|
var logger6 = createLogger("createLipSync");
|
|
@@ -4722,23 +4828,13 @@ var SileroVADInference = class {
|
|
|
4722
4828
|
}
|
|
4723
4829
|
return segments;
|
|
4724
4830
|
}
|
|
4725
|
-
/**
|
|
4726
|
-
* Calculate RMS energy of audio chunk
|
|
4727
|
-
*/
|
|
4728
|
-
calculateRMS(samples) {
|
|
4729
|
-
let sum = 0;
|
|
4730
|
-
for (let i = 0; i < samples.length; i++) {
|
|
4731
|
-
sum += samples[i] * samples[i];
|
|
4732
|
-
}
|
|
4733
|
-
return Math.sqrt(sum / samples.length);
|
|
4734
|
-
}
|
|
4735
4831
|
/**
|
|
4736
4832
|
* Queue inference to serialize ONNX session calls
|
|
4737
4833
|
*/
|
|
4738
4834
|
queueInference(audioChunk) {
|
|
4739
4835
|
const audioChunkCopy = new Float32Array(audioChunk);
|
|
4740
4836
|
const MIN_ENERGY_THRESHOLD = 1e-3;
|
|
4741
|
-
const rms =
|
|
4837
|
+
const rms = calculateRMS(audioChunkCopy);
|
|
4742
4838
|
if (rms < MIN_ENERGY_THRESHOLD) {
|
|
4743
4839
|
if (!this.wasSpeaking) {
|
|
4744
4840
|
this.preSpeechBuffer.push(new Float32Array(audioChunkCopy));
|
|
@@ -4793,7 +4889,7 @@ var SileroVADInference = class {
|
|
|
4793
4889
|
[2, 1, 128]
|
|
4794
4890
|
);
|
|
4795
4891
|
}
|
|
4796
|
-
this.context =
|
|
4892
|
+
this.context = audioChunkCopy.slice(-this.contextSize);
|
|
4797
4893
|
const inferenceTimeMs = performance.now() - startTime;
|
|
4798
4894
|
const isSpeech = probability > this.config.threshold;
|
|
4799
4895
|
let preSpeechChunks;
|
|
@@ -4805,7 +4901,7 @@ var SileroVADInference = class {
|
|
|
4805
4901
|
durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
|
|
4806
4902
|
});
|
|
4807
4903
|
} else if (!isSpeech && !this.wasSpeaking) {
|
|
4808
|
-
this.preSpeechBuffer.push(new Float32Array(
|
|
4904
|
+
this.preSpeechBuffer.push(new Float32Array(audioChunkCopy));
|
|
4809
4905
|
if (this.preSpeechBuffer.length > this.config.preSpeechBufferChunks) {
|
|
4810
4906
|
this.preSpeechBuffer.shift();
|
|
4811
4907
|
}
|
|
@@ -4840,13 +4936,30 @@ var SileroVADInference = class {
|
|
|
4840
4936
|
preSpeechChunks
|
|
4841
4937
|
});
|
|
4842
4938
|
} catch (err) {
|
|
4843
|
-
|
|
4844
|
-
|
|
4845
|
-
|
|
4846
|
-
|
|
4847
|
-
|
|
4848
|
-
|
|
4849
|
-
|
|
4939
|
+
if (typeof err === "number") {
|
|
4940
|
+
const oomError = new Error(
|
|
4941
|
+
`SileroVAD inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reducing concurrent model sessions or reloading the page.`
|
|
4942
|
+
);
|
|
4943
|
+
logger7.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
4944
|
+
pointer: `0x${err.toString(16)}`,
|
|
4945
|
+
backend: this._backend
|
|
4946
|
+
});
|
|
4947
|
+
span?.endWithError(oomError);
|
|
4948
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
4949
|
+
model: "silero-vad",
|
|
4950
|
+
backend: this._backend,
|
|
4951
|
+
status: "error"
|
|
4952
|
+
});
|
|
4953
|
+
reject(oomError);
|
|
4954
|
+
} else {
|
|
4955
|
+
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
4956
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
4957
|
+
model: "silero-vad",
|
|
4958
|
+
backend: this._backend,
|
|
4959
|
+
status: "error"
|
|
4960
|
+
});
|
|
4961
|
+
reject(err);
|
|
4962
|
+
}
|
|
4850
4963
|
}
|
|
4851
4964
|
});
|
|
4852
4965
|
});
|
|
@@ -6073,7 +6186,7 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
6073
6186
|
console.error("[AgentCore] VAD error during interruption detection:", error);
|
|
6074
6187
|
});
|
|
6075
6188
|
}
|
|
6076
|
-
const float32 = audio instanceof Float32Array ? audio :
|
|
6189
|
+
const float32 = audio instanceof Float32Array ? audio : int16ToFloat32(audio);
|
|
6077
6190
|
this.audioBuffer.push(float32);
|
|
6078
6191
|
this.scheduleTranscription();
|
|
6079
6192
|
}
|
|
@@ -6405,7 +6518,7 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
6405
6518
|
* Falls back to simple RMS if VAD not available
|
|
6406
6519
|
*/
|
|
6407
6520
|
async detectVoiceActivity(audio) {
|
|
6408
|
-
const float32 = audio instanceof Float32Array ? audio :
|
|
6521
|
+
const float32 = audio instanceof Float32Array ? audio : int16ToFloat32(audio);
|
|
6409
6522
|
if (this.vad) {
|
|
6410
6523
|
const chunkSize = this.vad.getChunkSize();
|
|
6411
6524
|
for (let i = 0; i + chunkSize <= float32.length; i += chunkSize) {
|
|
@@ -6424,13 +6537,6 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
6424
6537
|
const rms = Math.sqrt(sum / float32.length);
|
|
6425
6538
|
return rms > 0.02;
|
|
6426
6539
|
}
|
|
6427
|
-
int16ToFloat32(int16) {
|
|
6428
|
-
const float32 = new Float32Array(int16.length);
|
|
6429
|
-
for (let i = 0; i < int16.length; i++) {
|
|
6430
|
-
float32[i] = int16[i] / 32768;
|
|
6431
|
-
}
|
|
6432
|
-
return float32;
|
|
6433
|
-
}
|
|
6434
6540
|
base64ToArrayBuffer(base64) {
|
|
6435
6541
|
const binaryString = atob(base64);
|
|
6436
6542
|
const bytes = new Uint8Array(binaryString.length);
|