@omote/core 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +5 -3
- package/dist/index.d.ts +5 -3
- package/dist/index.js +32 -4
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +32 -4
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -366,12 +366,13 @@ declare function isSafari(): boolean;
|
|
|
366
366
|
/**
|
|
367
367
|
* Recommend using CPU-optimized lip sync model (wav2arkit_cpu)
|
|
368
368
|
*
|
|
369
|
-
* Safari
|
|
370
|
-
*
|
|
369
|
+
* All WebKit browsers (Safari macOS, Safari iOS, Chrome iOS, Firefox iOS)
|
|
370
|
+
* have ONNX Runtime WebGPU JSEP bugs that crash session creation, and the
|
|
371
|
+
* 384MB LAM model stack-overflows in WASM mode.
|
|
371
372
|
* The wav2arkit_cpu model (1.8MB) provides identical 52 ARKit blendshape
|
|
372
373
|
* output at 22x real-time on CPU/WASM.
|
|
373
374
|
*
|
|
374
|
-
* @returns true if on Safari (should use CPU lip sync model)
|
|
375
|
+
* @returns true if on Safari or any iOS browser (should use CPU lip sync model)
|
|
375
376
|
*/
|
|
376
377
|
declare function shouldUseCpuLipSync(): boolean;
|
|
377
378
|
/**
|
|
@@ -1366,6 +1367,7 @@ declare class SileroVADInference {
|
|
|
1366
1367
|
private inferenceQueue;
|
|
1367
1368
|
private preSpeechBuffer;
|
|
1368
1369
|
private wasSpeaking;
|
|
1370
|
+
private srTensor;
|
|
1369
1371
|
constructor(config: SileroVADConfig);
|
|
1370
1372
|
get backend(): RuntimeBackend | null;
|
|
1371
1373
|
get isLoaded(): boolean;
|
package/dist/index.d.ts
CHANGED
|
@@ -366,12 +366,13 @@ declare function isSafari(): boolean;
|
|
|
366
366
|
/**
|
|
367
367
|
* Recommend using CPU-optimized lip sync model (wav2arkit_cpu)
|
|
368
368
|
*
|
|
369
|
-
* Safari
|
|
370
|
-
*
|
|
369
|
+
* All WebKit browsers (Safari macOS, Safari iOS, Chrome iOS, Firefox iOS)
|
|
370
|
+
* have ONNX Runtime WebGPU JSEP bugs that crash session creation, and the
|
|
371
|
+
* 384MB LAM model stack-overflows in WASM mode.
|
|
371
372
|
* The wav2arkit_cpu model (1.8MB) provides identical 52 ARKit blendshape
|
|
372
373
|
* output at 22x real-time on CPU/WASM.
|
|
373
374
|
*
|
|
374
|
-
* @returns true if on Safari (should use CPU lip sync model)
|
|
375
|
+
* @returns true if on Safari or any iOS browser (should use CPU lip sync model)
|
|
375
376
|
*/
|
|
376
377
|
declare function shouldUseCpuLipSync(): boolean;
|
|
377
378
|
/**
|
|
@@ -1366,6 +1367,7 @@ declare class SileroVADInference {
|
|
|
1366
1367
|
private inferenceQueue;
|
|
1367
1368
|
private preSpeechBuffer;
|
|
1368
1369
|
private wasSpeaking;
|
|
1370
|
+
private srTensor;
|
|
1369
1371
|
constructor(config: SileroVADConfig);
|
|
1370
1372
|
get backend(): RuntimeBackend | null;
|
|
1371
1373
|
get isLoaded(): boolean;
|
package/dist/index.js
CHANGED
|
@@ -28970,7 +28970,7 @@ function isSafari() {
|
|
|
28970
28970
|
return /safari/.test(ua) && !/chrome|crios|fxios|chromium|edg/.test(ua);
|
|
28971
28971
|
}
|
|
28972
28972
|
function shouldUseCpuLipSync() {
|
|
28973
|
-
return isSafari();
|
|
28973
|
+
return isSafari() || isIOS();
|
|
28974
28974
|
}
|
|
28975
28975
|
function isSpeechRecognitionAvailable() {
|
|
28976
28976
|
if (typeof window === "undefined") return false;
|
|
@@ -30138,7 +30138,7 @@ function createLipSync(config) {
|
|
|
30138
30138
|
useCpu = false;
|
|
30139
30139
|
logger6.info("Forcing GPU lip sync model (Wav2Vec2)");
|
|
30140
30140
|
} else {
|
|
30141
|
-
useCpu =
|
|
30141
|
+
useCpu = shouldUseCpuLipSync();
|
|
30142
30142
|
logger6.info("Auto-detected lip sync model", {
|
|
30143
30143
|
useCpu,
|
|
30144
30144
|
isSafari: isSafari()
|
|
@@ -30216,6 +30216,8 @@ var SileroVADInference = class {
|
|
|
30216
30216
|
// Pre-speech buffer for capturing beginning of speech
|
|
30217
30217
|
this.preSpeechBuffer = [];
|
|
30218
30218
|
this.wasSpeaking = false;
|
|
30219
|
+
// Cached sample rate tensor (int64 scalar, never changes per instance)
|
|
30220
|
+
this.srTensor = null;
|
|
30219
30221
|
const sampleRate = config.sampleRate ?? 16e3;
|
|
30220
30222
|
if (sampleRate !== 8e3 && sampleRate !== 16e3) {
|
|
30221
30223
|
throw new Error("Silero VAD only supports 8000 or 16000 Hz sample rates");
|
|
@@ -30346,6 +30348,24 @@ var SileroVADInference = class {
|
|
|
30346
30348
|
this.context = new Float32Array(this.contextSize);
|
|
30347
30349
|
this.preSpeechBuffer = [];
|
|
30348
30350
|
this.wasSpeaking = false;
|
|
30351
|
+
if (!this.srTensor) {
|
|
30352
|
+
try {
|
|
30353
|
+
this.srTensor = new this.ort.Tensor(
|
|
30354
|
+
"int64",
|
|
30355
|
+
new BigInt64Array([BigInt(this.config.sampleRate)]),
|
|
30356
|
+
[]
|
|
30357
|
+
);
|
|
30358
|
+
} catch (e) {
|
|
30359
|
+
logger7.warn("BigInt64Array not available, using bigint array fallback", {
|
|
30360
|
+
error: e instanceof Error ? e.message : String(e)
|
|
30361
|
+
});
|
|
30362
|
+
this.srTensor = new this.ort.Tensor(
|
|
30363
|
+
"int64",
|
|
30364
|
+
[BigInt(this.config.sampleRate)],
|
|
30365
|
+
[]
|
|
30366
|
+
);
|
|
30367
|
+
}
|
|
30368
|
+
}
|
|
30349
30369
|
}
|
|
30350
30370
|
/**
|
|
30351
30371
|
* Process a single audio chunk
|
|
@@ -30477,7 +30497,7 @@ var SileroVADInference = class {
|
|
|
30477
30497
|
inputBuffer.set(audioChunkCopy, this.contextSize);
|
|
30478
30498
|
const inputBufferCopy = new Float32Array(inputBuffer);
|
|
30479
30499
|
const inputTensor = new this.ort.Tensor("float32", inputBufferCopy, [1, inputSize]);
|
|
30480
|
-
const srTensor =
|
|
30500
|
+
const srTensor = this.srTensor;
|
|
30481
30501
|
const stateCopy = new Float32Array(this.state.data);
|
|
30482
30502
|
const stateTensor = new this.ort.Tensor("float32", stateCopy, this.state.dims);
|
|
30483
30503
|
const feeds = {
|
|
@@ -30566,6 +30586,7 @@ var SileroVADInference = class {
|
|
|
30566
30586
|
this.session = null;
|
|
30567
30587
|
}
|
|
30568
30588
|
this.state = null;
|
|
30589
|
+
this.srTensor = null;
|
|
30569
30590
|
}
|
|
30570
30591
|
};
|
|
30571
30592
|
/**
|
|
@@ -30670,7 +30691,14 @@ async function runInference(audio, state, context) {
|
|
|
30670
30691
|
// Create tensors
|
|
30671
30692
|
const inputTensor = new ort.Tensor('float32', new Float32Array(inputBuffer), [1, inputSize]);
|
|
30672
30693
|
const stateTensor = new ort.Tensor('float32', new Float32Array(state), [2, 1, 128]);
|
|
30673
|
-
|
|
30694
|
+
// Use BigInt64Array constructor (not .from()) for broader compatibility
|
|
30695
|
+
let srTensor;
|
|
30696
|
+
try {
|
|
30697
|
+
srTensor = new ort.Tensor('int64', new BigInt64Array([BigInt(sampleRate)]), []);
|
|
30698
|
+
} catch (e) {
|
|
30699
|
+
// Fallback for environments without BigInt64Array support
|
|
30700
|
+
srTensor = new ort.Tensor('int64', [BigInt(sampleRate)], []);
|
|
30701
|
+
}
|
|
30674
30702
|
|
|
30675
30703
|
const feeds = {
|
|
30676
30704
|
'input': inputTensor,
|