@omote/core 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-6W7G6WE7.mjs +13 -0
- package/dist/chunk-6W7G6WE7.mjs.map +1 -0
- package/dist/chunk-T465MTDX.mjs +38869 -0
- package/dist/chunk-T465MTDX.mjs.map +1 -0
- package/dist/events/index.mjs +1 -1
- package/dist/index.d.mts +32 -12
- package/dist/index.d.ts +32 -12
- package/dist/index.js +38188 -25599
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +152 -107
- package/dist/index.mjs.map +1 -1
- package/dist/logging/index.mjs +1 -1
- package/dist/transformers.web-MHLR33H6.mjs +1718 -0
- package/dist/transformers.web-MHLR33H6.mjs.map +1 -0
- package/package.json +3 -2
package/dist/events/index.mjs
CHANGED
package/dist/index.d.mts
CHANGED
|
@@ -97,6 +97,13 @@ interface AudioSchedulerOptions {
|
|
|
97
97
|
sampleRate?: number;
|
|
98
98
|
/** Number of audio channels (default: 1 for mono) */
|
|
99
99
|
channels?: number;
|
|
100
|
+
/**
|
|
101
|
+
* Delay before first audio chunk plays (seconds).
|
|
102
|
+
* Gives slow inference backends (WASM) a head start so lip sync
|
|
103
|
+
* frames are ready by the time audio reaches the listener.
|
|
104
|
+
* Default: 0.05 (50ms — just enough to enqueue the first node)
|
|
105
|
+
*/
|
|
106
|
+
initialDelayS?: number;
|
|
100
107
|
}
|
|
101
108
|
declare class AudioScheduler {
|
|
102
109
|
private readonly options;
|
|
@@ -454,6 +461,12 @@ interface LipSyncBackend {
|
|
|
454
461
|
readonly backend: RuntimeBackend | null;
|
|
455
462
|
/** Whether the model is loaded and ready for inference */
|
|
456
463
|
readonly isLoaded: boolean;
|
|
464
|
+
/**
|
|
465
|
+
* Preferred number of audio samples per inference chunk.
|
|
466
|
+
* Models with variable-length input can use smaller values for lower latency.
|
|
467
|
+
* Default (if undefined): 16000 (1.0s at 16kHz, required by Wav2Vec2).
|
|
468
|
+
*/
|
|
469
|
+
readonly chunkSamples?: number;
|
|
457
470
|
/**
|
|
458
471
|
* Load the ONNX model
|
|
459
472
|
* @returns Model loading information
|
|
@@ -516,7 +529,7 @@ interface LAMPipelineOptions {
|
|
|
516
529
|
}
|
|
517
530
|
declare class LAMPipeline {
|
|
518
531
|
private readonly options;
|
|
519
|
-
private readonly
|
|
532
|
+
private readonly DEFAULT_CHUNK_SAMPLES;
|
|
520
533
|
private readonly FRAME_RATE;
|
|
521
534
|
private buffer;
|
|
522
535
|
private bufferStartTime;
|
|
@@ -545,15 +558,13 @@ declare class LAMPipeline {
|
|
|
545
558
|
/**
|
|
546
559
|
* Get the frame that should be displayed at the current time
|
|
547
560
|
*
|
|
548
|
-
*
|
|
549
|
-
*
|
|
561
|
+
* Timestamp-synced playback for all backends. Audio playback is delayed
|
|
562
|
+
* for slow backends (WASM gets 1s head start via AudioScheduler) so
|
|
563
|
+
* frames are ready by the time their corresponding audio plays.
|
|
550
564
|
*
|
|
551
|
-
* Discard
|
|
552
|
-
*
|
|
553
|
-
*
|
|
554
|
-
*
|
|
555
|
-
* Last-Frame-Hold: Returns last valid frame instead of null to prevent
|
|
556
|
-
* avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
|
|
565
|
+
* Discard window is generous for WASM to handle inference jitter.
|
|
566
|
+
* Late frames play at RAF rate (~60fps) until caught up, then settle
|
|
567
|
+
* to natural 30fps pacing via timestamp gating.
|
|
557
568
|
*
|
|
558
569
|
* @param currentTime - Current AudioContext time
|
|
559
570
|
* @param lam - LAM inference engine (optional, for backend detection)
|
|
@@ -581,7 +592,7 @@ declare class LAMPipeline {
|
|
|
581
592
|
/**
|
|
582
593
|
* Flush remaining buffered audio
|
|
583
594
|
*
|
|
584
|
-
* Processes any remaining audio in the buffer, even if less than
|
|
595
|
+
* Processes any remaining audio in the buffer, even if less than the chunk size.
|
|
585
596
|
* This ensures the final audio chunk generates blendshape frames.
|
|
586
597
|
*
|
|
587
598
|
* Should be called when audio stream ends to prevent losing the last 0-1 seconds.
|
|
@@ -1133,8 +1144,10 @@ declare class Wav2Vec2Inference {
|
|
|
1133
1144
|
*/
|
|
1134
1145
|
|
|
1135
1146
|
interface Wav2ArkitCpuConfig {
|
|
1136
|
-
/** Path or URL to the wav2arkit_cpu ONNX model */
|
|
1147
|
+
/** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
|
|
1137
1148
|
modelUrl: string;
|
|
1149
|
+
/** Path or URL to the external data file (.onnx.data weights file) */
|
|
1150
|
+
modelDataUrl?: string;
|
|
1138
1151
|
/** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
|
|
1139
1152
|
backend?: BackendPreference;
|
|
1140
1153
|
}
|
|
@@ -1148,6 +1161,12 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
|
|
|
1148
1161
|
constructor(config: Wav2ArkitCpuConfig);
|
|
1149
1162
|
get backend(): RuntimeBackend | null;
|
|
1150
1163
|
get isLoaded(): boolean;
|
|
1164
|
+
/**
|
|
1165
|
+
* Preferred chunk size: 4000 samples (250ms at 16kHz).
|
|
1166
|
+
* wav2arkit_cpu accepts variable-length input, so we use smaller chunks
|
|
1167
|
+
* for lower latency on WASM (vs 16000 for Wav2Vec2's fixed requirement).
|
|
1168
|
+
*/
|
|
1169
|
+
readonly chunkSamples = 4000;
|
|
1151
1170
|
/**
|
|
1152
1171
|
* Load the ONNX model
|
|
1153
1172
|
*/
|
|
@@ -1213,6 +1232,8 @@ interface CreateLipSyncConfig {
|
|
|
1213
1232
|
gpuModelUrl: string;
|
|
1214
1233
|
/** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
|
|
1215
1234
|
cpuModelUrl: string;
|
|
1235
|
+
/** URL for the CPU model's external data file (.onnx.data weights) */
|
|
1236
|
+
cpuModelDataUrl?: string;
|
|
1216
1237
|
/**
|
|
1217
1238
|
* Model selection mode:
|
|
1218
1239
|
* - 'auto': Safari/iOS → CPU, everything else → GPU (default)
|
|
@@ -1367,7 +1388,6 @@ declare class SileroVADInference {
|
|
|
1367
1388
|
private inferenceQueue;
|
|
1368
1389
|
private preSpeechBuffer;
|
|
1369
1390
|
private wasSpeaking;
|
|
1370
|
-
private srTensor;
|
|
1371
1391
|
constructor(config: SileroVADConfig);
|
|
1372
1392
|
get backend(): RuntimeBackend | null;
|
|
1373
1393
|
get isLoaded(): boolean;
|
package/dist/index.d.ts
CHANGED
|
@@ -97,6 +97,13 @@ interface AudioSchedulerOptions {
|
|
|
97
97
|
sampleRate?: number;
|
|
98
98
|
/** Number of audio channels (default: 1 for mono) */
|
|
99
99
|
channels?: number;
|
|
100
|
+
/**
|
|
101
|
+
* Delay before first audio chunk plays (seconds).
|
|
102
|
+
* Gives slow inference backends (WASM) a head start so lip sync
|
|
103
|
+
* frames are ready by the time audio reaches the listener.
|
|
104
|
+
* Default: 0.05 (50ms — just enough to enqueue the first node)
|
|
105
|
+
*/
|
|
106
|
+
initialDelayS?: number;
|
|
100
107
|
}
|
|
101
108
|
declare class AudioScheduler {
|
|
102
109
|
private readonly options;
|
|
@@ -454,6 +461,12 @@ interface LipSyncBackend {
|
|
|
454
461
|
readonly backend: RuntimeBackend | null;
|
|
455
462
|
/** Whether the model is loaded and ready for inference */
|
|
456
463
|
readonly isLoaded: boolean;
|
|
464
|
+
/**
|
|
465
|
+
* Preferred number of audio samples per inference chunk.
|
|
466
|
+
* Models with variable-length input can use smaller values for lower latency.
|
|
467
|
+
* Default (if undefined): 16000 (1.0s at 16kHz, required by Wav2Vec2).
|
|
468
|
+
*/
|
|
469
|
+
readonly chunkSamples?: number;
|
|
457
470
|
/**
|
|
458
471
|
* Load the ONNX model
|
|
459
472
|
* @returns Model loading information
|
|
@@ -516,7 +529,7 @@ interface LAMPipelineOptions {
|
|
|
516
529
|
}
|
|
517
530
|
declare class LAMPipeline {
|
|
518
531
|
private readonly options;
|
|
519
|
-
private readonly
|
|
532
|
+
private readonly DEFAULT_CHUNK_SAMPLES;
|
|
520
533
|
private readonly FRAME_RATE;
|
|
521
534
|
private buffer;
|
|
522
535
|
private bufferStartTime;
|
|
@@ -545,15 +558,13 @@ declare class LAMPipeline {
|
|
|
545
558
|
/**
|
|
546
559
|
* Get the frame that should be displayed at the current time
|
|
547
560
|
*
|
|
548
|
-
*
|
|
549
|
-
*
|
|
561
|
+
* Timestamp-synced playback for all backends. Audio playback is delayed
|
|
562
|
+
* for slow backends (WASM gets 1s head start via AudioScheduler) so
|
|
563
|
+
* frames are ready by the time their corresponding audio plays.
|
|
550
564
|
*
|
|
551
|
-
* Discard
|
|
552
|
-
*
|
|
553
|
-
*
|
|
554
|
-
*
|
|
555
|
-
* Last-Frame-Hold: Returns last valid frame instead of null to prevent
|
|
556
|
-
* avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
|
|
565
|
+
* Discard window is generous for WASM to handle inference jitter.
|
|
566
|
+
* Late frames play at RAF rate (~60fps) until caught up, then settle
|
|
567
|
+
* to natural 30fps pacing via timestamp gating.
|
|
557
568
|
*
|
|
558
569
|
* @param currentTime - Current AudioContext time
|
|
559
570
|
* @param lam - LAM inference engine (optional, for backend detection)
|
|
@@ -581,7 +592,7 @@ declare class LAMPipeline {
|
|
|
581
592
|
/**
|
|
582
593
|
* Flush remaining buffered audio
|
|
583
594
|
*
|
|
584
|
-
* Processes any remaining audio in the buffer, even if less than
|
|
595
|
+
* Processes any remaining audio in the buffer, even if less than the chunk size.
|
|
585
596
|
* This ensures the final audio chunk generates blendshape frames.
|
|
586
597
|
*
|
|
587
598
|
* Should be called when audio stream ends to prevent losing the last 0-1 seconds.
|
|
@@ -1133,8 +1144,10 @@ declare class Wav2Vec2Inference {
|
|
|
1133
1144
|
*/
|
|
1134
1145
|
|
|
1135
1146
|
interface Wav2ArkitCpuConfig {
|
|
1136
|
-
/** Path or URL to the wav2arkit_cpu ONNX model */
|
|
1147
|
+
/** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
|
|
1137
1148
|
modelUrl: string;
|
|
1149
|
+
/** Path or URL to the external data file (.onnx.data weights file) */
|
|
1150
|
+
modelDataUrl?: string;
|
|
1138
1151
|
/** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
|
|
1139
1152
|
backend?: BackendPreference;
|
|
1140
1153
|
}
|
|
@@ -1148,6 +1161,12 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
|
|
|
1148
1161
|
constructor(config: Wav2ArkitCpuConfig);
|
|
1149
1162
|
get backend(): RuntimeBackend | null;
|
|
1150
1163
|
get isLoaded(): boolean;
|
|
1164
|
+
/**
|
|
1165
|
+
* Preferred chunk size: 4000 samples (250ms at 16kHz).
|
|
1166
|
+
* wav2arkit_cpu accepts variable-length input, so we use smaller chunks
|
|
1167
|
+
* for lower latency on WASM (vs 16000 for Wav2Vec2's fixed requirement).
|
|
1168
|
+
*/
|
|
1169
|
+
readonly chunkSamples = 4000;
|
|
1151
1170
|
/**
|
|
1152
1171
|
* Load the ONNX model
|
|
1153
1172
|
*/
|
|
@@ -1213,6 +1232,8 @@ interface CreateLipSyncConfig {
|
|
|
1213
1232
|
gpuModelUrl: string;
|
|
1214
1233
|
/** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
|
|
1215
1234
|
cpuModelUrl: string;
|
|
1235
|
+
/** URL for the CPU model's external data file (.onnx.data weights) */
|
|
1236
|
+
cpuModelDataUrl?: string;
|
|
1216
1237
|
/**
|
|
1217
1238
|
* Model selection mode:
|
|
1218
1239
|
* - 'auto': Safari/iOS → CPU, everything else → GPU (default)
|
|
@@ -1367,7 +1388,6 @@ declare class SileroVADInference {
|
|
|
1367
1388
|
private inferenceQueue;
|
|
1368
1389
|
private preSpeechBuffer;
|
|
1369
1390
|
private wasSpeaking;
|
|
1370
|
-
private srTensor;
|
|
1371
1391
|
constructor(config: SileroVADConfig);
|
|
1372
1392
|
get backend(): RuntimeBackend | null;
|
|
1373
1393
|
get isLoaded(): boolean;
|