@omote/core 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  import {
2
2
  EventEmitter
3
3
  } from "../chunk-XK22BRG4.mjs";
4
- import "../chunk-NSSMTXJJ.mjs";
4
+ import "../chunk-6W7G6WE7.mjs";
5
5
  export {
6
6
  EventEmitter
7
7
  };
package/dist/index.d.mts CHANGED
@@ -97,6 +97,13 @@ interface AudioSchedulerOptions {
97
97
  sampleRate?: number;
98
98
  /** Number of audio channels (default: 1 for mono) */
99
99
  channels?: number;
100
+ /**
101
+ * Delay before first audio chunk plays (seconds).
102
+ * Gives slow inference backends (WASM) a head start so lip sync
103
+ * frames are ready by the time audio reaches the listener.
104
+ * Default: 0.05 (50ms — just enough to enqueue the first node)
105
+ */
106
+ initialDelayS?: number;
100
107
  }
101
108
  declare class AudioScheduler {
102
109
  private readonly options;
@@ -454,6 +461,12 @@ interface LipSyncBackend {
454
461
  readonly backend: RuntimeBackend | null;
455
462
  /** Whether the model is loaded and ready for inference */
456
463
  readonly isLoaded: boolean;
464
+ /**
465
+ * Preferred number of audio samples per inference chunk.
466
+ * Models with variable-length input can use smaller values for lower latency.
467
+ * Default (if undefined): 16000 (1.0s at 16kHz, required by Wav2Vec2).
468
+ */
469
+ readonly chunkSamples?: number;
457
470
  /**
458
471
  * Load the ONNX model
459
472
  * @returns Model loading information
@@ -516,7 +529,7 @@ interface LAMPipelineOptions {
516
529
  }
517
530
  declare class LAMPipeline {
518
531
  private readonly options;
519
- private readonly REQUIRED_SAMPLES;
532
+ private readonly DEFAULT_CHUNK_SAMPLES;
520
533
  private readonly FRAME_RATE;
521
534
  private buffer;
522
535
  private bufferStartTime;
@@ -545,15 +558,13 @@ declare class LAMPipeline {
545
558
  /**
546
559
  * Get the frame that should be displayed at the current time
547
560
  *
548
- * Automatically removes frames that have already been displayed.
549
- * This prevents memory leaks from accumulating old frames.
561
+ * Timestamp-synced playback for all backends. Audio playback is delayed
562
+ * for slow backends (WASM gets 1s head start via AudioScheduler) so
563
+ * frames are ready by the time their corresponding audio plays.
550
564
  *
551
- * Discard Window (prevents premature frame discarding):
552
- * - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
553
- * - WASM: 1.0s (LAM inference 50-500ms + higher variability)
554
- *
555
- * Last-Frame-Hold: Returns last valid frame instead of null to prevent
556
- * avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
565
+ * Discard window is generous for WASM to handle inference jitter.
566
+ * Late frames play at RAF rate (~60fps) until caught up, then settle
567
+ * to natural 30fps pacing via timestamp gating.
557
568
  *
558
569
  * @param currentTime - Current AudioContext time
559
570
  * @param lam - LAM inference engine (optional, for backend detection)
@@ -581,7 +592,7 @@ declare class LAMPipeline {
581
592
  /**
582
593
  * Flush remaining buffered audio
583
594
  *
584
- * Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
595
+ * Processes any remaining audio in the buffer, even if less than the chunk size.
585
596
  * This ensures the final audio chunk generates blendshape frames.
586
597
  *
587
598
  * Should be called when audio stream ends to prevent losing the last 0-1 seconds.
@@ -1133,8 +1144,10 @@ declare class Wav2Vec2Inference {
1133
1144
  */
1134
1145
 
1135
1146
  interface Wav2ArkitCpuConfig {
1136
- /** Path or URL to the wav2arkit_cpu ONNX model */
1147
+ /** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
1137
1148
  modelUrl: string;
1149
+ /** Path or URL to the external data file (.onnx.data weights file) */
1150
+ modelDataUrl?: string;
1138
1151
  /** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
1139
1152
  backend?: BackendPreference;
1140
1153
  }
@@ -1148,6 +1161,12 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
1148
1161
  constructor(config: Wav2ArkitCpuConfig);
1149
1162
  get backend(): RuntimeBackend | null;
1150
1163
  get isLoaded(): boolean;
1164
+ /**
1165
+ * Preferred chunk size: 4000 samples (250ms at 16kHz).
1166
+ * wav2arkit_cpu accepts variable-length input, so we use smaller chunks
1167
+ * for lower latency on WASM (vs 16000 for Wav2Vec2's fixed requirement).
1168
+ */
1169
+ readonly chunkSamples = 4000;
1151
1170
  /**
1152
1171
  * Load the ONNX model
1153
1172
  */
@@ -1213,6 +1232,8 @@ interface CreateLipSyncConfig {
1213
1232
  gpuModelUrl: string;
1214
1233
  /** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
1215
1234
  cpuModelUrl: string;
1235
+ /** URL for the CPU model's external data file (.onnx.data weights) */
1236
+ cpuModelDataUrl?: string;
1216
1237
  /**
1217
1238
  * Model selection mode:
1218
1239
  * - 'auto': Safari/iOS → CPU, everything else → GPU (default)
@@ -1367,7 +1388,6 @@ declare class SileroVADInference {
1367
1388
  private inferenceQueue;
1368
1389
  private preSpeechBuffer;
1369
1390
  private wasSpeaking;
1370
- private srTensor;
1371
1391
  constructor(config: SileroVADConfig);
1372
1392
  get backend(): RuntimeBackend | null;
1373
1393
  get isLoaded(): boolean;
package/dist/index.d.ts CHANGED
@@ -97,6 +97,13 @@ interface AudioSchedulerOptions {
97
97
  sampleRate?: number;
98
98
  /** Number of audio channels (default: 1 for mono) */
99
99
  channels?: number;
100
+ /**
101
+ * Delay before first audio chunk plays (seconds).
102
+ * Gives slow inference backends (WASM) a head start so lip sync
103
+ * frames are ready by the time audio reaches the listener.
104
+ * Default: 0.05 (50ms — just enough to enqueue the first node)
105
+ */
106
+ initialDelayS?: number;
100
107
  }
101
108
  declare class AudioScheduler {
102
109
  private readonly options;
@@ -454,6 +461,12 @@ interface LipSyncBackend {
454
461
  readonly backend: RuntimeBackend | null;
455
462
  /** Whether the model is loaded and ready for inference */
456
463
  readonly isLoaded: boolean;
464
+ /**
465
+ * Preferred number of audio samples per inference chunk.
466
+ * Models with variable-length input can use smaller values for lower latency.
467
+ * Default (if undefined): 16000 (1.0s at 16kHz, required by Wav2Vec2).
468
+ */
469
+ readonly chunkSamples?: number;
457
470
  /**
458
471
  * Load the ONNX model
459
472
  * @returns Model loading information
@@ -516,7 +529,7 @@ interface LAMPipelineOptions {
516
529
  }
517
530
  declare class LAMPipeline {
518
531
  private readonly options;
519
- private readonly REQUIRED_SAMPLES;
532
+ private readonly DEFAULT_CHUNK_SAMPLES;
520
533
  private readonly FRAME_RATE;
521
534
  private buffer;
522
535
  private bufferStartTime;
@@ -545,15 +558,13 @@ declare class LAMPipeline {
545
558
  /**
546
559
  * Get the frame that should be displayed at the current time
547
560
  *
548
- * Automatically removes frames that have already been displayed.
549
- * This prevents memory leaks from accumulating old frames.
561
+ * Timestamp-synced playback for all backends. Audio playback is delayed
562
+ * for slow backends (WASM gets 1s head start via AudioScheduler) so
563
+ * frames are ready by the time their corresponding audio plays.
550
564
  *
551
- * Discard Window (prevents premature frame discarding):
552
- * - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
553
- * - WASM: 1.0s (LAM inference 50-500ms + higher variability)
554
- *
555
- * Last-Frame-Hold: Returns last valid frame instead of null to prevent
556
- * avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
565
+ * Discard window is generous for WASM to handle inference jitter.
566
+ * Late frames play at RAF rate (~60fps) until caught up, then settle
567
+ * to natural 30fps pacing via timestamp gating.
557
568
  *
558
569
  * @param currentTime - Current AudioContext time
559
570
  * @param lam - LAM inference engine (optional, for backend detection)
@@ -581,7 +592,7 @@ declare class LAMPipeline {
581
592
  /**
582
593
  * Flush remaining buffered audio
583
594
  *
584
- * Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
595
+ * Processes any remaining audio in the buffer, even if less than the chunk size.
585
596
  * This ensures the final audio chunk generates blendshape frames.
586
597
  *
587
598
  * Should be called when audio stream ends to prevent losing the last 0-1 seconds.
@@ -1133,8 +1144,10 @@ declare class Wav2Vec2Inference {
1133
1144
  */
1134
1145
 
1135
1146
  interface Wav2ArkitCpuConfig {
1136
- /** Path or URL to the wav2arkit_cpu ONNX model */
1147
+ /** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
1137
1148
  modelUrl: string;
1149
+ /** Path or URL to the external data file (.onnx.data weights file) */
1150
+ modelDataUrl?: string;
1138
1151
  /** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
1139
1152
  backend?: BackendPreference;
1140
1153
  }
@@ -1148,6 +1161,12 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
1148
1161
  constructor(config: Wav2ArkitCpuConfig);
1149
1162
  get backend(): RuntimeBackend | null;
1150
1163
  get isLoaded(): boolean;
1164
+ /**
1165
+ * Preferred chunk size: 4000 samples (250ms at 16kHz).
1166
+ * wav2arkit_cpu accepts variable-length input, so we use smaller chunks
1167
+ * for lower latency on WASM (vs 16000 for Wav2Vec2's fixed requirement).
1168
+ */
1169
+ readonly chunkSamples = 4000;
1151
1170
  /**
1152
1171
  * Load the ONNX model
1153
1172
  */
@@ -1213,6 +1232,8 @@ interface CreateLipSyncConfig {
1213
1232
  gpuModelUrl: string;
1214
1233
  /** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
1215
1234
  cpuModelUrl: string;
1235
+ /** URL for the CPU model's external data file (.onnx.data weights) */
1236
+ cpuModelDataUrl?: string;
1216
1237
  /**
1217
1238
  * Model selection mode:
1218
1239
  * - 'auto': Safari/iOS → CPU, everything else → GPU (default)
@@ -1367,7 +1388,6 @@ declare class SileroVADInference {
1367
1388
  private inferenceQueue;
1368
1389
  private preSpeechBuffer;
1369
1390
  private wasSpeaking;
1370
- private srTensor;
1371
1391
  constructor(config: SileroVADConfig);
1372
1392
  get backend(): RuntimeBackend | null;
1373
1393
  get isLoaded(): boolean;