@omote/core 0.6.2 → 0.6.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -35,11 +35,7 @@ The most common use case: feed TTS audio chunks and get back 52 ARKit blendshape
35
35
  import { FullFacePipeline, createA2E } from '@omote/core';
36
36
 
37
37
  // 1. Create A2E backend (auto-detects GPU vs CPU)
38
- const lam = createA2E({
39
- gpuModelUrl: '/models/lam-wav2vec2.onnx',
40
- cpuModelUrl: '/models/wav2arkit_cpu.onnx',
41
- mode: 'auto',
42
- });
38
+ const lam = createA2E(); // auto-detects GPU vs CPU, fetches from HF CDN (192MB fp16)
43
39
  await lam.load();
44
40
 
45
41
  // 2. Create pipeline with expression profile
@@ -72,12 +68,7 @@ Auto-detects platform: Chrome/Edge/Android use WebGPU, Safari/iOS use WASM CPU f
72
68
  ```typescript
73
69
  import { createA2E } from '@omote/core';
74
70
 
75
- const a2e = createA2E({
76
- gpuModelUrl: '/models/lam-wav2vec2.onnx', // 384MB, WebGPU
77
- cpuModelUrl: '/models/wav2arkit_cpu.onnx', // 404MB, WASM
78
- mode: 'auto', // 'auto' | 'gpu' | 'cpu'
79
- fallbackOnError: true, // GPU failure → auto-switch to CPU
80
- });
71
+ const a2e = createA2E(); // auto-detects: GPU (192MB fp16) or CPU (404MB WASM)
81
72
  await a2e.load();
82
73
 
83
74
  const { blendshapes } = await a2e.infer(audioSamples); // Float32Array (16kHz)
@@ -89,7 +80,7 @@ const { blendshapes } = await a2e.infer(audioSamples); // Float32Array (16kHz)
89
80
  ```typescript
90
81
  import { Wav2Vec2Inference, LAM_BLENDSHAPES } from '@omote/core';
91
82
 
92
- const lam = new Wav2Vec2Inference({ modelUrl: '/models/lam-wav2vec2.onnx' });
83
+ const lam = new Wav2Vec2Inference({ modelUrl: '/models/model_fp16.onnx' });
93
84
  await lam.load();
94
85
 
95
86
  const { blendshapes } = await lam.infer(audioSamples);
@@ -317,7 +308,7 @@ Place models in your public assets directory:
317
308
 
318
309
  ```
319
310
  public/models/
320
- lam-wav2vec2.onnx # A2E lip sync — WebGPU (384MB)
311
+ model_fp16.onnx # A2E lip sync — WebGPU (192MB fp16, from omote-ai/lam-a2e)
321
312
  wav2arkit_cpu.onnx # A2E lip sync — WASM fallback (1.86MB graph)
322
313
  wav2arkit_cpu.onnx.data # A2E lip sync — WASM fallback (402MB weights)
323
314
  sensevoice/model.int8.onnx # SenseVoice ASR (239MB)
package/dist/index.d.mts CHANGED
@@ -380,7 +380,7 @@ declare function isSafari(): boolean;
380
380
  /**
381
381
  * Recommend using CPU-optimized A2E model (wav2arkit_cpu)
382
382
  *
383
- * All iOS browsers use WebKit and have tight memory limits — the 384MB
383
+ * All iOS browsers use WebKit and have tight memory limits — the 192MB fp16
384
384
  * LAM model causes silent crashes. wav2arkit_cpu uses URL pass-through
385
385
  * (ORT fetches the 402MB weights directly into WASM, no JS heap copy).
386
386
  *
@@ -427,8 +427,8 @@ declare function shouldUseServerA2E(): boolean;
427
427
  /**
428
428
  * Common interface for audio-to-expression (A2E) inference backends
429
429
  *
430
- * Both Wav2Vec2Inference (GPU, 384MB) and Wav2ArkitCpuInference (CPU, 404MB)
431
- * implement this interface, allowing SyncedAudioPipeline and LAMPipeline to
430
+ * Both Wav2Vec2Inference (GPU, 192MB fp16) and Wav2ArkitCpuInference (CPU, 404MB)
431
+ * implement this interface, allowing FullFacePipeline and A2EProcessor to
432
432
  * work with either model transparently.
433
433
  *
434
434
  * @category Inference
@@ -461,7 +461,7 @@ interface A2EResult {
461
461
  * Common interface for A2E (audio-to-expression) inference engines
462
462
  *
463
463
  * Implemented by:
464
- * - Wav2Vec2Inference (WebGPU/WASM, 384MB, ASR + A2E)
464
+ * - Wav2Vec2Inference (WebGPU/WASM, 192MB fp16, A2E)
465
465
  * - Wav2ArkitCpuInference (WASM-only, 404MB, A2E only)
466
466
  */
467
467
  interface A2EBackend {
@@ -1616,7 +1616,9 @@ interface SileroVADBackend {
1616
1616
  *
1617
1617
  * Extends SileroVADConfig with worker-specific options.
1618
1618
  */
1619
- interface SileroVADFactoryConfig extends SileroVADConfig {
1619
+ interface SileroVADFactoryConfig extends Omit<SileroVADConfig, 'modelUrl'> {
1620
+ /** Path or URL to the ONNX model. Default: HuggingFace CDN */
1621
+ modelUrl?: string;
1620
1622
  /**
1621
1623
  * Force worker usage (true), main thread (false), or auto-detect (undefined).
1622
1624
  *
@@ -1689,7 +1691,7 @@ declare function supportsVADWorker(): boolean;
1689
1691
  * const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
1690
1692
  * ```
1691
1693
  */
1692
- declare function createSileroVAD(config: SileroVADFactoryConfig): SileroVADBackend;
1694
+ declare function createSileroVAD(config?: SileroVADFactoryConfig): SileroVADBackend;
1693
1695
 
1694
1696
  /**
1695
1697
  * Web Worker-based wav2arkit_cpu lip sync inference
@@ -2012,8 +2014,8 @@ interface SenseVoiceBackend {
2012
2014
  * Configuration for the SenseVoice factory
2013
2015
  */
2014
2016
  interface CreateSenseVoiceConfig {
2015
- /** Path or URL to model.int8.onnx (239MB) */
2016
- modelUrl: string;
2017
+ /** Path or URL to model.int8.onnx (239MB). Default: HuggingFace CDN */
2018
+ modelUrl?: string;
2017
2019
  /** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
2018
2020
  tokensUrl?: string;
2019
2021
  /** Language hint (default: 'auto') */
@@ -2040,7 +2042,7 @@ interface CreateSenseVoiceConfig {
2040
2042
  * @param config - Factory configuration
2041
2043
  * @returns A SenseVoiceBackend instance (either Worker or main thread)
2042
2044
  */
2043
- declare function createSenseVoice(config: CreateSenseVoiceConfig): SenseVoiceBackend;
2045
+ declare function createSenseVoice(config?: CreateSenseVoiceConfig): SenseVoiceBackend;
2044
2046
 
2045
2047
  /**
2046
2048
  * Shared blendshape constants and utilities for lip sync inference
@@ -2075,12 +2077,10 @@ declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "bro
2075
2077
  declare function lerpBlendshapes(current: Float32Array | number[], target: Float32Array | number[], factor?: number): number[];
2076
2078
 
2077
2079
  /**
2078
- * Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
2080
+ * Wav2Vec2 inference engine for Audio-to-Expression (A2E)
2079
2081
  *
2080
2082
  * Runs entirely in the browser using WebGPU or WASM.
2081
- * Takes raw 16kHz audio and outputs:
2082
- * - 52 ARKit blendshapes (lip sync)
2083
- * - 32-token CTC logits (speech recognition)
2083
+ * Takes raw 16kHz audio and outputs 52 ARKit blendshapes for lip sync.
2084
2084
  *
2085
2085
  * @category Inference
2086
2086
  *
@@ -2088,14 +2088,12 @@ declare function lerpBlendshapes(current: Float32Array | number[], target: Float
2088
2088
  * ```typescript
2089
2089
  * import { Wav2Vec2Inference } from '@omote/core';
2090
2090
  *
2091
- * const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/unified_wav2vec2_asr_a2e.onnx' });
2091
+ * const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/model.onnx' });
2092
2092
  * await wav2vec.load();
2093
2093
  *
2094
2094
  * // Process 1 second of audio (16kHz = 16000 samples)
2095
2095
  * const result = await wav2vec.infer(audioSamples);
2096
- *
2097
2096
  * console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
2098
- * console.log('ASR text:', result.text); // Decoded transcription
2099
2097
  * ```
2100
2098
  */
2101
2099
 
@@ -2128,21 +2126,16 @@ interface ModelInfo {
2128
2126
  outputNames: string[];
2129
2127
  }
2130
2128
 
2131
- /** CTC vocabulary (32 tokens from wav2vec2-base-960h) */
2129
+ /**
2130
+ * CTC vocabulary (32 tokens from wav2vec2-base-960h)
2131
+ * @deprecated ASR is handled by SenseVoice. This will be removed in a future release.
2132
+ */
2132
2133
  declare const CTC_VOCAB: string[];
2133
2134
  interface Wav2Vec2Result {
2134
2135
  /** Blendshape weights [frames, 52] - 30fps */
2135
2136
  blendshapes: Float32Array[];
2136
- /** Raw CTC logits [frames, 32] - 50fps */
2137
- asrLogits: Float32Array[];
2138
- /** Decoded text from CTC */
2139
- text: string;
2140
- /** Number of blendshape frames (30fps) — alias for numA2EFrames */
2137
+ /** Number of blendshape frames (30fps) */
2141
2138
  numFrames: number;
2142
- /** Number of A2E frames (30fps) */
2143
- numA2EFrames: number;
2144
- /** Number of ASR frames (50fps) */
2145
- numASRFrames: number;
2146
2139
  /** Inference time in ms */
2147
2140
  inferenceTimeMs: number;
2148
2141
  }
@@ -2180,10 +2173,6 @@ declare class Wav2Vec2Inference implements A2EBackend {
2180
2173
  * Audio will be zero-padded or truncated to chunkSize samples.
2181
2174
  */
2182
2175
  infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
2183
- /**
2184
- * Decode CTC logits to text using greedy decoding
2185
- */
2186
- private decodeCTC;
2187
2176
  /**
2188
2177
  * Queue inference to serialize ONNX session calls
2189
2178
  */
@@ -2198,10 +2187,85 @@ declare class Wav2Vec2Inference implements A2EBackend {
2198
2187
  dispose(): Promise<void>;
2199
2188
  }
2200
2189
 
2190
+ /**
2191
+ * Default and user-configurable model URLs for all ONNX models
2192
+ *
2193
+ * Out of the box, models are served from HuggingFace CDN (`/resolve/main/`
2194
+ * endpoint with `Access-Control-Allow-Origin: *`). For production apps that
2195
+ * need faster or more reliable delivery, call {@link configureModelUrls} once
2196
+ * at startup to point any or all models at your own CDN.
2197
+ *
2198
+ * @category Inference
2199
+ *
2200
+ * @example Use HuggingFace defaults (zero-config)
2201
+ * ```typescript
2202
+ * import { createA2E } from '@omote/core';
2203
+ * const a2e = createA2E(); // fetches from HuggingFace CDN
2204
+ * ```
2205
+ *
2206
+ * @example Self-host on your own CDN
2207
+ * ```typescript
2208
+ * import { configureModelUrls, createA2E } from '@omote/core';
2209
+ *
2210
+ * configureModelUrls({
2211
+ * lam: 'https://cdn.example.com/models/model_fp16.onnx',
2212
+ * senseVoice: 'https://cdn.example.com/models/sensevoice.int8.onnx',
2213
+ * // omitted keys keep HuggingFace defaults
2214
+ * });
2215
+ *
2216
+ * const a2e = createA2E(); // now fetches from your CDN
2217
+ * ```
2218
+ */
2219
+ /** Model URL keys that can be configured */
2220
+ type ModelUrlKey = 'lam' | 'wav2arkitCpu' | 'senseVoice' | 'sileroVad';
2221
+ /**
2222
+ * Resolved model URLs — user overrides take priority, HuggingFace CDN is fallback.
2223
+ *
2224
+ * All SDK factories (`createA2E`, `createSenseVoice`, `createSileroVAD`) and
2225
+ * orchestrators (`VoicePipeline`) read from this object. Call
2226
+ * {@link configureModelUrls} before constructing any pipelines to point
2227
+ * models at your own CDN.
2228
+ */
2229
+ declare const DEFAULT_MODEL_URLS: Readonly<Record<ModelUrlKey, string>>;
2230
+ /**
2231
+ * Configure custom model URLs. Overrides persist for the lifetime of the page.
2232
+ * Omitted keys keep their HuggingFace CDN defaults.
2233
+ *
2234
+ * Call this **once** at app startup, before constructing any pipelines.
2235
+ *
2236
+ * @example Self-host all models
2237
+ * ```typescript
2238
+ * configureModelUrls({
2239
+ * lam: 'https://cdn.example.com/models/model_fp16.onnx',
2240
+ * wav2arkitCpu: 'https://cdn.example.com/models/wav2arkit_cpu.onnx',
2241
+ * senseVoice: 'https://cdn.example.com/models/sensevoice.int8.onnx',
2242
+ * sileroVad: 'https://cdn.example.com/models/silero-vad.onnx',
2243
+ * });
2244
+ * ```
2245
+ *
2246
+ * @example Override only one model
2247
+ * ```typescript
2248
+ * configureModelUrls({
2249
+ * lam: '/models/model_fp16.onnx', // self-hosted, same origin
2250
+ * });
2251
+ * ```
2252
+ */
2253
+ declare function configureModelUrls(urls: Partial<Record<ModelUrlKey, string>>): void;
2254
+ /**
2255
+ * Reset all model URL overrides back to HuggingFace CDN defaults.
2256
+ * Mainly useful for testing.
2257
+ */
2258
+ declare function resetModelUrls(): void;
2259
+ /**
2260
+ * Get the immutable HuggingFace CDN URLs (ignoring any overrides).
2261
+ * Useful for documentation or fallback logic.
2262
+ */
2263
+ declare const HF_CDN_URLS: Readonly<Record<ModelUrlKey, string>>;
2264
+
2201
2265
  /**
2202
2266
  * CPU-optimized lip sync inference using wav2arkit_cpu model
2203
2267
  *
2204
- * A Safari/iOS-compatible alternative to Wav2Vec2Inference (384MB) designed
2268
+ * A Safari/iOS-compatible alternative to Wav2Vec2Inference (192MB fp16) designed
2205
2269
  * for platforms where WebGPU crashes due to ONNX Runtime JSEP bugs.
2206
2270
  *
2207
2271
  * The model uses ONNX external data format:
@@ -2286,43 +2350,30 @@ declare class Wav2ArkitCpuInference implements A2EBackend {
2286
2350
  /**
2287
2351
  * Factory function for A2E with automatic GPU/CPU model selection
2288
2352
  *
2289
- * Provides a unified API that automatically selects the optimal model:
2290
- * - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (404MB, WASM)
2291
- * - Chrome/Firefox/Edge: Uses Wav2Vec2Inference (384MB, WebGPU)
2292
- * - Fallback: Gracefully falls back to CPU model if GPU model fails to load
2293
- *
2294
- * Why two separate models?
2295
- * Wav2Vec2 (LAM) cannot run on Safari/iOS for two reasons:
2296
- * 1. Its dual-head transformer graph needs ~750-950MB peak during ORT session
2297
- * creation (graph optimization), exceeding iOS WebKit's ~1-1.5GB tab limit.
2298
- * 2. It ships as a single 384MB .onnx file that must load into JS heap before
2299
- * ORT can consume it. iOS WebKit OOMs on this allocation.
2300
- * wav2arkit_cpu solves both: external data format (1.86MB graph + 402MB weights)
2301
- * lets ORT load only the tiny graph, then stream weights via URL pass-through
2302
- * directly into WASM memory. JS heap stays at ~2MB.
2353
+ * Provides a unified API that always tries Wav2Vec2 (LAM fp16) first:
2354
+ * - All platforms: Tries Wav2Vec2Inference (192MB fp16, external data format)
2355
+ * - Fallback: Gracefully falls back to wav2arkit_cpu if GPU model fails to load
2356
+ *
2357
+ * The fp16 external data format (385KB graph + 192MB weights) enables iOS support:
2358
+ * - URL pass-through: ORT streams weights directly into WASM memory (~2MB JS heap)
2359
+ * - Basic graph optimization: avoids ~750-950MB peak from 'all' optimization
2360
+ * - If iOS OOMs during session creation, A2EWithFallback catches it and loads
2361
+ * wav2arkit_cpu (1.86MB graph + 402MB weights) as a safe fallback.
2303
2362
  *
2304
2363
  * @category Inference
2305
2364
  *
2306
- * @example Auto-detect (recommended)
2365
+ * @example Auto-detect (recommended, zero-config)
2307
2366
  * ```typescript
2308
2367
  * import { createA2E } from '@omote/core';
2309
2368
  *
2310
- * const a2e = createA2E({
2311
- * gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
2312
- * cpuModelUrl: '/models/wav2arkit_cpu.onnx',
2313
- * });
2314
- *
2369
+ * const a2e = createA2E(); // uses HF CDN defaults (192MB fp16 GPU, 404MB CPU fallback)
2315
2370
  * await a2e.load();
2316
2371
  * const { blendshapes } = await a2e.infer(audioSamples);
2317
2372
  * ```
2318
2373
  *
2319
2374
  * @example Force CPU model
2320
2375
  * ```typescript
2321
- * const a2e = createA2E({
2322
- * gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
2323
- * cpuModelUrl: '/models/wav2arkit_cpu.onnx',
2324
- * mode: 'cpu',
2325
- * });
2376
+ * const a2e = createA2E({ mode: 'cpu' });
2326
2377
  * ```
2327
2378
  */
2328
2379
 
@@ -2330,8 +2381,8 @@ declare class Wav2ArkitCpuInference implements A2EBackend {
2330
2381
  * Configuration for the A2E factory
2331
2382
  */
2332
2383
  interface CreateA2EConfig {
2333
- /** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge) */
2334
- gpuModelUrl: string;
2384
+ /** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge). Default: HuggingFace CDN */
2385
+ gpuModelUrl?: string;
2335
2386
  /**
2336
2387
  * URL for GPU model external data file (.onnx.data weights).
2337
2388
  * Default: `${gpuModelUrl}.data`
@@ -2339,8 +2390,8 @@ interface CreateA2EConfig {
2339
2390
  * Set to `false` to skip external data loading (single-file models only).
2340
2391
  */
2341
2392
  gpuExternalDataUrl?: string | false;
2342
- /** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
2343
- cpuModelUrl: string;
2393
+ /** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS). Default: HuggingFace CDN */
2394
+ cpuModelUrl?: string;
2344
2395
  /**
2345
2396
  * Model selection mode:
2346
2397
  * - 'auto': Safari/iOS -> CPU, everything else -> GPU (default)
@@ -2382,7 +2433,7 @@ interface CreateA2EConfig {
2382
2433
  * @param config - Factory configuration
2383
2434
  * @returns An A2EBackend instance (either GPU or CPU model)
2384
2435
  */
2385
- declare function createA2E(config: CreateA2EConfig): A2EBackend;
2436
+ declare function createA2E(config?: CreateA2EConfig): A2EBackend;
2386
2437
 
2387
2438
  /**
2388
2439
  * A2EProcessor — Engine-agnostic audio-to-expression processor
@@ -4111,10 +4162,12 @@ declare class EmphasisDetector {
4111
4162
  * breathing/postural sway, and simplex noise-driven brow drift.
4112
4163
  *
4113
4164
  * Research sources:
4114
- * - Blink frequency: 15-20/min (every 3-4s), PMC4043155
4165
+ * - Blink frequency: log-normal IBI (mean=5.97s, SD(log)=0.89), PMC3565584
4166
+ * - Blink shape: asymmetric (92ms close, 242ms open, 3:1 ratio), PMC4043155
4115
4167
  * - Saccade latency: ~200ms, duration 20-200ms
4116
4168
  * - Microsaccades: ~1/second, amplitude 0.02-0.05, Scholarpedia
4117
4169
  * - Fixation duration: 200-350ms, Nature Scientific Reports
4170
+ * - Conversational gaze: Kendon (1967), Argyle & Cook (1976)
4118
4171
  * - Brow noise: NVIDIA Audio2Face, Unreal MetaHuman layered procedural animation
4119
4172
  *
4120
4173
  * @category Animation
@@ -4131,6 +4184,7 @@ declare class EmphasisDetector {
4131
4184
  * eyeTargetY: normalizedY,
4132
4185
  * audioEnergy: energy, // 0-1 from AudioEnergyAnalyzer
4133
4186
  * isSpeaking: true,
4187
+ * state: 'speaking', // conversational state for gaze behavior
4134
4188
  * });
4135
4189
  *
4136
4190
  * // Apply blendshapes to mesh
@@ -4169,6 +4223,8 @@ interface LifeLayerConfig {
4169
4223
  /** Eye smoothing factor (higher = faster response). Default: 15 */
4170
4224
  eyeSmoothing?: number;
4171
4225
  }
4226
+ /** Conversational state for state-dependent gaze behavior */
4227
+ type ConversationalState = 'idle' | 'listening' | 'thinking' | 'speaking';
4172
4228
  /**
4173
4229
  * Per-frame input to the life layer
4174
4230
  */
@@ -4181,6 +4237,8 @@ interface LifeLayerInput {
4181
4237
  audioEnergy?: number;
4182
4238
  /** Whether avatar is speaking. Multiplies brow noise amplitude. */
4183
4239
  isSpeaking?: boolean;
4240
+ /** Conversational state for gaze behavior (idle/listening/thinking/speaking) */
4241
+ state?: ConversationalState;
4184
4242
  }
4185
4243
  /**
4186
4244
  * Per-frame output from the life layer
@@ -4202,6 +4260,7 @@ interface LifeLayerOutput {
4202
4260
  */
4203
4261
  declare class ProceduralLifeLayer {
4204
4262
  private blinkIntervalRange;
4263
+ private useLogNormalBlinks;
4205
4264
  private gazeBreakIntervalRange;
4206
4265
  private gazeBreakAmplitudeRange;
4207
4266
  private eyeNoiseAmplitude;
@@ -4229,6 +4288,7 @@ declare class ProceduralLifeLayer {
4229
4288
  private gazeBreakTargetY;
4230
4289
  private gazeBreakCurrentX;
4231
4290
  private gazeBreakCurrentY;
4291
+ private currentState;
4232
4292
  private microMotionTime;
4233
4293
  private breathingPhase;
4234
4294
  private noiseTime;
@@ -4243,17 +4303,258 @@ declare class ProceduralLifeLayer {
4243
4303
  * @returns Blendshape values and head rotation deltas
4244
4304
  */
4245
4305
  update(delta: number, input?: LifeLayerInput): LifeLayerOutput;
4306
+ /**
4307
+ * Write life layer output directly to a Float32Array[52] in LAM_BLENDSHAPES order.
4308
+ *
4309
+ * Includes micro-jitter (0.4% amplitude simplex noise on all channels) to
4310
+ * break uncanny stillness on undriven channels.
4311
+ *
4312
+ * @param delta - Time since last frame in seconds
4313
+ * @param input - Per-frame input
4314
+ * @param out - Pre-allocated Float32Array(52) to write into
4315
+ */
4316
+ updateToArray(delta: number, input: LifeLayerInput, out: Float32Array): void;
4246
4317
  /**
4247
4318
  * Reset all internal state to initial values.
4248
4319
  */
4249
4320
  reset(): void;
4321
+ /**
4322
+ * Sample next blink interval.
4323
+ * Uses log-normal distribution (PMC3565584) when using default config,
4324
+ * or uniform random when custom blinkIntervalRange is provided.
4325
+ */
4326
+ private nextBlinkInterval;
4250
4327
  private updateBlinks;
4251
4328
  private getBlinkValues;
4252
4329
  private getEyeMicroMotion;
4330
+ /**
4331
+ * Get active gaze parameters — uses state-dependent params when
4332
+ * conversational state is provided, otherwise falls back to config ranges.
4333
+ */
4334
+ private getActiveGazeParams;
4253
4335
  private updateGazeBreaks;
4254
4336
  private updateBrowNoise;
4255
4337
  }
4256
4338
 
4339
+ /**
4340
+ * FACS (Facial Action Coding System) to ARKit Blendshape Mapping
4341
+ *
4342
+ * Two static lookup tables that decompose emotions into FACS Action Units,
4343
+ * then map AUs to ARKit blendshapes. Based on Ekman's FACS research.
4344
+ *
4345
+ * @category Face
4346
+ */
4347
+
4348
+ /**
4349
+ * A single FACS Action Unit activation within an emotion
4350
+ */
4351
+ interface AUActivation {
4352
+ /** FACS Action Unit identifier (e.g. 'AU6', 'AU12') */
4353
+ au: string;
4354
+ /** Activation intensity 0-1 */
4355
+ intensity: number;
4356
+ /** Facial region: upper (brows/eyes/cheeks) or lower (mouth/jaw) */
4357
+ region: 'upper' | 'lower';
4358
+ }
4359
+ /**
4360
+ * Table 1: Emotion → FACS Action Units
4361
+ *
4362
+ * Maps each of the 10 SDK emotion channels to their FACS AU combinations
4363
+ * with intensity and upper/lower face region tags.
4364
+ *
4365
+ * Sources:
4366
+ * - Ekman & Friesen (1978) FACS Manual
4367
+ * - Ekman (2003) Emotions Revealed
4368
+ * - Lucey et al. (2010) Extended Cohn-Kanade dataset
4369
+ */
4370
+ declare const EMOTION_TO_AU: Record<EmotionName, AUActivation[]>;
4371
+ /**
4372
+ * Table 2: FACS Action Unit → ARKit Blendshapes
4373
+ *
4374
+ * Maps each AU to one or more ARKit blendshape channels with weight.
4375
+ *
4376
+ * Sources:
4377
+ * - Apple ARKit face tracking documentation
4378
+ * - Melinda Ozel's ARKit-to-FACS cheat sheet
4379
+ */
4380
+ declare const AU_TO_ARKIT: Record<string, {
4381
+ blendshape: string;
4382
+ weight: number;
4383
+ }[]>;
4384
+ /**
4385
+ * All AU identifiers referenced by EMOTION_TO_AU (for validation)
4386
+ */
4387
+ declare const ALL_AUS: string[];
4388
+
4389
+ /**
4390
+ * EmotionResolver — Resolves EmotionWeights → split upper/lower face Float32Array[52]
4391
+ *
4392
+ * Uses FACS decomposition (EMOTION_TO_AU → AU_TO_ARKIT) to produce
4393
+ * anatomically correct blendshape contributions, split by facial region
4394
+ * for the FaceCompositor's modulation strategy:
4395
+ * - Upper face: additive overlay (independent of speech)
4396
+ * - Lower face: modulates speech output
4397
+ *
4398
+ * @category Face
4399
+ */
4400
+
4401
+ /**
4402
+ * Resolved emotion split into upper and lower face contributions.
4403
+ *
4404
+ * WARNING: Buffers are owned by EmotionResolver and are overwritten
4405
+ * on the next resolve() call. Copy if you need to retain values.
4406
+ */
4407
+ interface ResolvedEmotion {
4408
+ /** 52 channels — only upper face non-zero. Valid until next resolve() call. */
4409
+ upper: Float32Array;
4410
+ /** 52 channels — only lower face non-zero. Valid until next resolve() call. */
4411
+ lower: Float32Array;
4412
+ }
4413
+ /**
4414
+ * Resolves EmotionWeights into upper/lower face blendshape arrays
4415
+ * using FACS Action Unit decomposition.
4416
+ */
4417
+ declare class EmotionResolver {
4418
+ private readonly upperBuffer;
4419
+ private readonly lowerBuffer;
4420
+ /**
4421
+ * Resolve emotion weights to upper/lower face blendshape contributions.
4422
+ *
4423
+ * @param weights - Emotion channel weights from EmotionController
4424
+ * @param intensity - Global intensity multiplier (0-2). Default: 1.0
4425
+ * @returns Upper and lower face blendshape arrays (52 channels each)
4426
+ */
4427
+ resolve(weights: EmotionWeights, intensity?: number): ResolvedEmotion;
4428
+ }
4429
+
4430
+ /**
4431
+ * FaceCompositor — 5-stage signal processing chain for facial animation
4432
+ *
4433
+ * Composes A2E lip sync, emotion modulation, procedural life, and character
4434
+ * profile into a single Float32Array[52] per frame.
4435
+ *
4436
+ * ```
4437
+ * BASE (A2E) → EMOTION MODULATION → PROCEDURAL LIFE → CHARACTER PROFILE → OUTPUT [0,1]
4438
+ * ```
4439
+ *
4440
+ * Replaces manual blendshape merging in consumer code with a single `compose()` call.
4441
+ *
4442
+ * @category Face
4443
+ */
4444
+
4445
+ /**
4446
+ * Output of FaceCompositor.compose()
4447
+ *
4448
+ * WARNING: When using the internal output buffer (no `target` param),
4449
+ * `blendshapes` is a shared reference that is overwritten on the next
4450
+ * compose() call. Copy with `new Float32Array(output.blendshapes)` if
4451
+ * you need to retain values across frames.
4452
+ */
4453
+ interface FaceCompositorOutput {
4454
+ /**
4455
+ * 52 ARKit blendshape values, clamped [0,1].
4456
+ *
4457
+ * This buffer is reused across calls when no `target` parameter is
4458
+ * provided to compose(). Valid until the next compose() call.
4459
+ */
4460
+ blendshapes: Float32Array;
4461
+ /** Head rotation deltas in radians (from ProceduralLifeLayer) */
4462
+ headDelta: {
4463
+ yaw: number;
4464
+ pitch: number;
4465
+ };
4466
+ }
4467
+ /**
4468
+ * Per-blendshape character profile (multiplier + offset)
4469
+ *
4470
+ * Superset of ExpressionProfile — gives per-channel control instead of per-group.
4471
+ */
4472
+ interface CharacterProfile {
4473
+ /** Per-blendshape multiplier (default: all 1.0) */
4474
+ multiplier?: Partial<Record<string, number>>;
4475
+ /** Per-blendshape offset (default: all 0.0) */
4476
+ offset?: Partial<Record<string, number>>;
4477
+ }
4478
+ /**
4479
+ * Configuration for FaceCompositor
4480
+ */
4481
+ interface FaceCompositorConfig {
4482
+ /** ProceduralLifeLayer instance (compositor creates default if omitted) */
4483
+ lifeLayer?: ProceduralLifeLayer;
4484
+ /** Character profile: per-BS multiplier + offset */
4485
+ profile?: CharacterProfile;
4486
+ /** Emotion smoothing factor per frame (0-1). Default: 0.12 */
4487
+ emotionSmoothing?: number;
4488
+ }
4489
+ /**
4490
+ * Per-frame input to the compositor
4491
+ */
4492
+ interface FaceCompositorInput extends LifeLayerInput {
4493
+ /** Delta time in seconds */
4494
+ deltaTime: number;
4495
+ /** Current emotion weights (from EmotionController.emotion or manual) */
4496
+ emotion?: EmotionWeights;
4497
+ /** Emotion intensity multiplier (0-2). Default: 1.0 */
4498
+ emotionIntensity?: number;
4499
+ }
4500
+ /**
4501
+ * FaceCompositor — 5-stage facial animation signal chain.
4502
+ *
4503
+ * @example
4504
+ * ```typescript
4505
+ * import { FaceCompositor, createA2E } from '@omote/core';
4506
+ *
4507
+ * const compositor = new FaceCompositor();
4508
+ *
4509
+ * // In animation loop:
4510
+ * const output = compositor.compose(a2eFrame, {
4511
+ * deltaTime: 0.016,
4512
+ * emotion: { joy: 0.8 },
4513
+ * isSpeaking: true,
4514
+ * audioEnergy: 0.5,
4515
+ * });
4516
+ *
4517
+ * // Apply output.blendshapes[0..51] to avatar morphTargetInfluences
4518
+ * ```
4519
+ */
4520
+ declare class FaceCompositor {
4521
+ private readonly emotionResolver;
4522
+ private readonly lifeLayer;
4523
+ private readonly emotionSmoothing;
4524
+ private readonly outputBuffer;
4525
+ private readonly smoothedUpper;
4526
+ private readonly smoothedLower;
4527
+ private readonly lifeBuffer;
4528
+ private readonly multiplier;
4529
+ private readonly offset;
4530
+ private stickyEmotion;
4531
+ constructor(config?: FaceCompositorConfig);
4532
+ /**
4533
+ * Compose a single output frame from the 5-stage signal chain.
4534
+ *
4535
+ * @param base - A2E raw output (Float32Array[52], LAM_BLENDSHAPES order)
4536
+ * @param input - Per-frame input (deltaTime, emotion, life layer params)
4537
+ * @param target - Optional pre-allocated output buffer (avoids per-frame allocation).
4538
+ * When omitted, an internal buffer is used (valid until next compose() call).
4539
+ * @returns Blendshapes (Float32Array[52] clamped [0,1]) and head rotation deltas
4540
+ */
4541
+ compose(base: Float32Array, input: FaceCompositorInput, target?: Float32Array): FaceCompositorOutput;
4542
+ /**
4543
+ * Set sticky emotion (used when input.emotion is not provided).
4544
+ */
4545
+ setEmotion(weights: EmotionWeights): void;
4546
+ /**
4547
+ * Update character profile at runtime.
4548
+ */
4549
+ setProfile(profile: CharacterProfile): void;
4550
+ /**
4551
+ * Reset all smoothing state and life layer.
4552
+ */
4553
+ reset(): void;
4554
+ /** Expand partial profile maps into dense Float32Arrays */
4555
+ private applyProfileArrays;
4556
+ }
4557
+
4257
4558
  /**
4258
4559
  * MicLipSync - Microphone → VAD → A2E → blendshapes
4259
4560
  *
@@ -4539,4 +4840,4 @@ declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
4539
4840
  private clearSilenceTimer;
4540
4841
  }
4541
4842
 
4542
- export { type A2EBackend, type A2EModelInfo, A2EOrchestrator, type A2EOrchestratorConfig, A2EProcessor, type A2EProcessorConfig, type A2EProgressEvent, type A2EResult, ARKIT_BLENDSHAPES, type ActiveSpan, type AnimationClip, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type CreateA2EConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type ExpressionProfile, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, ProceduralLifeLayer, type QuotaInfo, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, VoicePipeline, type VoicePipelineConfig, type VoicePipelineEvents, type VoicePipelineState, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureTelemetry, createA2E, createEmotionVector, createSenseVoice, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, preloadModels, resolveBackend, shouldEnableWasmProxy, shouldUseCpuA2E, shouldUseNativeASR, shouldUseServerA2E, supportsVADWorker };
4843
+ export { type A2EBackend, type A2EModelInfo, A2EOrchestrator, type A2EOrchestratorConfig, A2EProcessor, type A2EProcessorConfig, type A2EProgressEvent, type A2EResult, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, type CharacterProfile, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FaceCompositorOutput, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, ProceduralLifeLayer, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, VoicePipeline, type VoicePipelineConfig, type VoicePipelineEvents, type VoicePipelineState, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureTelemetry, createA2E, createEmotionVector, createSenseVoice, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, preloadModels, resetModelUrls, resolveBackend, shouldEnableWasmProxy, shouldUseCpuA2E, shouldUseNativeASR, shouldUseServerA2E, supportsVADWorker };