@omote/core 0.6.4 → 0.6.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -2350,20 +2350,15 @@ declare class Wav2ArkitCpuInference implements A2EBackend {
2350
2350
  /**
2351
2351
  * Factory function for A2E with automatic GPU/CPU model selection
2352
2352
  *
2353
- * Provides a unified API that automatically selects the optimal model:
2354
- * - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (404MB, WASM)
2355
- * - Chrome/Firefox/Edge: Uses Wav2Vec2Inference (192MB fp16, WebGPU)
2356
- * - Fallback: Gracefully falls back to CPU model if GPU model fails to load
2357
- *
2358
- * Why two separate models?
2359
- * Wav2Vec2 (LAM) cannot run on Safari/iOS for two reasons:
2360
- * 1. Its dual-head transformer graph needs ~750-950MB peak during ORT session
2361
- * creation (graph optimization), exceeding iOS WebKit's ~1-1.5GB tab limit.
2362
- * 2. It ships as a single 192MB .onnx file (fp16) that must load into JS heap
2363
- * before ORT can consume it. iOS WebKit OOMs on this allocation.
2364
- * wav2arkit_cpu solves both: external data format (1.86MB graph + 402MB weights)
2365
- * lets ORT load only the tiny graph, then stream weights via URL pass-through
2366
- * directly into WASM memory. JS heap stays at ~2MB.
2353
+ * Provides a unified API that always tries Wav2Vec2 (LAM fp16) first:
2354
+ * - All platforms: Tries Wav2Vec2Inference (192MB fp16, external data format)
2355
+ * - Fallback: Gracefully falls back to wav2arkit_cpu if GPU model fails to load
2356
+ *
2357
+ * The fp16 external data format (385KB graph + 192MB weights) enables iOS support:
2358
+ * - URL pass-through: ORT streams weights directly into WASM memory (~2MB JS heap)
2359
+ * - Basic graph optimization: avoids ~750-950MB peak from 'all' optimization
2360
+ * - If iOS OOMs during session creation, A2EWithFallback catches it and loads
2361
+ * wav2arkit_cpu (1.86MB graph + 402MB weights) as a safe fallback.
2367
2362
  *
2368
2363
  * @category Inference
2369
2364
  *
@@ -4404,12 +4399,15 @@ declare const ALL_AUS: string[];
4404
4399
  */
4405
4400
 
4406
4401
  /**
4407
- * Resolved emotion split into upper and lower face contributions
4402
+ * Resolved emotion split into upper and lower face contributions.
4403
+ *
4404
+ * WARNING: Buffers are owned by EmotionResolver and are overwritten
4405
+ * on the next resolve() call. Copy if you need to retain values.
4408
4406
  */
4409
4407
  interface ResolvedEmotion {
4410
- /** 52 channels — only upper face (brows, eyes, cheeks, nose) non-zero */
4408
+ /** 52 channels — only upper face non-zero. Valid until next resolve() call. */
4411
4409
  upper: Float32Array;
4412
- /** 52 channels — only lower face (mouth, jaw) non-zero */
4410
+ /** 52 channels — only lower face non-zero. Valid until next resolve() call. */
4413
4411
  lower: Float32Array;
4414
4412
  }
4415
4413
  /**
@@ -4444,6 +4442,28 @@ declare class EmotionResolver {
4444
4442
  * @category Face
4445
4443
  */
4446
4444
 
4445
+ /**
4446
+ * Output of FaceCompositor.compose()
4447
+ *
4448
+ * WARNING: When using the internal output buffer (no `target` param),
4449
+ * `blendshapes` is a shared reference that is overwritten on the next
4450
+ * compose() call. Copy with `new Float32Array(output.blendshapes)` if
4451
+ * you need to retain values across frames.
4452
+ */
4453
+ interface FaceCompositorOutput {
4454
+ /**
4455
+ * 52 ARKit blendshape values, clamped [0,1].
4456
+ *
4457
+ * This buffer is reused across calls when no `target` parameter is
4458
+ * provided to compose(). Valid until the next compose() call.
4459
+ */
4460
+ blendshapes: Float32Array;
4461
+ /** Head rotation deltas in radians (from ProceduralLifeLayer) */
4462
+ headDelta: {
4463
+ yaw: number;
4464
+ pitch: number;
4465
+ };
4466
+ }
4447
4467
  /**
4448
4468
  * Per-blendshape character profile (multiplier + offset)
4449
4469
  *
@@ -4494,13 +4514,14 @@ interface FaceCompositorInput extends LifeLayerInput {
4494
4514
  * audioEnergy: 0.5,
4495
4515
  * });
4496
4516
  *
4497
- * // Apply output[0..51] to avatar morphTargetInfluences
4517
+ * // Apply output.blendshapes[0..51] to avatar morphTargetInfluences
4498
4518
  * ```
4499
4519
  */
4500
4520
  declare class FaceCompositor {
4501
4521
  private readonly emotionResolver;
4502
4522
  private readonly lifeLayer;
4503
4523
  private readonly emotionSmoothing;
4524
+ private readonly outputBuffer;
4504
4525
  private readonly smoothedUpper;
4505
4526
  private readonly smoothedLower;
4506
4527
  private readonly lifeBuffer;
@@ -4513,9 +4534,11 @@ declare class FaceCompositor {
4513
4534
  *
4514
4535
  * @param base - A2E raw output (Float32Array[52], LAM_BLENDSHAPES order)
4515
4536
  * @param input - Per-frame input (deltaTime, emotion, life layer params)
4516
- * @returns Float32Array[52] with all values clamped to [0, 1]
4537
+ * @param target - Optional pre-allocated output buffer (avoids per-frame allocation).
4538
+ * When omitted, an internal buffer is used (valid until next compose() call).
4539
+ * @returns Blendshapes (Float32Array[52] clamped [0,1]) and head rotation deltas
4517
4540
  */
4518
- compose(base: Float32Array, input: FaceCompositorInput): Float32Array;
4541
+ compose(base: Float32Array, input: FaceCompositorInput, target?: Float32Array): FaceCompositorOutput;
4519
4542
  /**
4520
4543
  * Set sticky emotion (used when input.emotion is not provided).
4521
4544
  */
@@ -4817,4 +4840,4 @@ declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
4817
4840
  private clearSilenceTimer;
4818
4841
  }
4819
4842
 
4820
- export { type A2EBackend, type A2EModelInfo, A2EOrchestrator, type A2EOrchestratorConfig, A2EProcessor, type A2EProcessorConfig, type A2EProgressEvent, type A2EResult, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, type CharacterProfile, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, ProceduralLifeLayer, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, VoicePipeline, type VoicePipelineConfig, type VoicePipelineEvents, type VoicePipelineState, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureTelemetry, createA2E, createEmotionVector, createSenseVoice, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, preloadModels, resetModelUrls, resolveBackend, shouldEnableWasmProxy, shouldUseCpuA2E, shouldUseNativeASR, shouldUseServerA2E, supportsVADWorker };
4843
+ export { type A2EBackend, type A2EModelInfo, A2EOrchestrator, type A2EOrchestratorConfig, A2EProcessor, type A2EProcessorConfig, type A2EProgressEvent, type A2EResult, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, type CharacterProfile, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FaceCompositorOutput, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, ProceduralLifeLayer, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, VoicePipeline, type VoicePipelineConfig, type VoicePipelineEvents, type VoicePipelineState, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureTelemetry, createA2E, createEmotionVector, createSenseVoice, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, preloadModels, resetModelUrls, resolveBackend, shouldEnableWasmProxy, shouldUseCpuA2E, shouldUseNativeASR, shouldUseServerA2E, supportsVADWorker };
package/dist/index.d.ts CHANGED
@@ -2350,20 +2350,15 @@ declare class Wav2ArkitCpuInference implements A2EBackend {
2350
2350
  /**
2351
2351
  * Factory function for A2E with automatic GPU/CPU model selection
2352
2352
  *
2353
- * Provides a unified API that automatically selects the optimal model:
2354
- * - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (404MB, WASM)
2355
- * - Chrome/Firefox/Edge: Uses Wav2Vec2Inference (192MB fp16, WebGPU)
2356
- * - Fallback: Gracefully falls back to CPU model if GPU model fails to load
2357
- *
2358
- * Why two separate models?
2359
- * Wav2Vec2 (LAM) cannot run on Safari/iOS for two reasons:
2360
- * 1. Its dual-head transformer graph needs ~750-950MB peak during ORT session
2361
- * creation (graph optimization), exceeding iOS WebKit's ~1-1.5GB tab limit.
2362
- * 2. It ships as a single 192MB .onnx file (fp16) that must load into JS heap
2363
- * before ORT can consume it. iOS WebKit OOMs on this allocation.
2364
- * wav2arkit_cpu solves both: external data format (1.86MB graph + 402MB weights)
2365
- * lets ORT load only the tiny graph, then stream weights via URL pass-through
2366
- * directly into WASM memory. JS heap stays at ~2MB.
2353
+ * Provides a unified API that always tries Wav2Vec2 (LAM fp16) first:
2354
+ * - All platforms: Tries Wav2Vec2Inference (192MB fp16, external data format)
2355
+ * - Fallback: Gracefully falls back to wav2arkit_cpu if GPU model fails to load
2356
+ *
2357
+ * The fp16 external data format (385KB graph + 192MB weights) enables iOS support:
2358
+ * - URL pass-through: ORT streams weights directly into WASM memory (~2MB JS heap)
2359
+ * - Basic graph optimization: avoids ~750-950MB peak from 'all' optimization
2360
+ * - If iOS OOMs during session creation, A2EWithFallback catches it and loads
2361
+ * wav2arkit_cpu (1.86MB graph + 402MB weights) as a safe fallback.
2367
2362
  *
2368
2363
  * @category Inference
2369
2364
  *
@@ -4404,12 +4399,15 @@ declare const ALL_AUS: string[];
4404
4399
  */
4405
4400
 
4406
4401
  /**
4407
- * Resolved emotion split into upper and lower face contributions
4402
+ * Resolved emotion split into upper and lower face contributions.
4403
+ *
4404
+ * WARNING: Buffers are owned by EmotionResolver and are overwritten
4405
+ * on the next resolve() call. Copy if you need to retain values.
4408
4406
  */
4409
4407
  interface ResolvedEmotion {
4410
- /** 52 channels — only upper face (brows, eyes, cheeks, nose) non-zero */
4408
+ /** 52 channels — only upper face non-zero. Valid until next resolve() call. */
4411
4409
  upper: Float32Array;
4412
- /** 52 channels — only lower face (mouth, jaw) non-zero */
4410
+ /** 52 channels — only lower face non-zero. Valid until next resolve() call. */
4413
4411
  lower: Float32Array;
4414
4412
  }
4415
4413
  /**
@@ -4444,6 +4442,28 @@ declare class EmotionResolver {
4444
4442
  * @category Face
4445
4443
  */
4446
4444
 
4445
+ /**
4446
+ * Output of FaceCompositor.compose()
4447
+ *
4448
+ * WARNING: When using the internal output buffer (no `target` param),
4449
+ * `blendshapes` is a shared reference that is overwritten on the next
4450
+ * compose() call. Copy with `new Float32Array(output.blendshapes)` if
4451
+ * you need to retain values across frames.
4452
+ */
4453
+ interface FaceCompositorOutput {
4454
+ /**
4455
+ * 52 ARKit blendshape values, clamped [0,1].
4456
+ *
4457
+ * This buffer is reused across calls when no `target` parameter is
4458
+ * provided to compose(). Valid until the next compose() call.
4459
+ */
4460
+ blendshapes: Float32Array;
4461
+ /** Head rotation deltas in radians (from ProceduralLifeLayer) */
4462
+ headDelta: {
4463
+ yaw: number;
4464
+ pitch: number;
4465
+ };
4466
+ }
4447
4467
  /**
4448
4468
  * Per-blendshape character profile (multiplier + offset)
4449
4469
  *
@@ -4494,13 +4514,14 @@ interface FaceCompositorInput extends LifeLayerInput {
4494
4514
  * audioEnergy: 0.5,
4495
4515
  * });
4496
4516
  *
4497
- * // Apply output[0..51] to avatar morphTargetInfluences
4517
+ * // Apply output.blendshapes[0..51] to avatar morphTargetInfluences
4498
4518
  * ```
4499
4519
  */
4500
4520
  declare class FaceCompositor {
4501
4521
  private readonly emotionResolver;
4502
4522
  private readonly lifeLayer;
4503
4523
  private readonly emotionSmoothing;
4524
+ private readonly outputBuffer;
4504
4525
  private readonly smoothedUpper;
4505
4526
  private readonly smoothedLower;
4506
4527
  private readonly lifeBuffer;
@@ -4513,9 +4534,11 @@ declare class FaceCompositor {
4513
4534
  *
4514
4535
  * @param base - A2E raw output (Float32Array[52], LAM_BLENDSHAPES order)
4515
4536
  * @param input - Per-frame input (deltaTime, emotion, life layer params)
4516
- * @returns Float32Array[52] with all values clamped to [0, 1]
4537
+ * @param target - Optional pre-allocated output buffer (avoids per-frame allocation).
4538
+ * When omitted, an internal buffer is used (valid until next compose() call).
4539
+ * @returns Blendshapes (Float32Array[52] clamped [0,1]) and head rotation deltas
4517
4540
  */
4518
- compose(base: Float32Array, input: FaceCompositorInput): Float32Array;
4541
+ compose(base: Float32Array, input: FaceCompositorInput, target?: Float32Array): FaceCompositorOutput;
4519
4542
  /**
4520
4543
  * Set sticky emotion (used when input.emotion is not provided).
4521
4544
  */
@@ -4817,4 +4840,4 @@ declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
4817
4840
  private clearSilenceTimer;
4818
4841
  }
4819
4842
 
4820
- export { type A2EBackend, type A2EModelInfo, A2EOrchestrator, type A2EOrchestratorConfig, A2EProcessor, type A2EProcessorConfig, type A2EProgressEvent, type A2EResult, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, type CharacterProfile, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, ProceduralLifeLayer, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, VoicePipeline, type VoicePipelineConfig, type VoicePipelineEvents, type VoicePipelineState, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureTelemetry, createA2E, createEmotionVector, createSenseVoice, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, preloadModels, resetModelUrls, resolveBackend, shouldEnableWasmProxy, shouldUseCpuA2E, shouldUseNativeASR, shouldUseServerA2E, supportsVADWorker };
4843
+ export { type A2EBackend, type A2EModelInfo, A2EOrchestrator, type A2EOrchestratorConfig, A2EProcessor, type A2EProcessorConfig, type A2EProgressEvent, type A2EResult, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, type CharacterProfile, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FaceCompositorOutput, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, ProceduralLifeLayer, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, VoicePipeline, type VoicePipelineConfig, type VoicePipelineEvents, type VoicePipelineState, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureTelemetry, createA2E, createEmotionVector, createSenseVoice, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, preloadModels, resetModelUrls, resolveBackend, shouldEnableWasmProxy, shouldUseCpuA2E, shouldUseNativeASR, shouldUseServerA2E, supportsVADWorker };
package/dist/index.js CHANGED
@@ -2613,7 +2613,7 @@ async function getOnnxRuntimeForPreference(preference = "auto") {
2613
2613
  const ort = await getOnnxRuntime(backend);
2614
2614
  return { ort, backend };
2615
2615
  }
2616
- function getSessionOptions(backend) {
2616
+ function getSessionOptions(backend, config) {
2617
2617
  if (backend === "webgpu") {
2618
2618
  return {
2619
2619
  executionProviders: [
@@ -2629,7 +2629,7 @@ function getSessionOptions(backend) {
2629
2629
  if (isIOS()) {
2630
2630
  return {
2631
2631
  executionProviders: ["wasm"],
2632
- graphOptimizationLevel: "basic",
2632
+ graphOptimizationLevel: config?.iosDisableOptimization ? "disabled" : "basic",
2633
2633
  enableCpuMemArena: false,
2634
2634
  enableMemPattern: false
2635
2635
  };
@@ -2896,7 +2896,7 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
2896
2896
  logger3.info("ONNX Runtime loaded", { backend: this._backend });
2897
2897
  const modelUrl = this.config.modelUrl;
2898
2898
  const dataUrl = this.config.externalDataUrl !== false ? typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data` : null;
2899
- const sessionOptions = getSessionOptions(this._backend);
2899
+ const sessionOptions = getSessionOptions(this._backend, { iosDisableOptimization: true });
2900
2900
  let isCached = false;
2901
2901
  if (isIOS()) {
2902
2902
  logger3.info("iOS: passing model URLs directly to ORT (low-memory path)", {
@@ -7739,10 +7739,11 @@ function createA2E(config = {}) {
7739
7739
  useCpu = false;
7740
7740
  logger12.info("Forcing GPU A2E model (Wav2Vec2)");
7741
7741
  } else {
7742
- useCpu = shouldUseCpuA2E();
7743
- logger12.info("Auto-detected A2E model", {
7744
- useCpu,
7745
- isSafari: isSafari()
7742
+ useCpu = false;
7743
+ logger12.info("Auto-detected A2E model: trying GPU first (fp16 external data)", {
7744
+ isSafari: isSafari(),
7745
+ isIOS: isIOS(),
7746
+ fallbackOnError
7746
7747
  });
7747
7748
  }
7748
7749
  if (useCpu) {
@@ -7804,6 +7805,7 @@ var A2EWithFallback = class {
7804
7805
  }
7805
7806
  }
7806
7807
  async fallbackToCpu(reason) {
7808
+ console.error("[A2EWithFallback] GPU\u2192CPU FALLBACK TRIGGERED. Reason:", reason);
7807
7809
  logger12.warn("GPU model load failed, falling back to CPU model", { reason });
7808
7810
  try {
7809
7811
  await this.implementation.dispose();
@@ -10672,10 +10674,7 @@ var EmotionResolver = class {
10672
10674
  if (upper[i] > 1) upper[i] = 1;
10673
10675
  if (lower[i] > 1) lower[i] = 1;
10674
10676
  }
10675
- return {
10676
- upper: new Float32Array(upper),
10677
- lower: new Float32Array(lower)
10678
- };
10677
+ return { upper, lower };
10679
10678
  }
10680
10679
  };
10681
10680
 
@@ -10698,6 +10697,7 @@ var FaceCompositor = class {
10698
10697
  constructor(config) {
10699
10698
  this.emotionResolver = new EmotionResolver();
10700
10699
  // Pre-allocated buffers
10700
+ this.outputBuffer = new Float32Array(52);
10701
10701
  this.smoothedUpper = new Float32Array(52);
10702
10702
  this.smoothedLower = new Float32Array(52);
10703
10703
  this.lifeBuffer = new Float32Array(52);
@@ -10715,10 +10715,12 @@ var FaceCompositor = class {
10715
10715
  *
10716
10716
  * @param base - A2E raw output (Float32Array[52], LAM_BLENDSHAPES order)
10717
10717
  * @param input - Per-frame input (deltaTime, emotion, life layer params)
10718
- * @returns Float32Array[52] with all values clamped to [0, 1]
10718
+ * @param target - Optional pre-allocated output buffer (avoids per-frame allocation).
10719
+ * When omitted, an internal buffer is used (valid until next compose() call).
10720
+ * @returns Blendshapes (Float32Array[52] clamped [0,1]) and head rotation deltas
10719
10721
  */
10720
- compose(base, input) {
10721
- const out = new Float32Array(52);
10722
+ compose(base, input, target) {
10723
+ const out = target ?? this.outputBuffer;
10722
10724
  out.set(base);
10723
10725
  const emotion = input.emotion ?? this.stickyEmotion;
10724
10726
  if (emotion) {
@@ -10740,7 +10742,14 @@ var FaceCompositor = class {
10740
10742
  out[i] *= 1 + this.smoothedLower[i] * bilabialSuppress;
10741
10743
  }
10742
10744
  }
10743
- this.lifeLayer.updateToArray(input.deltaTime, input, this.lifeBuffer);
10745
+ const lifeResult = this.lifeLayer.update(input.deltaTime, input);
10746
+ this.lifeBuffer.fill(0);
10747
+ for (const [name, value] of Object.entries(lifeResult.blendshapes)) {
10748
+ const idx = BS_INDEX2.get(name);
10749
+ if (idx !== void 0) {
10750
+ this.lifeBuffer[idx] = value;
10751
+ }
10752
+ }
10744
10753
  for (let i = 0; i < 52; i++) {
10745
10754
  if (IS_EYE_CHANNEL[i]) {
10746
10755
  out[i] = this.lifeBuffer[i];
@@ -10755,7 +10764,7 @@ var FaceCompositor = class {
10755
10764
  if (out[i] < 0) out[i] = 0;
10756
10765
  else if (out[i] > 1) out[i] = 1;
10757
10766
  }
10758
- return out;
10767
+ return { blendshapes: out, headDelta: lifeResult.headDelta };
10759
10768
  }
10760
10769
  /**
10761
10770
  * Set sticky emotion (used when input.emotion is not provided).
@@ -11042,6 +11051,7 @@ var VoicePipeline = class extends EventEmitter {
11042
11051
  new Promise((r) => setTimeout(() => r("timeout"), timeoutMs))
11043
11052
  ]);
11044
11053
  if (lamLoadResult === "timeout") {
11054
+ console.error(`[VoicePipeline] LAM TIMEOUT after ${timeoutMs}ms \u2014 forcing CPU fallback`);
11045
11055
  logger19.warn(`LAM GPU load timed out after ${timeoutMs}ms, falling back to CPU`);
11046
11056
  await lam.dispose();
11047
11057
  lam = createA2E({