@omote/core 0.6.4 → 0.6.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +44 -21
- package/dist/index.d.ts +44 -21
- package/dist/index.js +26 -16
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +26 -16
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -2350,20 +2350,15 @@ declare class Wav2ArkitCpuInference implements A2EBackend {
|
|
|
2350
2350
|
/**
|
|
2351
2351
|
* Factory function for A2E with automatic GPU/CPU model selection
|
|
2352
2352
|
*
|
|
2353
|
-
* Provides a unified API that
|
|
2354
|
-
* -
|
|
2355
|
-
* -
|
|
2356
|
-
*
|
|
2357
|
-
*
|
|
2358
|
-
*
|
|
2359
|
-
*
|
|
2360
|
-
*
|
|
2361
|
-
*
|
|
2362
|
-
* 2. It ships as a single 192MB .onnx file (fp16) that must load into JS heap
|
|
2363
|
-
* before ORT can consume it. iOS WebKit OOMs on this allocation.
|
|
2364
|
-
* wav2arkit_cpu solves both: external data format (1.86MB graph + 402MB weights)
|
|
2365
|
-
* lets ORT load only the tiny graph, then stream weights via URL pass-through
|
|
2366
|
-
* directly into WASM memory. JS heap stays at ~2MB.
|
|
2353
|
+
* Provides a unified API that always tries Wav2Vec2 (LAM fp16) first:
|
|
2354
|
+
* - All platforms: Tries Wav2Vec2Inference (192MB fp16, external data format)
|
|
2355
|
+
* - Fallback: Gracefully falls back to wav2arkit_cpu if GPU model fails to load
|
|
2356
|
+
*
|
|
2357
|
+
* The fp16 external data format (385KB graph + 192MB weights) enables iOS support:
|
|
2358
|
+
* - URL pass-through: ORT streams weights directly into WASM memory (~2MB JS heap)
|
|
2359
|
+
* - Basic graph optimization: avoids ~750-950MB peak from 'all' optimization
|
|
2360
|
+
* - If iOS OOMs during session creation, A2EWithFallback catches it and loads
|
|
2361
|
+
* wav2arkit_cpu (1.86MB graph + 402MB weights) as a safe fallback.
|
|
2367
2362
|
*
|
|
2368
2363
|
* @category Inference
|
|
2369
2364
|
*
|
|
@@ -4404,12 +4399,15 @@ declare const ALL_AUS: string[];
|
|
|
4404
4399
|
*/
|
|
4405
4400
|
|
|
4406
4401
|
/**
|
|
4407
|
-
* Resolved emotion split into upper and lower face contributions
|
|
4402
|
+
* Resolved emotion split into upper and lower face contributions.
|
|
4403
|
+
*
|
|
4404
|
+
* WARNING: Buffers are owned by EmotionResolver and are overwritten
|
|
4405
|
+
* on the next resolve() call. Copy if you need to retain values.
|
|
4408
4406
|
*/
|
|
4409
4407
|
interface ResolvedEmotion {
|
|
4410
|
-
/** 52 channels — only upper face
|
|
4408
|
+
/** 52 channels — only upper face non-zero. Valid until next resolve() call. */
|
|
4411
4409
|
upper: Float32Array;
|
|
4412
|
-
/** 52 channels — only lower face
|
|
4410
|
+
/** 52 channels — only lower face non-zero. Valid until next resolve() call. */
|
|
4413
4411
|
lower: Float32Array;
|
|
4414
4412
|
}
|
|
4415
4413
|
/**
|
|
@@ -4444,6 +4442,28 @@ declare class EmotionResolver {
|
|
|
4444
4442
|
* @category Face
|
|
4445
4443
|
*/
|
|
4446
4444
|
|
|
4445
|
+
/**
|
|
4446
|
+
* Output of FaceCompositor.compose()
|
|
4447
|
+
*
|
|
4448
|
+
* WARNING: When using the internal output buffer (no `target` param),
|
|
4449
|
+
* `blendshapes` is a shared reference that is overwritten on the next
|
|
4450
|
+
* compose() call. Copy with `new Float32Array(output.blendshapes)` if
|
|
4451
|
+
* you need to retain values across frames.
|
|
4452
|
+
*/
|
|
4453
|
+
interface FaceCompositorOutput {
|
|
4454
|
+
/**
|
|
4455
|
+
* 52 ARKit blendshape values, clamped [0,1].
|
|
4456
|
+
*
|
|
4457
|
+
* This buffer is reused across calls when no `target` parameter is
|
|
4458
|
+
* provided to compose(). Valid until the next compose() call.
|
|
4459
|
+
*/
|
|
4460
|
+
blendshapes: Float32Array;
|
|
4461
|
+
/** Head rotation deltas in radians (from ProceduralLifeLayer) */
|
|
4462
|
+
headDelta: {
|
|
4463
|
+
yaw: number;
|
|
4464
|
+
pitch: number;
|
|
4465
|
+
};
|
|
4466
|
+
}
|
|
4447
4467
|
/**
|
|
4448
4468
|
* Per-blendshape character profile (multiplier + offset)
|
|
4449
4469
|
*
|
|
@@ -4494,13 +4514,14 @@ interface FaceCompositorInput extends LifeLayerInput {
|
|
|
4494
4514
|
* audioEnergy: 0.5,
|
|
4495
4515
|
* });
|
|
4496
4516
|
*
|
|
4497
|
-
* // Apply output[0..51] to avatar morphTargetInfluences
|
|
4517
|
+
* // Apply output.blendshapes[0..51] to avatar morphTargetInfluences
|
|
4498
4518
|
* ```
|
|
4499
4519
|
*/
|
|
4500
4520
|
declare class FaceCompositor {
|
|
4501
4521
|
private readonly emotionResolver;
|
|
4502
4522
|
private readonly lifeLayer;
|
|
4503
4523
|
private readonly emotionSmoothing;
|
|
4524
|
+
private readonly outputBuffer;
|
|
4504
4525
|
private readonly smoothedUpper;
|
|
4505
4526
|
private readonly smoothedLower;
|
|
4506
4527
|
private readonly lifeBuffer;
|
|
@@ -4513,9 +4534,11 @@ declare class FaceCompositor {
|
|
|
4513
4534
|
*
|
|
4514
4535
|
* @param base - A2E raw output (Float32Array[52], LAM_BLENDSHAPES order)
|
|
4515
4536
|
* @param input - Per-frame input (deltaTime, emotion, life layer params)
|
|
4516
|
-
* @
|
|
4537
|
+
* @param target - Optional pre-allocated output buffer (avoids per-frame allocation).
|
|
4538
|
+
* When omitted, an internal buffer is used (valid until next compose() call).
|
|
4539
|
+
* @returns Blendshapes (Float32Array[52] clamped [0,1]) and head rotation deltas
|
|
4517
4540
|
*/
|
|
4518
|
-
compose(base: Float32Array, input: FaceCompositorInput):
|
|
4541
|
+
compose(base: Float32Array, input: FaceCompositorInput, target?: Float32Array): FaceCompositorOutput;
|
|
4519
4542
|
/**
|
|
4520
4543
|
* Set sticky emotion (used when input.emotion is not provided).
|
|
4521
4544
|
*/
|
|
@@ -4817,4 +4840,4 @@ declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
|
|
|
4817
4840
|
private clearSilenceTimer;
|
|
4818
4841
|
}
|
|
4819
4842
|
|
|
4820
|
-
export { type A2EBackend, type A2EModelInfo, A2EOrchestrator, type A2EOrchestratorConfig, A2EProcessor, type A2EProcessorConfig, type A2EProgressEvent, type A2EResult, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, type CharacterProfile, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, ProceduralLifeLayer, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, VoicePipeline, type VoicePipelineConfig, type VoicePipelineEvents, type VoicePipelineState, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureTelemetry, createA2E, createEmotionVector, createSenseVoice, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, preloadModels, resetModelUrls, resolveBackend, shouldEnableWasmProxy, shouldUseCpuA2E, shouldUseNativeASR, shouldUseServerA2E, supportsVADWorker };
|
|
4843
|
+
export { type A2EBackend, type A2EModelInfo, A2EOrchestrator, type A2EOrchestratorConfig, A2EProcessor, type A2EProcessorConfig, type A2EProgressEvent, type A2EResult, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, type CharacterProfile, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FaceCompositorOutput, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, ProceduralLifeLayer, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, VoicePipeline, type VoicePipelineConfig, type VoicePipelineEvents, type VoicePipelineState, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureTelemetry, createA2E, createEmotionVector, createSenseVoice, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, preloadModels, resetModelUrls, resolveBackend, shouldEnableWasmProxy, shouldUseCpuA2E, shouldUseNativeASR, shouldUseServerA2E, supportsVADWorker };
|
package/dist/index.d.ts
CHANGED
|
@@ -2350,20 +2350,15 @@ declare class Wav2ArkitCpuInference implements A2EBackend {
|
|
|
2350
2350
|
/**
|
|
2351
2351
|
* Factory function for A2E with automatic GPU/CPU model selection
|
|
2352
2352
|
*
|
|
2353
|
-
* Provides a unified API that
|
|
2354
|
-
* -
|
|
2355
|
-
* -
|
|
2356
|
-
*
|
|
2357
|
-
*
|
|
2358
|
-
*
|
|
2359
|
-
*
|
|
2360
|
-
*
|
|
2361
|
-
*
|
|
2362
|
-
* 2. It ships as a single 192MB .onnx file (fp16) that must load into JS heap
|
|
2363
|
-
* before ORT can consume it. iOS WebKit OOMs on this allocation.
|
|
2364
|
-
* wav2arkit_cpu solves both: external data format (1.86MB graph + 402MB weights)
|
|
2365
|
-
* lets ORT load only the tiny graph, then stream weights via URL pass-through
|
|
2366
|
-
* directly into WASM memory. JS heap stays at ~2MB.
|
|
2353
|
+
* Provides a unified API that always tries Wav2Vec2 (LAM fp16) first:
|
|
2354
|
+
* - All platforms: Tries Wav2Vec2Inference (192MB fp16, external data format)
|
|
2355
|
+
* - Fallback: Gracefully falls back to wav2arkit_cpu if GPU model fails to load
|
|
2356
|
+
*
|
|
2357
|
+
* The fp16 external data format (385KB graph + 192MB weights) enables iOS support:
|
|
2358
|
+
* - URL pass-through: ORT streams weights directly into WASM memory (~2MB JS heap)
|
|
2359
|
+
* - Basic graph optimization: avoids ~750-950MB peak from 'all' optimization
|
|
2360
|
+
* - If iOS OOMs during session creation, A2EWithFallback catches it and loads
|
|
2361
|
+
* wav2arkit_cpu (1.86MB graph + 402MB weights) as a safe fallback.
|
|
2367
2362
|
*
|
|
2368
2363
|
* @category Inference
|
|
2369
2364
|
*
|
|
@@ -4404,12 +4399,15 @@ declare const ALL_AUS: string[];
|
|
|
4404
4399
|
*/
|
|
4405
4400
|
|
|
4406
4401
|
/**
|
|
4407
|
-
* Resolved emotion split into upper and lower face contributions
|
|
4402
|
+
* Resolved emotion split into upper and lower face contributions.
|
|
4403
|
+
*
|
|
4404
|
+
* WARNING: Buffers are owned by EmotionResolver and are overwritten
|
|
4405
|
+
* on the next resolve() call. Copy if you need to retain values.
|
|
4408
4406
|
*/
|
|
4409
4407
|
interface ResolvedEmotion {
|
|
4410
|
-
/** 52 channels — only upper face
|
|
4408
|
+
/** 52 channels — only upper face non-zero. Valid until next resolve() call. */
|
|
4411
4409
|
upper: Float32Array;
|
|
4412
|
-
/** 52 channels — only lower face
|
|
4410
|
+
/** 52 channels — only lower face non-zero. Valid until next resolve() call. */
|
|
4413
4411
|
lower: Float32Array;
|
|
4414
4412
|
}
|
|
4415
4413
|
/**
|
|
@@ -4444,6 +4442,28 @@ declare class EmotionResolver {
|
|
|
4444
4442
|
* @category Face
|
|
4445
4443
|
*/
|
|
4446
4444
|
|
|
4445
|
+
/**
|
|
4446
|
+
* Output of FaceCompositor.compose()
|
|
4447
|
+
*
|
|
4448
|
+
* WARNING: When using the internal output buffer (no `target` param),
|
|
4449
|
+
* `blendshapes` is a shared reference that is overwritten on the next
|
|
4450
|
+
* compose() call. Copy with `new Float32Array(output.blendshapes)` if
|
|
4451
|
+
* you need to retain values across frames.
|
|
4452
|
+
*/
|
|
4453
|
+
interface FaceCompositorOutput {
|
|
4454
|
+
/**
|
|
4455
|
+
* 52 ARKit blendshape values, clamped [0,1].
|
|
4456
|
+
*
|
|
4457
|
+
* This buffer is reused across calls when no `target` parameter is
|
|
4458
|
+
* provided to compose(). Valid until the next compose() call.
|
|
4459
|
+
*/
|
|
4460
|
+
blendshapes: Float32Array;
|
|
4461
|
+
/** Head rotation deltas in radians (from ProceduralLifeLayer) */
|
|
4462
|
+
headDelta: {
|
|
4463
|
+
yaw: number;
|
|
4464
|
+
pitch: number;
|
|
4465
|
+
};
|
|
4466
|
+
}
|
|
4447
4467
|
/**
|
|
4448
4468
|
* Per-blendshape character profile (multiplier + offset)
|
|
4449
4469
|
*
|
|
@@ -4494,13 +4514,14 @@ interface FaceCompositorInput extends LifeLayerInput {
|
|
|
4494
4514
|
* audioEnergy: 0.5,
|
|
4495
4515
|
* });
|
|
4496
4516
|
*
|
|
4497
|
-
* // Apply output[0..51] to avatar morphTargetInfluences
|
|
4517
|
+
* // Apply output.blendshapes[0..51] to avatar morphTargetInfluences
|
|
4498
4518
|
* ```
|
|
4499
4519
|
*/
|
|
4500
4520
|
declare class FaceCompositor {
|
|
4501
4521
|
private readonly emotionResolver;
|
|
4502
4522
|
private readonly lifeLayer;
|
|
4503
4523
|
private readonly emotionSmoothing;
|
|
4524
|
+
private readonly outputBuffer;
|
|
4504
4525
|
private readonly smoothedUpper;
|
|
4505
4526
|
private readonly smoothedLower;
|
|
4506
4527
|
private readonly lifeBuffer;
|
|
@@ -4513,9 +4534,11 @@ declare class FaceCompositor {
|
|
|
4513
4534
|
*
|
|
4514
4535
|
* @param base - A2E raw output (Float32Array[52], LAM_BLENDSHAPES order)
|
|
4515
4536
|
* @param input - Per-frame input (deltaTime, emotion, life layer params)
|
|
4516
|
-
* @
|
|
4537
|
+
* @param target - Optional pre-allocated output buffer (avoids per-frame allocation).
|
|
4538
|
+
* When omitted, an internal buffer is used (valid until next compose() call).
|
|
4539
|
+
* @returns Blendshapes (Float32Array[52] clamped [0,1]) and head rotation deltas
|
|
4517
4540
|
*/
|
|
4518
|
-
compose(base: Float32Array, input: FaceCompositorInput):
|
|
4541
|
+
compose(base: Float32Array, input: FaceCompositorInput, target?: Float32Array): FaceCompositorOutput;
|
|
4519
4542
|
/**
|
|
4520
4543
|
* Set sticky emotion (used when input.emotion is not provided).
|
|
4521
4544
|
*/
|
|
@@ -4817,4 +4840,4 @@ declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
|
|
|
4817
4840
|
private clearSilenceTimer;
|
|
4818
4841
|
}
|
|
4819
4842
|
|
|
4820
|
-
export { type A2EBackend, type A2EModelInfo, A2EOrchestrator, type A2EOrchestratorConfig, A2EProcessor, type A2EProcessorConfig, type A2EProgressEvent, type A2EResult, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, type CharacterProfile, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, ProceduralLifeLayer, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, VoicePipeline, type VoicePipelineConfig, type VoicePipelineEvents, type VoicePipelineState, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureTelemetry, createA2E, createEmotionVector, createSenseVoice, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, preloadModels, resetModelUrls, resolveBackend, shouldEnableWasmProxy, shouldUseCpuA2E, shouldUseNativeASR, shouldUseServerA2E, supportsVADWorker };
|
|
4843
|
+
export { type A2EBackend, type A2EModelInfo, A2EOrchestrator, type A2EOrchestratorConfig, A2EProcessor, type A2EProcessorConfig, type A2EProgressEvent, type A2EResult, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, type CharacterProfile, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FaceCompositorOutput, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, ProceduralLifeLayer, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, VoicePipeline, type VoicePipelineConfig, type VoicePipelineEvents, type VoicePipelineState, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureTelemetry, createA2E, createEmotionVector, createSenseVoice, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, preloadModels, resetModelUrls, resolveBackend, shouldEnableWasmProxy, shouldUseCpuA2E, shouldUseNativeASR, shouldUseServerA2E, supportsVADWorker };
|
package/dist/index.js
CHANGED
|
@@ -2613,7 +2613,7 @@ async function getOnnxRuntimeForPreference(preference = "auto") {
|
|
|
2613
2613
|
const ort = await getOnnxRuntime(backend);
|
|
2614
2614
|
return { ort, backend };
|
|
2615
2615
|
}
|
|
2616
|
-
function getSessionOptions(backend) {
|
|
2616
|
+
function getSessionOptions(backend, config) {
|
|
2617
2617
|
if (backend === "webgpu") {
|
|
2618
2618
|
return {
|
|
2619
2619
|
executionProviders: [
|
|
@@ -2629,7 +2629,7 @@ function getSessionOptions(backend) {
|
|
|
2629
2629
|
if (isIOS()) {
|
|
2630
2630
|
return {
|
|
2631
2631
|
executionProviders: ["wasm"],
|
|
2632
|
-
graphOptimizationLevel: "basic",
|
|
2632
|
+
graphOptimizationLevel: config?.iosDisableOptimization ? "disabled" : "basic",
|
|
2633
2633
|
enableCpuMemArena: false,
|
|
2634
2634
|
enableMemPattern: false
|
|
2635
2635
|
};
|
|
@@ -2896,7 +2896,7 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
2896
2896
|
logger3.info("ONNX Runtime loaded", { backend: this._backend });
|
|
2897
2897
|
const modelUrl = this.config.modelUrl;
|
|
2898
2898
|
const dataUrl = this.config.externalDataUrl !== false ? typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data` : null;
|
|
2899
|
-
const sessionOptions = getSessionOptions(this._backend);
|
|
2899
|
+
const sessionOptions = getSessionOptions(this._backend, { iosDisableOptimization: true });
|
|
2900
2900
|
let isCached = false;
|
|
2901
2901
|
if (isIOS()) {
|
|
2902
2902
|
logger3.info("iOS: passing model URLs directly to ORT (low-memory path)", {
|
|
@@ -7739,10 +7739,11 @@ function createA2E(config = {}) {
|
|
|
7739
7739
|
useCpu = false;
|
|
7740
7740
|
logger12.info("Forcing GPU A2E model (Wav2Vec2)");
|
|
7741
7741
|
} else {
|
|
7742
|
-
useCpu =
|
|
7743
|
-
logger12.info("Auto-detected A2E model", {
|
|
7744
|
-
|
|
7745
|
-
|
|
7742
|
+
useCpu = false;
|
|
7743
|
+
logger12.info("Auto-detected A2E model: trying GPU first (fp16 external data)", {
|
|
7744
|
+
isSafari: isSafari(),
|
|
7745
|
+
isIOS: isIOS(),
|
|
7746
|
+
fallbackOnError
|
|
7746
7747
|
});
|
|
7747
7748
|
}
|
|
7748
7749
|
if (useCpu) {
|
|
@@ -7804,6 +7805,7 @@ var A2EWithFallback = class {
|
|
|
7804
7805
|
}
|
|
7805
7806
|
}
|
|
7806
7807
|
async fallbackToCpu(reason) {
|
|
7808
|
+
console.error("[A2EWithFallback] GPU\u2192CPU FALLBACK TRIGGERED. Reason:", reason);
|
|
7807
7809
|
logger12.warn("GPU model load failed, falling back to CPU model", { reason });
|
|
7808
7810
|
try {
|
|
7809
7811
|
await this.implementation.dispose();
|
|
@@ -10672,10 +10674,7 @@ var EmotionResolver = class {
|
|
|
10672
10674
|
if (upper[i] > 1) upper[i] = 1;
|
|
10673
10675
|
if (lower[i] > 1) lower[i] = 1;
|
|
10674
10676
|
}
|
|
10675
|
-
return {
|
|
10676
|
-
upper: new Float32Array(upper),
|
|
10677
|
-
lower: new Float32Array(lower)
|
|
10678
|
-
};
|
|
10677
|
+
return { upper, lower };
|
|
10679
10678
|
}
|
|
10680
10679
|
};
|
|
10681
10680
|
|
|
@@ -10698,6 +10697,7 @@ var FaceCompositor = class {
|
|
|
10698
10697
|
constructor(config) {
|
|
10699
10698
|
this.emotionResolver = new EmotionResolver();
|
|
10700
10699
|
// Pre-allocated buffers
|
|
10700
|
+
this.outputBuffer = new Float32Array(52);
|
|
10701
10701
|
this.smoothedUpper = new Float32Array(52);
|
|
10702
10702
|
this.smoothedLower = new Float32Array(52);
|
|
10703
10703
|
this.lifeBuffer = new Float32Array(52);
|
|
@@ -10715,10 +10715,12 @@ var FaceCompositor = class {
|
|
|
10715
10715
|
*
|
|
10716
10716
|
* @param base - A2E raw output (Float32Array[52], LAM_BLENDSHAPES order)
|
|
10717
10717
|
* @param input - Per-frame input (deltaTime, emotion, life layer params)
|
|
10718
|
-
* @
|
|
10718
|
+
* @param target - Optional pre-allocated output buffer (avoids per-frame allocation).
|
|
10719
|
+
* When omitted, an internal buffer is used (valid until next compose() call).
|
|
10720
|
+
* @returns Blendshapes (Float32Array[52] clamped [0,1]) and head rotation deltas
|
|
10719
10721
|
*/
|
|
10720
|
-
compose(base, input) {
|
|
10721
|
-
const out =
|
|
10722
|
+
compose(base, input, target) {
|
|
10723
|
+
const out = target ?? this.outputBuffer;
|
|
10722
10724
|
out.set(base);
|
|
10723
10725
|
const emotion = input.emotion ?? this.stickyEmotion;
|
|
10724
10726
|
if (emotion) {
|
|
@@ -10740,7 +10742,14 @@ var FaceCompositor = class {
|
|
|
10740
10742
|
out[i] *= 1 + this.smoothedLower[i] * bilabialSuppress;
|
|
10741
10743
|
}
|
|
10742
10744
|
}
|
|
10743
|
-
this.lifeLayer.
|
|
10745
|
+
const lifeResult = this.lifeLayer.update(input.deltaTime, input);
|
|
10746
|
+
this.lifeBuffer.fill(0);
|
|
10747
|
+
for (const [name, value] of Object.entries(lifeResult.blendshapes)) {
|
|
10748
|
+
const idx = BS_INDEX2.get(name);
|
|
10749
|
+
if (idx !== void 0) {
|
|
10750
|
+
this.lifeBuffer[idx] = value;
|
|
10751
|
+
}
|
|
10752
|
+
}
|
|
10744
10753
|
for (let i = 0; i < 52; i++) {
|
|
10745
10754
|
if (IS_EYE_CHANNEL[i]) {
|
|
10746
10755
|
out[i] = this.lifeBuffer[i];
|
|
@@ -10755,7 +10764,7 @@ var FaceCompositor = class {
|
|
|
10755
10764
|
if (out[i] < 0) out[i] = 0;
|
|
10756
10765
|
else if (out[i] > 1) out[i] = 1;
|
|
10757
10766
|
}
|
|
10758
|
-
return out;
|
|
10767
|
+
return { blendshapes: out, headDelta: lifeResult.headDelta };
|
|
10759
10768
|
}
|
|
10760
10769
|
/**
|
|
10761
10770
|
* Set sticky emotion (used when input.emotion is not provided).
|
|
@@ -11042,6 +11051,7 @@ var VoicePipeline = class extends EventEmitter {
|
|
|
11042
11051
|
new Promise((r) => setTimeout(() => r("timeout"), timeoutMs))
|
|
11043
11052
|
]);
|
|
11044
11053
|
if (lamLoadResult === "timeout") {
|
|
11054
|
+
console.error(`[VoicePipeline] LAM TIMEOUT after ${timeoutMs}ms \u2014 forcing CPU fallback`);
|
|
11045
11055
|
logger19.warn(`LAM GPU load timed out after ${timeoutMs}ms, falling back to CPU`);
|
|
11046
11056
|
await lam.dispose();
|
|
11047
11057
|
lam = createA2E({
|