@omote/core 0.7.1 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -10
- package/dist/Logger-BeUI6jG7.d.mts +145 -0
- package/dist/Logger-BeUI6jG7.d.ts +145 -0
- package/dist/chunk-3NDJA3I4.mjs +853 -0
- package/dist/chunk-3NDJA3I4.mjs.map +1 -0
- package/dist/chunk-J5LAM7VW.mjs +44 -0
- package/dist/chunk-J5LAM7VW.mjs.map +1 -0
- package/dist/chunk-MXKJOF4I.mjs +38 -0
- package/dist/chunk-MXKJOF4I.mjs.map +1 -0
- package/dist/events/index.d.mts +2 -87
- package/dist/events/index.d.ts +2 -87
- package/dist/events/index.js +8 -2
- package/dist/events/index.js.map +1 -1
- package/dist/events/index.mjs +1 -1
- package/dist/index.d.mts +1258 -1225
- package/dist/index.d.ts +1258 -1225
- package/dist/index.js +10289 -10006
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +8946 -9225
- package/dist/index.mjs.map +1 -1
- package/dist/logging/index.d.mts +2 -2
- package/dist/logging/index.d.ts +2 -2
- package/dist/logging/index.js +11 -0
- package/dist/logging/index.js.map +1 -1
- package/dist/logging/index.mjs +1 -1
- package/package.json +3 -2
package/dist/index.d.mts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { EventEmitter, OmoteEvents } from './events/index.mjs';
|
|
2
2
|
export { AnimationEvent, BackendEvent, EmotionEvent, GazeEvent, STTFinalEvent, STTPartialEvent, SessionStateEvent, TTSEndEvent, TTSMarkEvent, TTSStartEvent, VisemeEvent } from './events/index.mjs';
|
|
3
|
-
export { D as DEFAULT_LOGGING_CONFIG, I as ILogger, a as LOG_LEVEL_PRIORITY, b as LogEntry, L as LogFormatter, c as LogLevel, d as LogSink, e as LoggingConfig, g as configureLogging, h as createLogger, i as getLoggingConfig, n as noopLogger, r as resetLoggingConfig, s as setLogLevel, k as setLoggingEnabled } from './Logger-
|
|
3
|
+
export { D as DEFAULT_LOGGING_CONFIG, I as ILogger, a as LOG_LEVEL_PRIORITY, b as LogEntry, L as LogFormatter, c as LogLevel, d as LogSink, e as LoggingConfig, g as configureLogging, h as createLogger, i as getLoggingConfig, n as noopLogger, r as resetLoggingConfig, s as setLogLevel, k as setLoggingEnabled } from './Logger-BeUI6jG7.mjs';
|
|
4
4
|
export { ARKitToFLAMEMapping, ApiError, AudioChunkEvent, AvatarFormat, Character, CharacterAvatar, CharacterMemory, CharacterPersonality, CharacterSpec, CharacterVoice, CreateCharacterRequest, CreateCharacterResponse, CreateLAMJobRequest, CreateLAMJobResponse, CreateSessionRequest, CreateSessionResponse, GSplatConfig, LAMJob, LAMJobStatus, PROTOCOL_VERSION, PaginatedResponse, PlatformSession, ErrorEvent as ProtocolErrorEvent, ProtocolEvent, ResponseChunkEvent, ResponseEndEvent, ResponseStartEvent, SessionMessage, SessionStatus, isProtocolEvent } from '@omote/types';
|
|
5
5
|
|
|
6
6
|
/**
|
|
@@ -171,6 +171,8 @@ declare class AudioScheduler {
|
|
|
171
171
|
private scheduledSources;
|
|
172
172
|
private isPlaying;
|
|
173
173
|
constructor(options?: AudioSchedulerOptions);
|
|
174
|
+
/** Configured sample rate (default: 16000). */
|
|
175
|
+
get sampleRate(): number;
|
|
174
176
|
/**
|
|
175
177
|
* Initialize AudioContext with specified sample rate
|
|
176
178
|
*
|
|
@@ -429,19 +431,6 @@ declare function shouldEnableWasmProxy(): boolean;
|
|
|
429
431
|
* @returns true if running in Safari on any platform
|
|
430
432
|
*/
|
|
431
433
|
declare function isSafari(): boolean;
|
|
432
|
-
/**
|
|
433
|
-
* Recommend using CPU-optimized A2E model (wav2arkit_cpu)
|
|
434
|
-
*
|
|
435
|
-
* All iOS browsers use WebKit and have tight memory limits — the 192MB fp16
|
|
436
|
-
* LAM model causes silent crashes. wav2arkit_cpu uses URL pass-through
|
|
437
|
-
* (ORT fetches the 402MB weights directly into WASM, no JS heap copy).
|
|
438
|
-
*
|
|
439
|
-
* macOS Safari also needs this due to ONNX Runtime JSEP/ASYNCIFY bugs
|
|
440
|
-
* that crash WebKit's JIT compiler.
|
|
441
|
-
*
|
|
442
|
-
* @returns true if iOS (any browser) or Safari (any platform)
|
|
443
|
-
*/
|
|
444
|
-
declare function shouldUseCpuA2E(): boolean;
|
|
445
434
|
/**
|
|
446
435
|
* Check if Web Speech API is available in the browser
|
|
447
436
|
*
|
|
@@ -479,9 +468,8 @@ declare function shouldUseServerA2E(): boolean;
|
|
|
479
468
|
/**
|
|
480
469
|
* Common interface for audio-to-expression (A2E) inference backends
|
|
481
470
|
*
|
|
482
|
-
*
|
|
483
|
-
*
|
|
484
|
-
* work with either model transparently.
|
|
471
|
+
* Implemented by A2EInference and A2EUnifiedAdapter, allowing PlaybackPipeline
|
|
472
|
+
* and A2EProcessor to work with either implementation transparently.
|
|
485
473
|
*
|
|
486
474
|
* @category Inference
|
|
487
475
|
*/
|
|
@@ -510,15 +498,22 @@ interface A2EResult {
|
|
|
510
498
|
inferenceTimeMs: number;
|
|
511
499
|
}
|
|
512
500
|
/**
|
|
513
|
-
* Common interface for A2E (audio-to-expression) inference engines
|
|
501
|
+
* Common interface for A2E (audio-to-expression) inference engines.
|
|
502
|
+
*
|
|
503
|
+
* A2E is the SDK term for audio-to-expression inference. The underlying model
|
|
504
|
+
* is called **LAM** (Large Animation Model). "A2E" and "LAM" refer to the same
|
|
505
|
+
* pipeline — A2E is the interface abstraction, LAM is the model.
|
|
514
506
|
*
|
|
515
507
|
* Implemented by:
|
|
516
|
-
* -
|
|
517
|
-
* -
|
|
508
|
+
* - {@link A2EInference} (WebGPU/WASM, 192MB fp16)
|
|
509
|
+
* - A2EUnifiedAdapter (shared unified worker)
|
|
510
|
+
*
|
|
511
|
+
* @see {@link A2EInference} for direct usage
|
|
512
|
+
* @see {@link createA2E} for the recommended factory API
|
|
518
513
|
*/
|
|
519
514
|
interface A2EBackend {
|
|
520
|
-
/** Model identifier
|
|
521
|
-
readonly modelId: '
|
|
515
|
+
/** Model identifier */
|
|
516
|
+
readonly modelId: 'a2e';
|
|
522
517
|
/** Current backend type (webgpu, wasm, or null if not loaded) */
|
|
523
518
|
readonly backend: RuntimeBackend | null;
|
|
524
519
|
/** Whether the model is loaded and ready for inference */
|
|
@@ -590,7 +585,7 @@ declare const BLENDSHAPE_TO_GROUP: Map<string, BlendshapeGroup>;
|
|
|
590
585
|
* 2. Otherwise, use the group scaler (default 1.0)
|
|
591
586
|
* 3. Clamp result to [0, 1]
|
|
592
587
|
*/
|
|
593
|
-
declare function applyProfile(raw: Float32Array, profile: ExpressionProfile): Float32Array;
|
|
588
|
+
declare function applyProfile(raw: Float32Array, profile: ExpressionProfile, out?: Float32Array): Float32Array;
|
|
594
589
|
|
|
595
590
|
/**
|
|
596
591
|
* PlaybackPipeline - Audio playback + A2E lip sync with ExpressionProfile scaling
|
|
@@ -616,7 +611,7 @@ interface PlaybackPipelineConfig {
|
|
|
616
611
|
audioDelayMs?: number;
|
|
617
612
|
/** A2E inference chunk size in samples (default: 16000) */
|
|
618
613
|
chunkSize?: number;
|
|
619
|
-
/** Identity/style index for
|
|
614
|
+
/** Identity/style index for A2E model (default: 0) */
|
|
620
615
|
identityIndex?: number;
|
|
621
616
|
/** Per-character expression weight scaling */
|
|
622
617
|
profile?: ExpressionProfile;
|
|
@@ -637,6 +632,8 @@ interface FullFaceFrame {
|
|
|
637
632
|
rawBlendshapes: Float32Array;
|
|
638
633
|
/** AudioContext timestamp for this frame */
|
|
639
634
|
timestamp: number;
|
|
635
|
+
/** Emotion label for this frame (from SenseVoice, text heuristics, or LLM tags) */
|
|
636
|
+
emotion?: string;
|
|
640
637
|
}
|
|
641
638
|
interface PlaybackPipelineEvents {
|
|
642
639
|
/** New frame ready for display (scaled by ExpressionProfile) */
|
|
@@ -655,10 +652,6 @@ interface PlaybackPipelineEvents {
|
|
|
655
652
|
'error': Error;
|
|
656
653
|
/** State changed */
|
|
657
654
|
'state': PlaybackState;
|
|
658
|
-
'full_frame_ready': FullFaceFrame;
|
|
659
|
-
'lam_frame_ready': Float32Array;
|
|
660
|
-
'playback_complete': void;
|
|
661
|
-
'playback_start': number;
|
|
662
655
|
[key: string]: unknown;
|
|
663
656
|
}
|
|
664
657
|
declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
|
|
@@ -676,6 +669,7 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
|
|
|
676
669
|
private staleWarningEmitted;
|
|
677
670
|
private readonly staleThresholdMs;
|
|
678
671
|
private frameLoopCount;
|
|
672
|
+
private sessionStartTime;
|
|
679
673
|
private profile;
|
|
680
674
|
private readonly neutralTransitionEnabled;
|
|
681
675
|
private readonly neutralTransitionMs;
|
|
@@ -684,6 +678,8 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
|
|
|
684
678
|
private neutralAnimationId;
|
|
685
679
|
private _currentFrame;
|
|
686
680
|
private _currentRawFrame;
|
|
681
|
+
private _emotion;
|
|
682
|
+
private readonly _profileBuffer;
|
|
687
683
|
/** Current pipeline state */
|
|
688
684
|
get state(): PlaybackState;
|
|
689
685
|
/** Current scaled blendshapes (updated in-place for perf) */
|
|
@@ -695,6 +691,8 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
|
|
|
695
691
|
initialize(): Promise<void>;
|
|
696
692
|
/** Update ExpressionProfile at runtime */
|
|
697
693
|
setProfile(profile: ExpressionProfile): void;
|
|
694
|
+
/** Set the emotion label to include in emitted frames */
|
|
695
|
+
setEmotion(emotion: string | null): void;
|
|
698
696
|
/**
|
|
699
697
|
* Start a new playback session.
|
|
700
698
|
* Idempotent — calling during playback resets cleanly without emitting
|
|
@@ -733,201 +731,6 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
|
|
|
733
731
|
private setState;
|
|
734
732
|
}
|
|
735
733
|
|
|
736
|
-
/**
|
|
737
|
-
* FullFacePipeline - A2E expression pipeline with ExpressionProfile weight scaling
|
|
738
|
-
*
|
|
739
|
-
* Orchestrates full-face animation by:
|
|
740
|
-
* 1. Scheduling audio for playback immediately (audio-first, never waits for A2E)
|
|
741
|
-
* 2. Running A2E inference in background (fire-and-forget via A2EProcessor)
|
|
742
|
-
* 3. Applying per-character ExpressionProfile scaling to raw A2E output
|
|
743
|
-
*
|
|
744
|
-
* The A2E model outputs all 52 ARKit blendshapes from audio — brows, eyes, cheeks,
|
|
745
|
-
* mouth, jaw, everything. ExpressionProfile allows per-character weight scaling
|
|
746
|
-
* by group (eyes, brows, jaw, mouth, cheeks, nose, tongue) with per-blendshape overrides.
|
|
747
|
-
*
|
|
748
|
-
* @deprecated Use {@link PlaybackPipeline} from `@omote/core` instead. PlaybackPipeline
|
|
749
|
-
* is a superset with sync mode (`feedBuffer`), state tracking, and opt-in neutral transition.
|
|
750
|
-
* FullFacePipeline will continue to work but is no longer actively developed.
|
|
751
|
-
*
|
|
752
|
-
* @category Audio
|
|
753
|
-
*
|
|
754
|
-
* @example Basic usage
|
|
755
|
-
* ```typescript
|
|
756
|
-
* import { FullFacePipeline } from '@omote/core';
|
|
757
|
-
*
|
|
758
|
-
* const pipeline = new FullFacePipeline({
|
|
759
|
-
* lam,
|
|
760
|
-
* profile: { mouth: 1.2, brows: 0.8 },
|
|
761
|
-
* });
|
|
762
|
-
* await pipeline.initialize();
|
|
763
|
-
*
|
|
764
|
-
* pipeline.on('full_frame_ready', (frame) => {
|
|
765
|
-
* applyToAvatar(frame.blendshapes);
|
|
766
|
-
* });
|
|
767
|
-
*
|
|
768
|
-
* pipeline.start();
|
|
769
|
-
* await pipeline.onAudioChunk(audioData);
|
|
770
|
-
* ```
|
|
771
|
-
*/
|
|
772
|
-
|
|
773
|
-
/**
|
|
774
|
-
* Configuration for FullFacePipeline
|
|
775
|
-
*/
|
|
776
|
-
interface FullFacePipelineOptions {
|
|
777
|
-
/** Sample rate in Hz (default: 16000) */
|
|
778
|
-
sampleRate?: number;
|
|
779
|
-
/** Target chunk duration in ms for coalescing (default: 200) */
|
|
780
|
-
chunkTargetMs?: number;
|
|
781
|
-
/**
|
|
782
|
-
* Audio playback delay in ms before first audio plays.
|
|
783
|
-
* Gives A2E inference time to pre-compute blendshapes before audio
|
|
784
|
-
* starts, preventing frame drops/desync. Must be ≥ chunkSize
|
|
785
|
-
* accumulation time + inference latency.
|
|
786
|
-
*
|
|
787
|
-
* Default: auto-calculated from chunkSize and backend type.
|
|
788
|
-
*/
|
|
789
|
-
audioDelayMs?: number;
|
|
790
|
-
/**
|
|
791
|
-
* A2E inference chunk size in samples.
|
|
792
|
-
* Controls how many samples accumulate before each inference call.
|
|
793
|
-
* Smaller = lower latency (less delay before first frame), more overhead.
|
|
794
|
-
* Larger = higher latency, less overhead.
|
|
795
|
-
*
|
|
796
|
-
* Default: 16000 (1s) — the model's native window size.
|
|
797
|
-
* Smaller chunks get zero-padded, causing near-zero blendshape output.
|
|
798
|
-
*/
|
|
799
|
-
chunkSize?: number;
|
|
800
|
-
/** A2E inference engine */
|
|
801
|
-
lam: A2EBackend;
|
|
802
|
-
/**
|
|
803
|
-
* Identity/style index for the A2E model (default: 0).
|
|
804
|
-
*
|
|
805
|
-
* The LAM model uses a 12-class one-hot identity vector as style conditioning.
|
|
806
|
-
* Different indices produce different expression intensity across face regions.
|
|
807
|
-
* Only affects Wav2Vec2Inference (GPU). Wav2ArkitCpuInference has identity 11 baked in.
|
|
808
|
-
*/
|
|
809
|
-
identityIndex?: number;
|
|
810
|
-
/** Per-character expression weight scaling */
|
|
811
|
-
profile?: ExpressionProfile;
|
|
812
|
-
/**
|
|
813
|
-
* Time in ms with no new inference frames before logging a stale warning.
|
|
814
|
-
*
|
|
815
|
-
* Must be larger than the inter-batch gap (chunkSize/sampleRate + inference time).
|
|
816
|
-
* Default: 2000
|
|
817
|
-
*/
|
|
818
|
-
staleThresholdMs?: number;
|
|
819
|
-
}
|
|
820
|
-
/**
|
|
821
|
-
* Events emitted by FullFacePipeline
|
|
822
|
-
*/
|
|
823
|
-
interface FullFacePipelineEvents {
|
|
824
|
-
/** New merged frame ready for display */
|
|
825
|
-
full_frame_ready: FullFaceFrame;
|
|
826
|
-
/** Raw LAM frame ready (for debugging/monitoring) */
|
|
827
|
-
lam_frame_ready: Float32Array;
|
|
828
|
-
/** Playback has completed */
|
|
829
|
-
playback_complete: void;
|
|
830
|
-
/** First frame ready, playback starting */
|
|
831
|
-
playback_start: number;
|
|
832
|
-
/** Error occurred */
|
|
833
|
-
error: Error;
|
|
834
|
-
/** Index signature for EventEmitter compatibility */
|
|
835
|
-
[key: string]: unknown;
|
|
836
|
-
}
|
|
837
|
-
/**
|
|
838
|
-
* FullFacePipeline - A2E animation pipeline with ExpressionProfile scaling
|
|
839
|
-
*
|
|
840
|
-
* Audio-first design matching SyncedAudioPipeline:
|
|
841
|
-
* - Audio is scheduled immediately (never waits for A2E)
|
|
842
|
-
* - A2E runs in background (fire-and-forget via A2EProcessor)
|
|
843
|
-
* - ExpressionProfile scales raw A2E output per-character
|
|
844
|
-
*/
|
|
845
|
-
declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
|
|
846
|
-
private readonly options;
|
|
847
|
-
private scheduler;
|
|
848
|
-
private coalescer;
|
|
849
|
-
private processor;
|
|
850
|
-
private playbackStarted;
|
|
851
|
-
private monitorInterval;
|
|
852
|
-
private frameAnimationId;
|
|
853
|
-
private lastNewFrameTime;
|
|
854
|
-
private lastKnownLamFrame;
|
|
855
|
-
private staleWarningEmitted;
|
|
856
|
-
private readonly staleThresholdMs;
|
|
857
|
-
private frameLoopCount;
|
|
858
|
-
private profile;
|
|
859
|
-
constructor(options: FullFacePipelineOptions);
|
|
860
|
-
/**
|
|
861
|
-
* Initialize the pipeline
|
|
862
|
-
*/
|
|
863
|
-
initialize(): Promise<void>;
|
|
864
|
-
/**
|
|
865
|
-
* Update the ExpressionProfile at runtime (e.g., character switch).
|
|
866
|
-
*/
|
|
867
|
-
setProfile(profile: ExpressionProfile): void;
|
|
868
|
-
/**
|
|
869
|
-
* Apply ExpressionProfile scaling to raw A2E blendshapes.
|
|
870
|
-
*
|
|
871
|
-
* Delegates to the standalone applyProfile() utility from expressionProfile.ts.
|
|
872
|
-
*/
|
|
873
|
-
applyProfile(raw: Float32Array): Float32Array;
|
|
874
|
-
/**
|
|
875
|
-
* Start a new playback session
|
|
876
|
-
*
|
|
877
|
-
* Resets all state and prepares for incoming audio chunks.
|
|
878
|
-
* Audio will be scheduled immediately as chunks arrive (no buffering).
|
|
879
|
-
*/
|
|
880
|
-
start(): void;
|
|
881
|
-
/**
|
|
882
|
-
* Receive audio chunk from network
|
|
883
|
-
*
|
|
884
|
-
* Audio-first design: schedules audio immediately, A2E runs in background.
|
|
885
|
-
* This prevents A2E inference (50-300ms) from blocking audio scheduling.
|
|
886
|
-
*
|
|
887
|
-
* @param chunk - Uint8Array containing Int16 PCM audio
|
|
888
|
-
*/
|
|
889
|
-
onAudioChunk(chunk: Uint8Array): Promise<void>;
|
|
890
|
-
/**
|
|
891
|
-
* Start frame animation loop
|
|
892
|
-
*
|
|
893
|
-
* Polls A2EProcessor at render rate (60fps) for the latest inference frame
|
|
894
|
-
* matching the current AudioContext time. Between inference batches (~30fps
|
|
895
|
-
* bursts), getFrameForTime() holds the last frame.
|
|
896
|
-
*/
|
|
897
|
-
private startFrameLoop;
|
|
898
|
-
/**
|
|
899
|
-
* End of audio stream
|
|
900
|
-
*/
|
|
901
|
-
end(): Promise<void>;
|
|
902
|
-
/**
|
|
903
|
-
* Stop playback immediately with smooth fade-out
|
|
904
|
-
*/
|
|
905
|
-
stop(fadeOutMs?: number): Promise<void>;
|
|
906
|
-
/**
|
|
907
|
-
* Start monitoring for playback completion
|
|
908
|
-
*/
|
|
909
|
-
private startMonitoring;
|
|
910
|
-
/**
|
|
911
|
-
* Stop monitoring
|
|
912
|
-
*/
|
|
913
|
-
private stopMonitoring;
|
|
914
|
-
/**
|
|
915
|
-
* Get current pipeline state (for debugging/monitoring)
|
|
916
|
-
*/
|
|
917
|
-
getState(): {
|
|
918
|
-
playbackStarted: boolean;
|
|
919
|
-
coalescerFill: number;
|
|
920
|
-
processorFill: number;
|
|
921
|
-
queuedFrames: number;
|
|
922
|
-
currentTime: number;
|
|
923
|
-
playbackEndTime: number;
|
|
924
|
-
};
|
|
925
|
-
/**
|
|
926
|
-
* Cleanup resources
|
|
927
|
-
*/
|
|
928
|
-
dispose(): void;
|
|
929
|
-
}
|
|
930
|
-
|
|
931
734
|
/**
|
|
932
735
|
* TTSBackend — Streaming text-to-speech backend interface.
|
|
933
736
|
*
|
|
@@ -1007,7 +810,7 @@ interface TTSPlaybackConfig {
|
|
|
1007
810
|
profile?: ExpressionProfile;
|
|
1008
811
|
/** Prefetch next sentence while current plays. Default: true */
|
|
1009
812
|
prefetch?: boolean;
|
|
1010
|
-
/** Identity/style index for
|
|
813
|
+
/** Identity/style index for A2E model (default: 0) */
|
|
1011
814
|
identityIndex?: number;
|
|
1012
815
|
/** Audio playback delay in ms */
|
|
1013
816
|
audioDelayMs?: number;
|
|
@@ -1027,6 +830,8 @@ interface TTSPlaybackEvents {
|
|
|
1027
830
|
};
|
|
1028
831
|
/** Playback completed */
|
|
1029
832
|
'playback:complete': void;
|
|
833
|
+
/** Playback stopped (user-initiated) */
|
|
834
|
+
'playback:stop': void;
|
|
1030
835
|
/** Error */
|
|
1031
836
|
'error': Error;
|
|
1032
837
|
[key: string]: unknown;
|
|
@@ -1056,87 +861,6 @@ declare class TTSPlayback extends EventEmitter<TTSPlaybackEvents> {
|
|
|
1056
861
|
private speakSequential;
|
|
1057
862
|
}
|
|
1058
863
|
|
|
1059
|
-
/**
|
|
1060
|
-
* Interruption Handler
|
|
1061
|
-
*
|
|
1062
|
-
* VAD-based barge-in detection for AI conversations:
|
|
1063
|
-
* - Monitors VAD probability for user speech
|
|
1064
|
-
* - Detects when user interrupts AI response
|
|
1065
|
-
* - Triggers interruption callbacks
|
|
1066
|
-
*/
|
|
1067
|
-
|
|
1068
|
-
interface InterruptionEvents {
|
|
1069
|
-
[key: string]: unknown;
|
|
1070
|
-
'speech.detected': {
|
|
1071
|
-
rms: number;
|
|
1072
|
-
};
|
|
1073
|
-
'speech.ended': {
|
|
1074
|
-
durationMs: number;
|
|
1075
|
-
};
|
|
1076
|
-
'interruption.triggered': {
|
|
1077
|
-
rms: number;
|
|
1078
|
-
durationMs: number;
|
|
1079
|
-
};
|
|
1080
|
-
}
|
|
1081
|
-
/**
|
|
1082
|
-
* Interruption handler configuration
|
|
1083
|
-
*
|
|
1084
|
-
* Industry standards applied:
|
|
1085
|
-
* - vadThreshold: 0.5 (Silero VAD default)
|
|
1086
|
-
* - minSpeechDurationMs: 200ms (Google/Amazon barge-in standard)
|
|
1087
|
-
* - silenceTimeoutMs: 500ms (OpenAI Realtime API standard)
|
|
1088
|
-
*/
|
|
1089
|
-
interface InterruptionConfig {
|
|
1090
|
-
/** VAD probability threshold for speech detection (default: 0.5, Silero standard) */
|
|
1091
|
-
vadThreshold?: number;
|
|
1092
|
-
/** Minimum speech duration to trigger interruption (default: 200ms, Google/Amazon standard) */
|
|
1093
|
-
minSpeechDurationMs?: number;
|
|
1094
|
-
/** Silence duration to end speech (default: 500ms, OpenAI standard) */
|
|
1095
|
-
silenceTimeoutMs?: number;
|
|
1096
|
-
/** Enable interruption detection (default: true) */
|
|
1097
|
-
enabled?: boolean;
|
|
1098
|
-
}
|
|
1099
|
-
declare class InterruptionHandler extends EventEmitter<InterruptionEvents> {
|
|
1100
|
-
private config;
|
|
1101
|
-
private isSpeaking;
|
|
1102
|
-
private speechStartTime;
|
|
1103
|
-
private lastSpeechTime;
|
|
1104
|
-
private silenceTimer;
|
|
1105
|
-
private aiIsSpeaking;
|
|
1106
|
-
private interruptionTriggeredThisSession;
|
|
1107
|
-
constructor(config?: InterruptionConfig);
|
|
1108
|
-
/**
|
|
1109
|
-
* Process raw audio energy for interruption detection (no VAD required).
|
|
1110
|
-
* Used during speaking state when the unified worker is busy with TTS.
|
|
1111
|
-
* Echo-cancelled mic input means energy above threshold = user speech.
|
|
1112
|
-
*
|
|
1113
|
-
* @param rms - RMS energy of audio chunk (0-1)
|
|
1114
|
-
* @param energyThreshold - Minimum energy to consider speech (default: 0.02)
|
|
1115
|
-
*/
|
|
1116
|
-
processAudioEnergy(rms: number, energyThreshold?: number): void;
|
|
1117
|
-
/**
|
|
1118
|
-
* Process VAD result for interruption detection
|
|
1119
|
-
* @param vadProbability - Speech probability from VAD (0-1)
|
|
1120
|
-
* @param audioEnergy - Optional RMS energy for logging (default: 0)
|
|
1121
|
-
*/
|
|
1122
|
-
processVADResult(vadProbability: number, audioEnergy?: number): void;
|
|
1123
|
-
/** Notify that AI started/stopped speaking */
|
|
1124
|
-
setAISpeaking(speaking: boolean): void;
|
|
1125
|
-
/** Enable/disable interruption detection */
|
|
1126
|
-
setEnabled(enabled: boolean): void;
|
|
1127
|
-
/** Update configuration */
|
|
1128
|
-
updateConfig(config: Partial<InterruptionConfig>): void;
|
|
1129
|
-
/** Reset state */
|
|
1130
|
-
reset(): void;
|
|
1131
|
-
/** Get current state */
|
|
1132
|
-
getState(): {
|
|
1133
|
-
isSpeaking: boolean;
|
|
1134
|
-
speechDurationMs: number;
|
|
1135
|
-
};
|
|
1136
|
-
private onSpeechDetected;
|
|
1137
|
-
private onSilenceDetected;
|
|
1138
|
-
}
|
|
1139
|
-
|
|
1140
864
|
/**
|
|
1141
865
|
* Lazy ONNX Runtime loader with conditional WebGPU/WASM bundle loading
|
|
1142
866
|
*
|
|
@@ -1261,159 +985,49 @@ declare class SenseVoiceInference {
|
|
|
1261
985
|
}
|
|
1262
986
|
|
|
1263
987
|
/**
|
|
1264
|
-
*
|
|
988
|
+
* Silero VAD (Voice Activity Detection) inference
|
|
1265
989
|
*
|
|
1266
|
-
*
|
|
1267
|
-
*
|
|
1268
|
-
* avoid separate file deployment.
|
|
990
|
+
* Neural network-based VAD running in browser via ONNX Runtime Web.
|
|
991
|
+
* Much more accurate than RMS-based energy detection.
|
|
1269
992
|
*
|
|
1270
|
-
*
|
|
1271
|
-
* -
|
|
1272
|
-
* -
|
|
1273
|
-
* - Audio copied (not transferred) to retain main thread access
|
|
1274
|
-
* - ONNX Runtime loaded from CDN in worker (no bundler complications)
|
|
1275
|
-
* - iOS: model URL passed as string to ORT (avoids 239MB JS heap allocation)
|
|
993
|
+
* Uses lazy loading to conditionally load WebGPU or WASM-only bundle:
|
|
994
|
+
* - iOS: Loads WASM-only bundle (WebGPU crashes due to Safari bugs)
|
|
995
|
+
* - Android/Desktop: Loads WebGPU bundle (with WASM fallback)
|
|
1276
996
|
*
|
|
1277
997
|
* @category Inference
|
|
1278
998
|
*
|
|
1279
999
|
* @example Basic usage
|
|
1280
1000
|
* ```typescript
|
|
1281
|
-
* import {
|
|
1001
|
+
* import { SileroVADInference } from '@omote/core';
|
|
1282
1002
|
*
|
|
1283
|
-
* const
|
|
1284
|
-
* modelUrl: '/models/
|
|
1285
|
-
* tokensUrl: '/models/sensevoice/tokens.txt',
|
|
1003
|
+
* const vad = new SileroVADInference({
|
|
1004
|
+
* modelUrl: '/models/silero-vad.onnx'
|
|
1286
1005
|
* });
|
|
1287
|
-
* await
|
|
1006
|
+
* await vad.load();
|
|
1288
1007
|
*
|
|
1289
|
-
*
|
|
1290
|
-
*
|
|
1291
|
-
*
|
|
1292
|
-
*
|
|
1008
|
+
* // Process 32ms chunks (512 samples at 16kHz)
|
|
1009
|
+
* const probability = await vad.process(audioChunk);
|
|
1010
|
+
* if (probability > 0.5) {
|
|
1011
|
+
* console.log('Speech detected!');
|
|
1012
|
+
* }
|
|
1013
|
+
* ```
|
|
1014
|
+
*
|
|
1015
|
+
* @example Streaming with state management
|
|
1016
|
+
* ```typescript
|
|
1017
|
+
* // State is automatically maintained between process() calls
|
|
1018
|
+
* // Call reset() when starting a new audio stream
|
|
1019
|
+
* vad.reset();
|
|
1020
|
+
*
|
|
1021
|
+
* for (const chunk of audioChunks) {
|
|
1022
|
+
* const prob = await vad.process(chunk);
|
|
1023
|
+
* // prob is speech probability [0, 1]
|
|
1024
|
+
* }
|
|
1293
1025
|
* ```
|
|
1294
1026
|
*/
|
|
1295
1027
|
|
|
1028
|
+
type VADBackend = BackendPreference;
|
|
1296
1029
|
/**
|
|
1297
|
-
* Configuration for
|
|
1298
|
-
*/
|
|
1299
|
-
interface SenseVoiceWorkerConfig {
|
|
1300
|
-
/** Path or URL to model.int8.onnx (239MB) */
|
|
1301
|
-
modelUrl: string;
|
|
1302
|
-
/** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
|
|
1303
|
-
tokensUrl?: string;
|
|
1304
|
-
/** Language hint (default: 'auto' for auto-detection) */
|
|
1305
|
-
language?: SenseVoiceLanguage;
|
|
1306
|
-
/** Text normalization: 'with_itn' applies inverse text normalization (default: 'with_itn') */
|
|
1307
|
-
textNorm?: 'with_itn' | 'without_itn';
|
|
1308
|
-
}
|
|
1309
|
-
/**
|
|
1310
|
-
* SenseVoice ASR Worker - Speech Recognition in a Web Worker
|
|
1311
|
-
*
|
|
1312
|
-
* Runs SenseVoice inference off the main thread to prevent UI blocking.
|
|
1313
|
-
* All preprocessing (fbank, LFR, CMVN) and CTC decoding run in the worker.
|
|
1314
|
-
*
|
|
1315
|
-
* @see SenseVoiceInference for main-thread version
|
|
1316
|
-
*/
|
|
1317
|
-
declare class SenseVoiceWorker {
|
|
1318
|
-
private worker;
|
|
1319
|
-
private config;
|
|
1320
|
-
private isLoading;
|
|
1321
|
-
private _isLoaded;
|
|
1322
|
-
private inferenceQueue;
|
|
1323
|
-
private poisoned;
|
|
1324
|
-
private pendingResolvers;
|
|
1325
|
-
private languageId;
|
|
1326
|
-
private textNormId;
|
|
1327
|
-
constructor(config: SenseVoiceWorkerConfig);
|
|
1328
|
-
get isLoaded(): boolean;
|
|
1329
|
-
/**
|
|
1330
|
-
* Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
|
|
1331
|
-
*/
|
|
1332
|
-
get backend(): 'wasm' | null;
|
|
1333
|
-
/**
|
|
1334
|
-
* Create the worker from inline script
|
|
1335
|
-
*/
|
|
1336
|
-
private createWorker;
|
|
1337
|
-
/**
|
|
1338
|
-
* Handle messages from worker
|
|
1339
|
-
*/
|
|
1340
|
-
private handleWorkerMessage;
|
|
1341
|
-
/**
|
|
1342
|
-
* Send message to worker and wait for response
|
|
1343
|
-
*/
|
|
1344
|
-
private sendMessage;
|
|
1345
|
-
/**
|
|
1346
|
-
* Load the ONNX model in the worker
|
|
1347
|
-
*
|
|
1348
|
-
* @param onProgress - Optional progress callback. Fires once at 100% when load completes
|
|
1349
|
-
* (the worker downloads and loads the model internally, so granular progress is not available).
|
|
1350
|
-
*/
|
|
1351
|
-
load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
|
|
1352
|
-
/**
|
|
1353
|
-
* Transcribe audio samples to text
|
|
1354
|
-
*
|
|
1355
|
-
* @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
|
|
1356
|
-
* @returns Transcription result with text, emotion, language, and event
|
|
1357
|
-
*/
|
|
1358
|
-
transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
|
|
1359
|
-
/**
|
|
1360
|
-
* Queue inference to serialize worker calls
|
|
1361
|
-
*/
|
|
1362
|
-
private queueInference;
|
|
1363
|
-
/**
|
|
1364
|
-
* Dispose of the worker and free resources
|
|
1365
|
-
*/
|
|
1366
|
-
dispose(): Promise<void>;
|
|
1367
|
-
/**
|
|
1368
|
-
* Check if Web Workers are supported
|
|
1369
|
-
*/
|
|
1370
|
-
static isSupported(): boolean;
|
|
1371
|
-
}
|
|
1372
|
-
|
|
1373
|
-
/**
|
|
1374
|
-
* Silero VAD (Voice Activity Detection) inference
|
|
1375
|
-
*
|
|
1376
|
-
* Neural network-based VAD running in browser via ONNX Runtime Web.
|
|
1377
|
-
* Much more accurate than RMS-based energy detection.
|
|
1378
|
-
*
|
|
1379
|
-
* Uses lazy loading to conditionally load WebGPU or WASM-only bundle:
|
|
1380
|
-
* - iOS: Loads WASM-only bundle (WebGPU crashes due to Safari bugs)
|
|
1381
|
-
* - Android/Desktop: Loads WebGPU bundle (with WASM fallback)
|
|
1382
|
-
*
|
|
1383
|
-
* @category Inference
|
|
1384
|
-
*
|
|
1385
|
-
* @example Basic usage
|
|
1386
|
-
* ```typescript
|
|
1387
|
-
* import { SileroVADInference } from '@omote/core';
|
|
1388
|
-
*
|
|
1389
|
-
* const vad = new SileroVADInference({
|
|
1390
|
-
* modelUrl: '/models/silero-vad.onnx'
|
|
1391
|
-
* });
|
|
1392
|
-
* await vad.load();
|
|
1393
|
-
*
|
|
1394
|
-
* // Process 32ms chunks (512 samples at 16kHz)
|
|
1395
|
-
* const probability = await vad.process(audioChunk);
|
|
1396
|
-
* if (probability > 0.5) {
|
|
1397
|
-
* console.log('Speech detected!');
|
|
1398
|
-
* }
|
|
1399
|
-
* ```
|
|
1400
|
-
*
|
|
1401
|
-
* @example Streaming with state management
|
|
1402
|
-
* ```typescript
|
|
1403
|
-
* // State is automatically maintained between process() calls
|
|
1404
|
-
* // Call reset() when starting a new audio stream
|
|
1405
|
-
* vad.reset();
|
|
1406
|
-
*
|
|
1407
|
-
* for (const chunk of audioChunks) {
|
|
1408
|
-
* const prob = await vad.process(chunk);
|
|
1409
|
-
* // prob is speech probability [0, 1]
|
|
1410
|
-
* }
|
|
1411
|
-
* ```
|
|
1412
|
-
*/
|
|
1413
|
-
|
|
1414
|
-
type VADBackend = BackendPreference;
|
|
1415
|
-
/**
|
|
1416
|
-
* Configuration for Silero VAD
|
|
1030
|
+
* Configuration for Silero VAD
|
|
1417
1031
|
*/
|
|
1418
1032
|
interface SileroVADConfig {
|
|
1419
1033
|
/** Path or URL to the ONNX model */
|
|
@@ -1705,15 +1319,16 @@ declare class SileroVADWorker {
|
|
|
1705
1319
|
}
|
|
1706
1320
|
|
|
1707
1321
|
/**
|
|
1708
|
-
* Unified Inference Worker — single Web Worker hosting all
|
|
1322
|
+
* Unified Inference Worker — single Web Worker hosting all ONNX models
|
|
1323
|
+
*
|
|
1324
|
+
* Runs all model loading and inference off the main thread, preventing
|
|
1325
|
+
* InferenceSession.create() from blocking the renderer (5-30s).
|
|
1709
1326
|
*
|
|
1710
|
-
*
|
|
1711
|
-
*
|
|
1712
|
-
* limit, forcing main-thread fallback which blocks the render loop.
|
|
1327
|
+
* Uses WebGPU when available (Chrome/Edge 113+), falls back to WASM.
|
|
1328
|
+
* On iOS, uses a single WASM instance to stay within the ~1-1.5GB tab limit.
|
|
1713
1329
|
*
|
|
1714
|
-
* This worker hosts SenseVoice +
|
|
1715
|
-
* ORT
|
|
1716
|
-
* off-main-thread. Works on iOS because there's only one ORT instance.
|
|
1330
|
+
* This worker hosts SenseVoice + A2E + Silero VAD + Kokoro TTS in a single
|
|
1331
|
+
* ORT instance. Same total model memory, but inference runs off-main-thread.
|
|
1717
1332
|
*
|
|
1718
1333
|
* Consumer usage:
|
|
1719
1334
|
* ```typescript
|
|
@@ -1721,7 +1336,7 @@ declare class SileroVADWorker {
|
|
|
1721
1336
|
* await worker.init();
|
|
1722
1337
|
*
|
|
1723
1338
|
* const asr = createSenseVoice({ modelUrl: '...', unifiedWorker: worker });
|
|
1724
|
-
* const lam = createA2E({
|
|
1339
|
+
* const lam = createA2E({ modelUrl: '...', unifiedWorker: worker });
|
|
1725
1340
|
* const vad = createSileroVAD({ modelUrl: '...', unifiedWorker: worker });
|
|
1726
1341
|
* ```
|
|
1727
1342
|
*
|
|
@@ -1731,10 +1346,11 @@ declare class SileroVADWorker {
|
|
|
1731
1346
|
/** Health state of the unified worker */
|
|
1732
1347
|
type WorkerHealthState = 'healthy' | 'unhealthy' | 'recovering';
|
|
1733
1348
|
/**
|
|
1734
|
-
* Unified Inference Worker — single Web Worker for all
|
|
1349
|
+
* Unified Inference Worker — single Web Worker for all ONNX models
|
|
1735
1350
|
*
|
|
1736
|
-
* Hosts SenseVoice,
|
|
1737
|
-
*
|
|
1351
|
+
* Hosts SenseVoice, A2E (LAM), Kokoro TTS, and Silero VAD in one ORT instance.
|
|
1352
|
+
* Uses WebGPU on Chrome/Edge 113+, falls back to WASM on Safari/iOS/Firefox.
|
|
1353
|
+
* All model loading and inference runs off the main thread.
|
|
1738
1354
|
*/
|
|
1739
1355
|
declare class UnifiedInferenceWorker {
|
|
1740
1356
|
private worker;
|
|
@@ -1744,6 +1360,7 @@ declare class UnifiedInferenceWorker {
|
|
|
1744
1360
|
private consecutiveFailures;
|
|
1745
1361
|
private _generation;
|
|
1746
1362
|
private recovering;
|
|
1363
|
+
private _workerBackend;
|
|
1747
1364
|
/**
|
|
1748
1365
|
* Initialize the worker (load ORT WASM from CDN)
|
|
1749
1366
|
*/
|
|
@@ -1756,17 +1373,6 @@ declare class UnifiedInferenceWorker {
|
|
|
1756
1373
|
}): Promise<SenseVoiceModelInfo>;
|
|
1757
1374
|
transcribe(audio: Float32Array): Promise<SenseVoiceResult>;
|
|
1758
1375
|
disposeSenseVoice(): Promise<void>;
|
|
1759
|
-
loadA2E(config: {
|
|
1760
|
-
modelUrl: string;
|
|
1761
|
-
externalDataUrl: string | null;
|
|
1762
|
-
}): Promise<A2EModelInfo>;
|
|
1763
|
-
inferA2E(audio: Float32Array): Promise<{
|
|
1764
|
-
blendshapes: Float32Array;
|
|
1765
|
-
numFrames: number;
|
|
1766
|
-
numBlendshapes: number;
|
|
1767
|
-
inferenceTimeMs: number;
|
|
1768
|
-
}>;
|
|
1769
|
-
disposeA2E(): Promise<void>;
|
|
1770
1376
|
loadLAM(config: {
|
|
1771
1377
|
modelUrl: string;
|
|
1772
1378
|
externalDataUrl: string | null;
|
|
@@ -1807,6 +1413,8 @@ declare class UnifiedInferenceWorker {
|
|
|
1807
1413
|
get health(): WorkerHealthState;
|
|
1808
1414
|
/** Generation counter — increments on worker recovery. Adapters compare to detect stale sessions. */
|
|
1809
1415
|
get workerGeneration(): number;
|
|
1416
|
+
/** The ORT backend the worker is using ('webgpu' on Chrome/Edge, 'wasm' on Safari/iOS/Firefox) */
|
|
1417
|
+
get backend(): 'wasm' | 'webgpu';
|
|
1810
1418
|
/** Check if Web Workers are supported */
|
|
1811
1419
|
static isSupported(): boolean;
|
|
1812
1420
|
private assertReady;
|
|
@@ -1852,575 +1460,1003 @@ interface InferenceFactoryConfig {
|
|
|
1852
1460
|
}
|
|
1853
1461
|
|
|
1854
1462
|
/**
|
|
1855
|
-
* Factory function for
|
|
1463
|
+
* Factory function for A2E inference
|
|
1856
1464
|
*
|
|
1857
|
-
*
|
|
1858
|
-
*
|
|
1859
|
-
* - Worker unsupported: Uses SenseVoiceInference (main thread)
|
|
1465
|
+
* Creates an A2EBackend instance with zero-config defaults (HuggingFace CDN).
|
|
1466
|
+
* Supports unified worker mode for iOS off-main-thread inference.
|
|
1860
1467
|
*
|
|
1861
1468
|
* @category Inference
|
|
1862
1469
|
*
|
|
1863
|
-
* @example Auto-detect (recommended)
|
|
1470
|
+
* @example Auto-detect (recommended, zero-config)
|
|
1864
1471
|
* ```typescript
|
|
1865
|
-
* import {
|
|
1866
|
-
*
|
|
1867
|
-
* const asr = createSenseVoice({
|
|
1868
|
-
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
1869
|
-
* });
|
|
1870
|
-
* await asr.load();
|
|
1871
|
-
* const { text, emotion } = await asr.transcribe(audioSamples);
|
|
1872
|
-
* ```
|
|
1472
|
+
* import { createA2E } from '@omote/core';
|
|
1873
1473
|
*
|
|
1874
|
-
*
|
|
1875
|
-
*
|
|
1876
|
-
* const
|
|
1877
|
-
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
1878
|
-
* useWorker: true,
|
|
1879
|
-
* });
|
|
1474
|
+
* const a2e = createA2E(); // uses HF CDN defaults (192MB fp16)
|
|
1475
|
+
* await a2e.load();
|
|
1476
|
+
* const { blendshapes } = await a2e.infer(audioSamples);
|
|
1880
1477
|
* ```
|
|
1881
1478
|
*
|
|
1882
|
-
* @example
|
|
1479
|
+
* @example Custom model URL
|
|
1883
1480
|
* ```typescript
|
|
1884
|
-
* const
|
|
1885
|
-
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
1886
|
-
* useWorker: false,
|
|
1887
|
-
* });
|
|
1481
|
+
* const a2e = createA2E({ modelUrl: '/models/lam.onnx' });
|
|
1888
1482
|
* ```
|
|
1889
1483
|
*/
|
|
1890
1484
|
|
|
1891
1485
|
/**
|
|
1892
|
-
*
|
|
1486
|
+
* Configuration for the A2E factory
|
|
1893
1487
|
*/
|
|
1894
|
-
interface
|
|
1895
|
-
/**
|
|
1896
|
-
|
|
1897
|
-
/** Current backend type ('wasm' for worker, full backend for main thread, null if not loaded) */
|
|
1898
|
-
readonly backend: 'wasm' | 'webgpu' | null;
|
|
1899
|
-
/**
|
|
1900
|
-
* Load the ONNX model
|
|
1901
|
-
* @param onProgress - Optional progress callback (fires once at 100% for worker)
|
|
1902
|
-
* @returns Model loading information
|
|
1903
|
-
*/
|
|
1904
|
-
load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
|
|
1905
|
-
/**
|
|
1906
|
-
* Transcribe audio samples to text
|
|
1907
|
-
* @param audioSamples - Float32Array of audio samples at 16kHz
|
|
1908
|
-
* @returns Transcription result
|
|
1909
|
-
*/
|
|
1910
|
-
transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
|
|
1488
|
+
interface CreateA2EConfig extends InferenceFactoryConfig {
|
|
1489
|
+
/** URL for the ONNX model. Default: HuggingFace CDN */
|
|
1490
|
+
modelUrl?: string;
|
|
1911
1491
|
/**
|
|
1912
|
-
*
|
|
1492
|
+
* URL for external model data file (.onnx.data weights).
|
|
1493
|
+
* Default: `${modelUrl}.data`
|
|
1494
|
+
*
|
|
1495
|
+
* Set to `false` to skip external data loading (single-file models only).
|
|
1913
1496
|
*/
|
|
1914
|
-
|
|
1915
|
-
|
|
1916
|
-
|
|
1917
|
-
|
|
1918
|
-
|
|
1919
|
-
interface CreateSenseVoiceConfig extends InferenceFactoryConfig {
|
|
1920
|
-
/** Path or URL to model.int8.onnx (239MB). Default: HuggingFace CDN */
|
|
1921
|
-
modelUrl?: string;
|
|
1922
|
-
/** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
|
|
1923
|
-
tokensUrl?: string;
|
|
1924
|
-
/** Language hint (default: 'auto') */
|
|
1925
|
-
language?: SenseVoiceLanguage;
|
|
1926
|
-
/** Text normalization (default: 'with_itn') */
|
|
1927
|
-
textNorm?: 'with_itn' | 'without_itn';
|
|
1497
|
+
externalDataUrl?: string | false;
|
|
1498
|
+
/** Backend preference (default: 'auto') */
|
|
1499
|
+
backend?: BackendPreference;
|
|
1500
|
+
/** Number of identity classes (default: 12) */
|
|
1501
|
+
numIdentityClasses?: number;
|
|
1928
1502
|
}
|
|
1929
1503
|
/**
|
|
1930
|
-
* Create
|
|
1504
|
+
* Create an A2E instance
|
|
1931
1505
|
*
|
|
1932
1506
|
* @param config - Factory configuration
|
|
1933
|
-
* @returns
|
|
1507
|
+
* @returns An A2EBackend instance
|
|
1934
1508
|
*/
|
|
1935
|
-
declare function
|
|
1509
|
+
declare function createA2E(config?: CreateA2EConfig): A2EBackend;
|
|
1936
1510
|
|
|
1937
1511
|
/**
|
|
1938
|
-
* Shared
|
|
1939
|
-
*
|
|
1940
|
-
* Contains LAM_BLENDSHAPES (canonical ordering), symmetrization, and
|
|
1941
|
-
* index remapping used by both Wav2Vec2Inference and Wav2ArkitCpuInference.
|
|
1512
|
+
* Shared types for orchestration layer
|
|
1942
1513
|
*
|
|
1943
|
-
*
|
|
1944
|
-
|
|
1514
|
+
* @category Orchestration
|
|
1515
|
+
*/
|
|
1516
|
+
|
|
1517
|
+
/**
|
|
1518
|
+
* Generic frame source -- any object that emits 'frame' events with blendshapes.
|
|
1945
1519
|
*
|
|
1946
|
-
*
|
|
1520
|
+
* Implemented by PlaybackPipeline, MicLipSync, VoicePipeline, and any custom source.
|
|
1521
|
+
* Used by OmoteAvatar (all renderer adapters) to receive animation frames.
|
|
1947
1522
|
*/
|
|
1523
|
+
interface FrameSource {
|
|
1524
|
+
on(event: 'frame', callback: (frame: {
|
|
1525
|
+
blendshapes: Float32Array;
|
|
1526
|
+
emotion?: string;
|
|
1527
|
+
}) => void): void;
|
|
1528
|
+
off?(event: 'frame', callback: (...args: any[]) => void): void;
|
|
1529
|
+
}
|
|
1530
|
+
type VoicePipelineState = 'idle' | 'loading' | 'ready' | 'listening' | 'thinking' | 'speaking' | 'error';
|
|
1531
|
+
interface LoadingProgress {
|
|
1532
|
+
currentModel: string;
|
|
1533
|
+
progress: number;
|
|
1534
|
+
totalModels: number;
|
|
1535
|
+
modelsLoaded: number;
|
|
1536
|
+
}
|
|
1537
|
+
interface TranscriptResult {
|
|
1538
|
+
text: string;
|
|
1539
|
+
emotion?: string;
|
|
1540
|
+
language?: string;
|
|
1541
|
+
event?: string;
|
|
1542
|
+
isFinal: boolean;
|
|
1543
|
+
inferenceTimeMs?: number;
|
|
1544
|
+
}
|
|
1948
1545
|
/**
|
|
1949
|
-
*
|
|
1950
|
-
*
|
|
1546
|
+
* Consumer's response handler. VoicePipeline calls this with transcribed text.
|
|
1547
|
+
* Consumer must stream audio back for playback + lip sync.
|
|
1951
1548
|
*/
|
|
1952
|
-
|
|
1953
|
-
|
|
1954
|
-
|
|
1549
|
+
interface ResponseHandler {
|
|
1550
|
+
(params: {
|
|
1551
|
+
text: string;
|
|
1552
|
+
emotion?: string;
|
|
1553
|
+
event?: string;
|
|
1554
|
+
/** Set avatar emotion during response streaming (e.g., from LLM emotion_update messages) */
|
|
1555
|
+
setEmotion?: (emotion: string) => void;
|
|
1556
|
+
/** Stream audio chunks to pipeline for playback + lip sync */
|
|
1557
|
+
send: (chunk: Uint8Array) => Promise<void>;
|
|
1558
|
+
/** Call when all audio has been sent */
|
|
1559
|
+
done: () => Promise<void>;
|
|
1560
|
+
/** Aborted on interruption or stop() */
|
|
1561
|
+
signal: AbortSignal;
|
|
1562
|
+
/** Session ID for backend correlation */
|
|
1563
|
+
sessionId: string;
|
|
1564
|
+
}): Promise<void>;
|
|
1565
|
+
}
|
|
1566
|
+
|
|
1955
1567
|
/**
|
|
1956
|
-
*
|
|
1568
|
+
* TTSSpeaker — Shared helper for OmoteAvatar TTS integration.
|
|
1957
1569
|
*
|
|
1958
|
-
*
|
|
1959
|
-
*
|
|
1960
|
-
* transitions.
|
|
1570
|
+
* Encapsulates createA2E + TTSPlayback lifecycle so that renderer adapters
|
|
1571
|
+
* (Three.js, Babylon.js) and the R3F hook can delegate with ~15 lines each.
|
|
1961
1572
|
*
|
|
1962
|
-
* @
|
|
1963
|
-
* @param target - Target blendshape weights
|
|
1964
|
-
* @param factor - Interpolation factor (0 = no change, 1 = snap to target). Default: 0.3
|
|
1965
|
-
* @returns Interpolated weights as number[]
|
|
1573
|
+
* @category Audio
|
|
1966
1574
|
*/
|
|
1967
|
-
|
|
1575
|
+
|
|
1576
|
+
interface TTSSpeakerConfig {
|
|
1577
|
+
/** Per-character expression weight scaling */
|
|
1578
|
+
profile?: ExpressionProfile;
|
|
1579
|
+
/** Identity/style index for A2E model (default: 0) */
|
|
1580
|
+
identityIndex?: number;
|
|
1581
|
+
/** Audio playback delay in ms */
|
|
1582
|
+
audioDelayMs?: number;
|
|
1583
|
+
/** Enable neutral transition on playback complete */
|
|
1584
|
+
neutralTransitionEnabled?: boolean;
|
|
1585
|
+
/** Duration of neutral fade-out in ms */
|
|
1586
|
+
neutralTransitionMs?: number;
|
|
1587
|
+
/** Pre-built A2E backend (skip internal createA2E). */
|
|
1588
|
+
lam?: A2EBackend;
|
|
1589
|
+
/** LAM model config (only when lam not provided) */
|
|
1590
|
+
models?: CreateA2EConfig;
|
|
1591
|
+
/** Shared unified worker (recommended for iOS) */
|
|
1592
|
+
unifiedWorker?: UnifiedInferenceWorker;
|
|
1593
|
+
}
|
|
1594
|
+
declare class TTSSpeaker {
|
|
1595
|
+
private ttsPlayback;
|
|
1596
|
+
private tts;
|
|
1597
|
+
private ownedLam;
|
|
1598
|
+
private ownedWorker;
|
|
1599
|
+
private currentAbort;
|
|
1600
|
+
private _isSpeaking;
|
|
1601
|
+
private _audioOnly;
|
|
1602
|
+
private scheduler;
|
|
1603
|
+
/** Whether the speaker is currently playing audio. */
|
|
1604
|
+
get isSpeaking(): boolean;
|
|
1605
|
+
/** Whether this speaker is in audio-only mode (no lip sync). */
|
|
1606
|
+
get audioOnly(): boolean;
|
|
1607
|
+
/** The internal TTSPlayback (implements FrameSource). Null until connect() or in audio-only mode. */
|
|
1608
|
+
get frameSource(): FrameSource | null;
|
|
1609
|
+
/**
|
|
1610
|
+
* Connect a TTS backend.
|
|
1611
|
+
*
|
|
1612
|
+
* When config includes `lam`, `unifiedWorker`, or `models`, the full lip sync
|
|
1613
|
+
* pipeline is created (LAM + TTSPlayback + PlaybackPipeline).
|
|
1614
|
+
*
|
|
1615
|
+
* When config is omitted or has none of those, audio-only mode is used:
|
|
1616
|
+
* TTS → AudioScheduler (speakers only, no blendshapes, no LAM download).
|
|
1617
|
+
*
|
|
1618
|
+
* @param tts - TTS backend to use for speech synthesis
|
|
1619
|
+
* @param config - Optional configuration for A2E, expression profile, etc.
|
|
1620
|
+
*/
|
|
1621
|
+
connect(tts: TTSBackend, config?: TTSSpeakerConfig): Promise<void>;
|
|
1622
|
+
/**
|
|
1623
|
+
* Synthesize and play text with lip sync.
|
|
1624
|
+
* Auto-aborts previous speak if still in progress.
|
|
1625
|
+
*
|
|
1626
|
+
* @param text - Text to synthesize and play
|
|
1627
|
+
* @param options - Optional voice override and abort signal
|
|
1628
|
+
*/
|
|
1629
|
+
speak(text: string, options?: {
|
|
1630
|
+
signal?: AbortSignal;
|
|
1631
|
+
voice?: string;
|
|
1632
|
+
}): Promise<void>;
|
|
1633
|
+
/** Audio-only speak: TTS → resample → AudioScheduler (no blendshapes). */
|
|
1634
|
+
private speakAudioOnly;
|
|
1635
|
+
/** Poll scheduler until all audio has played. */
|
|
1636
|
+
private waitForSchedulerComplete;
|
|
1637
|
+
/**
|
|
1638
|
+
* Stream text token-by-token with automatic sentence buffering.
|
|
1639
|
+
* Designed for LLM token-by-token output. Sentences are detected at
|
|
1640
|
+
* boundary characters (.!?\n) with a minimum length threshold, then
|
|
1641
|
+
* synthesized and played with lip sync.
|
|
1642
|
+
*
|
|
1643
|
+
* Auto-aborts previous speak/streamText if still in progress.
|
|
1644
|
+
*
|
|
1645
|
+
* @param options - Optional voice override and abort signal
|
|
1646
|
+
* @returns Sink with push() and end() methods
|
|
1647
|
+
*/
|
|
1648
|
+
streamText(options: {
|
|
1649
|
+
signal?: AbortSignal;
|
|
1650
|
+
voice?: string;
|
|
1651
|
+
}): Promise<{
|
|
1652
|
+
push: (token: string) => void;
|
|
1653
|
+
end: () => Promise<void>;
|
|
1654
|
+
}>;
|
|
1655
|
+
/** streamText in audio-only mode: TTS → AudioScheduler (no blendshapes). */
|
|
1656
|
+
private streamTextAudioOnly;
|
|
1657
|
+
/** Abort current speak if any. */
|
|
1658
|
+
stop(): void;
|
|
1659
|
+
/** Clean teardown of all owned resources. */
|
|
1660
|
+
dispose(): Promise<void>;
|
|
1661
|
+
}
|
|
1968
1662
|
|
|
1969
1663
|
/**
|
|
1970
|
-
*
|
|
1664
|
+
* createTTSPlayer — Zero-config TTS player for audio-only playback.
|
|
1971
1665
|
*
|
|
1972
|
-
*
|
|
1973
|
-
*
|
|
1666
|
+
* Speaks text through speakers without an avatar. No LAM download, no lip sync.
|
|
1667
|
+
*
|
|
1668
|
+
* @example
|
|
1669
|
+
* ```typescript
|
|
1670
|
+
* import { createTTSPlayer } from '@omote/core';
|
|
1671
|
+
*
|
|
1672
|
+
* const player = createTTSPlayer();
|
|
1673
|
+
* await player.load();
|
|
1674
|
+
* await player.speak("Hello world!");
|
|
1675
|
+
*
|
|
1676
|
+
* // Streaming:
|
|
1677
|
+
* const stream = await player.streamText({});
|
|
1678
|
+
* stream.push("Hello ");
|
|
1679
|
+
* stream.push("world!");
|
|
1680
|
+
* await stream.end();
|
|
1681
|
+
* ```
|
|
1682
|
+
*
|
|
1683
|
+
* @category Audio
|
|
1684
|
+
*/
|
|
1685
|
+
|
|
1686
|
+
interface CreateTTSPlayerConfig {
|
|
1687
|
+
/** Voice to use (default: 'af_heart') */
|
|
1688
|
+
voice?: string;
|
|
1689
|
+
/** Model URL override */
|
|
1690
|
+
modelUrl?: string;
|
|
1691
|
+
/** Voice data base URL override */
|
|
1692
|
+
voiceBaseUrl?: string;
|
|
1693
|
+
}
|
|
1694
|
+
/**
|
|
1695
|
+
* Zero-config TTS player. Speak text through speakers without an avatar.
|
|
1696
|
+
*
|
|
1697
|
+
* Uses Kokoro TTS (82M q8, ~92MB) with automatic worker selection.
|
|
1698
|
+
* No LAM model is downloaded — audio plays directly through AudioScheduler.
|
|
1699
|
+
*/
|
|
1700
|
+
declare function createTTSPlayer(config?: CreateTTSPlayerConfig): TTSPlayer;
|
|
1701
|
+
/**
|
|
1702
|
+
* Thin wrapper: TTSSpeaker in audio-only mode + delegated load().
|
|
1703
|
+
*/
|
|
1704
|
+
declare class TTSPlayer extends TTSSpeaker {
|
|
1705
|
+
private backend;
|
|
1706
|
+
constructor(tts: TTSBackend);
|
|
1707
|
+
/** Load TTS model and connect in audio-only mode. */
|
|
1708
|
+
load(): Promise<void>;
|
|
1709
|
+
/** Whether the TTS model is loaded and ready. */
|
|
1710
|
+
get isLoaded(): boolean;
|
|
1711
|
+
}
|
|
1712
|
+
|
|
1713
|
+
/**
|
|
1714
|
+
* Factory function for SenseVoice ASR with automatic Worker vs main thread selection
|
|
1715
|
+
*
|
|
1716
|
+
* Provides a unified API that automatically selects the optimal implementation:
|
|
1717
|
+
* - Worker supported: Uses SenseVoiceWorker (off-main-thread inference)
|
|
1718
|
+
* - Worker unsupported: Uses SenseVoiceInference (main thread)
|
|
1974
1719
|
*
|
|
1975
1720
|
* @category Inference
|
|
1976
1721
|
*
|
|
1977
|
-
* @example
|
|
1722
|
+
* @example Auto-detect (recommended)
|
|
1978
1723
|
* ```typescript
|
|
1979
|
-
* import {
|
|
1724
|
+
* import { createSenseVoice } from '@omote/core';
|
|
1980
1725
|
*
|
|
1981
|
-
* const
|
|
1982
|
-
*
|
|
1726
|
+
* const asr = createSenseVoice({
|
|
1727
|
+
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
1728
|
+
* });
|
|
1729
|
+
* await asr.load();
|
|
1730
|
+
* const { text, emotion } = await asr.transcribe(audioSamples);
|
|
1731
|
+
* ```
|
|
1983
1732
|
*
|
|
1984
|
-
*
|
|
1985
|
-
*
|
|
1986
|
-
*
|
|
1733
|
+
* @example Force worker
|
|
1734
|
+
* ```typescript
|
|
1735
|
+
* const asr = createSenseVoice({
|
|
1736
|
+
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
1737
|
+
* useWorker: true,
|
|
1738
|
+
* });
|
|
1739
|
+
* ```
|
|
1740
|
+
*
|
|
1741
|
+
* @example Force main thread
|
|
1742
|
+
* ```typescript
|
|
1743
|
+
* const asr = createSenseVoice({
|
|
1744
|
+
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
1745
|
+
* useWorker: false,
|
|
1746
|
+
* });
|
|
1987
1747
|
* ```
|
|
1988
1748
|
*/
|
|
1989
1749
|
|
|
1990
|
-
|
|
1991
|
-
interface
|
|
1992
|
-
|
|
1993
|
-
|
|
1750
|
+
/**
|
|
1751
|
+
* Common interface for both SenseVoiceInference and SenseVoiceWorker
|
|
1752
|
+
*/
|
|
1753
|
+
interface SenseVoiceBackend {
|
|
1754
|
+
/** Whether the model is loaded and ready for inference */
|
|
1755
|
+
readonly isLoaded: boolean;
|
|
1756
|
+
/** Current backend type ('wasm' for worker, full backend for main thread, null if not loaded) */
|
|
1757
|
+
readonly backend: 'wasm' | 'webgpu' | null;
|
|
1994
1758
|
/**
|
|
1995
|
-
*
|
|
1996
|
-
*
|
|
1997
|
-
*
|
|
1998
|
-
* Set to `false` to skip external data loading (single-file models only).
|
|
1759
|
+
* Load the ONNX model
|
|
1760
|
+
* @param onProgress - Optional progress callback (fires once at 100% for worker)
|
|
1761
|
+
* @returns Model loading information
|
|
1999
1762
|
*/
|
|
2000
|
-
|
|
2001
|
-
/** Preferred backend (auto will try WebGPU first, fallback to WASM) */
|
|
2002
|
-
backend?: InferenceBackend;
|
|
2003
|
-
/** Number of identity classes (default: 12 for streaming model) */
|
|
2004
|
-
numIdentityClasses?: number;
|
|
1763
|
+
load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
|
|
2005
1764
|
/**
|
|
2006
|
-
*
|
|
2007
|
-
*
|
|
2008
|
-
*
|
|
1765
|
+
* Transcribe audio samples to text
|
|
1766
|
+
* @param audioSamples - Float32Array of audio samples at 16kHz
|
|
1767
|
+
* @returns Transcription result
|
|
2009
1768
|
*/
|
|
2010
|
-
|
|
1769
|
+
transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
|
|
1770
|
+
/**
|
|
1771
|
+
* Dispose of the model and free resources
|
|
1772
|
+
*/
|
|
1773
|
+
dispose(): Promise<void>;
|
|
2011
1774
|
}
|
|
2012
|
-
|
|
2013
|
-
|
|
2014
|
-
|
|
2015
|
-
|
|
2016
|
-
|
|
1775
|
+
/**
|
|
1776
|
+
* Configuration for the SenseVoice factory
|
|
1777
|
+
*/
|
|
1778
|
+
interface CreateSenseVoiceConfig extends InferenceFactoryConfig {
|
|
1779
|
+
/** Path or URL to model.int8.onnx (239MB). Default: HuggingFace CDN */
|
|
1780
|
+
modelUrl?: string;
|
|
1781
|
+
/** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
|
|
1782
|
+
tokensUrl?: string;
|
|
1783
|
+
/** Language hint (default: 'auto') */
|
|
1784
|
+
language?: SenseVoiceLanguage;
|
|
1785
|
+
/** Text normalization (default: 'with_itn') */
|
|
1786
|
+
textNorm?: 'with_itn' | 'without_itn';
|
|
2017
1787
|
}
|
|
1788
|
+
/**
|
|
1789
|
+
* Create a SenseVoice ASR instance with automatic implementation selection
|
|
1790
|
+
*
|
|
1791
|
+
* @param config - Factory configuration
|
|
1792
|
+
* @returns A SenseVoiceBackend instance (either Worker or main thread)
|
|
1793
|
+
*/
|
|
1794
|
+
declare function createSenseVoice(config?: CreateSenseVoiceConfig): SenseVoiceBackend;
|
|
2018
1795
|
|
|
2019
1796
|
/**
|
|
2020
|
-
*
|
|
2021
|
-
*
|
|
1797
|
+
* Factory function for Silero VAD with automatic Worker vs main thread selection
|
|
1798
|
+
*
|
|
1799
|
+
* Provides a unified API that automatically selects the optimal implementation:
|
|
1800
|
+
* - Desktop browsers: Uses SileroVADWorker (off-main-thread inference)
|
|
1801
|
+
* - Mobile devices: Uses SileroVADInference (main thread, avoids memory overhead)
|
|
1802
|
+
* - Fallback: Gracefully falls back to main thread if Worker fails
|
|
1803
|
+
*
|
|
1804
|
+
* @category Inference
|
|
1805
|
+
*
|
|
1806
|
+
* @example Basic usage (auto-detect)
|
|
1807
|
+
* ```typescript
|
|
1808
|
+
* import { createSileroVAD } from '@omote/core';
|
|
1809
|
+
*
|
|
1810
|
+
* const vad = createSileroVAD({
|
|
1811
|
+
* modelUrl: '/models/silero-vad.onnx',
|
|
1812
|
+
* threshold: 0.5,
|
|
1813
|
+
* });
|
|
1814
|
+
*
|
|
1815
|
+
* await vad.load();
|
|
1816
|
+
* const result = await vad.process(audioChunk);
|
|
1817
|
+
* if (result.isSpeech) {
|
|
1818
|
+
* console.log('Speech detected!', result.probability);
|
|
1819
|
+
* }
|
|
1820
|
+
* ```
|
|
1821
|
+
*
|
|
1822
|
+
* @example Force worker usage
|
|
1823
|
+
* ```typescript
|
|
1824
|
+
* const vad = createSileroVAD({
|
|
1825
|
+
* modelUrl: '/models/silero-vad.onnx',
|
|
1826
|
+
* useWorker: true, // Force Worker even on mobile
|
|
1827
|
+
* });
|
|
1828
|
+
* ```
|
|
1829
|
+
*
|
|
1830
|
+
* @example Force main thread
|
|
1831
|
+
* ```typescript
|
|
1832
|
+
* const vad = createSileroVAD({
|
|
1833
|
+
* modelUrl: '/models/silero-vad.onnx',
|
|
1834
|
+
* useWorker: false, // Force main thread
|
|
1835
|
+
* });
|
|
1836
|
+
* ```
|
|
2022
1837
|
*/
|
|
2023
|
-
|
|
2024
|
-
|
|
2025
|
-
|
|
2026
|
-
|
|
2027
|
-
|
|
2028
|
-
|
|
2029
|
-
|
|
2030
|
-
|
|
1838
|
+
|
|
1839
|
+
/**
|
|
1840
|
+
* Common interface for both SileroVADInference and SileroVADWorker
|
|
1841
|
+
*
|
|
1842
|
+
* This interface defines the shared API that both implementations provide,
|
|
1843
|
+
* allowing consumers to use either interchangeably.
|
|
1844
|
+
*/
|
|
1845
|
+
interface SileroVADBackend {
|
|
1846
|
+
/** Current backend type (webgpu, wasm, or null if not loaded) */
|
|
1847
|
+
readonly backend: RuntimeBackend | null;
|
|
1848
|
+
/** Whether the model is loaded and ready for inference */
|
|
1849
|
+
readonly isLoaded: boolean;
|
|
1850
|
+
/** Audio sample rate (8000 or 16000 Hz) */
|
|
1851
|
+
readonly sampleRate: number;
|
|
1852
|
+
/** Speech detection threshold (0-1) */
|
|
1853
|
+
readonly threshold: number;
|
|
1854
|
+
/**
|
|
1855
|
+
* Load the ONNX model
|
|
1856
|
+
* @returns Model loading information
|
|
1857
|
+
*/
|
|
1858
|
+
load(): Promise<VADModelInfo | VADWorkerModelInfo>;
|
|
1859
|
+
/**
|
|
1860
|
+
* Process a single audio chunk
|
|
1861
|
+
* @param audioChunk - Float32Array of exactly chunkSize samples
|
|
1862
|
+
* @returns VAD result with speech probability
|
|
1863
|
+
*/
|
|
1864
|
+
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
1865
|
+
/**
|
|
1866
|
+
* Reset state for new audio stream
|
|
1867
|
+
*/
|
|
1868
|
+
reset(): void | Promise<void>;
|
|
1869
|
+
/**
|
|
1870
|
+
* Dispose of the model and free resources
|
|
1871
|
+
*/
|
|
1872
|
+
dispose(): Promise<void>;
|
|
1873
|
+
/**
|
|
1874
|
+
* Get required chunk size in samples
|
|
1875
|
+
*/
|
|
1876
|
+
getChunkSize(): number;
|
|
1877
|
+
/**
|
|
1878
|
+
* Get chunk duration in milliseconds
|
|
1879
|
+
*/
|
|
1880
|
+
getChunkDurationMs(): number;
|
|
2031
1881
|
}
|
|
2032
|
-
|
|
2033
|
-
|
|
2034
|
-
|
|
2035
|
-
|
|
2036
|
-
|
|
2037
|
-
|
|
2038
|
-
|
|
2039
|
-
|
|
2040
|
-
readonly chunkSize: number;
|
|
2041
|
-
private inferenceQueue;
|
|
2042
|
-
private poisoned;
|
|
2043
|
-
private static readonly INFERENCE_TIMEOUT_MS;
|
|
2044
|
-
constructor(config: Wav2Vec2InferenceConfig);
|
|
1882
|
+
/**
|
|
1883
|
+
* Configuration for the Silero VAD factory
|
|
1884
|
+
*
|
|
1885
|
+
* Extends SileroVADConfig with worker-specific options.
|
|
1886
|
+
*/
|
|
1887
|
+
interface SileroVADFactoryConfig extends Omit<SileroVADConfig, 'modelUrl'>, InferenceFactoryConfig {
|
|
1888
|
+
/** Path or URL to the ONNX model. Default: HuggingFace CDN */
|
|
1889
|
+
modelUrl?: string;
|
|
2045
1890
|
/**
|
|
2046
|
-
*
|
|
2047
|
-
*
|
|
1891
|
+
* Fallback to main thread on worker errors.
|
|
1892
|
+
*
|
|
1893
|
+
* When true (default), if the Worker fails to load or encounters an error,
|
|
1894
|
+
* the factory will automatically create a main thread instance instead.
|
|
1895
|
+
*
|
|
1896
|
+
* When false, worker errors will propagate as exceptions.
|
|
1897
|
+
*
|
|
1898
|
+
* Default: true
|
|
2048
1899
|
*/
|
|
2049
|
-
|
|
2050
|
-
|
|
2051
|
-
|
|
2052
|
-
|
|
2053
|
-
|
|
1900
|
+
fallbackOnError?: boolean;
|
|
1901
|
+
}
|
|
1902
|
+
/**
|
|
1903
|
+
* Check if the current environment supports VAD Web Workers
|
|
1904
|
+
*
|
|
1905
|
+
* Requirements:
|
|
1906
|
+
* - Worker constructor must exist
|
|
1907
|
+
* - Blob URL support (for inline worker script)
|
|
1908
|
+
*
|
|
1909
|
+
* @returns true if VAD Worker is supported
|
|
1910
|
+
*/
|
|
1911
|
+
declare function supportsVADWorker(): boolean;
|
|
1912
|
+
/**
|
|
1913
|
+
* Create a Silero VAD instance with automatic implementation selection
|
|
1914
|
+
*
|
|
1915
|
+
* This factory function automatically selects between:
|
|
1916
|
+
* - **SileroVADWorker**: Off-main-thread inference (better for desktop)
|
|
1917
|
+
* - **SileroVADInference**: Main thread inference (better for mobile)
|
|
1918
|
+
*
|
|
1919
|
+
* The selection is based on:
|
|
1920
|
+
* 1. Explicit `useWorker` config (if provided)
|
|
1921
|
+
* 2. Platform detection (mobile vs desktop)
|
|
1922
|
+
* 3. Worker API availability
|
|
1923
|
+
*
|
|
1924
|
+
* Both implementations share the same interface (SileroVADBackend),
|
|
1925
|
+
* so consumers can use either interchangeably.
|
|
1926
|
+
*
|
|
1927
|
+
* @param config - Factory configuration
|
|
1928
|
+
* @returns A SileroVAD instance (either Worker or main thread)
|
|
1929
|
+
*
|
|
1930
|
+
* @example
|
|
1931
|
+
* ```typescript
|
|
1932
|
+
* // Auto-detect (recommended)
|
|
1933
|
+
* const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
|
|
1934
|
+
*
|
|
1935
|
+
* // Force Worker
|
|
1936
|
+
* const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
|
|
1937
|
+
*
|
|
1938
|
+
* // Force main thread
|
|
1939
|
+
* const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
|
|
1940
|
+
* ```
|
|
1941
|
+
*/
|
|
1942
|
+
declare function createSileroVAD(config?: SileroVADFactoryConfig): SileroVADBackend;
|
|
1943
|
+
|
|
1944
|
+
/**
|
|
1945
|
+
* SpeechListener — Standalone listening primitive.
|
|
1946
|
+
*
|
|
1947
|
+
* Composes: MicrophoneCapture → SileroVAD → SenseVoice ASR → transcript events.
|
|
1948
|
+
* Extracted from VoicePipeline's listening half so it can be used independently.
|
|
1949
|
+
*
|
|
1950
|
+
* Does NOT handle TTS, LAM, or response routing — those belong to TTSSpeaker
|
|
1951
|
+
* and VoicePipeline respectively.
|
|
1952
|
+
*
|
|
1953
|
+
* @category Audio
|
|
1954
|
+
*/
|
|
1955
|
+
|
|
1956
|
+
interface SpeechListenerConfig {
|
|
1957
|
+
/** Pre-built backends — skip internal factory creation. */
|
|
1958
|
+
backends?: {
|
|
1959
|
+
asr: SenseVoiceBackend;
|
|
1960
|
+
vad: SileroVADBackend;
|
|
1961
|
+
};
|
|
1962
|
+
/** External unified worker (reuse across pipelines). */
|
|
1963
|
+
unifiedWorker?: UnifiedInferenceWorker;
|
|
1964
|
+
/** URLs and options for model loading (when backends not provided). */
|
|
1965
|
+
models?: {
|
|
1966
|
+
senseVoice: {
|
|
1967
|
+
modelUrl: string;
|
|
1968
|
+
tokensUrl?: string;
|
|
1969
|
+
language?: string;
|
|
1970
|
+
};
|
|
1971
|
+
vad: {
|
|
1972
|
+
modelUrl: string;
|
|
1973
|
+
threshold?: number;
|
|
1974
|
+
preSpeechBufferChunks?: number;
|
|
1975
|
+
};
|
|
1976
|
+
};
|
|
1977
|
+
/** Base silence timeout in ms (default: 500) */
|
|
1978
|
+
silenceTimeoutMs?: number;
|
|
1979
|
+
/** Extended silence timeout for long utterances (default: 700) */
|
|
1980
|
+
silenceTimeoutExtendedMs?: number;
|
|
1981
|
+
/** Enable adaptive timeout based on speech duration (default: true) */
|
|
1982
|
+
adaptiveTimeout?: boolean;
|
|
1983
|
+
/** Minimum audio duration in seconds (default: 0.3) */
|
|
1984
|
+
minAudioDurationSec?: number;
|
|
1985
|
+
/** Minimum audio energy (default: 0.02) */
|
|
1986
|
+
minAudioEnergy?: number;
|
|
1987
|
+
/** Enable audio normalization for quiet audio (default: true) */
|
|
1988
|
+
normalizeAudio?: boolean;
|
|
1989
|
+
/** Progressive transcription interval — desktop (default: 500ms) */
|
|
1990
|
+
progressiveIntervalMs?: number;
|
|
1991
|
+
/** Progressive transcription interval — iOS (default: 800ms) */
|
|
1992
|
+
progressiveIntervalIosMs?: number;
|
|
1993
|
+
/** Coverage threshold to use progressive result (default: 0.8) */
|
|
1994
|
+
progressiveCoverageThreshold?: number;
|
|
1995
|
+
/** Minimum samples before progressive transcription starts (default: 8000) */
|
|
1996
|
+
progressiveMinSamples?: number;
|
|
1997
|
+
/** Timeout for individual transcribe() calls (default: 10000ms) */
|
|
1998
|
+
transcriptionTimeoutMs?: number;
|
|
1999
|
+
}
|
|
2000
|
+
type SpeechListenerState = 'idle' | 'loading' | 'ready' | 'listening' | 'processing' | 'paused';
|
|
2001
|
+
interface SpeechListenerEvents {
|
|
2002
|
+
'state': SpeechListenerState;
|
|
2003
|
+
'loading:progress': LoadingProgress;
|
|
2004
|
+
'transcript': TranscriptResult;
|
|
2005
|
+
'speech:start': void;
|
|
2006
|
+
'speech:end': {
|
|
2007
|
+
durationMs: number;
|
|
2008
|
+
};
|
|
2009
|
+
'audio:level': {
|
|
2010
|
+
rms: number;
|
|
2011
|
+
peak: number;
|
|
2012
|
+
};
|
|
2013
|
+
'audio:chunk': Float32Array;
|
|
2014
|
+
'error': Error;
|
|
2015
|
+
[key: string]: unknown;
|
|
2016
|
+
}
|
|
2017
|
+
declare class SpeechListener extends EventEmitter<SpeechListenerEvents> {
|
|
2018
|
+
private readonly config;
|
|
2019
|
+
private _state;
|
|
2020
|
+
private epoch;
|
|
2021
|
+
private asr;
|
|
2022
|
+
private vad;
|
|
2023
|
+
private ownedWorker;
|
|
2024
|
+
private mic;
|
|
2025
|
+
private omoteEvents;
|
|
2026
|
+
private _unsubChunk;
|
|
2027
|
+
private _unsubLevel;
|
|
2028
|
+
private static readonly MAX_AUDIO_BUFFER_SAMPLES;
|
|
2029
|
+
private audioBuffer;
|
|
2030
|
+
private audioBufferSamples;
|
|
2031
|
+
private speechStartTime;
|
|
2032
|
+
private silenceTimer;
|
|
2033
|
+
private isSpeechActive;
|
|
2034
|
+
private progressiveTimer;
|
|
2035
|
+
private progressivePromise;
|
|
2036
|
+
private lastProgressiveResult;
|
|
2037
|
+
private lastProgressiveSamples;
|
|
2038
|
+
private asrErrorCount;
|
|
2039
|
+
/** Current listener state */
|
|
2040
|
+
get state(): SpeechListenerState;
|
|
2041
|
+
constructor(config?: SpeechListenerConfig);
|
|
2054
2042
|
/**
|
|
2055
|
-
* Load
|
|
2043
|
+
* Load ASR + VAD models. Only loads speech recognition models,
|
|
2044
|
+
* NOT TTS or LAM (those belong to TTSSpeaker).
|
|
2056
2045
|
*/
|
|
2057
|
-
|
|
2046
|
+
loadModels(): Promise<void>;
|
|
2047
|
+
/** Start listening — activates mic + VAD. */
|
|
2048
|
+
start(): Promise<void>;
|
|
2049
|
+
/** Stop listening — deactivates mic, clears buffers. */
|
|
2050
|
+
stop(): void;
|
|
2051
|
+
/** Pause VAD/ASR but keep mic active for audio:chunk events (for interruption detection). */
|
|
2052
|
+
pause(): void;
|
|
2053
|
+
/** Resume VAD/ASR from paused state. */
|
|
2054
|
+
resume(): void;
|
|
2055
|
+
/** Dispose all resources. */
|
|
2056
|
+
dispose(): Promise<void>;
|
|
2057
|
+
private processAudioChunk;
|
|
2058
|
+
private getSilenceTimeout;
|
|
2059
|
+
private onSilenceDetected;
|
|
2060
|
+
private processEndOfSpeech;
|
|
2061
|
+
private startProgressiveTranscription;
|
|
2062
|
+
private stopProgressiveTranscription;
|
|
2063
|
+
private transcribeWithTimeout;
|
|
2064
|
+
private normalizeAudio;
|
|
2065
|
+
private setState;
|
|
2066
|
+
private emitProgress;
|
|
2067
|
+
private clearSilenceTimer;
|
|
2068
|
+
}
|
|
2069
|
+
|
|
2070
|
+
/**
|
|
2071
|
+
* Interruption Handler
|
|
2072
|
+
*
|
|
2073
|
+
* VAD-based barge-in detection for AI conversations:
|
|
2074
|
+
* - Monitors VAD probability for user speech
|
|
2075
|
+
* - Detects when user interrupts AI response
|
|
2076
|
+
* - Triggers interruption callbacks
|
|
2077
|
+
*/
|
|
2078
|
+
|
|
2079
|
+
interface InterruptionEvents {
|
|
2080
|
+
[key: string]: unknown;
|
|
2081
|
+
'speech.detected': {
|
|
2082
|
+
rms: number;
|
|
2083
|
+
};
|
|
2084
|
+
'speech.ended': {
|
|
2085
|
+
durationMs: number;
|
|
2086
|
+
};
|
|
2087
|
+
'interruption.triggered': {
|
|
2088
|
+
rms: number;
|
|
2089
|
+
durationMs: number;
|
|
2090
|
+
};
|
|
2091
|
+
}
|
|
2092
|
+
/**
|
|
2093
|
+
* Interruption handler configuration
|
|
2094
|
+
*
|
|
2095
|
+
* Industry standards applied:
|
|
2096
|
+
* - vadThreshold: 0.5 (Silero VAD default)
|
|
2097
|
+
* - minSpeechDurationMs: 200ms (Google/Amazon barge-in standard)
|
|
2098
|
+
* - silenceTimeoutMs: 500ms (OpenAI Realtime API standard)
|
|
2099
|
+
*/
|
|
2100
|
+
interface InterruptionConfig {
|
|
2101
|
+
/** VAD probability threshold for speech detection (default: 0.5, Silero standard) */
|
|
2102
|
+
vadThreshold?: number;
|
|
2103
|
+
/** Minimum speech duration to trigger interruption (default: 200ms, Google/Amazon standard) */
|
|
2104
|
+
minSpeechDurationMs?: number;
|
|
2105
|
+
/** Silence duration to end speech (default: 500ms, OpenAI standard) */
|
|
2106
|
+
silenceTimeoutMs?: number;
|
|
2107
|
+
/** Enable interruption detection (default: true) */
|
|
2108
|
+
enabled?: boolean;
|
|
2109
|
+
}
|
|
2110
|
+
declare class InterruptionHandler extends EventEmitter<InterruptionEvents> {
|
|
2111
|
+
private config;
|
|
2112
|
+
private isSpeaking;
|
|
2113
|
+
private speechStartTime;
|
|
2114
|
+
private lastSpeechTime;
|
|
2115
|
+
private silenceTimer;
|
|
2116
|
+
private aiIsSpeaking;
|
|
2117
|
+
private interruptionTriggeredThisSession;
|
|
2118
|
+
constructor(config?: InterruptionConfig);
|
|
2058
2119
|
/**
|
|
2059
|
-
*
|
|
2060
|
-
*
|
|
2061
|
-
*
|
|
2120
|
+
* Process raw audio energy for interruption detection (no VAD required).
|
|
2121
|
+
* Used during speaking state when the unified worker is busy with TTS.
|
|
2122
|
+
* Echo-cancelled mic input means energy above threshold = user speech.
|
|
2062
2123
|
*
|
|
2063
|
-
*
|
|
2064
|
-
|
|
2065
|
-
infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
|
|
2066
|
-
/**
|
|
2067
|
-
* Queue inference to serialize ONNX session calls
|
|
2068
|
-
*/
|
|
2069
|
-
private queueInference;
|
|
2070
|
-
/**
|
|
2071
|
-
* Get blendshape value by name for a specific frame
|
|
2124
|
+
* @param rms - RMS energy of audio chunk (0-1)
|
|
2125
|
+
* @param energyThreshold - Minimum energy to consider speech (default: 0.02)
|
|
2072
2126
|
*/
|
|
2073
|
-
|
|
2127
|
+
processAudioEnergy(rms: number, energyThreshold?: number): void;
|
|
2074
2128
|
/**
|
|
2075
|
-
*
|
|
2129
|
+
* Process VAD result for interruption detection
|
|
2130
|
+
* @param vadProbability - Speech probability from VAD (0-1)
|
|
2131
|
+
* @param audioEnergy - Optional RMS energy for logging (default: 0)
|
|
2076
2132
|
*/
|
|
2077
|
-
|
|
2133
|
+
processVADResult(vadProbability: number, audioEnergy?: number): void;
|
|
2134
|
+
/** Notify that AI started/stopped speaking */
|
|
2135
|
+
setAISpeaking(speaking: boolean): void;
|
|
2136
|
+
/** Enable/disable interruption detection */
|
|
2137
|
+
setEnabled(enabled: boolean): void;
|
|
2138
|
+
/** Update configuration */
|
|
2139
|
+
updateConfig(config: Partial<InterruptionConfig>): void;
|
|
2140
|
+
/** Reset state */
|
|
2141
|
+
reset(): void;
|
|
2142
|
+
/** Get current state */
|
|
2143
|
+
getState(): {
|
|
2144
|
+
isSpeaking: boolean;
|
|
2145
|
+
speechDurationMs: number;
|
|
2146
|
+
};
|
|
2147
|
+
private onSpeechDetected;
|
|
2148
|
+
private onSilenceDetected;
|
|
2078
2149
|
}
|
|
2079
2150
|
|
|
2080
2151
|
/**
|
|
2081
|
-
*
|
|
2152
|
+
* SenseVoice ASR Web Worker implementation
|
|
2082
2153
|
*
|
|
2083
|
-
*
|
|
2084
|
-
*
|
|
2085
|
-
*
|
|
2086
|
-
* at startup to point any or all models at your own CDN.
|
|
2154
|
+
* Runs SenseVoice speech recognition in a dedicated Web Worker to prevent
|
|
2155
|
+
* main thread blocking. Uses inline worker script (Blob URL pattern) to
|
|
2156
|
+
* avoid separate file deployment.
|
|
2087
2157
|
*
|
|
2088
|
-
*
|
|
2158
|
+
* Key design decisions:
|
|
2159
|
+
* - WASM backend only (WebGPU doesn't work in Workers)
|
|
2160
|
+
* - All preprocessing (fbank, LFR, CMVN) and CTC decoding inlined in worker
|
|
2161
|
+
* - Audio copied (not transferred) to retain main thread access
|
|
2162
|
+
* - ONNX Runtime loaded from CDN in worker (no bundler complications)
|
|
2163
|
+
* - iOS: model URL passed as string to ORT (avoids 239MB JS heap allocation)
|
|
2089
2164
|
*
|
|
2090
|
-
* @
|
|
2091
|
-
* ```typescript
|
|
2092
|
-
* import { createA2E } from '@omote/core';
|
|
2093
|
-
* const a2e = createA2E(); // fetches from HuggingFace CDN
|
|
2094
|
-
* ```
|
|
2165
|
+
* @category Inference
|
|
2095
2166
|
*
|
|
2096
|
-
* @example
|
|
2167
|
+
* @example Basic usage
|
|
2097
2168
|
* ```typescript
|
|
2098
|
-
* import {
|
|
2099
|
-
*
|
|
2100
|
-
* configureModelUrls({
|
|
2101
|
-
* lam: 'https://cdn.example.com/models/model_fp16.onnx',
|
|
2102
|
-
* senseVoice: 'https://cdn.example.com/models/sensevoice.int8.onnx',
|
|
2103
|
-
* // omitted keys keep HuggingFace defaults
|
|
2104
|
-
* });
|
|
2105
|
-
*
|
|
2106
|
-
* const a2e = createA2E(); // now fetches from your CDN
|
|
2107
|
-
* ```
|
|
2108
|
-
*/
|
|
2109
|
-
/** Model URL keys that can be configured */
|
|
2110
|
-
type ModelUrlKey = 'lam' | 'lamIos' | 'wav2arkitCpu' | 'senseVoice' | 'sileroVad' | 'kokoroTTS' | 'kokoroVoices';
|
|
2111
|
-
/**
|
|
2112
|
-
* Resolved model URLs — user overrides take priority, HuggingFace CDN is fallback.
|
|
2113
|
-
*
|
|
2114
|
-
* All SDK factories (`createA2E`, `createSenseVoice`, `createSileroVAD`) and
|
|
2115
|
-
* orchestrators (`VoicePipeline`) read from this object. Call
|
|
2116
|
-
* {@link configureModelUrls} before constructing any pipelines to point
|
|
2117
|
-
* models at your own CDN.
|
|
2118
|
-
*/
|
|
2119
|
-
declare const DEFAULT_MODEL_URLS: Readonly<Record<ModelUrlKey, string>>;
|
|
2120
|
-
/**
|
|
2121
|
-
* Configure custom model URLs. Overrides persist for the lifetime of the page.
|
|
2122
|
-
* Omitted keys keep their HuggingFace CDN defaults.
|
|
2123
|
-
*
|
|
2124
|
-
* Call this **once** at app startup, before constructing any pipelines.
|
|
2169
|
+
* import { SenseVoiceWorker } from '@omote/core';
|
|
2125
2170
|
*
|
|
2126
|
-
*
|
|
2127
|
-
*
|
|
2128
|
-
*
|
|
2129
|
-
* lam: 'https://cdn.example.com/models/model_fp16.onnx',
|
|
2130
|
-
* wav2arkitCpu: 'https://cdn.example.com/models/wav2arkit_cpu.onnx',
|
|
2131
|
-
* senseVoice: 'https://cdn.example.com/models/sensevoice.int8.onnx',
|
|
2132
|
-
* sileroVad: 'https://cdn.example.com/models/silero-vad.onnx',
|
|
2171
|
+
* const asr = new SenseVoiceWorker({
|
|
2172
|
+
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
2173
|
+
* tokensUrl: '/models/sensevoice/tokens.txt',
|
|
2133
2174
|
* });
|
|
2134
|
-
*
|
|
2175
|
+
* await asr.load();
|
|
2135
2176
|
*
|
|
2136
|
-
*
|
|
2137
|
-
*
|
|
2138
|
-
*
|
|
2139
|
-
*
|
|
2140
|
-
* });
|
|
2177
|
+
* const { text, emotion, language } = await asr.transcribe(audioSamples);
|
|
2178
|
+
* console.log(text); // "Hello world"
|
|
2179
|
+
* console.log(emotion); // "NEUTRAL"
|
|
2180
|
+
* console.log(language); // "en"
|
|
2141
2181
|
* ```
|
|
2142
2182
|
*/
|
|
2143
|
-
|
|
2144
|
-
/**
|
|
2145
|
-
* Reset all model URL overrides back to HuggingFace CDN defaults.
|
|
2146
|
-
* Mainly useful for testing.
|
|
2147
|
-
*/
|
|
2148
|
-
declare function resetModelUrls(): void;
|
|
2183
|
+
|
|
2149
2184
|
/**
|
|
2150
|
-
*
|
|
2151
|
-
* Useful for documentation or fallback logic.
|
|
2185
|
+
* Configuration for SenseVoice Worker
|
|
2152
2186
|
*/
|
|
2153
|
-
|
|
2154
|
-
|
|
2187
|
+
interface SenseVoiceWorkerConfig {
|
|
2188
|
+
/** Path or URL to model.int8.onnx (239MB) */
|
|
2189
|
+
modelUrl: string;
|
|
2190
|
+
/** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
|
|
2191
|
+
tokensUrl?: string;
|
|
2192
|
+
/** Language hint (default: 'auto' for auto-detection) */
|
|
2193
|
+
language?: SenseVoiceLanguage;
|
|
2194
|
+
/** Text normalization: 'with_itn' applies inverse text normalization (default: 'with_itn') */
|
|
2195
|
+
textNorm?: 'with_itn' | 'without_itn';
|
|
2196
|
+
}
|
|
2155
2197
|
/**
|
|
2156
|
-
*
|
|
2157
|
-
*
|
|
2158
|
-
* A Safari/iOS-compatible alternative to Wav2Vec2Inference (192MB fp16) designed
|
|
2159
|
-
* for platforms where WebGPU crashes due to ONNX Runtime JSEP bugs.
|
|
2160
|
-
*
|
|
2161
|
-
* The model uses ONNX external data format:
|
|
2162
|
-
* - wav2arkit_cpu.onnx (1.86MB graph structure)
|
|
2163
|
-
* - wav2arkit_cpu.onnx.data (402MB weights)
|
|
2164
|
-
* Both files are fetched and cached automatically.
|
|
2165
|
-
*
|
|
2166
|
-
* Key differences from Wav2Vec2Inference:
|
|
2167
|
-
* - WASM-only backend (CPU-optimized, single-threaded, no WebGPU)
|
|
2168
|
-
* - No identity input (baked to identity 11)
|
|
2169
|
-
* - No ASR output (lip sync only)
|
|
2170
|
-
* - Dynamic input length (not fixed to 16000 samples)
|
|
2171
|
-
* - Different native blendshape ordering (remapped to LAM_BLENDSHAPES)
|
|
2172
|
-
*
|
|
2173
|
-
* @category Inference
|
|
2174
|
-
*
|
|
2175
|
-
* @example
|
|
2176
|
-
* ```typescript
|
|
2177
|
-
* import { Wav2ArkitCpuInference } from '@omote/core';
|
|
2198
|
+
* SenseVoice ASR Worker - Speech Recognition in a Web Worker
|
|
2178
2199
|
*
|
|
2179
|
-
*
|
|
2180
|
-
*
|
|
2181
|
-
* });
|
|
2182
|
-
* await lam.load();
|
|
2200
|
+
* Runs SenseVoice inference off the main thread to prevent UI blocking.
|
|
2201
|
+
* All preprocessing (fbank, LFR, CMVN) and CTC decoding run in the worker.
|
|
2183
2202
|
*
|
|
2184
|
-
*
|
|
2185
|
-
* // blendshapes: Float32Array[] in LAM_BLENDSHAPES order, 30fps
|
|
2186
|
-
* ```
|
|
2203
|
+
* @see SenseVoiceInference for main-thread version
|
|
2187
2204
|
*/
|
|
2188
|
-
|
|
2189
|
-
|
|
2190
|
-
/** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
|
|
2191
|
-
modelUrl: string;
|
|
2192
|
-
/**
|
|
2193
|
-
* Path or URL to external model data file (.onnx.data weights).
|
|
2194
|
-
* Default: `${modelUrl}.data` (e.g., /models/wav2arkit_cpu.onnx.data)
|
|
2195
|
-
*
|
|
2196
|
-
* Set to `false` to skip external data loading (single-file models only).
|
|
2197
|
-
*/
|
|
2198
|
-
externalDataUrl?: string | false;
|
|
2199
|
-
/** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
|
|
2200
|
-
backend?: BackendPreference;
|
|
2201
|
-
}
|
|
2202
|
-
declare class Wav2ArkitCpuInference implements A2EBackend {
|
|
2203
|
-
readonly modelId: "wav2arkit_cpu";
|
|
2204
|
-
readonly chunkSize: number;
|
|
2205
|
-
private session;
|
|
2206
|
-
private ort;
|
|
2205
|
+
declare class SenseVoiceWorker {
|
|
2206
|
+
private worker;
|
|
2207
2207
|
private config;
|
|
2208
|
-
private _backend;
|
|
2209
2208
|
private isLoading;
|
|
2209
|
+
private _isLoaded;
|
|
2210
2210
|
private inferenceQueue;
|
|
2211
2211
|
private poisoned;
|
|
2212
|
-
private
|
|
2213
|
-
|
|
2214
|
-
|
|
2212
|
+
private pendingResolvers;
|
|
2213
|
+
private languageId;
|
|
2214
|
+
private textNormId;
|
|
2215
|
+
constructor(config: SenseVoiceWorkerConfig);
|
|
2215
2216
|
get isLoaded(): boolean;
|
|
2216
2217
|
/**
|
|
2217
|
-
*
|
|
2218
|
+
* Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
|
|
2218
2219
|
*/
|
|
2219
|
-
|
|
2220
|
+
get backend(): 'wasm' | null;
|
|
2220
2221
|
/**
|
|
2221
|
-
*
|
|
2222
|
+
* Create the worker from inline script
|
|
2223
|
+
*/
|
|
2224
|
+
private createWorker;
|
|
2225
|
+
/**
|
|
2226
|
+
* Handle messages from worker
|
|
2227
|
+
*/
|
|
2228
|
+
private handleWorkerMessage;
|
|
2229
|
+
/**
|
|
2230
|
+
* Send message to worker and wait for response
|
|
2231
|
+
*/
|
|
2232
|
+
private sendMessage;
|
|
2233
|
+
/**
|
|
2234
|
+
* Load the ONNX model in the worker
|
|
2222
2235
|
*
|
|
2223
|
-
*
|
|
2224
|
-
*
|
|
2236
|
+
* @param onProgress - Optional progress callback. Fires once at 100% when load completes
|
|
2237
|
+
* (the worker downloads and loads the model internally, so granular progress is not available).
|
|
2238
|
+
*/
|
|
2239
|
+
load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
|
|
2240
|
+
/**
|
|
2241
|
+
* Transcribe audio samples to text
|
|
2225
2242
|
*
|
|
2226
|
-
* @param audioSamples
|
|
2227
|
-
* @
|
|
2243
|
+
* @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
|
|
2244
|
+
* @returns Transcription result with text, emotion, language, and event
|
|
2228
2245
|
*/
|
|
2229
|
-
|
|
2246
|
+
transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
|
|
2230
2247
|
/**
|
|
2231
|
-
* Queue inference to serialize
|
|
2248
|
+
* Queue inference to serialize worker calls
|
|
2232
2249
|
*/
|
|
2233
2250
|
private queueInference;
|
|
2234
2251
|
/**
|
|
2235
|
-
* Dispose of the
|
|
2252
|
+
* Dispose of the worker and free resources
|
|
2236
2253
|
*/
|
|
2237
2254
|
dispose(): Promise<void>;
|
|
2255
|
+
/**
|
|
2256
|
+
* Check if Web Workers are supported
|
|
2257
|
+
*/
|
|
2258
|
+
static isSupported(): boolean;
|
|
2238
2259
|
}
|
|
2239
2260
|
|
|
2240
2261
|
/**
|
|
2241
|
-
*
|
|
2262
|
+
* Shared blendshape constants and utilities for lip sync inference
|
|
2242
2263
|
*
|
|
2243
|
-
*
|
|
2244
|
-
*
|
|
2264
|
+
* Contains ARKIT_BLENDSHAPES (canonical 52-blendshape ordering), symmetrization,
|
|
2265
|
+
* and interpolation utilities used by A2EInference and all renderer adapters.
|
|
2245
2266
|
*
|
|
2246
|
-
*
|
|
2247
|
-
*
|
|
2248
|
-
*
|
|
2249
|
-
*
|
|
2250
|
-
|
|
2251
|
-
|
|
2267
|
+
* This module is the single source of truth for blendshape ordering to
|
|
2268
|
+
* avoid circular dependencies between inference classes.
|
|
2269
|
+
*
|
|
2270
|
+
* @category Inference
|
|
2271
|
+
*/
|
|
2272
|
+
/**
|
|
2273
|
+
* ARKit blendshape names in alphabetical order (52 total)
|
|
2274
|
+
* This is the canonical ordering used by all A2E models in the SDK.
|
|
2275
|
+
*/
|
|
2276
|
+
declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
|
|
2277
|
+
/** @deprecated Use ARKIT_BLENDSHAPES instead */
|
|
2278
|
+
declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
|
|
2279
|
+
/**
|
|
2280
|
+
* Linearly interpolate between two blendshape weight arrays.
|
|
2281
|
+
*
|
|
2282
|
+
* Pure math utility with zero renderer dependency — used by all renderer
|
|
2283
|
+
* adapters (@omote/three, @omote/babylon, @omote/r3f) for smooth frame
|
|
2284
|
+
* transitions.
|
|
2285
|
+
*
|
|
2286
|
+
* @param current - Current blendshape weights
|
|
2287
|
+
* @param target - Target blendshape weights
|
|
2288
|
+
* @param factor - Interpolation factor (0 = no change, 1 = snap to target). Default: 0.3
|
|
2289
|
+
* @returns Interpolated weights as number[]
|
|
2290
|
+
*/
|
|
2291
|
+
declare function lerpBlendshapes(current: Float32Array | number[], target: Float32Array | number[], factor?: number): number[];
|
|
2292
|
+
|
|
2293
|
+
/**
|
|
2294
|
+
* A2E inference engine for Audio-to-Expression (LAM model)
|
|
2295
|
+
*
|
|
2296
|
+
* Runs entirely in the browser using WebGPU or WASM.
|
|
2297
|
+
* Takes raw 16kHz audio and outputs 52 ARKit blendshapes for lip sync.
|
|
2298
|
+
* Uses the LAM (Large Animation Model) — see {@link A2EBackend} for the interface.
|
|
2252
2299
|
*
|
|
2300
|
+
* @see {@link createA2E} for the recommended zero-config factory
|
|
2301
|
+
* @see {@link A2EBackend} for the common interface
|
|
2253
2302
|
* @category Inference
|
|
2254
2303
|
*
|
|
2255
|
-
* @example
|
|
2304
|
+
* @example Basic usage
|
|
2256
2305
|
* ```typescript
|
|
2257
|
-
* import {
|
|
2306
|
+
* import { A2EInference } from '@omote/core';
|
|
2258
2307
|
*
|
|
2259
|
-
* const
|
|
2260
|
-
*
|
|
2261
|
-
* });
|
|
2262
|
-
* await lam.load();
|
|
2308
|
+
* const a2e = new A2EInference({ modelUrl: '/models/lam.onnx' });
|
|
2309
|
+
* await a2e.load();
|
|
2263
2310
|
*
|
|
2264
|
-
*
|
|
2265
|
-
*
|
|
2311
|
+
* // Process 1 second of audio (16kHz = 16000 samples)
|
|
2312
|
+
* const result = await a2e.infer(audioSamples);
|
|
2313
|
+
* console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
|
|
2266
2314
|
* ```
|
|
2267
2315
|
*/
|
|
2268
2316
|
|
|
2269
|
-
|
|
2270
|
-
|
|
2271
|
-
*/
|
|
2272
|
-
interface Wav2ArkitCpuWorkerConfig {
|
|
2273
|
-
/** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
|
|
2317
|
+
interface A2EInferenceConfig {
|
|
2318
|
+
/** Path or URL to the ONNX model */
|
|
2274
2319
|
modelUrl: string;
|
|
2275
2320
|
/**
|
|
2276
2321
|
* Path or URL to external model data file (.onnx.data weights).
|
|
2277
|
-
* Default: `${modelUrl}.data` (e.g., /models/
|
|
2322
|
+
* Default: `${modelUrl}.data` (e.g., /models/model.onnx.data)
|
|
2278
2323
|
*
|
|
2279
2324
|
* Set to `false` to skip external data loading (single-file models only).
|
|
2280
2325
|
*/
|
|
2281
2326
|
externalDataUrl?: string | false;
|
|
2327
|
+
/** Preferred backend (auto will try WebGPU first, fallback to WASM) */
|
|
2328
|
+
backend?: BackendPreference;
|
|
2329
|
+
/** Number of identity classes (default: 12 for streaming model) */
|
|
2330
|
+
numIdentityClasses?: number;
|
|
2331
|
+
/**
|
|
2332
|
+
* Number of audio samples per inference chunk (default: 16000).
|
|
2333
|
+
* Model supports variable chunk sizes. Smaller chunks = lower latency,
|
|
2334
|
+
* more inference overhead. 8000 (500ms) is recommended for real-time lip sync.
|
|
2335
|
+
*/
|
|
2336
|
+
chunkSize?: number;
|
|
2282
2337
|
}
|
|
2283
|
-
|
|
2284
|
-
|
|
2285
|
-
|
|
2286
|
-
|
|
2287
|
-
|
|
2288
|
-
*
|
|
2289
|
-
* @see Wav2ArkitCpuInference for main-thread version
|
|
2290
|
-
*/
|
|
2291
|
-
declare class Wav2ArkitCpuWorker implements A2EBackend {
|
|
2292
|
-
readonly modelId: "wav2arkit_cpu";
|
|
2293
|
-
readonly chunkSize: number;
|
|
2294
|
-
private worker;
|
|
2338
|
+
|
|
2339
|
+
declare class A2EInference implements A2EBackend {
|
|
2340
|
+
readonly modelId: "a2e";
|
|
2341
|
+
private session;
|
|
2342
|
+
private ort;
|
|
2295
2343
|
private config;
|
|
2344
|
+
private _backend;
|
|
2296
2345
|
private isLoading;
|
|
2297
|
-
private
|
|
2346
|
+
private numIdentityClasses;
|
|
2347
|
+
readonly chunkSize: number;
|
|
2298
2348
|
private inferenceQueue;
|
|
2299
2349
|
private poisoned;
|
|
2300
|
-
private
|
|
2301
|
-
constructor(config:
|
|
2302
|
-
get isLoaded(): boolean;
|
|
2303
|
-
/**
|
|
2304
|
-
* Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
|
|
2305
|
-
*/
|
|
2306
|
-
get backend(): 'wasm' | null;
|
|
2307
|
-
/**
|
|
2308
|
-
* Create the worker from inline script
|
|
2309
|
-
*/
|
|
2310
|
-
private createWorker;
|
|
2311
|
-
/**
|
|
2312
|
-
* Handle messages from worker
|
|
2313
|
-
*/
|
|
2314
|
-
private handleWorkerMessage;
|
|
2350
|
+
private static readonly INFERENCE_TIMEOUT_MS;
|
|
2351
|
+
constructor(config: A2EInferenceConfig);
|
|
2315
2352
|
/**
|
|
2316
|
-
*
|
|
2353
|
+
* Check if WebGPU is available and working
|
|
2354
|
+
* (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
|
|
2317
2355
|
*/
|
|
2318
|
-
|
|
2356
|
+
static isWebGPUAvailable: typeof isWebGPUAvailable;
|
|
2357
|
+
get backend(): 'webgpu' | 'wasm' | null;
|
|
2358
|
+
get isLoaded(): boolean;
|
|
2359
|
+
/** True if inference timed out and the session is permanently unusable */
|
|
2360
|
+
get isSessionPoisoned(): boolean;
|
|
2319
2361
|
/**
|
|
2320
|
-
* Load the ONNX model
|
|
2362
|
+
* Load the ONNX model
|
|
2321
2363
|
*/
|
|
2322
2364
|
load(): Promise<A2EModelInfo>;
|
|
2323
2365
|
/**
|
|
2324
2366
|
* Run inference on raw audio
|
|
2325
|
-
*
|
|
2326
|
-
* Accepts variable-length audio (not fixed to 16000 samples).
|
|
2327
|
-
* Output frames = ceil(30 * numSamples / 16000).
|
|
2328
|
-
*
|
|
2329
2367
|
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
2330
|
-
* @param
|
|
2368
|
+
* @param identityIndex - Optional identity index (0-11, default 0 = neutral)
|
|
2369
|
+
*
|
|
2370
|
+
* Audio will be zero-padded or truncated to chunkSize samples.
|
|
2331
2371
|
*/
|
|
2332
|
-
infer(audioSamples: Float32Array,
|
|
2372
|
+
infer(audioSamples: Float32Array, identityIndex?: number): Promise<A2EResult>;
|
|
2333
2373
|
/**
|
|
2334
|
-
* Queue inference to serialize
|
|
2374
|
+
* Queue inference to serialize ONNX session calls
|
|
2335
2375
|
*/
|
|
2336
2376
|
private queueInference;
|
|
2337
2377
|
/**
|
|
2338
|
-
*
|
|
2378
|
+
* Get blendshape value by name for a specific frame
|
|
2339
2379
|
*/
|
|
2340
|
-
|
|
2380
|
+
getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
|
|
2341
2381
|
/**
|
|
2342
|
-
*
|
|
2382
|
+
* Dispose of the model and free resources
|
|
2343
2383
|
*/
|
|
2344
|
-
|
|
2384
|
+
dispose(): Promise<void>;
|
|
2345
2385
|
}
|
|
2346
2386
|
|
|
2347
2387
|
/**
|
|
2348
|
-
*
|
|
2388
|
+
* Default and user-configurable model URLs for all ONNX models
|
|
2349
2389
|
*
|
|
2350
|
-
*
|
|
2390
|
+
* Out of the box, models are served from HuggingFace CDN (`/resolve/main/`
|
|
2391
|
+
* endpoint with `Access-Control-Allow-Origin: *`). For production apps that
|
|
2392
|
+
* need faster or more reliable delivery, call {@link configureModelUrls} once
|
|
2393
|
+
* at startup to point any or all models at your own CDN.
|
|
2351
2394
|
*
|
|
2352
|
-
*
|
|
2353
|
-
* Wav2Vec2 (WebGPU, 192MB fp16) → wav2arkit_cpu fallback
|
|
2395
|
+
* @category Inference
|
|
2354
2396
|
*
|
|
2355
|
-
*
|
|
2356
|
-
*
|
|
2397
|
+
* @example Use HuggingFace defaults (zero-config)
|
|
2398
|
+
* ```typescript
|
|
2399
|
+
* import { createA2E } from '@omote/core';
|
|
2400
|
+
* const a2e = createA2E(); // fetches from HuggingFace CDN
|
|
2401
|
+
* ```
|
|
2357
2402
|
*
|
|
2358
|
-
*
|
|
2359
|
-
*
|
|
2360
|
-
*
|
|
2361
|
-
* graph parsing/optimization, fitting within iOS's ~1-1.5GB tab limit.
|
|
2403
|
+
* @example Self-host on your own CDN
|
|
2404
|
+
* ```typescript
|
|
2405
|
+
* import { configureModelUrls, createA2E } from '@omote/core';
|
|
2362
2406
|
*
|
|
2363
|
-
*
|
|
2364
|
-
*
|
|
2365
|
-
*
|
|
2366
|
-
*
|
|
2407
|
+
* configureModelUrls({
|
|
2408
|
+
* lam: 'https://cdn.example.com/models/model_fp16.onnx',
|
|
2409
|
+
* senseVoice: 'https://cdn.example.com/models/sensevoice.int8.onnx',
|
|
2410
|
+
* // omitted keys keep HuggingFace defaults
|
|
2411
|
+
* });
|
|
2367
2412
|
*
|
|
2368
|
-
*
|
|
2413
|
+
* const a2e = createA2E(); // now fetches from your CDN
|
|
2414
|
+
* ```
|
|
2415
|
+
*/
|
|
2416
|
+
/** Model URL keys that can be configured */
|
|
2417
|
+
type ModelUrlKey = 'lam' | 'senseVoice' | 'sileroVad' | 'kokoroTTS' | 'kokoroVoices';
|
|
2418
|
+
/**
|
|
2419
|
+
* Resolved model URLs — user overrides take priority, HuggingFace CDN is fallback.
|
|
2369
2420
|
*
|
|
2370
|
-
*
|
|
2371
|
-
*
|
|
2372
|
-
*
|
|
2421
|
+
* All SDK factories (`createA2E`, `createSenseVoice`, `createSileroVAD`) and
|
|
2422
|
+
* orchestrators (`VoicePipeline`) read from this object. Call
|
|
2423
|
+
* {@link configureModelUrls} before constructing any pipelines to point
|
|
2424
|
+
* models at your own CDN.
|
|
2425
|
+
*/
|
|
2426
|
+
declare const DEFAULT_MODEL_URLS: Readonly<Record<ModelUrlKey, string>>;
|
|
2427
|
+
/**
|
|
2428
|
+
* Configure custom model URLs. Overrides persist for the lifetime of the page.
|
|
2429
|
+
* Omitted keys keep their HuggingFace CDN defaults.
|
|
2373
2430
|
*
|
|
2374
|
-
*
|
|
2375
|
-
*
|
|
2376
|
-
*
|
|
2431
|
+
* Call this **once** at app startup, before constructing any pipelines.
|
|
2432
|
+
*
|
|
2433
|
+
* @example Self-host all models
|
|
2434
|
+
* ```typescript
|
|
2435
|
+
* configureModelUrls({
|
|
2436
|
+
* lam: 'https://cdn.example.com/models/lam.onnx',
|
|
2437
|
+
* senseVoice: 'https://cdn.example.com/models/sensevoice.int8.onnx',
|
|
2438
|
+
* sileroVad: 'https://cdn.example.com/models/silero-vad.onnx',
|
|
2439
|
+
* });
|
|
2377
2440
|
* ```
|
|
2378
2441
|
*
|
|
2379
|
-
* @example
|
|
2442
|
+
* @example Override only one model
|
|
2380
2443
|
* ```typescript
|
|
2381
|
-
*
|
|
2444
|
+
* configureModelUrls({
|
|
2445
|
+
* lam: '/models/model_fp16.onnx', // self-hosted, same origin
|
|
2446
|
+
* });
|
|
2382
2447
|
* ```
|
|
2383
2448
|
*/
|
|
2384
|
-
|
|
2449
|
+
declare function configureModelUrls(urls: Partial<Record<ModelUrlKey, string>>): void;
|
|
2385
2450
|
/**
|
|
2386
|
-
*
|
|
2451
|
+
* Reset all model URL overrides back to HuggingFace CDN defaults.
|
|
2452
|
+
* Mainly useful for testing.
|
|
2387
2453
|
*/
|
|
2388
|
-
|
|
2389
|
-
/** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge). Default: HuggingFace CDN */
|
|
2390
|
-
gpuModelUrl?: string;
|
|
2391
|
-
/**
|
|
2392
|
-
* URL for GPU model external data file (.onnx.data weights).
|
|
2393
|
-
* Default: `${gpuModelUrl}.data`
|
|
2394
|
-
*
|
|
2395
|
-
* Set to `false` to skip external data loading (single-file models only).
|
|
2396
|
-
*/
|
|
2397
|
-
gpuExternalDataUrl?: string | false;
|
|
2398
|
-
/** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS). Default: HuggingFace CDN */
|
|
2399
|
-
cpuModelUrl?: string;
|
|
2400
|
-
/**
|
|
2401
|
-
* Model selection mode:
|
|
2402
|
-
* - 'auto': Safari/iOS -> CPU, everything else -> GPU (default)
|
|
2403
|
-
* - 'gpu': Force GPU model (Wav2Vec2Inference)
|
|
2404
|
-
* - 'cpu': Force CPU model (Wav2ArkitCpuInference)
|
|
2405
|
-
*/
|
|
2406
|
-
mode?: 'auto' | 'gpu' | 'cpu';
|
|
2407
|
-
/** Backend preference for GPU model (default: 'auto') */
|
|
2408
|
-
gpuBackend?: BackendPreference;
|
|
2409
|
-
/** Number of identity classes for GPU model (default: 12) */
|
|
2410
|
-
numIdentityClasses?: number;
|
|
2411
|
-
/**
|
|
2412
|
-
* Fall back to CPU model if GPU model fails to load (default: true)
|
|
2413
|
-
* Only applies when mode is 'auto' or 'gpu'
|
|
2414
|
-
*/
|
|
2415
|
-
fallbackOnError?: boolean;
|
|
2416
|
-
}
|
|
2454
|
+
declare function resetModelUrls(): void;
|
|
2417
2455
|
/**
|
|
2418
|
-
*
|
|
2419
|
-
*
|
|
2420
|
-
* @param config - Factory configuration
|
|
2421
|
-
* @returns An A2EBackend instance (either GPU or CPU model)
|
|
2456
|
+
* Get the immutable HuggingFace CDN URLs (ignoring any overrides).
|
|
2457
|
+
* Useful for documentation or fallback logic.
|
|
2422
2458
|
*/
|
|
2423
|
-
declare
|
|
2459
|
+
declare const HF_CDN_URLS: Readonly<Record<ModelUrlKey, string>>;
|
|
2424
2460
|
|
|
2425
2461
|
/**
|
|
2426
2462
|
* A2EProcessor — Engine-agnostic audio-to-expression processor
|
|
@@ -2471,9 +2507,6 @@ interface A2EProcessorConfig {
|
|
|
2471
2507
|
* The LAM model uses a one-hot identity vector (12 classes, indices 0-11) as
|
|
2472
2508
|
* style conditioning alongside audio features. Different indices produce
|
|
2473
2509
|
* different expression intensity across face regions (brows, eyes, cheeks).
|
|
2474
|
-
*
|
|
2475
|
-
* Only affects Wav2Vec2Inference (GPU model). Wav2ArkitCpuInference has
|
|
2476
|
-
* identity 11 baked into the model weights.
|
|
2477
2510
|
*/
|
|
2478
2511
|
identityIndex?: number;
|
|
2479
2512
|
/** Callback fired with each blendshape frame (push mode) */
|
|
@@ -2482,6 +2515,7 @@ interface A2EProcessorConfig {
|
|
|
2482
2515
|
onError?: (error: Error) => void;
|
|
2483
2516
|
}
|
|
2484
2517
|
declare class A2EProcessor {
|
|
2518
|
+
private static readonly MAX_PENDING_CHUNKS;
|
|
2485
2519
|
private readonly backend;
|
|
2486
2520
|
private readonly sampleRate;
|
|
2487
2521
|
private readonly chunkSize;
|
|
@@ -2497,6 +2531,8 @@ declare class A2EProcessor {
|
|
|
2497
2531
|
private _latestFrame;
|
|
2498
2532
|
private dripInterval;
|
|
2499
2533
|
private lastPulledFrame;
|
|
2534
|
+
private lastDequeuedTime;
|
|
2535
|
+
private decayBuffer;
|
|
2500
2536
|
private inferenceRunning;
|
|
2501
2537
|
private pendingChunks;
|
|
2502
2538
|
private getFrameCallCount;
|
|
@@ -2633,161 +2669,13 @@ declare class BlendshapeSmoother {
|
|
|
2633
2669
|
* The springs will smoothly close the mouth / relax the face over
|
|
2634
2670
|
* the halflife period rather than freezing.
|
|
2635
2671
|
*/
|
|
2636
|
-
decayToNeutral(): void;
|
|
2637
|
-
/**
|
|
2638
|
-
* Reset all state (values, velocities, targets).
|
|
2639
|
-
* Call when starting a new playback session.
|
|
2640
|
-
*/
|
|
2641
|
-
reset(): void;
|
|
2642
|
-
}
|
|
2643
|
-
|
|
2644
|
-
/**
|
|
2645
|
-
* Factory function for Silero VAD with automatic Worker vs main thread selection
|
|
2646
|
-
*
|
|
2647
|
-
* Provides a unified API that automatically selects the optimal implementation:
|
|
2648
|
-
* - Desktop browsers: Uses SileroVADWorker (off-main-thread inference)
|
|
2649
|
-
* - Mobile devices: Uses SileroVADInference (main thread, avoids memory overhead)
|
|
2650
|
-
* - Fallback: Gracefully falls back to main thread if Worker fails
|
|
2651
|
-
*
|
|
2652
|
-
* @category Inference
|
|
2653
|
-
*
|
|
2654
|
-
* @example Basic usage (auto-detect)
|
|
2655
|
-
* ```typescript
|
|
2656
|
-
* import { createSileroVAD } from '@omote/core';
|
|
2657
|
-
*
|
|
2658
|
-
* const vad = createSileroVAD({
|
|
2659
|
-
* modelUrl: '/models/silero-vad.onnx',
|
|
2660
|
-
* threshold: 0.5,
|
|
2661
|
-
* });
|
|
2662
|
-
*
|
|
2663
|
-
* await vad.load();
|
|
2664
|
-
* const result = await vad.process(audioChunk);
|
|
2665
|
-
* if (result.isSpeech) {
|
|
2666
|
-
* console.log('Speech detected!', result.probability);
|
|
2667
|
-
* }
|
|
2668
|
-
* ```
|
|
2669
|
-
*
|
|
2670
|
-
* @example Force worker usage
|
|
2671
|
-
* ```typescript
|
|
2672
|
-
* const vad = createSileroVAD({
|
|
2673
|
-
* modelUrl: '/models/silero-vad.onnx',
|
|
2674
|
-
* useWorker: true, // Force Worker even on mobile
|
|
2675
|
-
* });
|
|
2676
|
-
* ```
|
|
2677
|
-
*
|
|
2678
|
-
* @example Force main thread
|
|
2679
|
-
* ```typescript
|
|
2680
|
-
* const vad = createSileroVAD({
|
|
2681
|
-
* modelUrl: '/models/silero-vad.onnx',
|
|
2682
|
-
* useWorker: false, // Force main thread
|
|
2683
|
-
* });
|
|
2684
|
-
* ```
|
|
2685
|
-
*/
|
|
2686
|
-
|
|
2687
|
-
/**
|
|
2688
|
-
* Common interface for both SileroVADInference and SileroVADWorker
|
|
2689
|
-
*
|
|
2690
|
-
* This interface defines the shared API that both implementations provide,
|
|
2691
|
-
* allowing consumers to use either interchangeably.
|
|
2692
|
-
*/
|
|
2693
|
-
interface SileroVADBackend {
|
|
2694
|
-
/** Current backend type (webgpu, wasm, or null if not loaded) */
|
|
2695
|
-
readonly backend: RuntimeBackend | null;
|
|
2696
|
-
/** Whether the model is loaded and ready for inference */
|
|
2697
|
-
readonly isLoaded: boolean;
|
|
2698
|
-
/** Audio sample rate (8000 or 16000 Hz) */
|
|
2699
|
-
readonly sampleRate: number;
|
|
2700
|
-
/** Speech detection threshold (0-1) */
|
|
2701
|
-
readonly threshold: number;
|
|
2702
|
-
/**
|
|
2703
|
-
* Load the ONNX model
|
|
2704
|
-
* @returns Model loading information
|
|
2705
|
-
*/
|
|
2706
|
-
load(): Promise<VADModelInfo | VADWorkerModelInfo>;
|
|
2707
|
-
/**
|
|
2708
|
-
* Process a single audio chunk
|
|
2709
|
-
* @param audioChunk - Float32Array of exactly chunkSize samples
|
|
2710
|
-
* @returns VAD result with speech probability
|
|
2711
|
-
*/
|
|
2712
|
-
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
2713
|
-
/**
|
|
2714
|
-
* Reset state for new audio stream
|
|
2715
|
-
*/
|
|
2716
|
-
reset(): void | Promise<void>;
|
|
2717
|
-
/**
|
|
2718
|
-
* Dispose of the model and free resources
|
|
2719
|
-
*/
|
|
2720
|
-
dispose(): Promise<void>;
|
|
2721
|
-
/**
|
|
2722
|
-
* Get required chunk size in samples
|
|
2723
|
-
*/
|
|
2724
|
-
getChunkSize(): number;
|
|
2725
|
-
/**
|
|
2726
|
-
* Get chunk duration in milliseconds
|
|
2727
|
-
*/
|
|
2728
|
-
getChunkDurationMs(): number;
|
|
2729
|
-
}
|
|
2730
|
-
/**
|
|
2731
|
-
* Configuration for the Silero VAD factory
|
|
2732
|
-
*
|
|
2733
|
-
* Extends SileroVADConfig with worker-specific options.
|
|
2734
|
-
*/
|
|
2735
|
-
interface SileroVADFactoryConfig extends Omit<SileroVADConfig, 'modelUrl'>, InferenceFactoryConfig {
|
|
2736
|
-
/** Path or URL to the ONNX model. Default: HuggingFace CDN */
|
|
2737
|
-
modelUrl?: string;
|
|
2738
|
-
/**
|
|
2739
|
-
* Fallback to main thread on worker errors.
|
|
2740
|
-
*
|
|
2741
|
-
* When true (default), if the Worker fails to load or encounters an error,
|
|
2742
|
-
* the factory will automatically create a main thread instance instead.
|
|
2743
|
-
*
|
|
2744
|
-
* When false, worker errors will propagate as exceptions.
|
|
2745
|
-
*
|
|
2746
|
-
* Default: true
|
|
2747
|
-
*/
|
|
2748
|
-
fallbackOnError?: boolean;
|
|
2672
|
+
decayToNeutral(): void;
|
|
2673
|
+
/**
|
|
2674
|
+
* Reset all state (values, velocities, targets).
|
|
2675
|
+
* Call when starting a new playback session.
|
|
2676
|
+
*/
|
|
2677
|
+
reset(): void;
|
|
2749
2678
|
}
|
|
2750
|
-
/**
|
|
2751
|
-
* Check if the current environment supports VAD Web Workers
|
|
2752
|
-
*
|
|
2753
|
-
* Requirements:
|
|
2754
|
-
* - Worker constructor must exist
|
|
2755
|
-
* - Blob URL support (for inline worker script)
|
|
2756
|
-
*
|
|
2757
|
-
* @returns true if VAD Worker is supported
|
|
2758
|
-
*/
|
|
2759
|
-
declare function supportsVADWorker(): boolean;
|
|
2760
|
-
/**
|
|
2761
|
-
* Create a Silero VAD instance with automatic implementation selection
|
|
2762
|
-
*
|
|
2763
|
-
* This factory function automatically selects between:
|
|
2764
|
-
* - **SileroVADWorker**: Off-main-thread inference (better for desktop)
|
|
2765
|
-
* - **SileroVADInference**: Main thread inference (better for mobile)
|
|
2766
|
-
*
|
|
2767
|
-
* The selection is based on:
|
|
2768
|
-
* 1. Explicit `useWorker` config (if provided)
|
|
2769
|
-
* 2. Platform detection (mobile vs desktop)
|
|
2770
|
-
* 3. Worker API availability
|
|
2771
|
-
*
|
|
2772
|
-
* Both implementations share the same interface (SileroVADBackend),
|
|
2773
|
-
* so consumers can use either interchangeably.
|
|
2774
|
-
*
|
|
2775
|
-
* @param config - Factory configuration
|
|
2776
|
-
* @returns A SileroVAD instance (either Worker or main thread)
|
|
2777
|
-
*
|
|
2778
|
-
* @example
|
|
2779
|
-
* ```typescript
|
|
2780
|
-
* // Auto-detect (recommended)
|
|
2781
|
-
* const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
|
|
2782
|
-
*
|
|
2783
|
-
* // Force Worker
|
|
2784
|
-
* const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
|
|
2785
|
-
*
|
|
2786
|
-
* // Force main thread
|
|
2787
|
-
* const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
|
|
2788
|
-
* ```
|
|
2789
|
-
*/
|
|
2790
|
-
declare function createSileroVAD(config?: SileroVADFactoryConfig): SileroVADBackend;
|
|
2791
2679
|
|
|
2792
2680
|
/**
|
|
2793
2681
|
* SenseVoice adapter backed by UnifiedInferenceWorker
|
|
@@ -2814,44 +2702,21 @@ declare class SenseVoiceUnifiedAdapter implements SenseVoiceBackend {
|
|
|
2814
2702
|
}
|
|
2815
2703
|
|
|
2816
2704
|
/**
|
|
2817
|
-
*
|
|
2818
|
-
*
|
|
2819
|
-
* Implements A2EBackend, delegating all inference to the shared worker.
|
|
2820
|
-
*/
|
|
2821
|
-
|
|
2822
|
-
declare class Wav2ArkitCpuUnifiedAdapter implements A2EBackend {
|
|
2823
|
-
readonly modelId: "wav2arkit_cpu";
|
|
2824
|
-
readonly chunkSize: number;
|
|
2825
|
-
private worker;
|
|
2826
|
-
private config;
|
|
2827
|
-
private _isLoaded;
|
|
2828
|
-
private loadedGeneration;
|
|
2829
|
-
/** Per-adapter inference queue — ensures sequential state updates. */
|
|
2830
|
-
private inferenceQueue;
|
|
2831
|
-
constructor(worker: UnifiedInferenceWorker, config: Wav2ArkitCpuWorkerConfig);
|
|
2832
|
-
get isLoaded(): boolean;
|
|
2833
|
-
get backend(): RuntimeBackend | null;
|
|
2834
|
-
load(): Promise<A2EModelInfo>;
|
|
2835
|
-
infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
|
|
2836
|
-
dispose(): Promise<void>;
|
|
2837
|
-
private assertLoaded;
|
|
2838
|
-
}
|
|
2839
|
-
|
|
2840
|
-
/**
|
|
2841
|
-
* Wav2Vec2 (LAM) adapter backed by UnifiedInferenceWorker
|
|
2705
|
+
* A2E adapter backed by UnifiedInferenceWorker
|
|
2842
2706
|
*
|
|
2843
2707
|
* Implements A2EBackend, delegating all inference to the shared worker.
|
|
2844
|
-
* Used on iOS to run
|
|
2708
|
+
* Used on iOS to run A2E inference off the main thread via the unified worker.
|
|
2845
2709
|
*/
|
|
2846
2710
|
|
|
2847
|
-
declare class
|
|
2848
|
-
readonly modelId: "
|
|
2711
|
+
declare class A2EUnifiedAdapter implements A2EBackend {
|
|
2712
|
+
readonly modelId: "a2e";
|
|
2849
2713
|
readonly chunkSize: number;
|
|
2850
2714
|
private worker;
|
|
2851
2715
|
private modelUrl;
|
|
2852
2716
|
private externalDataUrl;
|
|
2853
2717
|
private numIdentityClasses;
|
|
2854
2718
|
private _isLoaded;
|
|
2719
|
+
private _backend;
|
|
2855
2720
|
private loadedGeneration;
|
|
2856
2721
|
/** Per-adapter inference queue — ensures sequential state updates. */
|
|
2857
2722
|
private inferenceQueue;
|
|
@@ -2944,6 +2809,11 @@ interface SynthesizeOptions {
|
|
|
2944
2809
|
/** Speed multiplier (overrides config speed) */
|
|
2945
2810
|
speed?: number;
|
|
2946
2811
|
}
|
|
2812
|
+
/**
|
|
2813
|
+
* Validate TTS input parameters at API boundaries.
|
|
2814
|
+
* Returns trimmed text on success, throws on invalid input.
|
|
2815
|
+
*/
|
|
2816
|
+
declare function validateTTSInput(text: unknown, voiceName: string, speed: number, availableVoices?: string[]): string;
|
|
2947
2817
|
declare class KokoroTTSInference implements TTSBackend {
|
|
2948
2818
|
private readonly config;
|
|
2949
2819
|
private readonly modelUrl;
|
|
@@ -3077,113 +2947,6 @@ declare class SileroVADUnifiedAdapter implements SileroVADBackend {
|
|
|
3077
2947
|
private assertLoaded;
|
|
3078
2948
|
}
|
|
3079
2949
|
|
|
3080
|
-
/**
|
|
3081
|
-
* Renderer-agnostic A2E (audio-to-expression) orchestrator
|
|
3082
|
-
*
|
|
3083
|
-
* Manages the mic capture + A2E inference loop independently of any
|
|
3084
|
-
* 3D renderer. Adapter packages (@omote/three, @omote/babylon) wrap this
|
|
3085
|
-
* thinly and pipe `latestWeights` into their renderer-specific blendshape
|
|
3086
|
-
* controllers.
|
|
3087
|
-
*
|
|
3088
|
-
* Internally delegates all buffer accumulation, inference, and frame
|
|
3089
|
-
* drip-feeding to {@link A2EProcessor}. This class only handles mic capture
|
|
3090
|
-
* (getUserMedia, ScriptProcessorNode, resampling).
|
|
3091
|
-
*
|
|
3092
|
-
* @deprecated Use {@link MicLipSync} from `@omote/core` instead. MicLipSync provides
|
|
3093
|
-
* the same mic → A2E composition with proper MicrophoneCapture integration, VAD support,
|
|
3094
|
-
* ExpressionProfile scaling, and pause/resume. This class will be removed in a future version.
|
|
3095
|
-
*
|
|
3096
|
-
* @category Inference
|
|
3097
|
-
*/
|
|
3098
|
-
|
|
3099
|
-
/**
|
|
3100
|
-
* Progress event emitted during model download / compile
|
|
3101
|
-
*/
|
|
3102
|
-
interface A2EProgressEvent {
|
|
3103
|
-
phase: 'download' | 'compile';
|
|
3104
|
-
progress: number;
|
|
3105
|
-
}
|
|
3106
|
-
/**
|
|
3107
|
-
* Configuration for the A2EOrchestrator
|
|
3108
|
-
*/
|
|
3109
|
-
interface A2EOrchestratorConfig {
|
|
3110
|
-
/** URL for the GPU model (Wav2Vec2, Chrome/Firefox/Edge) */
|
|
3111
|
-
gpuModelUrl: string;
|
|
3112
|
-
/** URL for GPU model external data file */
|
|
3113
|
-
gpuExternalDataUrl?: string | false;
|
|
3114
|
-
/** URL for the CPU model (wav2arkit_cpu, Safari/iOS) */
|
|
3115
|
-
cpuModelUrl?: string;
|
|
3116
|
-
/** Sample rate for mic capture (default: 16000) */
|
|
3117
|
-
sampleRate?: number;
|
|
3118
|
-
/** Chunk size in samples for mic capture (default: 16000 = 1s at 16kHz) */
|
|
3119
|
-
chunkSize?: number;
|
|
3120
|
-
/** Callback fired with new blendshape weights after each inference */
|
|
3121
|
-
onFrame?: (weights: Float32Array) => void;
|
|
3122
|
-
/** Callback fired during model loading progress */
|
|
3123
|
-
onProgress?: (event: A2EProgressEvent) => void;
|
|
3124
|
-
/** Callback fired on error */
|
|
3125
|
-
onError?: (error: Error) => void;
|
|
3126
|
-
/** Callback fired when model is loaded and ready */
|
|
3127
|
-
onReady?: () => void;
|
|
3128
|
-
/** Additional createA2E config options */
|
|
3129
|
-
a2eConfig?: Partial<CreateA2EConfig>;
|
|
3130
|
-
}
|
|
3131
|
-
/**
|
|
3132
|
-
* Renderer-agnostic A2E orchestrator.
|
|
3133
|
-
*
|
|
3134
|
-
* Manages mic capture + delegates inference to {@link A2EProcessor}.
|
|
3135
|
-
* Adapters read `latestWeights` each frame to apply to their meshes.
|
|
3136
|
-
*
|
|
3137
|
-
* @example Quick start (used by @omote/three and @omote/babylon adapters)
|
|
3138
|
-
* ```typescript
|
|
3139
|
-
* const orchestrator = new A2EOrchestrator({
|
|
3140
|
-
* gpuModelUrl: '/models/wav2vec2.onnx',
|
|
3141
|
-
* cpuModelUrl: '/models/wav2arkit_cpu.onnx',
|
|
3142
|
-
* onFrame: (weights) => controller.update(weights),
|
|
3143
|
-
* });
|
|
3144
|
-
* await orchestrator.load();
|
|
3145
|
-
* await orchestrator.start();
|
|
3146
|
-
* ```
|
|
3147
|
-
*/
|
|
3148
|
-
declare class A2EOrchestrator {
|
|
3149
|
-
private config;
|
|
3150
|
-
private a2e;
|
|
3151
|
-
private processor;
|
|
3152
|
-
private stream;
|
|
3153
|
-
private audioContext;
|
|
3154
|
-
private scriptProcessor;
|
|
3155
|
-
private nativeSampleRate;
|
|
3156
|
-
private _isReady;
|
|
3157
|
-
private _isStreaming;
|
|
3158
|
-
private _backend;
|
|
3159
|
-
private disposed;
|
|
3160
|
-
constructor(config: A2EOrchestratorConfig);
|
|
3161
|
-
/** Latest blendshape weights from inference (null if none yet) */
|
|
3162
|
-
get latestWeights(): Float32Array | null;
|
|
3163
|
-
/** Whether the model is loaded and ready for inference */
|
|
3164
|
-
get isReady(): boolean;
|
|
3165
|
-
/** Whether mic is active and inference loop is running */
|
|
3166
|
-
get isStreaming(): boolean;
|
|
3167
|
-
/** Current backend type (webgpu, wasm, or null) */
|
|
3168
|
-
get backend(): string | null;
|
|
3169
|
-
/**
|
|
3170
|
-
* Load the A2E model and create the processor
|
|
3171
|
-
*/
|
|
3172
|
-
load(): Promise<void>;
|
|
3173
|
-
/**
|
|
3174
|
-
* Start mic capture and inference loop
|
|
3175
|
-
*/
|
|
3176
|
-
start(): Promise<void>;
|
|
3177
|
-
/**
|
|
3178
|
-
* Stop mic capture and inference loop
|
|
3179
|
-
*/
|
|
3180
|
-
stop(): void;
|
|
3181
|
-
/**
|
|
3182
|
-
* Dispose of all resources
|
|
3183
|
-
*/
|
|
3184
|
-
dispose(): Promise<void>;
|
|
3185
|
-
}
|
|
3186
|
-
|
|
3187
2950
|
/**
|
|
3188
2951
|
* Safari Web Speech API wrapper for iOS speech recognition
|
|
3189
2952
|
*
|
|
@@ -3562,6 +3325,31 @@ type KokoroVoiceName = keyof typeof KOKORO_VOICES;
|
|
|
3562
3325
|
*/
|
|
3563
3326
|
declare function listVoices(): string[];
|
|
3564
3327
|
|
|
3328
|
+
/**
|
|
3329
|
+
* ORT CDN configuration
|
|
3330
|
+
*
|
|
3331
|
+
* Allows consumers to override the CDN base URL used for loading
|
|
3332
|
+
* ONNX Runtime WASM/WebGPU binaries. By default, ORT loads from
|
|
3333
|
+
* its bundled CDN path. Use {@link configureOrtCdn} to point at
|
|
3334
|
+
* a self-hosted or enterprise CDN.
|
|
3335
|
+
*
|
|
3336
|
+
* @category Inference
|
|
3337
|
+
*/
|
|
3338
|
+
/**
|
|
3339
|
+
* Override the CDN base URL for ONNX Runtime WASM/WebGPU binaries.
|
|
3340
|
+
*
|
|
3341
|
+
* Must be an HTTPS URL or a relative path (starts with `/` or `./`).
|
|
3342
|
+
* Call this once at app startup, before loading any models.
|
|
3343
|
+
*
|
|
3344
|
+
* @param cdnPath - HTTPS URL or relative path to ORT binaries directory
|
|
3345
|
+
* @throws If cdnPath is not HTTPS or a relative path
|
|
3346
|
+
*/
|
|
3347
|
+
declare function configureOrtCdn(cdnPath: string): void;
|
|
3348
|
+
/**
|
|
3349
|
+
* Get the current ORT CDN base URL override, or null if using defaults.
|
|
3350
|
+
*/
|
|
3351
|
+
declare function getOrtCdnBase(): string | null;
|
|
3352
|
+
|
|
3565
3353
|
/**
|
|
3566
3354
|
* Emotion - Helper for creating emotion vectors for avatar animation
|
|
3567
3355
|
*
|
|
@@ -3601,6 +3389,8 @@ type EmotionName = typeof EMOTION_NAMES[number];
|
|
|
3601
3389
|
type EmotionWeights = Partial<Record<EmotionName, number>>;
|
|
3602
3390
|
/** Total emotion vector size */
|
|
3603
3391
|
declare const EMOTION_VECTOR_SIZE = 26;
|
|
3392
|
+
/** Number of explicit emotion channels */
|
|
3393
|
+
declare const EXPLICIT_EMOTION_COUNT = 10;
|
|
3604
3394
|
/**
|
|
3605
3395
|
* Create an emotion vector from named weights
|
|
3606
3396
|
*
|
|
@@ -4099,7 +3889,48 @@ declare const MetricNames: {
|
|
|
4099
3889
|
readonly CACHE_HITS: "omote.cache.hits";
|
|
4100
3890
|
/** Counter: Cache misses */
|
|
4101
3891
|
readonly CACHE_MISSES: "omote.cache.misses";
|
|
3892
|
+
/** Histogram: VoicePipeline turn latency (speech end → transcript ready, excludes playback) */
|
|
3893
|
+
readonly VOICE_TURN_LATENCY: "omote.voice.turn.latency";
|
|
3894
|
+
/** Histogram: ASR transcription latency in ms */
|
|
3895
|
+
readonly VOICE_TRANSCRIPTION_LATENCY: "omote.voice.transcription.latency";
|
|
3896
|
+
/** Histogram: Response handler latency in ms */
|
|
3897
|
+
readonly VOICE_RESPONSE_LATENCY: "omote.voice.response.latency";
|
|
3898
|
+
/** Counter: Total transcriptions */
|
|
3899
|
+
readonly VOICE_TRANSCRIPTIONS: "omote.voice.transcriptions";
|
|
3900
|
+
/** Counter: Total interruptions */
|
|
3901
|
+
readonly VOICE_INTERRUPTIONS: "omote.voice.interruptions";
|
|
3902
|
+
/** Histogram: PlaybackPipeline session duration in ms */
|
|
3903
|
+
readonly PLAYBACK_SESSION_DURATION: "omote.playback.session.duration";
|
|
3904
|
+
/** Histogram: Audio chunk processing latency in ms */
|
|
3905
|
+
readonly PLAYBACK_CHUNK_LATENCY: "omote.playback.chunk.latency";
|
|
3906
|
+
/** Histogram: TTSSpeaker.connect() latency in ms */
|
|
3907
|
+
readonly TTS_CONNECT_LATENCY: "omote.tts.connect.latency";
|
|
3908
|
+
/** Histogram: TTSSpeaker.speak() latency in ms */
|
|
3909
|
+
readonly TTS_SPEAK_LATENCY: "omote.tts.speak.latency";
|
|
3910
|
+
/** Counter: TTSSpeaker.stop() aborted speak calls */
|
|
3911
|
+
readonly TTS_SPEAK_ABORTED: "omote.tts.speak.aborted";
|
|
3912
|
+
/** Counter: MicLipSync sessions started */
|
|
3913
|
+
readonly MIC_SESSIONS: "omote.mic.sessions";
|
|
3914
|
+
/** Histogram: CharacterController.update() latency in µs */
|
|
3915
|
+
readonly AVATAR_FRAME_LATENCY: "omote.avatar.frame.latency_us";
|
|
3916
|
+
/** Histogram: FaceCompositor.compose() latency in µs */
|
|
3917
|
+
readonly COMPOSITOR_COMPOSE_LATENCY: "omote.compositor.compose.latency_us";
|
|
3918
|
+
/** Counter: Frames exceeding budget threshold */
|
|
3919
|
+
readonly AVATAR_FRAME_DROPS: "omote.avatar.frame.drops";
|
|
3920
|
+
};
|
|
3921
|
+
/**
|
|
3922
|
+
* Centralized error type taxonomy for structured error reporting.
|
|
3923
|
+
*/
|
|
3924
|
+
declare const ErrorTypes: {
|
|
3925
|
+
readonly INFERENCE: "inference_error";
|
|
3926
|
+
readonly NETWORK: "network_error";
|
|
3927
|
+
readonly TIMEOUT: "timeout";
|
|
3928
|
+
readonly USER: "user_error";
|
|
3929
|
+
readonly RUNTIME: "runtime_error";
|
|
3930
|
+
readonly MEDIA: "media_error";
|
|
3931
|
+
readonly MODEL: "model_error";
|
|
4102
3932
|
};
|
|
3933
|
+
type ErrorType = typeof ErrorTypes[keyof typeof ErrorTypes];
|
|
4103
3934
|
/**
|
|
4104
3935
|
* Histogram buckets for inference latency (ms)
|
|
4105
3936
|
*/
|
|
@@ -4177,6 +4008,7 @@ declare class OmoteTelemetry {
|
|
|
4177
4008
|
private exporter;
|
|
4178
4009
|
private activeTraceId;
|
|
4179
4010
|
private metricsIntervalId;
|
|
4011
|
+
private spanStack;
|
|
4180
4012
|
private counters;
|
|
4181
4013
|
private histograms;
|
|
4182
4014
|
constructor(config: TelemetryConfig);
|
|
@@ -4274,6 +4106,14 @@ declare class OmoteTelemetry {
|
|
|
4274
4106
|
* Get current configuration
|
|
4275
4107
|
*/
|
|
4276
4108
|
getConfig(): TelemetryConfig;
|
|
4109
|
+
/**
|
|
4110
|
+
* Get the active span context for log-to-span correlation.
|
|
4111
|
+
* Returns the most recent (top of stack) active span, or null if none.
|
|
4112
|
+
*/
|
|
4113
|
+
getActiveContext(): {
|
|
4114
|
+
traceId: string;
|
|
4115
|
+
spanId: string;
|
|
4116
|
+
} | null;
|
|
4277
4117
|
}
|
|
4278
4118
|
|
|
4279
4119
|
/**
|
|
@@ -4886,6 +4726,7 @@ declare class ProceduralLifeLayer {
|
|
|
4886
4726
|
private noiseTime;
|
|
4887
4727
|
private previousEnergy;
|
|
4888
4728
|
private emphasisLevel;
|
|
4729
|
+
private readonly _outputBlendshapes;
|
|
4889
4730
|
constructor(config?: LifeLayerConfig);
|
|
4890
4731
|
/**
|
|
4891
4732
|
* Update the life layer and produce output for this frame.
|
|
@@ -4928,6 +4769,113 @@ declare class ProceduralLifeLayer {
|
|
|
4928
4769
|
private updateBrowNoise;
|
|
4929
4770
|
}
|
|
4930
4771
|
|
|
4772
|
+
/**
|
|
4773
|
+
* Body Animation — Renderer-agnostic interfaces and utilities.
|
|
4774
|
+
*
|
|
4775
|
+
* Defines the contract for body animation controllers that each renderer
|
|
4776
|
+
* adapter (@omote/three, @omote/babylon, @omote/r3f) implements natively.
|
|
4777
|
+
*
|
|
4778
|
+
* Also provides the shared bone filtering logic used during animation
|
|
4779
|
+
* retargeting — stripping head/neck/eye tracks so body animations don't
|
|
4780
|
+
* conflict with the face pipeline (FaceCompositor, gaze, ProceduralLifeLayer).
|
|
4781
|
+
*
|
|
4782
|
+
* @module animation
|
|
4783
|
+
*/
|
|
4784
|
+
/**
|
|
4785
|
+
* Renderer-agnostic animation controller interface.
|
|
4786
|
+
*
|
|
4787
|
+
* Each renderer adapter implements this against its native animation system:
|
|
4788
|
+
* - @omote/three → THREE.AnimationMixer + AnimationAction
|
|
4789
|
+
* - @omote/babylon → Babylon.js AnimationGroup
|
|
4790
|
+
* - @omote/r3f → React hook wrapping the Three.js implementation
|
|
4791
|
+
*
|
|
4792
|
+
* Python/Node ports implement this against their own runtimes.
|
|
4793
|
+
*/
|
|
4794
|
+
interface AnimationController {
|
|
4795
|
+
/** Play an animation by id. */
|
|
4796
|
+
play(id: string, options?: {
|
|
4797
|
+
fadeInDuration?: number;
|
|
4798
|
+
}): void;
|
|
4799
|
+
/** Stop all playing animations. */
|
|
4800
|
+
stop(fadeOutDuration?: number): void;
|
|
4801
|
+
/** Crossfade from current animation to target. */
|
|
4802
|
+
crossfadeTo(id: string, duration?: number): void;
|
|
4803
|
+
/** Check if a specific animation is currently playing. */
|
|
4804
|
+
isPlaying(id: string): boolean;
|
|
4805
|
+
/** Check if an animation with this id is loaded. */
|
|
4806
|
+
hasAnimation(id: string): boolean;
|
|
4807
|
+
/** List of loaded animation ids. */
|
|
4808
|
+
readonly availableAnimations: string[];
|
|
4809
|
+
}
|
|
4810
|
+
/**
|
|
4811
|
+
* Describes an external animation asset to load and configure.
|
|
4812
|
+
* Renderer-agnostic — loaders are adapter-specific.
|
|
4813
|
+
*/
|
|
4814
|
+
interface AnimationSource {
|
|
4815
|
+
/** Unique identifier for this animation. */
|
|
4816
|
+
id: string;
|
|
4817
|
+
/** URL to the animation file (FBX, GLB, etc.). */
|
|
4818
|
+
url: string;
|
|
4819
|
+
/** Clip name within the file (if it contains multiple clips). */
|
|
4820
|
+
clipName?: string;
|
|
4821
|
+
/** Playback options. */
|
|
4822
|
+
options?: AnimationSourceOptions;
|
|
4823
|
+
}
|
|
4824
|
+
interface AnimationSourceOptions {
|
|
4825
|
+
loop?: boolean;
|
|
4826
|
+
timeScale?: number;
|
|
4827
|
+
fadeInDuration?: number;
|
|
4828
|
+
fadeOutDuration?: number;
|
|
4829
|
+
clampWhenFinished?: boolean;
|
|
4830
|
+
}
|
|
4831
|
+
/**
|
|
4832
|
+
* Configuration for filtering bone tracks from body animations.
|
|
4833
|
+
*
|
|
4834
|
+
* The face pipeline (FaceCompositor, gaze tracking, ProceduralLifeLayer) owns
|
|
4835
|
+
* certain bones (head, neck, eyes). Body animations must strip these tracks
|
|
4836
|
+
* to prevent conflicts.
|
|
4837
|
+
*/
|
|
4838
|
+
interface BoneFilterConfig {
|
|
4839
|
+
/** Bone names owned by the face pipeline (e.g., ['Head', 'Neck', 'LeftEye', 'RightEye']). */
|
|
4840
|
+
proceduralBones: string[];
|
|
4841
|
+
/** Whether to strip .position tracks (keep only quaternion/rotation). */
|
|
4842
|
+
filterPositionTracks: boolean;
|
|
4843
|
+
/** Whether to strip morphTargetInfluences tracks. */
|
|
4844
|
+
filterMorphTargets: boolean;
|
|
4845
|
+
}
|
|
4846
|
+
/** Mixamo bone name prefix (stripped during retargeting). */
|
|
4847
|
+
declare const MIXAMO_PREFIX = "mixamorig";
|
|
4848
|
+
/**
|
|
4849
|
+
* Bones that need position tracks preserved during retargeting.
|
|
4850
|
+
* Stripping finger/hand position tracks causes fingers to splay to bind pose.
|
|
4851
|
+
*/
|
|
4852
|
+
declare const PRESERVE_POSITION_BONES: Set<string>;
|
|
4853
|
+
/** Default bone filter for RPM/Mixamo avatars. */
|
|
4854
|
+
declare const DEFAULT_BONE_FILTER: BoneFilterConfig;
|
|
4855
|
+
/**
|
|
4856
|
+
* A generic animation track descriptor. Renderers map their native track
|
|
4857
|
+
* objects to this shape for filtering, then map back.
|
|
4858
|
+
*/
|
|
4859
|
+
interface TrackDescriptor {
|
|
4860
|
+
/** Full track name, e.g. "mixamorigHips.quaternion" or "Head.position". */
|
|
4861
|
+
name: string;
|
|
4862
|
+
}
|
|
4863
|
+
/**
|
|
4864
|
+
* Filter animation tracks according to a BoneFilterConfig.
|
|
4865
|
+
*
|
|
4866
|
+
* This is the renderer-agnostic core of `retargetClip`. Renderer adapters
|
|
4867
|
+
* call this with their native track names and use the result to decide
|
|
4868
|
+
* which tracks to keep.
|
|
4869
|
+
*
|
|
4870
|
+
* @returns true if the track should be KEPT (not filtered out).
|
|
4871
|
+
*/
|
|
4872
|
+
declare function shouldKeepTrack(trackName: string, config: BoneFilterConfig): boolean;
|
|
4873
|
+
/**
|
|
4874
|
+
* Strip Mixamo prefix from a track name.
|
|
4875
|
+
* "mixamorigHips.quaternion" → "Hips.quaternion"
|
|
4876
|
+
*/
|
|
4877
|
+
declare function stripMixamoPrefix(trackName: string): string;
|
|
4878
|
+
|
|
4931
4879
|
/**
|
|
4932
4880
|
* FACS (Facial Action Coding System) to ARKit Blendshape Mapping
|
|
4933
4881
|
*
|
|
@@ -5147,6 +5095,41 @@ declare class FaceCompositor {
|
|
|
5147
5095
|
private applyProfileArrays;
|
|
5148
5096
|
}
|
|
5149
5097
|
|
|
5098
|
+
/**
|
|
5099
|
+
* TextEmotionAnalyzer — Lightweight keyword heuristic for mapping AI response
|
|
5100
|
+
* text to an emotion label.
|
|
5101
|
+
*
|
|
5102
|
+
* Returns null if no strong signal is detected (keeps current emotion).
|
|
5103
|
+
*
|
|
5104
|
+
* @category Face
|
|
5105
|
+
*/
|
|
5106
|
+
/**
|
|
5107
|
+
* Analyze AI response text for emotional content.
|
|
5108
|
+
*
|
|
5109
|
+
* @param text - The AI response text to analyze
|
|
5110
|
+
* @returns An emotion label string, or null if no strong signal detected
|
|
5111
|
+
*/
|
|
5112
|
+
declare function analyzeTextEmotion(text: string): string | null;
|
|
5113
|
+
|
|
5114
|
+
/**
|
|
5115
|
+
* EmotionTagParser — Strips `[tag]` emotion annotations from LLM response text.
|
|
5116
|
+
*
|
|
5117
|
+
* LLMs can self-annotate responses with emotion tags like `[excited]` or `[sad]`.
|
|
5118
|
+
* This parser extracts the first valid tag and returns clean display text.
|
|
5119
|
+
*
|
|
5120
|
+
* @category Face
|
|
5121
|
+
*/
|
|
5122
|
+
/**
|
|
5123
|
+
* Parse emotion tags from LLM response text.
|
|
5124
|
+
*
|
|
5125
|
+
* @param text - Raw LLM response text, possibly containing `[emotion]` tags
|
|
5126
|
+
* @returns Object with clean display text and extracted emotion label (or null)
|
|
5127
|
+
*/
|
|
5128
|
+
declare function parseEmotionTags(text: string): {
|
|
5129
|
+
cleanText: string;
|
|
5130
|
+
emotion: string | null;
|
|
5131
|
+
};
|
|
5132
|
+
|
|
5150
5133
|
/**
|
|
5151
5134
|
* CharacterController — Renderer-agnostic avatar composition loop
|
|
5152
5135
|
*
|
|
@@ -5230,6 +5213,9 @@ declare class CharacterController {
|
|
|
5230
5213
|
private readonly gazeYawInfluence;
|
|
5231
5214
|
private readonly gazePitchInfluence;
|
|
5232
5215
|
private readonly gazeSmoothing;
|
|
5216
|
+
private readonly frameTimes;
|
|
5217
|
+
private frameTimeIdx;
|
|
5218
|
+
private frameTimeFill;
|
|
5233
5219
|
private readonly zeroBase;
|
|
5234
5220
|
private readonly outputBuffer;
|
|
5235
5221
|
private readonly compositorInput;
|
|
@@ -5249,6 +5235,17 @@ declare class CharacterController {
|
|
|
5249
5235
|
setProfile(profile: CharacterProfile): void;
|
|
5250
5236
|
/** Access underlying FaceCompositor for advanced use. */
|
|
5251
5237
|
get compositor(): FaceCompositor;
|
|
5238
|
+
/**
|
|
5239
|
+
* Get a snapshot of frame budget performance (rolling 2-second window).
|
|
5240
|
+
* Useful for runtime diagnostics / dev overlays.
|
|
5241
|
+
*/
|
|
5242
|
+
getPerformanceSnapshot(): {
|
|
5243
|
+
avgFrameUs: number;
|
|
5244
|
+
maxFrameUs: number;
|
|
5245
|
+
p95FrameUs: number;
|
|
5246
|
+
droppedFrames: number;
|
|
5247
|
+
totalFrames: number;
|
|
5248
|
+
};
|
|
5252
5249
|
/** Reset all state (smoothing, life layer, emotions). */
|
|
5253
5250
|
reset(): void;
|
|
5254
5251
|
dispose(): void;
|
|
@@ -5285,7 +5282,7 @@ interface MicLipSyncConfig {
|
|
|
5285
5282
|
micChunkSize?: number;
|
|
5286
5283
|
/** Per-character expression weight scaling */
|
|
5287
5284
|
profile?: ExpressionProfile;
|
|
5288
|
-
/** Identity/style index for
|
|
5285
|
+
/** Identity/style index for A2E model (default: 0) */
|
|
5289
5286
|
identityIndex?: number;
|
|
5290
5287
|
}
|
|
5291
5288
|
interface MicLipSyncFrame {
|
|
@@ -5324,9 +5321,10 @@ declare class MicLipSync extends EventEmitter<MicLipSyncEvents> {
|
|
|
5324
5321
|
private _state;
|
|
5325
5322
|
private _isSpeaking;
|
|
5326
5323
|
private _currentFrame;
|
|
5327
|
-
private _currentRawFrame;
|
|
5328
5324
|
private profile;
|
|
5329
5325
|
private _firstFrameEmitted;
|
|
5326
|
+
private readonly _profileBuffer;
|
|
5327
|
+
private vadQueue;
|
|
5330
5328
|
private speechStartTime;
|
|
5331
5329
|
private vadChunkSize;
|
|
5332
5330
|
private vadBuffer;
|
|
@@ -5356,47 +5354,6 @@ declare class MicLipSync extends EventEmitter<MicLipSyncEvents> {
|
|
|
5356
5354
|
private setState;
|
|
5357
5355
|
}
|
|
5358
5356
|
|
|
5359
|
-
/**
|
|
5360
|
-
* Shared types for orchestration layer
|
|
5361
|
-
*
|
|
5362
|
-
* @category Orchestration
|
|
5363
|
-
*/
|
|
5364
|
-
|
|
5365
|
-
type VoicePipelineState = 'idle' | 'loading' | 'ready' | 'listening' | 'thinking' | 'speaking' | 'error';
|
|
5366
|
-
interface LoadingProgress {
|
|
5367
|
-
currentModel: string;
|
|
5368
|
-
progress: number;
|
|
5369
|
-
totalModels: number;
|
|
5370
|
-
modelsLoaded: number;
|
|
5371
|
-
}
|
|
5372
|
-
interface TranscriptResult {
|
|
5373
|
-
text: string;
|
|
5374
|
-
emotion?: string;
|
|
5375
|
-
language?: string;
|
|
5376
|
-
event?: string;
|
|
5377
|
-
isFinal: boolean;
|
|
5378
|
-
inferenceTimeMs?: number;
|
|
5379
|
-
}
|
|
5380
|
-
/**
|
|
5381
|
-
* Consumer's response handler. VoicePipeline calls this with transcribed text.
|
|
5382
|
-
* Consumer must stream audio back for playback + lip sync.
|
|
5383
|
-
*/
|
|
5384
|
-
interface ResponseHandler {
|
|
5385
|
-
(params: {
|
|
5386
|
-
text: string;
|
|
5387
|
-
emotion?: string;
|
|
5388
|
-
event?: string;
|
|
5389
|
-
/** Stream audio chunks to pipeline for playback + lip sync */
|
|
5390
|
-
send: (chunk: Uint8Array) => Promise<void>;
|
|
5391
|
-
/** Call when all audio has been sent */
|
|
5392
|
-
done: () => Promise<void>;
|
|
5393
|
-
/** Aborted on interruption or stop() */
|
|
5394
|
-
signal: AbortSignal;
|
|
5395
|
-
/** Session ID for backend correlation */
|
|
5396
|
-
sessionId: string;
|
|
5397
|
-
}): Promise<void>;
|
|
5398
|
-
}
|
|
5399
|
-
|
|
5400
5357
|
/**
|
|
5401
5358
|
* VoicePipeline - Full conversational agent loop
|
|
5402
5359
|
*
|
|
@@ -5429,10 +5386,9 @@ interface VoicePipelineBaseConfig {
|
|
|
5429
5386
|
language?: string;
|
|
5430
5387
|
};
|
|
5431
5388
|
lam: {
|
|
5432
|
-
|
|
5433
|
-
|
|
5434
|
-
|
|
5435
|
-
mode?: 'auto' | 'gpu' | 'cpu';
|
|
5389
|
+
modelUrl: string;
|
|
5390
|
+
externalDataUrl?: string | false;
|
|
5391
|
+
backend?: 'auto' | 'webgpu' | 'wasm';
|
|
5436
5392
|
};
|
|
5437
5393
|
vad: {
|
|
5438
5394
|
modelUrl: string;
|
|
@@ -5442,10 +5398,8 @@ interface VoicePipelineBaseConfig {
|
|
|
5442
5398
|
};
|
|
5443
5399
|
/** Per-character expression weight scaling */
|
|
5444
5400
|
profile?: ExpressionProfile;
|
|
5445
|
-
/** Identity/style index for
|
|
5401
|
+
/** Identity/style index for A2E model (default: 0) */
|
|
5446
5402
|
identityIndex?: number;
|
|
5447
|
-
/** LAM load timeout in ms — CPU fallback on timeout (default: 30000) */
|
|
5448
|
-
lamLoadTimeoutMs?: number;
|
|
5449
5403
|
/** Base silence timeout in ms (default: 500) */
|
|
5450
5404
|
silenceTimeoutMs?: number;
|
|
5451
5405
|
/** Extended silence timeout for long utterances (default: 700) */
|
|
@@ -5514,13 +5468,7 @@ interface VoicePipelineLocalConfig extends VoicePipelineBaseConfig {
|
|
|
5514
5468
|
/** Optional text transform (e.g., LLM call). Receives transcript, returns response text. */
|
|
5515
5469
|
onTranscript?: (text: string) => string | Promise<string>;
|
|
5516
5470
|
}
|
|
5517
|
-
|
|
5518
|
-
interface VoicePipelineLegacyConfig extends VoicePipelineBaseConfig {
|
|
5519
|
-
mode?: undefined;
|
|
5520
|
-
/** Consumer's response handler */
|
|
5521
|
-
onResponse: ResponseHandler;
|
|
5522
|
-
}
|
|
5523
|
-
type VoicePipelineConfig = VoicePipelineCloudConfig | VoicePipelineLocalConfig | VoicePipelineLegacyConfig;
|
|
5471
|
+
type VoicePipelineConfig = VoicePipelineCloudConfig | VoicePipelineLocalConfig;
|
|
5524
5472
|
interface VoicePipelineEvents {
|
|
5525
5473
|
'state': VoicePipelineState;
|
|
5526
5474
|
'loading:progress': LoadingProgress;
|
|
@@ -5558,6 +5506,7 @@ declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
|
|
|
5558
5506
|
private interruption;
|
|
5559
5507
|
private omoteEvents;
|
|
5560
5508
|
private mic;
|
|
5509
|
+
private static readonly MAX_AUDIO_BUFFER_SAMPLES;
|
|
5561
5510
|
private audioBuffer;
|
|
5562
5511
|
private audioBufferSamples;
|
|
5563
5512
|
private speechStartTime;
|
|
@@ -5569,6 +5518,8 @@ declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
|
|
|
5569
5518
|
private lastProgressiveSamples;
|
|
5570
5519
|
private asrErrorCount;
|
|
5571
5520
|
private responseAbortController;
|
|
5521
|
+
private _unsubChunk;
|
|
5522
|
+
private _unsubLevel;
|
|
5572
5523
|
private _currentFrame;
|
|
5573
5524
|
/** Current pipeline state */
|
|
5574
5525
|
get state(): VoicePipelineState;
|
|
@@ -5586,7 +5537,7 @@ declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
|
|
|
5586
5537
|
*/
|
|
5587
5538
|
private loadFromBackends;
|
|
5588
5539
|
/**
|
|
5589
|
-
* Load from factories (original path).
|
|
5540
|
+
* Load from factories (original path). Loads SenseVoice, LAM, and VAD in parallel.
|
|
5590
5541
|
*/
|
|
5591
5542
|
private loadFromFactories;
|
|
5592
5543
|
start(): Promise<void>;
|
|
@@ -5612,4 +5563,86 @@ declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
|
|
|
5612
5563
|
private clearSilenceTimer;
|
|
5613
5564
|
}
|
|
5614
5565
|
|
|
5615
|
-
|
|
5566
|
+
/**
|
|
5567
|
+
* VoiceOrchestrator — Shared voice wiring for OmoteAvatar adapters.
|
|
5568
|
+
*
|
|
5569
|
+
* Composes TTSSpeaker (local mode) or PlaybackPipeline (cloud mode) with
|
|
5570
|
+
* SpeechListener and InterruptionHandler. Supports both local TTS and
|
|
5571
|
+
* cloud TTS via discriminated union config.
|
|
5572
|
+
*
|
|
5573
|
+
* Extracted from the ~70 identical lines duplicated across three/babylon/r3f
|
|
5574
|
+
* adapters into a single reusable class.
|
|
5575
|
+
*
|
|
5576
|
+
* @category Orchestration
|
|
5577
|
+
*/
|
|
5578
|
+
|
|
5579
|
+
interface VoiceOrchestratorBaseConfig {
|
|
5580
|
+
listener?: SpeechListenerConfig;
|
|
5581
|
+
interruptionEnabled?: boolean;
|
|
5582
|
+
profile?: ExpressionProfile;
|
|
5583
|
+
}
|
|
5584
|
+
interface VoiceOrchestratorLocalConfig extends VoiceOrchestratorBaseConfig {
|
|
5585
|
+
mode?: 'local';
|
|
5586
|
+
tts: TTSBackend;
|
|
5587
|
+
speaker?: TTSSpeakerConfig;
|
|
5588
|
+
onTranscript: (text: string, emotion?: string) => string | Promise<string> | AsyncGenerator<string>;
|
|
5589
|
+
}
|
|
5590
|
+
interface VoiceOrchestratorCloudConfig extends VoiceOrchestratorBaseConfig {
|
|
5591
|
+
mode: 'cloud';
|
|
5592
|
+
onResponse: ResponseHandler;
|
|
5593
|
+
lam?: {
|
|
5594
|
+
modelUrl?: string;
|
|
5595
|
+
externalDataUrl?: string | false;
|
|
5596
|
+
};
|
|
5597
|
+
}
|
|
5598
|
+
type VoiceOrchestratorConfig = VoiceOrchestratorLocalConfig | VoiceOrchestratorCloudConfig;
|
|
5599
|
+
interface VoiceOrchestratorEvents {
|
|
5600
|
+
'state': ConversationalState;
|
|
5601
|
+
'transcript': TranscriptResult;
|
|
5602
|
+
[key: string]: unknown;
|
|
5603
|
+
}
|
|
5604
|
+
declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
|
|
5605
|
+
private speechListener;
|
|
5606
|
+
private interruption;
|
|
5607
|
+
private ttsSpeaker;
|
|
5608
|
+
private playbackPipeline;
|
|
5609
|
+
private ownedLam;
|
|
5610
|
+
private transcriptUnsub;
|
|
5611
|
+
private audioChunkUnsub;
|
|
5612
|
+
private connectEpoch;
|
|
5613
|
+
private responseAbortController;
|
|
5614
|
+
private _state;
|
|
5615
|
+
private _isSpeaking;
|
|
5616
|
+
private _frameSource;
|
|
5617
|
+
private _mode;
|
|
5618
|
+
private _sessionId;
|
|
5619
|
+
get state(): ConversationalState;
|
|
5620
|
+
get isSpeaking(): boolean;
|
|
5621
|
+
get frameSource(): FrameSource | null;
|
|
5622
|
+
/** Access the internal SpeechListener. */
|
|
5623
|
+
get listener(): SpeechListener | null;
|
|
5624
|
+
/** Access the internal TTSSpeaker (local mode only). */
|
|
5625
|
+
get speaker(): TTSSpeaker | null;
|
|
5626
|
+
connect(config: VoiceOrchestratorConfig): Promise<void>;
|
|
5627
|
+
disconnect(): Promise<void>;
|
|
5628
|
+
startListening(): Promise<void>;
|
|
5629
|
+
stopListening(): void;
|
|
5630
|
+
speak(text: string, options?: {
|
|
5631
|
+
signal?: AbortSignal;
|
|
5632
|
+
voice?: string;
|
|
5633
|
+
}): Promise<void>;
|
|
5634
|
+
streamText(options?: {
|
|
5635
|
+
signal?: AbortSignal;
|
|
5636
|
+
voice?: string;
|
|
5637
|
+
}): Promise<{
|
|
5638
|
+
push: (token: string) => void;
|
|
5639
|
+
end: () => Promise<void>;
|
|
5640
|
+
}>;
|
|
5641
|
+
stopSpeaking(): void;
|
|
5642
|
+
private wireLocalTranscript;
|
|
5643
|
+
private wireCloudTranscript;
|
|
5644
|
+
private handleInterruption;
|
|
5645
|
+
private setState;
|
|
5646
|
+
}
|
|
5647
|
+
|
|
5648
|
+
export { type A2EBackend, A2EInference, type A2EInferenceConfig, type A2EModelInfo, A2EProcessor, type A2EProcessorConfig, type A2EResult, A2EUnifiedAdapter, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, type AnimationController, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationSource, type AnimationSourceOptions, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, type BoneFilterConfig, type CacheConfig, type CacheSpanAttributes, CharacterController, type CharacterControllerConfig, type CharacterProfile, type CharacterUpdateInput, type CharacterUpdateOutput, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateKokoroTTSConfig, type CreateSenseVoiceConfig, type CreateTTSPlayerConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_BONE_FILTER, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, EXPLICIT_EMOTION_COUNT, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, type ErrorType, ErrorTypes, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FaceCompositorOutput, type FetchWithCacheOptions, type FrameSource, type FullFaceFrame, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceFactoryConfig, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, KOKORO_VOICES, type KokoroStreamChunk, type KokoroTTSConfig, KokoroTTSInference, type KokoroTTSModelInfo, type KokoroTTSResult, KokoroTTSUnifiedAdapter, KokoroTTSWorker, type KokoroVoiceName, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MIXAMO_PREFIX, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PRESERVE_POSITION_BONES, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, ProceduralLifeLayer, type Quat, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, SpeechListener, type SpeechListenerConfig, type SpeechListenerEvents, type SpeechListenerState, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type SynthesizeOptions, type TTSBackend, type TTSChunk, TTSPlayback, type TTSPlaybackConfig, type TTSPlaybackEvents, TTSPlayer, TTSSpeaker, type TTSSpeakerConfig, type TTSStreamOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TrackDescriptor, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type Vec3, VoiceOrchestrator, type VoiceOrchestratorCloudConfig, type VoiceOrchestratorConfig, type VoiceOrchestratorEvents, type VoiceOrchestratorLocalConfig, VoicePipeline, type VoicePipelineCloudConfig, type VoicePipelineConfig, type VoicePipelineEvents, type VoicePipelineLocalConfig, type VoicePipelineState, A2EInference as Wav2Vec2Inference, type WorkerHealthState, analyzeTextEmotion, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureOrtCdn, configureTelemetry, createA2E, createEmotionVector, createKokoroTTS, createSenseVoice, createSileroVAD, createTTSPlayer, fetchWithCache, float32ToPcm16, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getOrtCdnBase, getRecommendedBackend, getTelemetry, hasWebGPUApi, int16ToFloat32, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, listVoices as listKokoroVoices, parseEmotionTags, pcm16ToFloat32, preloadModels, resampleLinear, resetModelUrls, resolveBackend, resolveEmotion, shouldEnableWasmProxy, shouldKeepTrack, shouldUseNativeASR, shouldUseServerA2E, stripMixamoPrefix, supportsVADWorker, ttsToPlaybackFormat, validateTTSInput };
|