@omote/core 0.6.6 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -25
- package/dist/Logger-BeUI6jG7.d.mts +145 -0
- package/dist/Logger-BeUI6jG7.d.ts +145 -0
- package/dist/{Logger-I_k4sGhM.d.mts → Logger-DSoGAYJu.d.mts} +1 -1
- package/dist/{Logger-I_k4sGhM.d.ts → Logger-DSoGAYJu.d.ts} +1 -1
- package/dist/chunk-3NDJA3I4.mjs +853 -0
- package/dist/chunk-3NDJA3I4.mjs.map +1 -0
- package/dist/chunk-J5LAM7VW.mjs +44 -0
- package/dist/chunk-J5LAM7VW.mjs.map +1 -0
- package/dist/chunk-MXKJOF4I.mjs +38 -0
- package/dist/chunk-MXKJOF4I.mjs.map +1 -0
- package/dist/events/index.d.mts +2 -87
- package/dist/events/index.d.ts +2 -87
- package/dist/events/index.js +8 -2
- package/dist/events/index.js.map +1 -1
- package/dist/events/index.mjs +1 -1
- package/dist/index.d.mts +2040 -1235
- package/dist/index.d.ts +2040 -1235
- package/dist/index.js +10650 -7809
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +9319 -7040
- package/dist/index.mjs.map +1 -1
- package/dist/logging/index.d.mts +2 -2
- package/dist/logging/index.d.ts +2 -2
- package/dist/logging/index.js +11 -0
- package/dist/logging/index.js.map +1 -1
- package/dist/logging/index.mjs +1 -1
- package/package.json +3 -1
package/dist/index.d.mts
CHANGED
|
@@ -1,8 +1,60 @@
|
|
|
1
1
|
import { EventEmitter, OmoteEvents } from './events/index.mjs';
|
|
2
2
|
export { AnimationEvent, BackendEvent, EmotionEvent, GazeEvent, STTFinalEvent, STTPartialEvent, SessionStateEvent, TTSEndEvent, TTSMarkEvent, TTSStartEvent, VisemeEvent } from './events/index.mjs';
|
|
3
|
-
export { D as DEFAULT_LOGGING_CONFIG, I as ILogger,
|
|
3
|
+
export { D as DEFAULT_LOGGING_CONFIG, I as ILogger, a as LOG_LEVEL_PRIORITY, b as LogEntry, L as LogFormatter, c as LogLevel, d as LogSink, e as LoggingConfig, g as configureLogging, h as createLogger, i as getLoggingConfig, n as noopLogger, r as resetLoggingConfig, s as setLogLevel, k as setLoggingEnabled } from './Logger-BeUI6jG7.mjs';
|
|
4
4
|
export { ARKitToFLAMEMapping, ApiError, AudioChunkEvent, AvatarFormat, Character, CharacterAvatar, CharacterMemory, CharacterPersonality, CharacterSpec, CharacterVoice, CreateCharacterRequest, CreateCharacterResponse, CreateLAMJobRequest, CreateLAMJobResponse, CreateSessionRequest, CreateSessionResponse, GSplatConfig, LAMJob, LAMJobStatus, PROTOCOL_VERSION, PaginatedResponse, PlatformSession, ErrorEvent as ProtocolErrorEvent, ProtocolEvent, ResponseChunkEvent, ResponseEndEvent, ResponseStartEvent, SessionMessage, SessionStatus, isProtocolEvent } from '@omote/types';
|
|
5
5
|
|
|
6
|
+
/**
|
|
7
|
+
* Audio format conversion utilities
|
|
8
|
+
*
|
|
9
|
+
* Bridges the gap between TTS engines (Float32 at various sample rates)
|
|
10
|
+
* and playback pipelines (Uint8Array PCM16 at 16kHz).
|
|
11
|
+
*
|
|
12
|
+
* @module audio/audioConvert
|
|
13
|
+
*/
|
|
14
|
+
/**
|
|
15
|
+
* Convert Float32 [-1,1] samples to PCM16 Uint8Array (little-endian).
|
|
16
|
+
*
|
|
17
|
+
* @param samples - Float32Array of normalized audio samples
|
|
18
|
+
* @returns Uint8Array of PCM16 bytes (2 bytes per sample, little-endian)
|
|
19
|
+
*/
|
|
20
|
+
declare function float32ToPcm16(samples: Float32Array): Uint8Array;
|
|
21
|
+
/**
|
|
22
|
+
* Linear interpolation resampler.
|
|
23
|
+
* Good enough for speech (no sinc filtering needed).
|
|
24
|
+
*
|
|
25
|
+
* @param samples - Input audio samples
|
|
26
|
+
* @param fromRate - Source sample rate (e.g., 24000)
|
|
27
|
+
* @param toRate - Target sample rate (e.g., 16000)
|
|
28
|
+
* @returns Resampled Float32Array
|
|
29
|
+
*/
|
|
30
|
+
declare function resampleLinear(samples: Float32Array, fromRate: number, toRate: number): Float32Array;
|
|
31
|
+
/**
|
|
32
|
+
* Convenience: resample + encode in one call.
|
|
33
|
+
* Converts TTS output (Float32 at TTS rate) to pipeline format (PCM16 Uint8Array at 16kHz).
|
|
34
|
+
*
|
|
35
|
+
* @param audio - Float32Array from TTS engine
|
|
36
|
+
* @param sourceRate - TTS engine's output sample rate (default: 24000)
|
|
37
|
+
* @param targetRate - Pipeline's expected sample rate (default: 16000)
|
|
38
|
+
* @returns Uint8Array PCM16 at target rate
|
|
39
|
+
*/
|
|
40
|
+
declare function ttsToPlaybackFormat(audio: Float32Array, sourceRate?: number, targetRate?: number): Uint8Array;
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Shared audio utility functions
|
|
44
|
+
*
|
|
45
|
+
* @module audio
|
|
46
|
+
*/
|
|
47
|
+
/**
|
|
48
|
+
* Safely convert an ArrayBuffer of PCM16 bytes to Float32 samples.
|
|
49
|
+
* Handles odd-length buffers by truncating to the nearest even byte boundary.
|
|
50
|
+
*/
|
|
51
|
+
declare function pcm16ToFloat32(buffer: ArrayBuffer): Float32Array;
|
|
52
|
+
/**
|
|
53
|
+
* Convert Int16Array samples to Float32Array.
|
|
54
|
+
* Each sample is divided by 32768 to normalize to [-1, 1] range.
|
|
55
|
+
*/
|
|
56
|
+
declare function int16ToFloat32(int16: Int16Array): Float32Array;
|
|
57
|
+
|
|
6
58
|
/**
|
|
7
59
|
* Microphone capture - renderer-agnostic audio input
|
|
8
60
|
*
|
|
@@ -119,6 +171,8 @@ declare class AudioScheduler {
|
|
|
119
171
|
private scheduledSources;
|
|
120
172
|
private isPlaying;
|
|
121
173
|
constructor(options?: AudioSchedulerOptions);
|
|
174
|
+
/** Configured sample rate (default: 16000). */
|
|
175
|
+
get sampleRate(): number;
|
|
122
176
|
/**
|
|
123
177
|
* Initialize AudioContext with specified sample rate
|
|
124
178
|
*
|
|
@@ -377,19 +431,6 @@ declare function shouldEnableWasmProxy(): boolean;
|
|
|
377
431
|
* @returns true if running in Safari on any platform
|
|
378
432
|
*/
|
|
379
433
|
declare function isSafari(): boolean;
|
|
380
|
-
/**
|
|
381
|
-
* Recommend using CPU-optimized A2E model (wav2arkit_cpu)
|
|
382
|
-
*
|
|
383
|
-
* All iOS browsers use WebKit and have tight memory limits — the 192MB fp16
|
|
384
|
-
* LAM model causes silent crashes. wav2arkit_cpu uses URL pass-through
|
|
385
|
-
* (ORT fetches the 402MB weights directly into WASM, no JS heap copy).
|
|
386
|
-
*
|
|
387
|
-
* macOS Safari also needs this due to ONNX Runtime JSEP/ASYNCIFY bugs
|
|
388
|
-
* that crash WebKit's JIT compiler.
|
|
389
|
-
*
|
|
390
|
-
* @returns true if iOS (any browser) or Safari (any platform)
|
|
391
|
-
*/
|
|
392
|
-
declare function shouldUseCpuA2E(): boolean;
|
|
393
434
|
/**
|
|
394
435
|
* Check if Web Speech API is available in the browser
|
|
395
436
|
*
|
|
@@ -427,9 +468,8 @@ declare function shouldUseServerA2E(): boolean;
|
|
|
427
468
|
/**
|
|
428
469
|
* Common interface for audio-to-expression (A2E) inference backends
|
|
429
470
|
*
|
|
430
|
-
*
|
|
431
|
-
*
|
|
432
|
-
* work with either model transparently.
|
|
471
|
+
* Implemented by A2EInference and A2EUnifiedAdapter, allowing PlaybackPipeline
|
|
472
|
+
* and A2EProcessor to work with either implementation transparently.
|
|
433
473
|
*
|
|
434
474
|
* @category Inference
|
|
435
475
|
*/
|
|
@@ -458,15 +498,22 @@ interface A2EResult {
|
|
|
458
498
|
inferenceTimeMs: number;
|
|
459
499
|
}
|
|
460
500
|
/**
|
|
461
|
-
* Common interface for A2E (audio-to-expression) inference engines
|
|
501
|
+
* Common interface for A2E (audio-to-expression) inference engines.
|
|
502
|
+
*
|
|
503
|
+
* A2E is the SDK term for audio-to-expression inference. The underlying model
|
|
504
|
+
* is called **LAM** (Large Animation Model). "A2E" and "LAM" refer to the same
|
|
505
|
+
* pipeline — A2E is the interface abstraction, LAM is the model.
|
|
462
506
|
*
|
|
463
507
|
* Implemented by:
|
|
464
|
-
* -
|
|
465
|
-
* -
|
|
508
|
+
* - {@link A2EInference} (WebGPU/WASM, 192MB fp16)
|
|
509
|
+
* - A2EUnifiedAdapter (shared unified worker)
|
|
510
|
+
*
|
|
511
|
+
* @see {@link A2EInference} for direct usage
|
|
512
|
+
* @see {@link createA2E} for the recommended factory API
|
|
466
513
|
*/
|
|
467
514
|
interface A2EBackend {
|
|
468
|
-
/** Model identifier
|
|
469
|
-
readonly modelId: '
|
|
515
|
+
/** Model identifier */
|
|
516
|
+
readonly modelId: 'a2e';
|
|
470
517
|
/** Current backend type (webgpu, wasm, or null if not loaded) */
|
|
471
518
|
readonly backend: RuntimeBackend | null;
|
|
472
519
|
/** Whether the model is loaded and ready for inference */
|
|
@@ -538,213 +585,7 @@ declare const BLENDSHAPE_TO_GROUP: Map<string, BlendshapeGroup>;
|
|
|
538
585
|
* 2. Otherwise, use the group scaler (default 1.0)
|
|
539
586
|
* 3. Clamp result to [0, 1]
|
|
540
587
|
*/
|
|
541
|
-
declare function applyProfile(raw: Float32Array, profile: ExpressionProfile): Float32Array;
|
|
542
|
-
|
|
543
|
-
/**
|
|
544
|
-
* FullFacePipeline - A2E expression pipeline with ExpressionProfile weight scaling
|
|
545
|
-
*
|
|
546
|
-
* Orchestrates full-face animation by:
|
|
547
|
-
* 1. Scheduling audio for playback immediately (audio-first, never waits for A2E)
|
|
548
|
-
* 2. Running A2E inference in background (fire-and-forget via A2EProcessor)
|
|
549
|
-
* 3. Applying per-character ExpressionProfile scaling to raw A2E output
|
|
550
|
-
*
|
|
551
|
-
* The A2E model outputs all 52 ARKit blendshapes from audio — brows, eyes, cheeks,
|
|
552
|
-
* mouth, jaw, everything. ExpressionProfile allows per-character weight scaling
|
|
553
|
-
* by group (eyes, brows, jaw, mouth, cheeks, nose, tongue) with per-blendshape overrides.
|
|
554
|
-
*
|
|
555
|
-
* @deprecated Use {@link PlaybackPipeline} from `@omote/core` instead. PlaybackPipeline
|
|
556
|
-
* is a superset with sync mode (`feedBuffer`), state tracking, and opt-in neutral transition.
|
|
557
|
-
* FullFacePipeline will continue to work but is no longer actively developed.
|
|
558
|
-
*
|
|
559
|
-
* @category Audio
|
|
560
|
-
*
|
|
561
|
-
* @example Basic usage
|
|
562
|
-
* ```typescript
|
|
563
|
-
* import { FullFacePipeline } from '@omote/core';
|
|
564
|
-
*
|
|
565
|
-
* const pipeline = new FullFacePipeline({
|
|
566
|
-
* lam,
|
|
567
|
-
* profile: { mouth: 1.2, brows: 0.8 },
|
|
568
|
-
* });
|
|
569
|
-
* await pipeline.initialize();
|
|
570
|
-
*
|
|
571
|
-
* pipeline.on('full_frame_ready', (frame) => {
|
|
572
|
-
* applyToAvatar(frame.blendshapes);
|
|
573
|
-
* });
|
|
574
|
-
*
|
|
575
|
-
* pipeline.start();
|
|
576
|
-
* await pipeline.onAudioChunk(audioData);
|
|
577
|
-
* ```
|
|
578
|
-
*/
|
|
579
|
-
|
|
580
|
-
/**
|
|
581
|
-
* Configuration for FullFacePipeline
|
|
582
|
-
*/
|
|
583
|
-
interface FullFacePipelineOptions {
|
|
584
|
-
/** Sample rate in Hz (default: 16000) */
|
|
585
|
-
sampleRate?: number;
|
|
586
|
-
/** Target chunk duration in ms for coalescing (default: 200) */
|
|
587
|
-
chunkTargetMs?: number;
|
|
588
|
-
/**
|
|
589
|
-
* Audio playback delay in ms before first audio plays.
|
|
590
|
-
* Gives A2E inference time to pre-compute blendshapes before audio
|
|
591
|
-
* starts, preventing frame drops/desync. Must be ≥ chunkSize
|
|
592
|
-
* accumulation time + inference latency.
|
|
593
|
-
*
|
|
594
|
-
* Default: auto-calculated from chunkSize and backend type.
|
|
595
|
-
*/
|
|
596
|
-
audioDelayMs?: number;
|
|
597
|
-
/**
|
|
598
|
-
* A2E inference chunk size in samples.
|
|
599
|
-
* Controls how many samples accumulate before each inference call.
|
|
600
|
-
* Smaller = lower latency (less delay before first frame), more overhead.
|
|
601
|
-
* Larger = higher latency, less overhead.
|
|
602
|
-
*
|
|
603
|
-
* Default: 16000 (1s) — the model's native window size.
|
|
604
|
-
* Smaller chunks get zero-padded, causing near-zero blendshape output.
|
|
605
|
-
*/
|
|
606
|
-
chunkSize?: number;
|
|
607
|
-
/** A2E inference engine */
|
|
608
|
-
lam: A2EBackend;
|
|
609
|
-
/**
|
|
610
|
-
* Identity/style index for the A2E model (default: 0).
|
|
611
|
-
*
|
|
612
|
-
* The LAM model uses a 12-class one-hot identity vector as style conditioning.
|
|
613
|
-
* Different indices produce different expression intensity across face regions.
|
|
614
|
-
* Only affects Wav2Vec2Inference (GPU). Wav2ArkitCpuInference has identity 11 baked in.
|
|
615
|
-
*/
|
|
616
|
-
identityIndex?: number;
|
|
617
|
-
/** Per-character expression weight scaling */
|
|
618
|
-
profile?: ExpressionProfile;
|
|
619
|
-
/**
|
|
620
|
-
* Time in ms with no new inference frames before logging a stale warning.
|
|
621
|
-
*
|
|
622
|
-
* Must be larger than the inter-batch gap (chunkSize/sampleRate + inference time).
|
|
623
|
-
* Default: 2000
|
|
624
|
-
*/
|
|
625
|
-
staleThresholdMs?: number;
|
|
626
|
-
}
|
|
627
|
-
/**
|
|
628
|
-
* Full face frame with scaled blendshapes
|
|
629
|
-
*/
|
|
630
|
-
interface FullFaceFrame$1 {
|
|
631
|
-
/** Scaled 52 ARKit blendshapes (ExpressionProfile applied) */
|
|
632
|
-
blendshapes: Float32Array;
|
|
633
|
-
/** Raw A2E output (52 blendshapes, before profile scaling) */
|
|
634
|
-
rawBlendshapes: Float32Array;
|
|
635
|
-
/** AudioContext timestamp for this frame */
|
|
636
|
-
timestamp: number;
|
|
637
|
-
}
|
|
638
|
-
/**
|
|
639
|
-
* Events emitted by FullFacePipeline
|
|
640
|
-
*/
|
|
641
|
-
interface FullFacePipelineEvents {
|
|
642
|
-
/** New merged frame ready for display */
|
|
643
|
-
full_frame_ready: FullFaceFrame$1;
|
|
644
|
-
/** Raw LAM frame ready (for debugging/monitoring) */
|
|
645
|
-
lam_frame_ready: Float32Array;
|
|
646
|
-
/** Playback has completed */
|
|
647
|
-
playback_complete: void;
|
|
648
|
-
/** First frame ready, playback starting */
|
|
649
|
-
playback_start: number;
|
|
650
|
-
/** Error occurred */
|
|
651
|
-
error: Error;
|
|
652
|
-
/** Index signature for EventEmitter compatibility */
|
|
653
|
-
[key: string]: unknown;
|
|
654
|
-
}
|
|
655
|
-
/**
|
|
656
|
-
* FullFacePipeline - A2E animation pipeline with ExpressionProfile scaling
|
|
657
|
-
*
|
|
658
|
-
* Audio-first design matching SyncedAudioPipeline:
|
|
659
|
-
* - Audio is scheduled immediately (never waits for A2E)
|
|
660
|
-
* - A2E runs in background (fire-and-forget via A2EProcessor)
|
|
661
|
-
* - ExpressionProfile scales raw A2E output per-character
|
|
662
|
-
*/
|
|
663
|
-
declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
|
|
664
|
-
private readonly options;
|
|
665
|
-
private scheduler;
|
|
666
|
-
private coalescer;
|
|
667
|
-
private processor;
|
|
668
|
-
private playbackStarted;
|
|
669
|
-
private monitorInterval;
|
|
670
|
-
private frameAnimationId;
|
|
671
|
-
private lastNewFrameTime;
|
|
672
|
-
private lastKnownLamFrame;
|
|
673
|
-
private staleWarningEmitted;
|
|
674
|
-
private readonly staleThresholdMs;
|
|
675
|
-
private frameLoopCount;
|
|
676
|
-
private profile;
|
|
677
|
-
constructor(options: FullFacePipelineOptions);
|
|
678
|
-
/**
|
|
679
|
-
* Initialize the pipeline
|
|
680
|
-
*/
|
|
681
|
-
initialize(): Promise<void>;
|
|
682
|
-
/**
|
|
683
|
-
* Update the ExpressionProfile at runtime (e.g., character switch).
|
|
684
|
-
*/
|
|
685
|
-
setProfile(profile: ExpressionProfile): void;
|
|
686
|
-
/**
|
|
687
|
-
* Apply ExpressionProfile scaling to raw A2E blendshapes.
|
|
688
|
-
*
|
|
689
|
-
* Delegates to the standalone applyProfile() utility from expressionProfile.ts.
|
|
690
|
-
*/
|
|
691
|
-
applyProfile(raw: Float32Array): Float32Array;
|
|
692
|
-
/**
|
|
693
|
-
* Start a new playback session
|
|
694
|
-
*
|
|
695
|
-
* Resets all state and prepares for incoming audio chunks.
|
|
696
|
-
* Audio will be scheduled immediately as chunks arrive (no buffering).
|
|
697
|
-
*/
|
|
698
|
-
start(): void;
|
|
699
|
-
/**
|
|
700
|
-
* Receive audio chunk from network
|
|
701
|
-
*
|
|
702
|
-
* Audio-first design: schedules audio immediately, A2E runs in background.
|
|
703
|
-
* This prevents A2E inference (50-300ms) from blocking audio scheduling.
|
|
704
|
-
*
|
|
705
|
-
* @param chunk - Uint8Array containing Int16 PCM audio
|
|
706
|
-
*/
|
|
707
|
-
onAudioChunk(chunk: Uint8Array): Promise<void>;
|
|
708
|
-
/**
|
|
709
|
-
* Start frame animation loop
|
|
710
|
-
*
|
|
711
|
-
* Polls A2EProcessor at render rate (60fps) for the latest inference frame
|
|
712
|
-
* matching the current AudioContext time. Between inference batches (~30fps
|
|
713
|
-
* bursts), getFrameForTime() holds the last frame.
|
|
714
|
-
*/
|
|
715
|
-
private startFrameLoop;
|
|
716
|
-
/**
|
|
717
|
-
* End of audio stream
|
|
718
|
-
*/
|
|
719
|
-
end(): Promise<void>;
|
|
720
|
-
/**
|
|
721
|
-
* Stop playback immediately with smooth fade-out
|
|
722
|
-
*/
|
|
723
|
-
stop(fadeOutMs?: number): Promise<void>;
|
|
724
|
-
/**
|
|
725
|
-
* Start monitoring for playback completion
|
|
726
|
-
*/
|
|
727
|
-
private startMonitoring;
|
|
728
|
-
/**
|
|
729
|
-
* Stop monitoring
|
|
730
|
-
*/
|
|
731
|
-
private stopMonitoring;
|
|
732
|
-
/**
|
|
733
|
-
* Get current pipeline state (for debugging/monitoring)
|
|
734
|
-
*/
|
|
735
|
-
getState(): {
|
|
736
|
-
playbackStarted: boolean;
|
|
737
|
-
coalescerFill: number;
|
|
738
|
-
processorFill: number;
|
|
739
|
-
queuedFrames: number;
|
|
740
|
-
currentTime: number;
|
|
741
|
-
playbackEndTime: number;
|
|
742
|
-
};
|
|
743
|
-
/**
|
|
744
|
-
* Cleanup resources
|
|
745
|
-
*/
|
|
746
|
-
dispose(): void;
|
|
747
|
-
}
|
|
588
|
+
declare function applyProfile(raw: Float32Array, profile: ExpressionProfile, out?: Float32Array): Float32Array;
|
|
748
589
|
|
|
749
590
|
/**
|
|
750
591
|
* PlaybackPipeline - Audio playback + A2E lip sync with ExpressionProfile scaling
|
|
@@ -770,7 +611,7 @@ interface PlaybackPipelineConfig {
|
|
|
770
611
|
audioDelayMs?: number;
|
|
771
612
|
/** A2E inference chunk size in samples (default: 16000) */
|
|
772
613
|
chunkSize?: number;
|
|
773
|
-
/** Identity/style index for
|
|
614
|
+
/** Identity/style index for A2E model (default: 0) */
|
|
774
615
|
identityIndex?: number;
|
|
775
616
|
/** Per-character expression weight scaling */
|
|
776
617
|
profile?: ExpressionProfile;
|
|
@@ -791,6 +632,8 @@ interface FullFaceFrame {
|
|
|
791
632
|
rawBlendshapes: Float32Array;
|
|
792
633
|
/** AudioContext timestamp for this frame */
|
|
793
634
|
timestamp: number;
|
|
635
|
+
/** Emotion label for this frame (from SenseVoice, text heuristics, or LLM tags) */
|
|
636
|
+
emotion?: string;
|
|
794
637
|
}
|
|
795
638
|
interface PlaybackPipelineEvents {
|
|
796
639
|
/** New frame ready for display (scaled by ExpressionProfile) */
|
|
@@ -809,10 +652,6 @@ interface PlaybackPipelineEvents {
|
|
|
809
652
|
'error': Error;
|
|
810
653
|
/** State changed */
|
|
811
654
|
'state': PlaybackState;
|
|
812
|
-
'full_frame_ready': FullFaceFrame;
|
|
813
|
-
'lam_frame_ready': Float32Array;
|
|
814
|
-
'playback_complete': void;
|
|
815
|
-
'playback_start': number;
|
|
816
655
|
[key: string]: unknown;
|
|
817
656
|
}
|
|
818
657
|
declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
|
|
@@ -830,6 +669,7 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
|
|
|
830
669
|
private staleWarningEmitted;
|
|
831
670
|
private readonly staleThresholdMs;
|
|
832
671
|
private frameLoopCount;
|
|
672
|
+
private sessionStartTime;
|
|
833
673
|
private profile;
|
|
834
674
|
private readonly neutralTransitionEnabled;
|
|
835
675
|
private readonly neutralTransitionMs;
|
|
@@ -838,6 +678,8 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
|
|
|
838
678
|
private neutralAnimationId;
|
|
839
679
|
private _currentFrame;
|
|
840
680
|
private _currentRawFrame;
|
|
681
|
+
private _emotion;
|
|
682
|
+
private readonly _profileBuffer;
|
|
841
683
|
/** Current pipeline state */
|
|
842
684
|
get state(): PlaybackState;
|
|
843
685
|
/** Current scaled blendshapes (updated in-place for perf) */
|
|
@@ -849,6 +691,8 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
|
|
|
849
691
|
initialize(): Promise<void>;
|
|
850
692
|
/** Update ExpressionProfile at runtime */
|
|
851
693
|
setProfile(profile: ExpressionProfile): void;
|
|
694
|
+
/** Set the emotion label to include in emitted frames */
|
|
695
|
+
setEmotion(emotion: string | null): void;
|
|
852
696
|
/**
|
|
853
697
|
* Start a new playback session.
|
|
854
698
|
* Idempotent — calling during playback resets cleanly without emitting
|
|
@@ -888,103 +732,161 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
|
|
|
888
732
|
}
|
|
889
733
|
|
|
890
734
|
/**
|
|
891
|
-
*
|
|
735
|
+
* TTSBackend — Streaming text-to-speech backend interface.
|
|
892
736
|
*
|
|
893
|
-
*
|
|
894
|
-
*
|
|
895
|
-
* - Detects when user interrupts AI response
|
|
896
|
-
* - Triggers interruption callbacks
|
|
897
|
-
*/
|
|
898
|
-
|
|
899
|
-
interface InterruptionEvents {
|
|
900
|
-
[key: string]: unknown;
|
|
901
|
-
'speech.detected': {
|
|
902
|
-
rms: number;
|
|
903
|
-
};
|
|
904
|
-
'speech.ended': {
|
|
905
|
-
durationMs: number;
|
|
906
|
-
};
|
|
907
|
-
'interruption.triggered': {
|
|
908
|
-
rms: number;
|
|
909
|
-
durationMs: number;
|
|
910
|
-
};
|
|
911
|
-
}
|
|
912
|
-
/**
|
|
913
|
-
* Interruption handler configuration
|
|
737
|
+
* Any TTS engine (Kokoro, ElevenLabs, etc.) can implement this contract
|
|
738
|
+
* to integrate with TTSPlayback and VoicePipeline.
|
|
914
739
|
*
|
|
915
|
-
*
|
|
916
|
-
* - vadThreshold: 0.5 (Silero VAD default)
|
|
917
|
-
* - minSpeechDurationMs: 200ms (Google/Amazon barge-in standard)
|
|
918
|
-
* - silenceTimeoutMs: 500ms (OpenAI Realtime API standard)
|
|
740
|
+
* @category Inference
|
|
919
741
|
*/
|
|
920
|
-
interface InterruptionConfig {
|
|
921
|
-
/** VAD probability threshold for speech detection (default: 0.5, Silero standard) */
|
|
922
|
-
vadThreshold?: number;
|
|
923
|
-
/** Minimum speech duration to trigger interruption (default: 200ms, Google/Amazon standard) */
|
|
924
|
-
minSpeechDurationMs?: number;
|
|
925
|
-
/** Silence duration to end speech (default: 500ms, OpenAI standard) */
|
|
926
|
-
silenceTimeoutMs?: number;
|
|
927
|
-
/** Enable interruption detection (default: true) */
|
|
928
|
-
enabled?: boolean;
|
|
929
|
-
}
|
|
930
|
-
declare class InterruptionHandler extends EventEmitter<InterruptionEvents> {
|
|
931
|
-
private config;
|
|
932
|
-
private isSpeaking;
|
|
933
|
-
private speechStartTime;
|
|
934
|
-
private lastSpeechTime;
|
|
935
|
-
private silenceTimer;
|
|
936
|
-
private aiIsSpeaking;
|
|
937
|
-
private interruptionTriggeredThisSession;
|
|
938
|
-
constructor(config?: InterruptionConfig);
|
|
939
|
-
/**
|
|
940
|
-
* Process VAD result for interruption detection
|
|
941
|
-
* @param vadProbability - Speech probability from VAD (0-1)
|
|
942
|
-
* @param audioEnergy - Optional RMS energy for logging (default: 0)
|
|
943
|
-
*/
|
|
944
|
-
processVADResult(vadProbability: number, audioEnergy?: number): void;
|
|
945
|
-
/** Notify that AI started/stopped speaking */
|
|
946
|
-
setAISpeaking(speaking: boolean): void;
|
|
947
|
-
/** Enable/disable interruption detection */
|
|
948
|
-
setEnabled(enabled: boolean): void;
|
|
949
|
-
/** Update configuration */
|
|
950
|
-
updateConfig(config: Partial<InterruptionConfig>): void;
|
|
951
|
-
/** Reset state */
|
|
952
|
-
reset(): void;
|
|
953
|
-
/** Get current state */
|
|
954
|
-
getState(): {
|
|
955
|
-
isSpeaking: boolean;
|
|
956
|
-
speechDurationMs: number;
|
|
957
|
-
};
|
|
958
|
-
private onSpeechDetected;
|
|
959
|
-
private onSilenceDetected;
|
|
960
|
-
}
|
|
961
|
-
|
|
962
742
|
/**
|
|
963
|
-
*
|
|
964
|
-
*
|
|
965
|
-
* This module provides a way to dynamically load the appropriate ONNX Runtime bundle
|
|
966
|
-
* based on the platform's capabilities. This is critical for iOS support because:
|
|
743
|
+
* Streaming TTS backend interface.
|
|
967
744
|
*
|
|
968
|
-
*
|
|
969
|
-
*
|
|
970
|
-
*
|
|
745
|
+
* Implementations must provide:
|
|
746
|
+
* - `stream()` for sentence-by-sentence audio generation
|
|
747
|
+
* - `sampleRate` for format conversion
|
|
748
|
+
* - `load()` for model initialization
|
|
971
749
|
*
|
|
972
|
-
*
|
|
750
|
+
* @example
|
|
973
751
|
* ```typescript
|
|
974
|
-
* const
|
|
975
|
-
*
|
|
976
|
-
* ```
|
|
752
|
+
* const kokoro: TTSBackend = new KokoroTTSInference({ defaultVoice: 'af_heart' });
|
|
753
|
+
* await kokoro.load();
|
|
977
754
|
*
|
|
978
|
-
*
|
|
755
|
+
* for await (const chunk of kokoro.stream("Hello world!", { voice: 'af_heart' })) {
|
|
756
|
+
* // chunk.audio is Float32Array at kokoro.sampleRate
|
|
757
|
+
* }
|
|
758
|
+
* ```
|
|
979
759
|
*/
|
|
760
|
+
interface TTSBackend {
|
|
761
|
+
/** Stream audio chunks for given text. Each chunk: Float32Array at engine's native rate. */
|
|
762
|
+
stream(text: string, options?: TTSStreamOptions): AsyncGenerator<TTSChunk>;
|
|
763
|
+
/** Engine's native output sample rate (e.g., 24000 for Kokoro). */
|
|
764
|
+
readonly sampleRate: number;
|
|
765
|
+
/** Load model if not already loaded. */
|
|
766
|
+
load(): Promise<unknown>;
|
|
767
|
+
/** Whether model is loaded and ready. */
|
|
768
|
+
readonly isLoaded: boolean;
|
|
769
|
+
/** Release resources. */
|
|
770
|
+
dispose(): Promise<void>;
|
|
771
|
+
}
|
|
772
|
+
/**
|
|
773
|
+
* Options for TTSBackend.stream()
|
|
774
|
+
*/
|
|
775
|
+
interface TTSStreamOptions {
|
|
776
|
+
/** Abort signal for cancellation */
|
|
777
|
+
signal?: AbortSignal;
|
|
778
|
+
/** Voice override per-call */
|
|
779
|
+
voice?: string;
|
|
780
|
+
/** Speed multiplier override per-call */
|
|
781
|
+
speed?: number;
|
|
782
|
+
}
|
|
783
|
+
/**
|
|
784
|
+
* A single chunk of TTS audio output
|
|
785
|
+
*/
|
|
786
|
+
interface TTSChunk {
|
|
787
|
+
/** Audio samples at engine's native sample rate */
|
|
788
|
+
audio: Float32Array;
|
|
789
|
+
/** Duration in seconds */
|
|
790
|
+
duration: number;
|
|
791
|
+
/** Sentence/segment text that produced this audio */
|
|
792
|
+
text?: string;
|
|
793
|
+
}
|
|
980
794
|
|
|
981
795
|
/**
|
|
982
|
-
*
|
|
796
|
+
* TTSPlayback — Composes TTSBackend + PlaybackPipeline for text → lip sync.
|
|
983
797
|
*
|
|
984
|
-
*
|
|
985
|
-
*
|
|
798
|
+
* Handles format conversion (Float32 @ TTS rate → PCM16 @ 16kHz)
|
|
799
|
+
* and sentence prefetch for gapless playback.
|
|
986
800
|
*
|
|
987
|
-
* @
|
|
801
|
+
* @category Audio
|
|
802
|
+
*/
|
|
803
|
+
|
|
804
|
+
interface TTSPlaybackConfig {
|
|
805
|
+
/** TTS backend (e.g., KokoroTTSInference) */
|
|
806
|
+
tts: TTSBackend;
|
|
807
|
+
/** A2E inference backend (from createA2E) */
|
|
808
|
+
lam: A2EBackend;
|
|
809
|
+
/** Per-character expression weight scaling */
|
|
810
|
+
profile?: ExpressionProfile;
|
|
811
|
+
/** Prefetch next sentence while current plays. Default: true */
|
|
812
|
+
prefetch?: boolean;
|
|
813
|
+
/** Identity/style index for A2E model (default: 0) */
|
|
814
|
+
identityIndex?: number;
|
|
815
|
+
/** Audio playback delay in ms */
|
|
816
|
+
audioDelayMs?: number;
|
|
817
|
+
/** Enable neutral transition on playback complete */
|
|
818
|
+
neutralTransitionEnabled?: boolean;
|
|
819
|
+
/** Duration of neutral fade-out in ms */
|
|
820
|
+
neutralTransitionMs?: number;
|
|
821
|
+
}
|
|
822
|
+
interface TTSPlaybackEvents {
|
|
823
|
+
/** New frame ready for display */
|
|
824
|
+
'frame': FullFaceFrame;
|
|
825
|
+
/** Raw A2E frame */
|
|
826
|
+
'frame:raw': Float32Array;
|
|
827
|
+
/** Playback started */
|
|
828
|
+
'playback:start': {
|
|
829
|
+
time: number;
|
|
830
|
+
};
|
|
831
|
+
/** Playback completed */
|
|
832
|
+
'playback:complete': void;
|
|
833
|
+
/** Playback stopped (user-initiated) */
|
|
834
|
+
'playback:stop': void;
|
|
835
|
+
/** Error */
|
|
836
|
+
'error': Error;
|
|
837
|
+
[key: string]: unknown;
|
|
838
|
+
}
|
|
839
|
+
declare class TTSPlayback extends EventEmitter<TTSPlaybackEvents> {
|
|
840
|
+
private readonly config;
|
|
841
|
+
private _pipeline;
|
|
842
|
+
private initialized;
|
|
843
|
+
constructor(config: TTSPlaybackConfig);
|
|
844
|
+
/** Access underlying PlaybackPipeline for event subscriptions. */
|
|
845
|
+
get pipeline(): PlaybackPipeline | null;
|
|
846
|
+
/** Load TTS model + initialize PlaybackPipeline. */
|
|
847
|
+
initialize(): Promise<void>;
|
|
848
|
+
/**
|
|
849
|
+
* Synthesize text and play with lip sync.
|
|
850
|
+
* Streams sentences with prefetch for minimal gaps.
|
|
851
|
+
*
|
|
852
|
+
* @returns Resolves when playback completes
|
|
853
|
+
*/
|
|
854
|
+
speak(text: string, options?: {
|
|
855
|
+
signal?: AbortSignal;
|
|
856
|
+
voice?: string;
|
|
857
|
+
}): Promise<void>;
|
|
858
|
+
/** Dispose of all resources. */
|
|
859
|
+
dispose(): Promise<void>;
|
|
860
|
+
private speakWithPrefetch;
|
|
861
|
+
private speakSequential;
|
|
862
|
+
}
|
|
863
|
+
|
|
864
|
+
/**
|
|
865
|
+
* Lazy ONNX Runtime loader with conditional WebGPU/WASM bundle loading
|
|
866
|
+
*
|
|
867
|
+
* This module provides a way to dynamically load the appropriate ONNX Runtime bundle
|
|
868
|
+
* based on the platform's capabilities. This is critical for iOS support because:
|
|
869
|
+
*
|
|
870
|
+
* 1. iOS Safari has WebGPU API but ONNX Runtime's WebGPU backend crashes
|
|
871
|
+
* 2. Loading the WebGPU bundle wastes bandwidth and can cause issues
|
|
872
|
+
* 3. WASM-only bundle is smaller and more reliable on iOS
|
|
873
|
+
*
|
|
874
|
+
* Usage:
|
|
875
|
+
* ```typescript
|
|
876
|
+
* const ort = await getOnnxRuntime('wasm'); // Load WASM-only bundle
|
|
877
|
+
* const ort = await getOnnxRuntime('webgpu'); // Load WebGPU bundle (includes WASM)
|
|
878
|
+
* ```
|
|
879
|
+
*
|
|
880
|
+
* @module inference/onnxLoader
|
|
881
|
+
*/
|
|
882
|
+
|
|
883
|
+
/**
|
|
884
|
+
* Check if WebGPU is available and likely to work
|
|
885
|
+
*
|
|
886
|
+
* This is more thorough than just checking navigator.gpu exists.
|
|
887
|
+
* It actually requests an adapter to verify the GPU is accessible.
|
|
888
|
+
*
|
|
889
|
+
* @returns true if WebGPU is available and working
|
|
988
890
|
*/
|
|
989
891
|
declare function isWebGPUAvailable(): Promise<boolean>;
|
|
990
892
|
|
|
@@ -1082,116 +984,6 @@ declare class SenseVoiceInference {
|
|
|
1082
984
|
dispose(): Promise<void>;
|
|
1083
985
|
}
|
|
1084
986
|
|
|
1085
|
-
/**
|
|
1086
|
-
* SenseVoice ASR Web Worker implementation
|
|
1087
|
-
*
|
|
1088
|
-
* Runs SenseVoice speech recognition in a dedicated Web Worker to prevent
|
|
1089
|
-
* main thread blocking. Uses inline worker script (Blob URL pattern) to
|
|
1090
|
-
* avoid separate file deployment.
|
|
1091
|
-
*
|
|
1092
|
-
* Key design decisions:
|
|
1093
|
-
* - WASM backend only (WebGPU doesn't work in Workers)
|
|
1094
|
-
* - All preprocessing (fbank, LFR, CMVN) and CTC decoding inlined in worker
|
|
1095
|
-
* - Audio copied (not transferred) to retain main thread access
|
|
1096
|
-
* - ONNX Runtime loaded from CDN in worker (no bundler complications)
|
|
1097
|
-
* - iOS: model URL passed as string to ORT (avoids 239MB JS heap allocation)
|
|
1098
|
-
*
|
|
1099
|
-
* @category Inference
|
|
1100
|
-
*
|
|
1101
|
-
* @example Basic usage
|
|
1102
|
-
* ```typescript
|
|
1103
|
-
* import { SenseVoiceWorker } from '@omote/core';
|
|
1104
|
-
*
|
|
1105
|
-
* const asr = new SenseVoiceWorker({
|
|
1106
|
-
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
1107
|
-
* tokensUrl: '/models/sensevoice/tokens.txt',
|
|
1108
|
-
* });
|
|
1109
|
-
* await asr.load();
|
|
1110
|
-
*
|
|
1111
|
-
* const { text, emotion, language } = await asr.transcribe(audioSamples);
|
|
1112
|
-
* console.log(text); // "Hello world"
|
|
1113
|
-
* console.log(emotion); // "NEUTRAL"
|
|
1114
|
-
* console.log(language); // "en"
|
|
1115
|
-
* ```
|
|
1116
|
-
*/
|
|
1117
|
-
|
|
1118
|
-
/**
|
|
1119
|
-
* Configuration for SenseVoice Worker
|
|
1120
|
-
*/
|
|
1121
|
-
interface SenseVoiceWorkerConfig {
|
|
1122
|
-
/** Path or URL to model.int8.onnx (239MB) */
|
|
1123
|
-
modelUrl: string;
|
|
1124
|
-
/** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
|
|
1125
|
-
tokensUrl?: string;
|
|
1126
|
-
/** Language hint (default: 'auto' for auto-detection) */
|
|
1127
|
-
language?: SenseVoiceLanguage;
|
|
1128
|
-
/** Text normalization: 'with_itn' applies inverse text normalization (default: 'with_itn') */
|
|
1129
|
-
textNorm?: 'with_itn' | 'without_itn';
|
|
1130
|
-
}
|
|
1131
|
-
/**
|
|
1132
|
-
* SenseVoice ASR Worker - Speech Recognition in a Web Worker
|
|
1133
|
-
*
|
|
1134
|
-
* Runs SenseVoice inference off the main thread to prevent UI blocking.
|
|
1135
|
-
* All preprocessing (fbank, LFR, CMVN) and CTC decoding run in the worker.
|
|
1136
|
-
*
|
|
1137
|
-
* @see SenseVoiceInference for main-thread version
|
|
1138
|
-
*/
|
|
1139
|
-
declare class SenseVoiceWorker {
|
|
1140
|
-
private worker;
|
|
1141
|
-
private config;
|
|
1142
|
-
private isLoading;
|
|
1143
|
-
private _isLoaded;
|
|
1144
|
-
private inferenceQueue;
|
|
1145
|
-
private poisoned;
|
|
1146
|
-
private pendingResolvers;
|
|
1147
|
-
private languageId;
|
|
1148
|
-
private textNormId;
|
|
1149
|
-
constructor(config: SenseVoiceWorkerConfig);
|
|
1150
|
-
get isLoaded(): boolean;
|
|
1151
|
-
/**
|
|
1152
|
-
* Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
|
|
1153
|
-
*/
|
|
1154
|
-
get backend(): 'wasm' | null;
|
|
1155
|
-
/**
|
|
1156
|
-
* Create the worker from inline script
|
|
1157
|
-
*/
|
|
1158
|
-
private createWorker;
|
|
1159
|
-
/**
|
|
1160
|
-
* Handle messages from worker
|
|
1161
|
-
*/
|
|
1162
|
-
private handleWorkerMessage;
|
|
1163
|
-
/**
|
|
1164
|
-
* Send message to worker and wait for response
|
|
1165
|
-
*/
|
|
1166
|
-
private sendMessage;
|
|
1167
|
-
/**
|
|
1168
|
-
* Load the ONNX model in the worker
|
|
1169
|
-
*
|
|
1170
|
-
* @param onProgress - Optional progress callback. Fires once at 100% when load completes
|
|
1171
|
-
* (the worker downloads and loads the model internally, so granular progress is not available).
|
|
1172
|
-
*/
|
|
1173
|
-
load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
|
|
1174
|
-
/**
|
|
1175
|
-
* Transcribe audio samples to text
|
|
1176
|
-
*
|
|
1177
|
-
* @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
|
|
1178
|
-
* @returns Transcription result with text, emotion, language, and event
|
|
1179
|
-
*/
|
|
1180
|
-
transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
|
|
1181
|
-
/**
|
|
1182
|
-
* Queue inference to serialize worker calls
|
|
1183
|
-
*/
|
|
1184
|
-
private queueInference;
|
|
1185
|
-
/**
|
|
1186
|
-
* Dispose of the worker and free resources
|
|
1187
|
-
*/
|
|
1188
|
-
dispose(): Promise<void>;
|
|
1189
|
-
/**
|
|
1190
|
-
* Check if Web Workers are supported
|
|
1191
|
-
*/
|
|
1192
|
-
static isSupported(): boolean;
|
|
1193
|
-
}
|
|
1194
|
-
|
|
1195
987
|
/**
|
|
1196
988
|
* Silero VAD (Voice Activity Detection) inference
|
|
1197
989
|
*
|
|
@@ -1459,6 +1251,7 @@ declare class SileroVADWorker {
|
|
|
1459
1251
|
private config;
|
|
1460
1252
|
private isLoading;
|
|
1461
1253
|
private _isLoaded;
|
|
1254
|
+
private poisoned;
|
|
1462
1255
|
private state;
|
|
1463
1256
|
private context;
|
|
1464
1257
|
private readonly chunkSize;
|
|
@@ -1526,426 +1319,395 @@ declare class SileroVADWorker {
|
|
|
1526
1319
|
}
|
|
1527
1320
|
|
|
1528
1321
|
/**
|
|
1529
|
-
*
|
|
1530
|
-
*
|
|
1531
|
-
* Provides a unified API that automatically selects the optimal implementation:
|
|
1532
|
-
* - Desktop browsers: Uses SileroVADWorker (off-main-thread inference)
|
|
1533
|
-
* - Mobile devices: Uses SileroVADInference (main thread, avoids memory overhead)
|
|
1534
|
-
* - Fallback: Gracefully falls back to main thread if Worker fails
|
|
1322
|
+
* Unified Inference Worker — single Web Worker hosting all ONNX models
|
|
1535
1323
|
*
|
|
1536
|
-
*
|
|
1537
|
-
*
|
|
1538
|
-
* @example Basic usage (auto-detect)
|
|
1539
|
-
* ```typescript
|
|
1540
|
-
* import { createSileroVAD } from '@omote/core';
|
|
1324
|
+
* Runs all model loading and inference off the main thread, preventing
|
|
1325
|
+
* InferenceSession.create() from blocking the renderer (5-30s).
|
|
1541
1326
|
*
|
|
1542
|
-
*
|
|
1543
|
-
*
|
|
1544
|
-
* threshold: 0.5,
|
|
1545
|
-
* });
|
|
1327
|
+
* Uses WebGPU when available (Chrome/Edge 113+), falls back to WASM.
|
|
1328
|
+
* On iOS, uses a single WASM instance to stay within the ~1-1.5GB tab limit.
|
|
1546
1329
|
*
|
|
1547
|
-
*
|
|
1548
|
-
*
|
|
1549
|
-
* if (result.isSpeech) {
|
|
1550
|
-
* console.log('Speech detected!', result.probability);
|
|
1551
|
-
* }
|
|
1552
|
-
* ```
|
|
1330
|
+
* This worker hosts SenseVoice + A2E + Silero VAD + Kokoro TTS in a single
|
|
1331
|
+
* ORT instance. Same total model memory, but inference runs off-main-thread.
|
|
1553
1332
|
*
|
|
1554
|
-
*
|
|
1333
|
+
* Consumer usage:
|
|
1555
1334
|
* ```typescript
|
|
1556
|
-
* const
|
|
1557
|
-
*
|
|
1558
|
-
* useWorker: true, // Force Worker even on mobile
|
|
1559
|
-
* });
|
|
1560
|
-
* ```
|
|
1335
|
+
* const worker = new UnifiedInferenceWorker();
|
|
1336
|
+
* await worker.init();
|
|
1561
1337
|
*
|
|
1562
|
-
*
|
|
1563
|
-
*
|
|
1564
|
-
* const vad = createSileroVAD({
|
|
1565
|
-
* modelUrl: '/models/silero-vad.onnx',
|
|
1566
|
-
* useWorker: false, // Force main thread
|
|
1567
|
-
* });
|
|
1338
|
+
* const asr = createSenseVoice({ modelUrl: '...', unifiedWorker: worker });
|
|
1339
|
+
* const lam = createA2E({ modelUrl: '...', unifiedWorker: worker });
|
|
1340
|
+
* const vad = createSileroVAD({ modelUrl: '...', unifiedWorker: worker });
|
|
1568
1341
|
* ```
|
|
1342
|
+
*
|
|
1343
|
+
* @category Inference
|
|
1569
1344
|
*/
|
|
1570
1345
|
|
|
1346
|
+
/** Health state of the unified worker */
|
|
1347
|
+
type WorkerHealthState = 'healthy' | 'unhealthy' | 'recovering';
|
|
1571
1348
|
/**
|
|
1572
|
-
*
|
|
1349
|
+
* Unified Inference Worker — single Web Worker for all ONNX models
|
|
1573
1350
|
*
|
|
1574
|
-
*
|
|
1575
|
-
*
|
|
1351
|
+
* Hosts SenseVoice, A2E (LAM), Kokoro TTS, and Silero VAD in one ORT instance.
|
|
1352
|
+
* Uses WebGPU on Chrome/Edge 113+, falls back to WASM on Safari/iOS/Firefox.
|
|
1353
|
+
* All model loading and inference runs off the main thread.
|
|
1576
1354
|
*/
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
|
-
|
|
1581
|
-
|
|
1582
|
-
|
|
1583
|
-
|
|
1584
|
-
|
|
1585
|
-
|
|
1586
|
-
/**
|
|
1587
|
-
* Load the ONNX model
|
|
1588
|
-
* @returns Model loading information
|
|
1589
|
-
*/
|
|
1590
|
-
load(): Promise<VADModelInfo | VADWorkerModelInfo>;
|
|
1591
|
-
/**
|
|
1592
|
-
* Process a single audio chunk
|
|
1593
|
-
* @param audioChunk - Float32Array of exactly chunkSize samples
|
|
1594
|
-
* @returns VAD result with speech probability
|
|
1595
|
-
*/
|
|
1596
|
-
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
1597
|
-
/**
|
|
1598
|
-
* Reset state for new audio stream
|
|
1599
|
-
*/
|
|
1600
|
-
reset(): void | Promise<void>;
|
|
1601
|
-
/**
|
|
1602
|
-
* Dispose of the model and free resources
|
|
1603
|
-
*/
|
|
1604
|
-
dispose(): Promise<void>;
|
|
1605
|
-
/**
|
|
1606
|
-
* Get required chunk size in samples
|
|
1607
|
-
*/
|
|
1608
|
-
getChunkSize(): number;
|
|
1355
|
+
declare class UnifiedInferenceWorker {
|
|
1356
|
+
private worker;
|
|
1357
|
+
private pendingRequests;
|
|
1358
|
+
private initialized;
|
|
1359
|
+
private healthState;
|
|
1360
|
+
private consecutiveFailures;
|
|
1361
|
+
private _generation;
|
|
1362
|
+
private recovering;
|
|
1363
|
+
private _workerBackend;
|
|
1609
1364
|
/**
|
|
1610
|
-
*
|
|
1365
|
+
* Initialize the worker (load ORT WASM from CDN)
|
|
1611
1366
|
*/
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
|
-
|
|
1615
|
-
|
|
1616
|
-
|
|
1617
|
-
|
|
1618
|
-
|
|
1619
|
-
|
|
1620
|
-
|
|
1621
|
-
|
|
1367
|
+
init(): Promise<void>;
|
|
1368
|
+
loadSenseVoice(config: {
|
|
1369
|
+
modelUrl: string;
|
|
1370
|
+
tokensUrl: string;
|
|
1371
|
+
language: number;
|
|
1372
|
+
textNorm: number;
|
|
1373
|
+
}): Promise<SenseVoiceModelInfo>;
|
|
1374
|
+
transcribe(audio: Float32Array): Promise<SenseVoiceResult>;
|
|
1375
|
+
disposeSenseVoice(): Promise<void>;
|
|
1376
|
+
loadLAM(config: {
|
|
1377
|
+
modelUrl: string;
|
|
1378
|
+
externalDataUrl: string | null;
|
|
1379
|
+
numIdentityClasses?: number;
|
|
1380
|
+
}): Promise<A2EModelInfo>;
|
|
1381
|
+
inferLAM(audio: Float32Array, identityIndex?: number): Promise<{
|
|
1382
|
+
blendshapes: Float32Array;
|
|
1383
|
+
numFrames: number;
|
|
1384
|
+
numBlendshapes: number;
|
|
1385
|
+
inferenceTimeMs: number;
|
|
1386
|
+
}>;
|
|
1387
|
+
disposeLAM(): Promise<void>;
|
|
1388
|
+
loadKokoro(config: {
|
|
1389
|
+
modelUrl: string;
|
|
1390
|
+
}): Promise<{
|
|
1391
|
+
loadTimeMs: number;
|
|
1392
|
+
}>;
|
|
1393
|
+
inferKokoro(tokens: number[], style: Float32Array, speed: number): Promise<{
|
|
1394
|
+
audio: Float32Array;
|
|
1395
|
+
inferenceTimeMs: number;
|
|
1396
|
+
}>;
|
|
1397
|
+
disposeKokoro(): Promise<void>;
|
|
1398
|
+
loadVAD(config: {
|
|
1399
|
+
modelUrl: string;
|
|
1400
|
+
sampleRate: number;
|
|
1401
|
+
}): Promise<VADWorkerModelInfo>;
|
|
1402
|
+
processVAD(audio: Float32Array, state: Float32Array, context: Float32Array): Promise<{
|
|
1403
|
+
probability: number;
|
|
1404
|
+
state: Float32Array;
|
|
1405
|
+
inferenceTimeMs: number;
|
|
1406
|
+
}>;
|
|
1407
|
+
resetVAD(): Promise<Float32Array>;
|
|
1408
|
+
disposeVAD(): Promise<void>;
|
|
1409
|
+
dispose(): Promise<void>;
|
|
1410
|
+
/** Check if the worker is initialized and healthy */
|
|
1411
|
+
get isReady(): boolean;
|
|
1412
|
+
/** Current health state of the worker */
|
|
1413
|
+
get health(): WorkerHealthState;
|
|
1414
|
+
/** Generation counter — increments on worker recovery. Adapters compare to detect stale sessions. */
|
|
1415
|
+
get workerGeneration(): number;
|
|
1416
|
+
/** The ORT backend the worker is using ('webgpu' on Chrome/Edge, 'wasm' on Safari/iOS/Firefox) */
|
|
1417
|
+
get backend(): 'wasm' | 'webgpu';
|
|
1418
|
+
/** Check if Web Workers are supported */
|
|
1419
|
+
static isSupported(): boolean;
|
|
1420
|
+
private assertReady;
|
|
1421
|
+
private createWorker;
|
|
1422
|
+
private handleWorkerMessage;
|
|
1423
|
+
private sendMessage;
|
|
1622
1424
|
/**
|
|
1623
|
-
*
|
|
1624
|
-
*
|
|
1625
|
-
* Auto-detection behavior:
|
|
1626
|
-
* - Desktop: Uses Worker (better responsiveness, off-main-thread)
|
|
1627
|
-
* - Mobile: Uses main thread (avoids 5MB memory overhead)
|
|
1628
|
-
*
|
|
1629
|
-
* You can override this to:
|
|
1630
|
-
* - `true`: Force Worker even on mobile (if you have memory headroom)
|
|
1631
|
-
* - `false`: Force main thread even on desktop (for debugging)
|
|
1632
|
-
*
|
|
1633
|
-
* Default: undefined (auto-detect)
|
|
1425
|
+
* Ping the worker to check if it's alive. If ping succeeds, worker was just
|
|
1426
|
+
* busy with long inference. If ping fails, worker is truly stuck — recover.
|
|
1634
1427
|
*/
|
|
1635
|
-
|
|
1428
|
+
private runHealthCheck;
|
|
1636
1429
|
/**
|
|
1637
|
-
*
|
|
1638
|
-
*
|
|
1639
|
-
* When true (default), if the Worker fails to load or encounters an error,
|
|
1640
|
-
* the factory will automatically create a main thread instance instead.
|
|
1641
|
-
*
|
|
1642
|
-
* When false, worker errors will propagate as exceptions.
|
|
1643
|
-
*
|
|
1644
|
-
* Default: true
|
|
1430
|
+
* Terminate the stuck worker, create a new one, and re-initialize ORT.
|
|
1431
|
+
* Model sessions are lost — adapters must reload via generation check.
|
|
1645
1432
|
*/
|
|
1646
|
-
|
|
1433
|
+
private recoverWorker;
|
|
1434
|
+
private rejectAllPending;
|
|
1435
|
+
private cleanup;
|
|
1436
|
+
}
|
|
1437
|
+
|
|
1438
|
+
/**
|
|
1439
|
+
* Shared base config for all inference factory functions.
|
|
1440
|
+
*
|
|
1441
|
+
* @category Inference
|
|
1442
|
+
*/
|
|
1443
|
+
|
|
1444
|
+
/** Base config shared across all inference factory functions */
|
|
1445
|
+
interface InferenceFactoryConfig {
|
|
1446
|
+
/**
|
|
1447
|
+
* Worker mode:
|
|
1448
|
+
* - 'auto' (default): Use Worker if supported, else main thread
|
|
1449
|
+
* - true: Force Worker (throws if unsupported)
|
|
1450
|
+
* - false: Force main thread
|
|
1451
|
+
*/
|
|
1452
|
+
useWorker?: boolean | 'auto';
|
|
1647
1453
|
/**
|
|
1648
1454
|
* Unified inference worker instance.
|
|
1649
|
-
* When provided,
|
|
1455
|
+
* When provided, routes inference through the shared worker,
|
|
1456
|
+
* keeping all inference off the main thread.
|
|
1650
1457
|
* Takes precedence over useWorker setting.
|
|
1651
1458
|
*/
|
|
1652
1459
|
unifiedWorker?: UnifiedInferenceWorker;
|
|
1653
1460
|
}
|
|
1654
|
-
/**
|
|
1655
|
-
* Check if the current environment supports VAD Web Workers
|
|
1656
|
-
*
|
|
1657
|
-
* Requirements:
|
|
1658
|
-
* - Worker constructor must exist
|
|
1659
|
-
* - Blob URL support (for inline worker script)
|
|
1660
|
-
*
|
|
1661
|
-
* @returns true if VAD Worker is supported
|
|
1662
|
-
*/
|
|
1663
|
-
declare function supportsVADWorker(): boolean;
|
|
1664
|
-
/**
|
|
1665
|
-
* Create a Silero VAD instance with automatic implementation selection
|
|
1666
|
-
*
|
|
1667
|
-
* This factory function automatically selects between:
|
|
1668
|
-
* - **SileroVADWorker**: Off-main-thread inference (better for desktop)
|
|
1669
|
-
* - **SileroVADInference**: Main thread inference (better for mobile)
|
|
1670
|
-
*
|
|
1671
|
-
* The selection is based on:
|
|
1672
|
-
* 1. Explicit `useWorker` config (if provided)
|
|
1673
|
-
* 2. Platform detection (mobile vs desktop)
|
|
1674
|
-
* 3. Worker API availability
|
|
1675
|
-
*
|
|
1676
|
-
* Both implementations share the same interface (SileroVADBackend),
|
|
1677
|
-
* so consumers can use either interchangeably.
|
|
1678
|
-
*
|
|
1679
|
-
* @param config - Factory configuration
|
|
1680
|
-
* @returns A SileroVAD instance (either Worker or main thread)
|
|
1681
|
-
*
|
|
1682
|
-
* @example
|
|
1683
|
-
* ```typescript
|
|
1684
|
-
* // Auto-detect (recommended)
|
|
1685
|
-
* const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
|
|
1686
|
-
*
|
|
1687
|
-
* // Force Worker
|
|
1688
|
-
* const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
|
|
1689
|
-
*
|
|
1690
|
-
* // Force main thread
|
|
1691
|
-
* const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
|
|
1692
|
-
* ```
|
|
1693
|
-
*/
|
|
1694
|
-
declare function createSileroVAD(config?: SileroVADFactoryConfig): SileroVADBackend;
|
|
1695
1461
|
|
|
1696
1462
|
/**
|
|
1697
|
-
*
|
|
1698
|
-
*
|
|
1699
|
-
* Runs wav2arkit_cpu inference in a dedicated Web Worker to prevent main thread blocking.
|
|
1700
|
-
* Uses inline worker script (Blob URL pattern) to avoid separate file deployment.
|
|
1463
|
+
* Factory function for A2E inference
|
|
1701
1464
|
*
|
|
1702
|
-
*
|
|
1703
|
-
*
|
|
1704
|
-
* - Audio copied (not transferred) to retain main thread access
|
|
1705
|
-
* - ONNX Runtime loaded from CDN in worker (no bundler complications)
|
|
1706
|
-
* - Blendshape symmetrization inlined in worker (no module imports)
|
|
1707
|
-
* - iOS: passes model URLs as strings directly to ORT (avoids 400MB+ JS heap)
|
|
1465
|
+
* Creates an A2EBackend instance with zero-config defaults (HuggingFace CDN).
|
|
1466
|
+
* Supports unified worker mode for iOS off-main-thread inference.
|
|
1708
1467
|
*
|
|
1709
1468
|
* @category Inference
|
|
1710
1469
|
*
|
|
1711
|
-
* @example
|
|
1470
|
+
* @example Auto-detect (recommended, zero-config)
|
|
1712
1471
|
* ```typescript
|
|
1713
|
-
* import {
|
|
1472
|
+
* import { createA2E } from '@omote/core';
|
|
1714
1473
|
*
|
|
1715
|
-
* const
|
|
1716
|
-
*
|
|
1717
|
-
* });
|
|
1718
|
-
*
|
|
1474
|
+
* const a2e = createA2E(); // uses HF CDN defaults (192MB fp16)
|
|
1475
|
+
* await a2e.load();
|
|
1476
|
+
* const { blendshapes } = await a2e.infer(audioSamples);
|
|
1477
|
+
* ```
|
|
1719
1478
|
*
|
|
1720
|
-
*
|
|
1721
|
-
*
|
|
1479
|
+
* @example Custom model URL
|
|
1480
|
+
* ```typescript
|
|
1481
|
+
* const a2e = createA2E({ modelUrl: '/models/lam.onnx' });
|
|
1722
1482
|
* ```
|
|
1723
1483
|
*/
|
|
1724
1484
|
|
|
1725
1485
|
/**
|
|
1726
|
-
* Configuration for
|
|
1486
|
+
* Configuration for the A2E factory
|
|
1727
1487
|
*/
|
|
1728
|
-
interface
|
|
1729
|
-
/**
|
|
1730
|
-
modelUrl
|
|
1488
|
+
interface CreateA2EConfig extends InferenceFactoryConfig {
|
|
1489
|
+
/** URL for the ONNX model. Default: HuggingFace CDN */
|
|
1490
|
+
modelUrl?: string;
|
|
1731
1491
|
/**
|
|
1732
|
-
*
|
|
1733
|
-
* Default: `${modelUrl}.data`
|
|
1492
|
+
* URL for external model data file (.onnx.data weights).
|
|
1493
|
+
* Default: `${modelUrl}.data`
|
|
1734
1494
|
*
|
|
1735
1495
|
* Set to `false` to skip external data loading (single-file models only).
|
|
1736
1496
|
*/
|
|
1737
1497
|
externalDataUrl?: string | false;
|
|
1498
|
+
/** Backend preference (default: 'auto') */
|
|
1499
|
+
backend?: BackendPreference;
|
|
1500
|
+
/** Number of identity classes (default: 12) */
|
|
1501
|
+
numIdentityClasses?: number;
|
|
1738
1502
|
}
|
|
1739
1503
|
/**
|
|
1740
|
-
*
|
|
1504
|
+
* Create an A2E instance
|
|
1741
1505
|
*
|
|
1742
|
-
*
|
|
1743
|
-
*
|
|
1506
|
+
* @param config - Factory configuration
|
|
1507
|
+
* @returns An A2EBackend instance
|
|
1508
|
+
*/
|
|
1509
|
+
declare function createA2E(config?: CreateA2EConfig): A2EBackend;
|
|
1510
|
+
|
|
1511
|
+
/**
|
|
1512
|
+
* Shared types for orchestration layer
|
|
1744
1513
|
*
|
|
1745
|
-
* @
|
|
1514
|
+
* @category Orchestration
|
|
1746
1515
|
*/
|
|
1747
|
-
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
|
-
|
|
1761
|
-
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
|
|
1769
|
-
|
|
1770
|
-
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
|
|
1774
|
-
|
|
1775
|
-
|
|
1776
|
-
|
|
1777
|
-
|
|
1778
|
-
|
|
1516
|
+
|
|
1517
|
+
/**
|
|
1518
|
+
* Generic frame source -- any object that emits 'frame' events with blendshapes.
|
|
1519
|
+
*
|
|
1520
|
+
* Implemented by PlaybackPipeline, MicLipSync, VoicePipeline, and any custom source.
|
|
1521
|
+
* Used by OmoteAvatar (all renderer adapters) to receive animation frames.
|
|
1522
|
+
*/
|
|
1523
|
+
interface FrameSource {
|
|
1524
|
+
on(event: 'frame', callback: (frame: {
|
|
1525
|
+
blendshapes: Float32Array;
|
|
1526
|
+
emotion?: string;
|
|
1527
|
+
}) => void): void;
|
|
1528
|
+
off?(event: 'frame', callback: (...args: any[]) => void): void;
|
|
1529
|
+
}
|
|
1530
|
+
type VoicePipelineState = 'idle' | 'loading' | 'ready' | 'listening' | 'thinking' | 'speaking' | 'error';
|
|
1531
|
+
interface LoadingProgress {
|
|
1532
|
+
currentModel: string;
|
|
1533
|
+
progress: number;
|
|
1534
|
+
totalModels: number;
|
|
1535
|
+
modelsLoaded: number;
|
|
1536
|
+
}
|
|
1537
|
+
interface TranscriptResult {
|
|
1538
|
+
text: string;
|
|
1539
|
+
emotion?: string;
|
|
1540
|
+
language?: string;
|
|
1541
|
+
event?: string;
|
|
1542
|
+
isFinal: boolean;
|
|
1543
|
+
inferenceTimeMs?: number;
|
|
1544
|
+
}
|
|
1545
|
+
/**
|
|
1546
|
+
* Consumer's response handler. VoicePipeline calls this with transcribed text.
|
|
1547
|
+
* Consumer must stream audio back for playback + lip sync.
|
|
1548
|
+
*/
|
|
1549
|
+
interface ResponseHandler {
|
|
1550
|
+
(params: {
|
|
1551
|
+
text: string;
|
|
1552
|
+
emotion?: string;
|
|
1553
|
+
event?: string;
|
|
1554
|
+
/** Set avatar emotion during response streaming (e.g., from LLM emotion_update messages) */
|
|
1555
|
+
setEmotion?: (emotion: string) => void;
|
|
1556
|
+
/** Stream audio chunks to pipeline for playback + lip sync */
|
|
1557
|
+
send: (chunk: Uint8Array) => Promise<void>;
|
|
1558
|
+
/** Call when all audio has been sent */
|
|
1559
|
+
done: () => Promise<void>;
|
|
1560
|
+
/** Aborted on interruption or stop() */
|
|
1561
|
+
signal: AbortSignal;
|
|
1562
|
+
/** Session ID for backend correlation */
|
|
1563
|
+
sessionId: string;
|
|
1564
|
+
}): Promise<void>;
|
|
1565
|
+
}
|
|
1566
|
+
|
|
1567
|
+
/**
|
|
1568
|
+
* TTSSpeaker — Shared helper for OmoteAvatar TTS integration.
|
|
1569
|
+
*
|
|
1570
|
+
* Encapsulates createA2E + TTSPlayback lifecycle so that renderer adapters
|
|
1571
|
+
* (Three.js, Babylon.js) and the R3F hook can delegate with ~15 lines each.
|
|
1572
|
+
*
|
|
1573
|
+
* @category Audio
|
|
1574
|
+
*/
|
|
1575
|
+
|
|
1576
|
+
interface TTSSpeakerConfig {
|
|
1577
|
+
/** Per-character expression weight scaling */
|
|
1578
|
+
profile?: ExpressionProfile;
|
|
1579
|
+
/** Identity/style index for A2E model (default: 0) */
|
|
1580
|
+
identityIndex?: number;
|
|
1581
|
+
/** Audio playback delay in ms */
|
|
1582
|
+
audioDelayMs?: number;
|
|
1583
|
+
/** Enable neutral transition on playback complete */
|
|
1584
|
+
neutralTransitionEnabled?: boolean;
|
|
1585
|
+
/** Duration of neutral fade-out in ms */
|
|
1586
|
+
neutralTransitionMs?: number;
|
|
1587
|
+
/** Pre-built A2E backend (skip internal createA2E). */
|
|
1588
|
+
lam?: A2EBackend;
|
|
1589
|
+
/** LAM model config (only when lam not provided) */
|
|
1590
|
+
models?: CreateA2EConfig;
|
|
1591
|
+
/** Shared unified worker (recommended for iOS) */
|
|
1592
|
+
unifiedWorker?: UnifiedInferenceWorker;
|
|
1593
|
+
}
|
|
1594
|
+
declare class TTSSpeaker {
|
|
1595
|
+
private ttsPlayback;
|
|
1596
|
+
private tts;
|
|
1597
|
+
private ownedLam;
|
|
1598
|
+
private ownedWorker;
|
|
1599
|
+
private currentAbort;
|
|
1600
|
+
private _isSpeaking;
|
|
1601
|
+
private _audioOnly;
|
|
1602
|
+
private scheduler;
|
|
1603
|
+
/** Whether the speaker is currently playing audio. */
|
|
1604
|
+
get isSpeaking(): boolean;
|
|
1605
|
+
/** Whether this speaker is in audio-only mode (no lip sync). */
|
|
1606
|
+
get audioOnly(): boolean;
|
|
1607
|
+
/** The internal TTSPlayback (implements FrameSource). Null until connect() or in audio-only mode. */
|
|
1608
|
+
get frameSource(): FrameSource | null;
|
|
1779
1609
|
/**
|
|
1780
|
-
*
|
|
1610
|
+
* Connect a TTS backend.
|
|
1781
1611
|
*
|
|
1782
|
-
*
|
|
1783
|
-
*
|
|
1612
|
+
* When config includes `lam`, `unifiedWorker`, or `models`, the full lip sync
|
|
1613
|
+
* pipeline is created (LAM + TTSPlayback + PlaybackPipeline).
|
|
1784
1614
|
*
|
|
1785
|
-
*
|
|
1786
|
-
*
|
|
1787
|
-
|
|
1788
|
-
|
|
1789
|
-
|
|
1790
|
-
* Queue inference to serialize worker calls
|
|
1615
|
+
* When config is omitted or has none of those, audio-only mode is used:
|
|
1616
|
+
* TTS → AudioScheduler (speakers only, no blendshapes, no LAM download).
|
|
1617
|
+
*
|
|
1618
|
+
* @param tts - TTS backend to use for speech synthesis
|
|
1619
|
+
* @param config - Optional configuration for A2E, expression profile, etc.
|
|
1791
1620
|
*/
|
|
1792
|
-
|
|
1621
|
+
connect(tts: TTSBackend, config?: TTSSpeakerConfig): Promise<void>;
|
|
1793
1622
|
/**
|
|
1794
|
-
*
|
|
1623
|
+
* Synthesize and play text with lip sync.
|
|
1624
|
+
* Auto-aborts previous speak if still in progress.
|
|
1625
|
+
*
|
|
1626
|
+
* @param text - Text to synthesize and play
|
|
1627
|
+
* @param options - Optional voice override and abort signal
|
|
1795
1628
|
*/
|
|
1629
|
+
speak(text: string, options?: {
|
|
1630
|
+
signal?: AbortSignal;
|
|
1631
|
+
voice?: string;
|
|
1632
|
+
}): Promise<void>;
|
|
1633
|
+
/** Audio-only speak: TTS → resample → AudioScheduler (no blendshapes). */
|
|
1634
|
+
private speakAudioOnly;
|
|
1635
|
+
/** Poll scheduler until all audio has played. */
|
|
1636
|
+
private waitForSchedulerComplete;
|
|
1637
|
+
/**
|
|
1638
|
+
* Stream text token-by-token with automatic sentence buffering.
|
|
1639
|
+
* Designed for LLM token-by-token output. Sentences are detected at
|
|
1640
|
+
* boundary characters (.!?\n) with a minimum length threshold, then
|
|
1641
|
+
* synthesized and played with lip sync.
|
|
1642
|
+
*
|
|
1643
|
+
* Auto-aborts previous speak/streamText if still in progress.
|
|
1644
|
+
*
|
|
1645
|
+
* @param options - Optional voice override and abort signal
|
|
1646
|
+
* @returns Sink with push() and end() methods
|
|
1647
|
+
*/
|
|
1648
|
+
streamText(options: {
|
|
1649
|
+
signal?: AbortSignal;
|
|
1650
|
+
voice?: string;
|
|
1651
|
+
}): Promise<{
|
|
1652
|
+
push: (token: string) => void;
|
|
1653
|
+
end: () => Promise<void>;
|
|
1654
|
+
}>;
|
|
1655
|
+
/** streamText in audio-only mode: TTS → AudioScheduler (no blendshapes). */
|
|
1656
|
+
private streamTextAudioOnly;
|
|
1657
|
+
/** Abort current speak if any. */
|
|
1658
|
+
stop(): void;
|
|
1659
|
+
/** Clean teardown of all owned resources. */
|
|
1796
1660
|
dispose(): Promise<void>;
|
|
1797
|
-
/**
|
|
1798
|
-
* Check if Web Workers are supported
|
|
1799
|
-
*/
|
|
1800
|
-
static isSupported(): boolean;
|
|
1801
1661
|
}
|
|
1802
1662
|
|
|
1803
1663
|
/**
|
|
1804
|
-
*
|
|
1664
|
+
* createTTSPlayer — Zero-config TTS player for audio-only playback.
|
|
1805
1665
|
*
|
|
1806
|
-
*
|
|
1807
|
-
* own ORT WASM instance (~40MB each). On iOS this exceeds the ~1-1.5GB tab
|
|
1808
|
-
* limit, forcing main-thread fallback which blocks the render loop.
|
|
1666
|
+
* Speaks text through speakers without an avatar. No LAM download, no lip sync.
|
|
1809
1667
|
*
|
|
1810
|
-
*
|
|
1811
|
-
* ORT WASM instance. Same total model memory (~643MB), but inference runs
|
|
1812
|
-
* off-main-thread. Works on iOS because there's only one ORT instance.
|
|
1813
|
-
*
|
|
1814
|
-
* Consumer usage:
|
|
1668
|
+
* @example
|
|
1815
1669
|
* ```typescript
|
|
1816
|
-
*
|
|
1817
|
-
* await worker.init();
|
|
1670
|
+
* import { createTTSPlayer } from '@omote/core';
|
|
1818
1671
|
*
|
|
1819
|
-
* const
|
|
1820
|
-
*
|
|
1821
|
-
*
|
|
1672
|
+
* const player = createTTSPlayer();
|
|
1673
|
+
* await player.load();
|
|
1674
|
+
* await player.speak("Hello world!");
|
|
1675
|
+
*
|
|
1676
|
+
* // Streaming:
|
|
1677
|
+
* const stream = await player.streamText({});
|
|
1678
|
+
* stream.push("Hello ");
|
|
1679
|
+
* stream.push("world!");
|
|
1680
|
+
* await stream.end();
|
|
1822
1681
|
* ```
|
|
1823
1682
|
*
|
|
1824
|
-
* @category
|
|
1683
|
+
* @category Audio
|
|
1825
1684
|
*/
|
|
1826
1685
|
|
|
1827
|
-
|
|
1828
|
-
|
|
1829
|
-
|
|
1830
|
-
|
|
1831
|
-
|
|
1832
|
-
*/
|
|
1833
|
-
|
|
1834
|
-
private worker;
|
|
1835
|
-
private pendingRequests;
|
|
1836
|
-
private initialized;
|
|
1837
|
-
private poisoned;
|
|
1838
|
-
/**
|
|
1839
|
-
* Initialize the worker (load ORT WASM from CDN)
|
|
1840
|
-
*/
|
|
1841
|
-
init(): Promise<void>;
|
|
1842
|
-
loadSenseVoice(config: {
|
|
1843
|
-
modelUrl: string;
|
|
1844
|
-
tokensUrl: string;
|
|
1845
|
-
language: number;
|
|
1846
|
-
textNorm: number;
|
|
1847
|
-
}): Promise<SenseVoiceModelInfo>;
|
|
1848
|
-
transcribe(audio: Float32Array): Promise<SenseVoiceResult>;
|
|
1849
|
-
disposeSenseVoice(): Promise<void>;
|
|
1850
|
-
loadA2E(config: {
|
|
1851
|
-
modelUrl: string;
|
|
1852
|
-
externalDataUrl: string | null;
|
|
1853
|
-
}): Promise<A2EModelInfo>;
|
|
1854
|
-
inferA2E(audio: Float32Array): Promise<{
|
|
1855
|
-
blendshapes: Float32Array;
|
|
1856
|
-
numFrames: number;
|
|
1857
|
-
numBlendshapes: number;
|
|
1858
|
-
inferenceTimeMs: number;
|
|
1859
|
-
}>;
|
|
1860
|
-
disposeA2E(): Promise<void>;
|
|
1861
|
-
loadVAD(config: {
|
|
1862
|
-
modelUrl: string;
|
|
1863
|
-
sampleRate: number;
|
|
1864
|
-
}): Promise<VADWorkerModelInfo>;
|
|
1865
|
-
processVAD(audio: Float32Array, state: Float32Array, context: Float32Array): Promise<{
|
|
1866
|
-
probability: number;
|
|
1867
|
-
state: Float32Array;
|
|
1868
|
-
inferenceTimeMs: number;
|
|
1869
|
-
}>;
|
|
1870
|
-
resetVAD(): Promise<Float32Array>;
|
|
1871
|
-
disposeVAD(): Promise<void>;
|
|
1872
|
-
dispose(): Promise<void>;
|
|
1873
|
-
/** Check if the worker is initialized and not poisoned */
|
|
1874
|
-
get isReady(): boolean;
|
|
1875
|
-
/** Check if Web Workers are supported */
|
|
1876
|
-
static isSupported(): boolean;
|
|
1877
|
-
private assertReady;
|
|
1878
|
-
private createWorker;
|
|
1879
|
-
private handleWorkerMessage;
|
|
1880
|
-
private sendMessage;
|
|
1881
|
-
private rejectAllPending;
|
|
1882
|
-
private cleanup;
|
|
1883
|
-
}
|
|
1884
|
-
/**
|
|
1885
|
-
* SenseVoice adapter backed by UnifiedInferenceWorker
|
|
1886
|
-
*
|
|
1887
|
-
* Implements SenseVoiceBackend, delegating all inference to the shared worker.
|
|
1888
|
-
*/
|
|
1889
|
-
declare class SenseVoiceUnifiedAdapter implements SenseVoiceBackend {
|
|
1890
|
-
private worker;
|
|
1891
|
-
private config;
|
|
1892
|
-
private _isLoaded;
|
|
1893
|
-
private languageId;
|
|
1894
|
-
private textNormId;
|
|
1895
|
-
private inferenceQueue;
|
|
1896
|
-
constructor(worker: UnifiedInferenceWorker, config: SenseVoiceWorkerConfig);
|
|
1897
|
-
get isLoaded(): boolean;
|
|
1898
|
-
get backend(): 'wasm' | null;
|
|
1899
|
-
load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
|
|
1900
|
-
transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
|
|
1901
|
-
dispose(): Promise<void>;
|
|
1686
|
+
interface CreateTTSPlayerConfig {
|
|
1687
|
+
/** Voice to use (default: 'af_heart') */
|
|
1688
|
+
voice?: string;
|
|
1689
|
+
/** Model URL override */
|
|
1690
|
+
modelUrl?: string;
|
|
1691
|
+
/** Voice data base URL override */
|
|
1692
|
+
voiceBaseUrl?: string;
|
|
1902
1693
|
}
|
|
1903
1694
|
/**
|
|
1904
|
-
*
|
|
1695
|
+
* Zero-config TTS player. Speak text through speakers without an avatar.
|
|
1905
1696
|
*
|
|
1906
|
-
*
|
|
1697
|
+
* Uses Kokoro TTS (82M q8, ~92MB) with automatic worker selection.
|
|
1698
|
+
* No LAM model is downloaded — audio plays directly through AudioScheduler.
|
|
1907
1699
|
*/
|
|
1908
|
-
declare
|
|
1909
|
-
readonly modelId: "wav2arkit_cpu";
|
|
1910
|
-
readonly chunkSize: number;
|
|
1911
|
-
private worker;
|
|
1912
|
-
private config;
|
|
1913
|
-
private _isLoaded;
|
|
1914
|
-
private inferenceQueue;
|
|
1915
|
-
constructor(worker: UnifiedInferenceWorker, config: Wav2ArkitCpuWorkerConfig);
|
|
1916
|
-
get isLoaded(): boolean;
|
|
1917
|
-
get backend(): RuntimeBackend | null;
|
|
1918
|
-
load(): Promise<A2EModelInfo>;
|
|
1919
|
-
infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
|
|
1920
|
-
dispose(): Promise<void>;
|
|
1921
|
-
}
|
|
1700
|
+
declare function createTTSPlayer(config?: CreateTTSPlayerConfig): TTSPlayer;
|
|
1922
1701
|
/**
|
|
1923
|
-
*
|
|
1924
|
-
*
|
|
1925
|
-
* Implements SileroVADBackend, delegating all inference to the shared worker.
|
|
1702
|
+
* Thin wrapper: TTSSpeaker in audio-only mode + delegated load().
|
|
1926
1703
|
*/
|
|
1927
|
-
declare class
|
|
1928
|
-
private
|
|
1929
|
-
|
|
1930
|
-
|
|
1931
|
-
|
|
1932
|
-
|
|
1933
|
-
private readonly chunkSize;
|
|
1934
|
-
private readonly contextSize;
|
|
1935
|
-
private inferenceQueue;
|
|
1936
|
-
private preSpeechBuffer;
|
|
1937
|
-
private wasSpeaking;
|
|
1938
|
-
constructor(worker: UnifiedInferenceWorker, config: SileroVADConfig);
|
|
1704
|
+
declare class TTSPlayer extends TTSSpeaker {
|
|
1705
|
+
private backend;
|
|
1706
|
+
constructor(tts: TTSBackend);
|
|
1707
|
+
/** Load TTS model and connect in audio-only mode. */
|
|
1708
|
+
load(): Promise<void>;
|
|
1709
|
+
/** Whether the TTS model is loaded and ready. */
|
|
1939
1710
|
get isLoaded(): boolean;
|
|
1940
|
-
get backend(): RuntimeBackend | null;
|
|
1941
|
-
get sampleRate(): number;
|
|
1942
|
-
get threshold(): number;
|
|
1943
|
-
getChunkSize(): number;
|
|
1944
|
-
getChunkDurationMs(): number;
|
|
1945
|
-
load(): Promise<VADWorkerModelInfo>;
|
|
1946
|
-
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
1947
|
-
reset(): Promise<void>;
|
|
1948
|
-
dispose(): Promise<void>;
|
|
1949
1711
|
}
|
|
1950
1712
|
|
|
1951
1713
|
/**
|
|
@@ -1986,69 +1748,521 @@ declare class SileroVADUnifiedAdapter implements SileroVADBackend {
|
|
|
1986
1748
|
*/
|
|
1987
1749
|
|
|
1988
1750
|
/**
|
|
1989
|
-
* Common interface for both SenseVoiceInference and SenseVoiceWorker
|
|
1751
|
+
* Common interface for both SenseVoiceInference and SenseVoiceWorker
|
|
1752
|
+
*/
|
|
1753
|
+
interface SenseVoiceBackend {
|
|
1754
|
+
/** Whether the model is loaded and ready for inference */
|
|
1755
|
+
readonly isLoaded: boolean;
|
|
1756
|
+
/** Current backend type ('wasm' for worker, full backend for main thread, null if not loaded) */
|
|
1757
|
+
readonly backend: 'wasm' | 'webgpu' | null;
|
|
1758
|
+
/**
|
|
1759
|
+
* Load the ONNX model
|
|
1760
|
+
* @param onProgress - Optional progress callback (fires once at 100% for worker)
|
|
1761
|
+
* @returns Model loading information
|
|
1762
|
+
*/
|
|
1763
|
+
load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
|
|
1764
|
+
/**
|
|
1765
|
+
* Transcribe audio samples to text
|
|
1766
|
+
* @param audioSamples - Float32Array of audio samples at 16kHz
|
|
1767
|
+
* @returns Transcription result
|
|
1768
|
+
*/
|
|
1769
|
+
transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
|
|
1770
|
+
/**
|
|
1771
|
+
* Dispose of the model and free resources
|
|
1772
|
+
*/
|
|
1773
|
+
dispose(): Promise<void>;
|
|
1774
|
+
}
|
|
1775
|
+
/**
|
|
1776
|
+
* Configuration for the SenseVoice factory
|
|
1777
|
+
*/
|
|
1778
|
+
interface CreateSenseVoiceConfig extends InferenceFactoryConfig {
|
|
1779
|
+
/** Path or URL to model.int8.onnx (239MB). Default: HuggingFace CDN */
|
|
1780
|
+
modelUrl?: string;
|
|
1781
|
+
/** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
|
|
1782
|
+
tokensUrl?: string;
|
|
1783
|
+
/** Language hint (default: 'auto') */
|
|
1784
|
+
language?: SenseVoiceLanguage;
|
|
1785
|
+
/** Text normalization (default: 'with_itn') */
|
|
1786
|
+
textNorm?: 'with_itn' | 'without_itn';
|
|
1787
|
+
}
|
|
1788
|
+
/**
|
|
1789
|
+
* Create a SenseVoice ASR instance with automatic implementation selection
|
|
1790
|
+
*
|
|
1791
|
+
* @param config - Factory configuration
|
|
1792
|
+
* @returns A SenseVoiceBackend instance (either Worker or main thread)
|
|
1793
|
+
*/
|
|
1794
|
+
declare function createSenseVoice(config?: CreateSenseVoiceConfig): SenseVoiceBackend;
|
|
1795
|
+
|
|
1796
|
+
/**
|
|
1797
|
+
* Factory function for Silero VAD with automatic Worker vs main thread selection
|
|
1798
|
+
*
|
|
1799
|
+
* Provides a unified API that automatically selects the optimal implementation:
|
|
1800
|
+
* - Desktop browsers: Uses SileroVADWorker (off-main-thread inference)
|
|
1801
|
+
* - Mobile devices: Uses SileroVADInference (main thread, avoids memory overhead)
|
|
1802
|
+
* - Fallback: Gracefully falls back to main thread if Worker fails
|
|
1803
|
+
*
|
|
1804
|
+
* @category Inference
|
|
1805
|
+
*
|
|
1806
|
+
* @example Basic usage (auto-detect)
|
|
1807
|
+
* ```typescript
|
|
1808
|
+
* import { createSileroVAD } from '@omote/core';
|
|
1809
|
+
*
|
|
1810
|
+
* const vad = createSileroVAD({
|
|
1811
|
+
* modelUrl: '/models/silero-vad.onnx',
|
|
1812
|
+
* threshold: 0.5,
|
|
1813
|
+
* });
|
|
1814
|
+
*
|
|
1815
|
+
* await vad.load();
|
|
1816
|
+
* const result = await vad.process(audioChunk);
|
|
1817
|
+
* if (result.isSpeech) {
|
|
1818
|
+
* console.log('Speech detected!', result.probability);
|
|
1819
|
+
* }
|
|
1820
|
+
* ```
|
|
1821
|
+
*
|
|
1822
|
+
* @example Force worker usage
|
|
1823
|
+
* ```typescript
|
|
1824
|
+
* const vad = createSileroVAD({
|
|
1825
|
+
* modelUrl: '/models/silero-vad.onnx',
|
|
1826
|
+
* useWorker: true, // Force Worker even on mobile
|
|
1827
|
+
* });
|
|
1828
|
+
* ```
|
|
1829
|
+
*
|
|
1830
|
+
* @example Force main thread
|
|
1831
|
+
* ```typescript
|
|
1832
|
+
* const vad = createSileroVAD({
|
|
1833
|
+
* modelUrl: '/models/silero-vad.onnx',
|
|
1834
|
+
* useWorker: false, // Force main thread
|
|
1835
|
+
* });
|
|
1836
|
+
* ```
|
|
1837
|
+
*/
|
|
1838
|
+
|
|
1839
|
+
/**
|
|
1840
|
+
* Common interface for both SileroVADInference and SileroVADWorker
|
|
1841
|
+
*
|
|
1842
|
+
* This interface defines the shared API that both implementations provide,
|
|
1843
|
+
* allowing consumers to use either interchangeably.
|
|
1844
|
+
*/
|
|
1845
|
+
interface SileroVADBackend {
|
|
1846
|
+
/** Current backend type (webgpu, wasm, or null if not loaded) */
|
|
1847
|
+
readonly backend: RuntimeBackend | null;
|
|
1848
|
+
/** Whether the model is loaded and ready for inference */
|
|
1849
|
+
readonly isLoaded: boolean;
|
|
1850
|
+
/** Audio sample rate (8000 or 16000 Hz) */
|
|
1851
|
+
readonly sampleRate: number;
|
|
1852
|
+
/** Speech detection threshold (0-1) */
|
|
1853
|
+
readonly threshold: number;
|
|
1854
|
+
/**
|
|
1855
|
+
* Load the ONNX model
|
|
1856
|
+
* @returns Model loading information
|
|
1857
|
+
*/
|
|
1858
|
+
load(): Promise<VADModelInfo | VADWorkerModelInfo>;
|
|
1859
|
+
/**
|
|
1860
|
+
* Process a single audio chunk
|
|
1861
|
+
* @param audioChunk - Float32Array of exactly chunkSize samples
|
|
1862
|
+
* @returns VAD result with speech probability
|
|
1863
|
+
*/
|
|
1864
|
+
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
1865
|
+
/**
|
|
1866
|
+
* Reset state for new audio stream
|
|
1867
|
+
*/
|
|
1868
|
+
reset(): void | Promise<void>;
|
|
1869
|
+
/**
|
|
1870
|
+
* Dispose of the model and free resources
|
|
1871
|
+
*/
|
|
1872
|
+
dispose(): Promise<void>;
|
|
1873
|
+
/**
|
|
1874
|
+
* Get required chunk size in samples
|
|
1875
|
+
*/
|
|
1876
|
+
getChunkSize(): number;
|
|
1877
|
+
/**
|
|
1878
|
+
* Get chunk duration in milliseconds
|
|
1879
|
+
*/
|
|
1880
|
+
getChunkDurationMs(): number;
|
|
1881
|
+
}
|
|
1882
|
+
/**
|
|
1883
|
+
* Configuration for the Silero VAD factory
|
|
1884
|
+
*
|
|
1885
|
+
* Extends SileroVADConfig with worker-specific options.
|
|
1886
|
+
*/
|
|
1887
|
+
interface SileroVADFactoryConfig extends Omit<SileroVADConfig, 'modelUrl'>, InferenceFactoryConfig {
|
|
1888
|
+
/** Path or URL to the ONNX model. Default: HuggingFace CDN */
|
|
1889
|
+
modelUrl?: string;
|
|
1890
|
+
/**
|
|
1891
|
+
* Fallback to main thread on worker errors.
|
|
1892
|
+
*
|
|
1893
|
+
* When true (default), if the Worker fails to load or encounters an error,
|
|
1894
|
+
* the factory will automatically create a main thread instance instead.
|
|
1895
|
+
*
|
|
1896
|
+
* When false, worker errors will propagate as exceptions.
|
|
1897
|
+
*
|
|
1898
|
+
* Default: true
|
|
1899
|
+
*/
|
|
1900
|
+
fallbackOnError?: boolean;
|
|
1901
|
+
}
|
|
1902
|
+
/**
|
|
1903
|
+
* Check if the current environment supports VAD Web Workers
|
|
1904
|
+
*
|
|
1905
|
+
* Requirements:
|
|
1906
|
+
* - Worker constructor must exist
|
|
1907
|
+
* - Blob URL support (for inline worker script)
|
|
1908
|
+
*
|
|
1909
|
+
* @returns true if VAD Worker is supported
|
|
1910
|
+
*/
|
|
1911
|
+
declare function supportsVADWorker(): boolean;
|
|
1912
|
+
/**
|
|
1913
|
+
* Create a Silero VAD instance with automatic implementation selection
|
|
1914
|
+
*
|
|
1915
|
+
* This factory function automatically selects between:
|
|
1916
|
+
* - **SileroVADWorker**: Off-main-thread inference (better for desktop)
|
|
1917
|
+
* - **SileroVADInference**: Main thread inference (better for mobile)
|
|
1918
|
+
*
|
|
1919
|
+
* The selection is based on:
|
|
1920
|
+
* 1. Explicit `useWorker` config (if provided)
|
|
1921
|
+
* 2. Platform detection (mobile vs desktop)
|
|
1922
|
+
* 3. Worker API availability
|
|
1923
|
+
*
|
|
1924
|
+
* Both implementations share the same interface (SileroVADBackend),
|
|
1925
|
+
* so consumers can use either interchangeably.
|
|
1926
|
+
*
|
|
1927
|
+
* @param config - Factory configuration
|
|
1928
|
+
* @returns A SileroVAD instance (either Worker or main thread)
|
|
1929
|
+
*
|
|
1930
|
+
* @example
|
|
1931
|
+
* ```typescript
|
|
1932
|
+
* // Auto-detect (recommended)
|
|
1933
|
+
* const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
|
|
1934
|
+
*
|
|
1935
|
+
* // Force Worker
|
|
1936
|
+
* const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
|
|
1937
|
+
*
|
|
1938
|
+
* // Force main thread
|
|
1939
|
+
* const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
|
|
1940
|
+
* ```
|
|
1941
|
+
*/
|
|
1942
|
+
declare function createSileroVAD(config?: SileroVADFactoryConfig): SileroVADBackend;
|
|
1943
|
+
|
|
1944
|
+
/**
|
|
1945
|
+
* SpeechListener — Standalone listening primitive.
|
|
1946
|
+
*
|
|
1947
|
+
* Composes: MicrophoneCapture → SileroVAD → SenseVoice ASR → transcript events.
|
|
1948
|
+
* Extracted from VoicePipeline's listening half so it can be used independently.
|
|
1949
|
+
*
|
|
1950
|
+
* Does NOT handle TTS, LAM, or response routing — those belong to TTSSpeaker
|
|
1951
|
+
* and VoicePipeline respectively.
|
|
1952
|
+
*
|
|
1953
|
+
* @category Audio
|
|
1954
|
+
*/
|
|
1955
|
+
|
|
1956
|
+
interface SpeechListenerConfig {
|
|
1957
|
+
/** Pre-built backends — skip internal factory creation. */
|
|
1958
|
+
backends?: {
|
|
1959
|
+
asr: SenseVoiceBackend;
|
|
1960
|
+
vad: SileroVADBackend;
|
|
1961
|
+
};
|
|
1962
|
+
/** External unified worker (reuse across pipelines). */
|
|
1963
|
+
unifiedWorker?: UnifiedInferenceWorker;
|
|
1964
|
+
/** URLs and options for model loading (when backends not provided). */
|
|
1965
|
+
models?: {
|
|
1966
|
+
senseVoice: {
|
|
1967
|
+
modelUrl: string;
|
|
1968
|
+
tokensUrl?: string;
|
|
1969
|
+
language?: string;
|
|
1970
|
+
};
|
|
1971
|
+
vad: {
|
|
1972
|
+
modelUrl: string;
|
|
1973
|
+
threshold?: number;
|
|
1974
|
+
preSpeechBufferChunks?: number;
|
|
1975
|
+
};
|
|
1976
|
+
};
|
|
1977
|
+
/** Base silence timeout in ms (default: 500) */
|
|
1978
|
+
silenceTimeoutMs?: number;
|
|
1979
|
+
/** Extended silence timeout for long utterances (default: 700) */
|
|
1980
|
+
silenceTimeoutExtendedMs?: number;
|
|
1981
|
+
/** Enable adaptive timeout based on speech duration (default: true) */
|
|
1982
|
+
adaptiveTimeout?: boolean;
|
|
1983
|
+
/** Minimum audio duration in seconds (default: 0.3) */
|
|
1984
|
+
minAudioDurationSec?: number;
|
|
1985
|
+
/** Minimum audio energy (default: 0.02) */
|
|
1986
|
+
minAudioEnergy?: number;
|
|
1987
|
+
/** Enable audio normalization for quiet audio (default: true) */
|
|
1988
|
+
normalizeAudio?: boolean;
|
|
1989
|
+
/** Progressive transcription interval — desktop (default: 500ms) */
|
|
1990
|
+
progressiveIntervalMs?: number;
|
|
1991
|
+
/** Progressive transcription interval — iOS (default: 800ms) */
|
|
1992
|
+
progressiveIntervalIosMs?: number;
|
|
1993
|
+
/** Coverage threshold to use progressive result (default: 0.8) */
|
|
1994
|
+
progressiveCoverageThreshold?: number;
|
|
1995
|
+
/** Minimum samples before progressive transcription starts (default: 8000) */
|
|
1996
|
+
progressiveMinSamples?: number;
|
|
1997
|
+
/** Timeout for individual transcribe() calls (default: 10000ms) */
|
|
1998
|
+
transcriptionTimeoutMs?: number;
|
|
1999
|
+
}
|
|
2000
|
+
type SpeechListenerState = 'idle' | 'loading' | 'ready' | 'listening' | 'processing' | 'paused';
|
|
2001
|
+
interface SpeechListenerEvents {
|
|
2002
|
+
'state': SpeechListenerState;
|
|
2003
|
+
'loading:progress': LoadingProgress;
|
|
2004
|
+
'transcript': TranscriptResult;
|
|
2005
|
+
'speech:start': void;
|
|
2006
|
+
'speech:end': {
|
|
2007
|
+
durationMs: number;
|
|
2008
|
+
};
|
|
2009
|
+
'audio:level': {
|
|
2010
|
+
rms: number;
|
|
2011
|
+
peak: number;
|
|
2012
|
+
};
|
|
2013
|
+
'audio:chunk': Float32Array;
|
|
2014
|
+
'error': Error;
|
|
2015
|
+
[key: string]: unknown;
|
|
2016
|
+
}
|
|
2017
|
+
declare class SpeechListener extends EventEmitter<SpeechListenerEvents> {
|
|
2018
|
+
private readonly config;
|
|
2019
|
+
private _state;
|
|
2020
|
+
private epoch;
|
|
2021
|
+
private asr;
|
|
2022
|
+
private vad;
|
|
2023
|
+
private ownedWorker;
|
|
2024
|
+
private mic;
|
|
2025
|
+
private omoteEvents;
|
|
2026
|
+
private _unsubChunk;
|
|
2027
|
+
private _unsubLevel;
|
|
2028
|
+
private static readonly MAX_AUDIO_BUFFER_SAMPLES;
|
|
2029
|
+
private audioBuffer;
|
|
2030
|
+
private audioBufferSamples;
|
|
2031
|
+
private speechStartTime;
|
|
2032
|
+
private silenceTimer;
|
|
2033
|
+
private isSpeechActive;
|
|
2034
|
+
private progressiveTimer;
|
|
2035
|
+
private progressivePromise;
|
|
2036
|
+
private lastProgressiveResult;
|
|
2037
|
+
private lastProgressiveSamples;
|
|
2038
|
+
private asrErrorCount;
|
|
2039
|
+
/** Current listener state */
|
|
2040
|
+
get state(): SpeechListenerState;
|
|
2041
|
+
constructor(config?: SpeechListenerConfig);
|
|
2042
|
+
/**
|
|
2043
|
+
* Load ASR + VAD models. Only loads speech recognition models,
|
|
2044
|
+
* NOT TTS or LAM (those belong to TTSSpeaker).
|
|
2045
|
+
*/
|
|
2046
|
+
loadModels(): Promise<void>;
|
|
2047
|
+
/** Start listening — activates mic + VAD. */
|
|
2048
|
+
start(): Promise<void>;
|
|
2049
|
+
/** Stop listening — deactivates mic, clears buffers. */
|
|
2050
|
+
stop(): void;
|
|
2051
|
+
/** Pause VAD/ASR but keep mic active for audio:chunk events (for interruption detection). */
|
|
2052
|
+
pause(): void;
|
|
2053
|
+
/** Resume VAD/ASR from paused state. */
|
|
2054
|
+
resume(): void;
|
|
2055
|
+
/** Dispose all resources. */
|
|
2056
|
+
dispose(): Promise<void>;
|
|
2057
|
+
private processAudioChunk;
|
|
2058
|
+
private getSilenceTimeout;
|
|
2059
|
+
private onSilenceDetected;
|
|
2060
|
+
private processEndOfSpeech;
|
|
2061
|
+
private startProgressiveTranscription;
|
|
2062
|
+
private stopProgressiveTranscription;
|
|
2063
|
+
private transcribeWithTimeout;
|
|
2064
|
+
private normalizeAudio;
|
|
2065
|
+
private setState;
|
|
2066
|
+
private emitProgress;
|
|
2067
|
+
private clearSilenceTimer;
|
|
2068
|
+
}
|
|
2069
|
+
|
|
2070
|
+
/**
|
|
2071
|
+
* Interruption Handler
|
|
2072
|
+
*
|
|
2073
|
+
* VAD-based barge-in detection for AI conversations:
|
|
2074
|
+
* - Monitors VAD probability for user speech
|
|
2075
|
+
* - Detects when user interrupts AI response
|
|
2076
|
+
* - Triggers interruption callbacks
|
|
2077
|
+
*/
|
|
2078
|
+
|
|
2079
|
+
interface InterruptionEvents {
|
|
2080
|
+
[key: string]: unknown;
|
|
2081
|
+
'speech.detected': {
|
|
2082
|
+
rms: number;
|
|
2083
|
+
};
|
|
2084
|
+
'speech.ended': {
|
|
2085
|
+
durationMs: number;
|
|
2086
|
+
};
|
|
2087
|
+
'interruption.triggered': {
|
|
2088
|
+
rms: number;
|
|
2089
|
+
durationMs: number;
|
|
2090
|
+
};
|
|
2091
|
+
}
|
|
2092
|
+
/**
|
|
2093
|
+
* Interruption handler configuration
|
|
2094
|
+
*
|
|
2095
|
+
* Industry standards applied:
|
|
2096
|
+
* - vadThreshold: 0.5 (Silero VAD default)
|
|
2097
|
+
* - minSpeechDurationMs: 200ms (Google/Amazon barge-in standard)
|
|
2098
|
+
* - silenceTimeoutMs: 500ms (OpenAI Realtime API standard)
|
|
2099
|
+
*/
|
|
2100
|
+
interface InterruptionConfig {
|
|
2101
|
+
/** VAD probability threshold for speech detection (default: 0.5, Silero standard) */
|
|
2102
|
+
vadThreshold?: number;
|
|
2103
|
+
/** Minimum speech duration to trigger interruption (default: 200ms, Google/Amazon standard) */
|
|
2104
|
+
minSpeechDurationMs?: number;
|
|
2105
|
+
/** Silence duration to end speech (default: 500ms, OpenAI standard) */
|
|
2106
|
+
silenceTimeoutMs?: number;
|
|
2107
|
+
/** Enable interruption detection (default: true) */
|
|
2108
|
+
enabled?: boolean;
|
|
2109
|
+
}
|
|
2110
|
+
declare class InterruptionHandler extends EventEmitter<InterruptionEvents> {
|
|
2111
|
+
private config;
|
|
2112
|
+
private isSpeaking;
|
|
2113
|
+
private speechStartTime;
|
|
2114
|
+
private lastSpeechTime;
|
|
2115
|
+
private silenceTimer;
|
|
2116
|
+
private aiIsSpeaking;
|
|
2117
|
+
private interruptionTriggeredThisSession;
|
|
2118
|
+
constructor(config?: InterruptionConfig);
|
|
2119
|
+
/**
|
|
2120
|
+
* Process raw audio energy for interruption detection (no VAD required).
|
|
2121
|
+
* Used during speaking state when the unified worker is busy with TTS.
|
|
2122
|
+
* Echo-cancelled mic input means energy above threshold = user speech.
|
|
2123
|
+
*
|
|
2124
|
+
* @param rms - RMS energy of audio chunk (0-1)
|
|
2125
|
+
* @param energyThreshold - Minimum energy to consider speech (default: 0.02)
|
|
2126
|
+
*/
|
|
2127
|
+
processAudioEnergy(rms: number, energyThreshold?: number): void;
|
|
2128
|
+
/**
|
|
2129
|
+
* Process VAD result for interruption detection
|
|
2130
|
+
* @param vadProbability - Speech probability from VAD (0-1)
|
|
2131
|
+
* @param audioEnergy - Optional RMS energy for logging (default: 0)
|
|
2132
|
+
*/
|
|
2133
|
+
processVADResult(vadProbability: number, audioEnergy?: number): void;
|
|
2134
|
+
/** Notify that AI started/stopped speaking */
|
|
2135
|
+
setAISpeaking(speaking: boolean): void;
|
|
2136
|
+
/** Enable/disable interruption detection */
|
|
2137
|
+
setEnabled(enabled: boolean): void;
|
|
2138
|
+
/** Update configuration */
|
|
2139
|
+
updateConfig(config: Partial<InterruptionConfig>): void;
|
|
2140
|
+
/** Reset state */
|
|
2141
|
+
reset(): void;
|
|
2142
|
+
/** Get current state */
|
|
2143
|
+
getState(): {
|
|
2144
|
+
isSpeaking: boolean;
|
|
2145
|
+
speechDurationMs: number;
|
|
2146
|
+
};
|
|
2147
|
+
private onSpeechDetected;
|
|
2148
|
+
private onSilenceDetected;
|
|
2149
|
+
}
|
|
2150
|
+
|
|
2151
|
+
/**
|
|
2152
|
+
* SenseVoice ASR Web Worker implementation
|
|
2153
|
+
*
|
|
2154
|
+
* Runs SenseVoice speech recognition in a dedicated Web Worker to prevent
|
|
2155
|
+
* main thread blocking. Uses inline worker script (Blob URL pattern) to
|
|
2156
|
+
* avoid separate file deployment.
|
|
2157
|
+
*
|
|
2158
|
+
* Key design decisions:
|
|
2159
|
+
* - WASM backend only (WebGPU doesn't work in Workers)
|
|
2160
|
+
* - All preprocessing (fbank, LFR, CMVN) and CTC decoding inlined in worker
|
|
2161
|
+
* - Audio copied (not transferred) to retain main thread access
|
|
2162
|
+
* - ONNX Runtime loaded from CDN in worker (no bundler complications)
|
|
2163
|
+
* - iOS: model URL passed as string to ORT (avoids 239MB JS heap allocation)
|
|
2164
|
+
*
|
|
2165
|
+
* @category Inference
|
|
2166
|
+
*
|
|
2167
|
+
* @example Basic usage
|
|
2168
|
+
* ```typescript
|
|
2169
|
+
* import { SenseVoiceWorker } from '@omote/core';
|
|
2170
|
+
*
|
|
2171
|
+
* const asr = new SenseVoiceWorker({
|
|
2172
|
+
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
2173
|
+
* tokensUrl: '/models/sensevoice/tokens.txt',
|
|
2174
|
+
* });
|
|
2175
|
+
* await asr.load();
|
|
2176
|
+
*
|
|
2177
|
+
* const { text, emotion, language } = await asr.transcribe(audioSamples);
|
|
2178
|
+
* console.log(text); // "Hello world"
|
|
2179
|
+
* console.log(emotion); // "NEUTRAL"
|
|
2180
|
+
* console.log(language); // "en"
|
|
2181
|
+
* ```
|
|
2182
|
+
*/
|
|
2183
|
+
|
|
2184
|
+
/**
|
|
2185
|
+
* Configuration for SenseVoice Worker
|
|
1990
2186
|
*/
|
|
1991
|
-
interface
|
|
1992
|
-
/**
|
|
1993
|
-
|
|
1994
|
-
/**
|
|
1995
|
-
|
|
2187
|
+
interface SenseVoiceWorkerConfig {
|
|
2188
|
+
/** Path or URL to model.int8.onnx (239MB) */
|
|
2189
|
+
modelUrl: string;
|
|
2190
|
+
/** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
|
|
2191
|
+
tokensUrl?: string;
|
|
2192
|
+
/** Language hint (default: 'auto' for auto-detection) */
|
|
2193
|
+
language?: SenseVoiceLanguage;
|
|
2194
|
+
/** Text normalization: 'with_itn' applies inverse text normalization (default: 'with_itn') */
|
|
2195
|
+
textNorm?: 'with_itn' | 'without_itn';
|
|
2196
|
+
}
|
|
2197
|
+
/**
|
|
2198
|
+
* SenseVoice ASR Worker - Speech Recognition in a Web Worker
|
|
2199
|
+
*
|
|
2200
|
+
* Runs SenseVoice inference off the main thread to prevent UI blocking.
|
|
2201
|
+
* All preprocessing (fbank, LFR, CMVN) and CTC decoding run in the worker.
|
|
2202
|
+
*
|
|
2203
|
+
* @see SenseVoiceInference for main-thread version
|
|
2204
|
+
*/
|
|
2205
|
+
declare class SenseVoiceWorker {
|
|
2206
|
+
private worker;
|
|
2207
|
+
private config;
|
|
2208
|
+
private isLoading;
|
|
2209
|
+
private _isLoaded;
|
|
2210
|
+
private inferenceQueue;
|
|
2211
|
+
private poisoned;
|
|
2212
|
+
private pendingResolvers;
|
|
2213
|
+
private languageId;
|
|
2214
|
+
private textNormId;
|
|
2215
|
+
constructor(config: SenseVoiceWorkerConfig);
|
|
2216
|
+
get isLoaded(): boolean;
|
|
1996
2217
|
/**
|
|
1997
|
-
*
|
|
1998
|
-
|
|
1999
|
-
|
|
2218
|
+
* Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
|
|
2219
|
+
*/
|
|
2220
|
+
get backend(): 'wasm' | null;
|
|
2221
|
+
/**
|
|
2222
|
+
* Create the worker from inline script
|
|
2223
|
+
*/
|
|
2224
|
+
private createWorker;
|
|
2225
|
+
/**
|
|
2226
|
+
* Handle messages from worker
|
|
2227
|
+
*/
|
|
2228
|
+
private handleWorkerMessage;
|
|
2229
|
+
/**
|
|
2230
|
+
* Send message to worker and wait for response
|
|
2231
|
+
*/
|
|
2232
|
+
private sendMessage;
|
|
2233
|
+
/**
|
|
2234
|
+
* Load the ONNX model in the worker
|
|
2235
|
+
*
|
|
2236
|
+
* @param onProgress - Optional progress callback. Fires once at 100% when load completes
|
|
2237
|
+
* (the worker downloads and loads the model internally, so granular progress is not available).
|
|
2000
2238
|
*/
|
|
2001
2239
|
load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
|
|
2002
2240
|
/**
|
|
2003
2241
|
* Transcribe audio samples to text
|
|
2004
|
-
*
|
|
2005
|
-
* @
|
|
2242
|
+
*
|
|
2243
|
+
* @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
|
|
2244
|
+
* @returns Transcription result with text, emotion, language, and event
|
|
2006
2245
|
*/
|
|
2007
2246
|
transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
|
|
2008
2247
|
/**
|
|
2009
|
-
*
|
|
2248
|
+
* Queue inference to serialize worker calls
|
|
2010
2249
|
*/
|
|
2011
|
-
|
|
2012
|
-
}
|
|
2013
|
-
/**
|
|
2014
|
-
* Configuration for the SenseVoice factory
|
|
2015
|
-
*/
|
|
2016
|
-
interface CreateSenseVoiceConfig {
|
|
2017
|
-
/** Path or URL to model.int8.onnx (239MB). Default: HuggingFace CDN */
|
|
2018
|
-
modelUrl?: string;
|
|
2019
|
-
/** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
|
|
2020
|
-
tokensUrl?: string;
|
|
2021
|
-
/** Language hint (default: 'auto') */
|
|
2022
|
-
language?: SenseVoiceLanguage;
|
|
2023
|
-
/** Text normalization (default: 'with_itn') */
|
|
2024
|
-
textNorm?: 'with_itn' | 'without_itn';
|
|
2250
|
+
private queueInference;
|
|
2025
2251
|
/**
|
|
2026
|
-
*
|
|
2027
|
-
* - 'auto' (default): Use Worker if supported, else main thread
|
|
2028
|
-
* - true: Force Worker (throws if unsupported)
|
|
2029
|
-
* - false: Force main thread
|
|
2252
|
+
* Dispose of the worker and free resources
|
|
2030
2253
|
*/
|
|
2031
|
-
|
|
2254
|
+
dispose(): Promise<void>;
|
|
2032
2255
|
/**
|
|
2033
|
-
*
|
|
2034
|
-
* When provided, uses SenseVoiceUnifiedAdapter (shared single-ORT worker).
|
|
2035
|
-
* Takes precedence over useWorker setting.
|
|
2256
|
+
* Check if Web Workers are supported
|
|
2036
2257
|
*/
|
|
2037
|
-
|
|
2258
|
+
static isSupported(): boolean;
|
|
2038
2259
|
}
|
|
2039
|
-
/**
|
|
2040
|
-
* Create a SenseVoice ASR instance with automatic implementation selection
|
|
2041
|
-
*
|
|
2042
|
-
* @param config - Factory configuration
|
|
2043
|
-
* @returns A SenseVoiceBackend instance (either Worker or main thread)
|
|
2044
|
-
*/
|
|
2045
|
-
declare function createSenseVoice(config?: CreateSenseVoiceConfig): SenseVoiceBackend;
|
|
2046
2260
|
|
|
2047
2261
|
/**
|
|
2048
2262
|
* Shared blendshape constants and utilities for lip sync inference
|
|
2049
2263
|
*
|
|
2050
|
-
* Contains
|
|
2051
|
-
*
|
|
2264
|
+
* Contains ARKIT_BLENDSHAPES (canonical 52-blendshape ordering), symmetrization,
|
|
2265
|
+
* and interpolation utilities used by A2EInference and all renderer adapters.
|
|
2052
2266
|
*
|
|
2053
2267
|
* This module is the single source of truth for blendshape ordering to
|
|
2054
2268
|
* avoid circular dependencies between inference classes.
|
|
@@ -2056,12 +2270,12 @@ declare function createSenseVoice(config?: CreateSenseVoiceConfig): SenseVoiceBa
|
|
|
2056
2270
|
* @category Inference
|
|
2057
2271
|
*/
|
|
2058
2272
|
/**
|
|
2059
|
-
*
|
|
2060
|
-
*
|
|
2273
|
+
* ARKit blendshape names in alphabetical order (52 total)
|
|
2274
|
+
* This is the canonical ordering used by all A2E models in the SDK.
|
|
2061
2275
|
*/
|
|
2062
|
-
declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
|
|
2063
|
-
/** Alias for backwards compatibility */
|
|
2064
2276
|
declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
|
|
2277
|
+
/** @deprecated Use ARKIT_BLENDSHAPES instead */
|
|
2278
|
+
declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
|
|
2065
2279
|
/**
|
|
2066
2280
|
* Linearly interpolate between two blendshape weight arrays.
|
|
2067
2281
|
*
|
|
@@ -2077,28 +2291,30 @@ declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "bro
|
|
|
2077
2291
|
declare function lerpBlendshapes(current: Float32Array | number[], target: Float32Array | number[], factor?: number): number[];
|
|
2078
2292
|
|
|
2079
2293
|
/**
|
|
2080
|
-
*
|
|
2294
|
+
* A2E inference engine for Audio-to-Expression (LAM model)
|
|
2081
2295
|
*
|
|
2082
2296
|
* Runs entirely in the browser using WebGPU or WASM.
|
|
2083
2297
|
* Takes raw 16kHz audio and outputs 52 ARKit blendshapes for lip sync.
|
|
2298
|
+
* Uses the LAM (Large Animation Model) — see {@link A2EBackend} for the interface.
|
|
2084
2299
|
*
|
|
2300
|
+
* @see {@link createA2E} for the recommended zero-config factory
|
|
2301
|
+
* @see {@link A2EBackend} for the common interface
|
|
2085
2302
|
* @category Inference
|
|
2086
2303
|
*
|
|
2087
2304
|
* @example Basic usage
|
|
2088
2305
|
* ```typescript
|
|
2089
|
-
* import {
|
|
2306
|
+
* import { A2EInference } from '@omote/core';
|
|
2090
2307
|
*
|
|
2091
|
-
* const
|
|
2092
|
-
* await
|
|
2308
|
+
* const a2e = new A2EInference({ modelUrl: '/models/lam.onnx' });
|
|
2309
|
+
* await a2e.load();
|
|
2093
2310
|
*
|
|
2094
2311
|
* // Process 1 second of audio (16kHz = 16000 samples)
|
|
2095
|
-
* const result = await
|
|
2312
|
+
* const result = await a2e.infer(audioSamples);
|
|
2096
2313
|
* console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
|
|
2097
2314
|
* ```
|
|
2098
2315
|
*/
|
|
2099
2316
|
|
|
2100
|
-
|
|
2101
|
-
interface Wav2Vec2InferenceConfig {
|
|
2317
|
+
interface A2EInferenceConfig {
|
|
2102
2318
|
/** Path or URL to the ONNX model */
|
|
2103
2319
|
modelUrl: string;
|
|
2104
2320
|
/**
|
|
@@ -2109,7 +2325,7 @@ interface Wav2Vec2InferenceConfig {
|
|
|
2109
2325
|
*/
|
|
2110
2326
|
externalDataUrl?: string | false;
|
|
2111
2327
|
/** Preferred backend (auto will try WebGPU first, fallback to WASM) */
|
|
2112
|
-
backend?:
|
|
2328
|
+
backend?: BackendPreference;
|
|
2113
2329
|
/** Number of identity classes (default: 12 for streaming model) */
|
|
2114
2330
|
numIdentityClasses?: number;
|
|
2115
2331
|
/**
|
|
@@ -2119,28 +2335,9 @@ interface Wav2Vec2InferenceConfig {
|
|
|
2119
2335
|
*/
|
|
2120
2336
|
chunkSize?: number;
|
|
2121
2337
|
}
|
|
2122
|
-
interface ModelInfo {
|
|
2123
|
-
backend: 'webgpu' | 'wasm';
|
|
2124
|
-
loadTimeMs: number;
|
|
2125
|
-
inputNames: string[];
|
|
2126
|
-
outputNames: string[];
|
|
2127
|
-
}
|
|
2128
2338
|
|
|
2129
|
-
|
|
2130
|
-
|
|
2131
|
-
* @deprecated ASR is handled by SenseVoice. This will be removed in a future release.
|
|
2132
|
-
*/
|
|
2133
|
-
declare const CTC_VOCAB: string[];
|
|
2134
|
-
interface Wav2Vec2Result {
|
|
2135
|
-
/** Blendshape weights [frames, 52] - 30fps */
|
|
2136
|
-
blendshapes: Float32Array[];
|
|
2137
|
-
/** Number of blendshape frames (30fps) */
|
|
2138
|
-
numFrames: number;
|
|
2139
|
-
/** Inference time in ms */
|
|
2140
|
-
inferenceTimeMs: number;
|
|
2141
|
-
}
|
|
2142
|
-
declare class Wav2Vec2Inference implements A2EBackend {
|
|
2143
|
-
readonly modelId: "wav2vec2";
|
|
2339
|
+
declare class A2EInference implements A2EBackend {
|
|
2340
|
+
readonly modelId: "a2e";
|
|
2144
2341
|
private session;
|
|
2145
2342
|
private ort;
|
|
2146
2343
|
private config;
|
|
@@ -2151,7 +2348,7 @@ declare class Wav2Vec2Inference implements A2EBackend {
|
|
|
2151
2348
|
private inferenceQueue;
|
|
2152
2349
|
private poisoned;
|
|
2153
2350
|
private static readonly INFERENCE_TIMEOUT_MS;
|
|
2154
|
-
constructor(config:
|
|
2351
|
+
constructor(config: A2EInferenceConfig);
|
|
2155
2352
|
/**
|
|
2156
2353
|
* Check if WebGPU is available and working
|
|
2157
2354
|
* (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
|
|
@@ -2161,186 +2358,26 @@ declare class Wav2Vec2Inference implements A2EBackend {
|
|
|
2161
2358
|
get isLoaded(): boolean;
|
|
2162
2359
|
/** True if inference timed out and the session is permanently unusable */
|
|
2163
2360
|
get isSessionPoisoned(): boolean;
|
|
2164
|
-
/**
|
|
2165
|
-
* Load the ONNX model
|
|
2166
|
-
*/
|
|
2167
|
-
load(): Promise<ModelInfo>;
|
|
2168
|
-
/**
|
|
2169
|
-
* Run inference on raw audio
|
|
2170
|
-
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
2171
|
-
* @param identityIndex - Optional identity index (0-11, default 0 = neutral)
|
|
2172
|
-
*
|
|
2173
|
-
* Audio will be zero-padded or truncated to chunkSize samples.
|
|
2174
|
-
*/
|
|
2175
|
-
infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
|
|
2176
|
-
/**
|
|
2177
|
-
* Queue inference to serialize ONNX session calls
|
|
2178
|
-
*/
|
|
2179
|
-
private queueInference;
|
|
2180
|
-
/**
|
|
2181
|
-
* Get blendshape value by name for a specific frame
|
|
2182
|
-
*/
|
|
2183
|
-
getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
|
|
2184
|
-
/**
|
|
2185
|
-
* Dispose of the model and free resources
|
|
2186
|
-
*/
|
|
2187
|
-
dispose(): Promise<void>;
|
|
2188
|
-
}
|
|
2189
|
-
|
|
2190
|
-
/**
|
|
2191
|
-
* Default and user-configurable model URLs for all ONNX models
|
|
2192
|
-
*
|
|
2193
|
-
* Out of the box, models are served from HuggingFace CDN (`/resolve/main/`
|
|
2194
|
-
* endpoint with `Access-Control-Allow-Origin: *`). For production apps that
|
|
2195
|
-
* need faster or more reliable delivery, call {@link configureModelUrls} once
|
|
2196
|
-
* at startup to point any or all models at your own CDN.
|
|
2197
|
-
*
|
|
2198
|
-
* @category Inference
|
|
2199
|
-
*
|
|
2200
|
-
* @example Use HuggingFace defaults (zero-config)
|
|
2201
|
-
* ```typescript
|
|
2202
|
-
* import { createA2E } from '@omote/core';
|
|
2203
|
-
* const a2e = createA2E(); // fetches from HuggingFace CDN
|
|
2204
|
-
* ```
|
|
2205
|
-
*
|
|
2206
|
-
* @example Self-host on your own CDN
|
|
2207
|
-
* ```typescript
|
|
2208
|
-
* import { configureModelUrls, createA2E } from '@omote/core';
|
|
2209
|
-
*
|
|
2210
|
-
* configureModelUrls({
|
|
2211
|
-
* lam: 'https://cdn.example.com/models/model_fp16.onnx',
|
|
2212
|
-
* senseVoice: 'https://cdn.example.com/models/sensevoice.int8.onnx',
|
|
2213
|
-
* // omitted keys keep HuggingFace defaults
|
|
2214
|
-
* });
|
|
2215
|
-
*
|
|
2216
|
-
* const a2e = createA2E(); // now fetches from your CDN
|
|
2217
|
-
* ```
|
|
2218
|
-
*/
|
|
2219
|
-
/** Model URL keys that can be configured */
|
|
2220
|
-
type ModelUrlKey = 'lam' | 'wav2arkitCpu' | 'senseVoice' | 'sileroVad';
|
|
2221
|
-
/**
|
|
2222
|
-
* Resolved model URLs — user overrides take priority, HuggingFace CDN is fallback.
|
|
2223
|
-
*
|
|
2224
|
-
* All SDK factories (`createA2E`, `createSenseVoice`, `createSileroVAD`) and
|
|
2225
|
-
* orchestrators (`VoicePipeline`) read from this object. Call
|
|
2226
|
-
* {@link configureModelUrls} before constructing any pipelines to point
|
|
2227
|
-
* models at your own CDN.
|
|
2228
|
-
*/
|
|
2229
|
-
declare const DEFAULT_MODEL_URLS: Readonly<Record<ModelUrlKey, string>>;
|
|
2230
|
-
/**
|
|
2231
|
-
* Configure custom model URLs. Overrides persist for the lifetime of the page.
|
|
2232
|
-
* Omitted keys keep their HuggingFace CDN defaults.
|
|
2233
|
-
*
|
|
2234
|
-
* Call this **once** at app startup, before constructing any pipelines.
|
|
2235
|
-
*
|
|
2236
|
-
* @example Self-host all models
|
|
2237
|
-
* ```typescript
|
|
2238
|
-
* configureModelUrls({
|
|
2239
|
-
* lam: 'https://cdn.example.com/models/model_fp16.onnx',
|
|
2240
|
-
* wav2arkitCpu: 'https://cdn.example.com/models/wav2arkit_cpu.onnx',
|
|
2241
|
-
* senseVoice: 'https://cdn.example.com/models/sensevoice.int8.onnx',
|
|
2242
|
-
* sileroVad: 'https://cdn.example.com/models/silero-vad.onnx',
|
|
2243
|
-
* });
|
|
2244
|
-
* ```
|
|
2245
|
-
*
|
|
2246
|
-
* @example Override only one model
|
|
2247
|
-
* ```typescript
|
|
2248
|
-
* configureModelUrls({
|
|
2249
|
-
* lam: '/models/model_fp16.onnx', // self-hosted, same origin
|
|
2250
|
-
* });
|
|
2251
|
-
* ```
|
|
2252
|
-
*/
|
|
2253
|
-
declare function configureModelUrls(urls: Partial<Record<ModelUrlKey, string>>): void;
|
|
2254
|
-
/**
|
|
2255
|
-
* Reset all model URL overrides back to HuggingFace CDN defaults.
|
|
2256
|
-
* Mainly useful for testing.
|
|
2257
|
-
*/
|
|
2258
|
-
declare function resetModelUrls(): void;
|
|
2259
|
-
/**
|
|
2260
|
-
* Get the immutable HuggingFace CDN URLs (ignoring any overrides).
|
|
2261
|
-
* Useful for documentation or fallback logic.
|
|
2262
|
-
*/
|
|
2263
|
-
declare const HF_CDN_URLS: Readonly<Record<ModelUrlKey, string>>;
|
|
2264
|
-
|
|
2265
|
-
/**
|
|
2266
|
-
* CPU-optimized lip sync inference using wav2arkit_cpu model
|
|
2267
|
-
*
|
|
2268
|
-
* A Safari/iOS-compatible alternative to Wav2Vec2Inference (192MB fp16) designed
|
|
2269
|
-
* for platforms where WebGPU crashes due to ONNX Runtime JSEP bugs.
|
|
2270
|
-
*
|
|
2271
|
-
* The model uses ONNX external data format:
|
|
2272
|
-
* - wav2arkit_cpu.onnx (1.86MB graph structure)
|
|
2273
|
-
* - wav2arkit_cpu.onnx.data (402MB weights)
|
|
2274
|
-
* Both files are fetched and cached automatically.
|
|
2275
|
-
*
|
|
2276
|
-
* Key differences from Wav2Vec2Inference:
|
|
2277
|
-
* - WASM-only backend (CPU-optimized, single-threaded, no WebGPU)
|
|
2278
|
-
* - No identity input (baked to identity 11)
|
|
2279
|
-
* - No ASR output (lip sync only)
|
|
2280
|
-
* - Dynamic input length (not fixed to 16000 samples)
|
|
2281
|
-
* - Different native blendshape ordering (remapped to LAM_BLENDSHAPES)
|
|
2282
|
-
*
|
|
2283
|
-
* @category Inference
|
|
2284
|
-
*
|
|
2285
|
-
* @example
|
|
2286
|
-
* ```typescript
|
|
2287
|
-
* import { Wav2ArkitCpuInference } from '@omote/core';
|
|
2288
|
-
*
|
|
2289
|
-
* const lam = new Wav2ArkitCpuInference({
|
|
2290
|
-
* modelUrl: '/models/wav2arkit_cpu.onnx',
|
|
2291
|
-
* });
|
|
2292
|
-
* await lam.load();
|
|
2293
|
-
*
|
|
2294
|
-
* const { blendshapes } = await lam.infer(audioSamples);
|
|
2295
|
-
* // blendshapes: Float32Array[] in LAM_BLENDSHAPES order, 30fps
|
|
2296
|
-
* ```
|
|
2297
|
-
*/
|
|
2298
|
-
|
|
2299
|
-
interface Wav2ArkitCpuConfig {
|
|
2300
|
-
/** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
|
|
2301
|
-
modelUrl: string;
|
|
2302
|
-
/**
|
|
2303
|
-
* Path or URL to external model data file (.onnx.data weights).
|
|
2304
|
-
* Default: `${modelUrl}.data` (e.g., /models/wav2arkit_cpu.onnx.data)
|
|
2305
|
-
*
|
|
2306
|
-
* Set to `false` to skip external data loading (single-file models only).
|
|
2307
|
-
*/
|
|
2308
|
-
externalDataUrl?: string | false;
|
|
2309
|
-
/** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
|
|
2310
|
-
backend?: BackendPreference;
|
|
2311
|
-
}
|
|
2312
|
-
declare class Wav2ArkitCpuInference implements A2EBackend {
|
|
2313
|
-
readonly modelId: "wav2arkit_cpu";
|
|
2314
|
-
readonly chunkSize: number;
|
|
2315
|
-
private session;
|
|
2316
|
-
private ort;
|
|
2317
|
-
private config;
|
|
2318
|
-
private _backend;
|
|
2319
|
-
private isLoading;
|
|
2320
|
-
private inferenceQueue;
|
|
2321
|
-
private poisoned;
|
|
2322
|
-
private static readonly INFERENCE_TIMEOUT_MS;
|
|
2323
|
-
constructor(config: Wav2ArkitCpuConfig);
|
|
2324
|
-
get backend(): RuntimeBackend | null;
|
|
2325
|
-
get isLoaded(): boolean;
|
|
2326
2361
|
/**
|
|
2327
2362
|
* Load the ONNX model
|
|
2328
2363
|
*/
|
|
2329
2364
|
load(): Promise<A2EModelInfo>;
|
|
2330
2365
|
/**
|
|
2331
2366
|
* Run inference on raw audio
|
|
2332
|
-
*
|
|
2333
|
-
* Accepts variable-length audio (not fixed to 16000 samples).
|
|
2334
|
-
* Output frames = ceil(30 * numSamples / 16000).
|
|
2335
|
-
*
|
|
2336
2367
|
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
2337
|
-
* @param
|
|
2368
|
+
* @param identityIndex - Optional identity index (0-11, default 0 = neutral)
|
|
2369
|
+
*
|
|
2370
|
+
* Audio will be zero-padded or truncated to chunkSize samples.
|
|
2338
2371
|
*/
|
|
2339
|
-
infer(audioSamples: Float32Array,
|
|
2372
|
+
infer(audioSamples: Float32Array, identityIndex?: number): Promise<A2EResult>;
|
|
2340
2373
|
/**
|
|
2341
2374
|
* Queue inference to serialize ONNX session calls
|
|
2342
2375
|
*/
|
|
2343
2376
|
private queueInference;
|
|
2377
|
+
/**
|
|
2378
|
+
* Get blendshape value by name for a specific frame
|
|
2379
|
+
*/
|
|
2380
|
+
getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
|
|
2344
2381
|
/**
|
|
2345
2382
|
* Dispose of the model and free resources
|
|
2346
2383
|
*/
|
|
@@ -2348,92 +2385,78 @@ declare class Wav2ArkitCpuInference implements A2EBackend {
|
|
|
2348
2385
|
}
|
|
2349
2386
|
|
|
2350
2387
|
/**
|
|
2351
|
-
*
|
|
2352
|
-
*
|
|
2353
|
-
* Provides a unified API that always tries Wav2Vec2 (LAM fp16) first:
|
|
2354
|
-
* - All platforms: Tries Wav2Vec2Inference (192MB fp16, external data format)
|
|
2355
|
-
* - Fallback: Gracefully falls back to wav2arkit_cpu if GPU model fails to load
|
|
2388
|
+
* Default and user-configurable model URLs for all ONNX models
|
|
2356
2389
|
*
|
|
2357
|
-
*
|
|
2358
|
-
*
|
|
2359
|
-
*
|
|
2360
|
-
*
|
|
2361
|
-
* wav2arkit_cpu (1.86MB graph + 402MB weights) as a safe fallback.
|
|
2390
|
+
* Out of the box, models are served from HuggingFace CDN (`/resolve/main/`
|
|
2391
|
+
* endpoint with `Access-Control-Allow-Origin: *`). For production apps that
|
|
2392
|
+
* need faster or more reliable delivery, call {@link configureModelUrls} once
|
|
2393
|
+
* at startup to point any or all models at your own CDN.
|
|
2362
2394
|
*
|
|
2363
2395
|
* @category Inference
|
|
2364
2396
|
*
|
|
2365
|
-
* @example
|
|
2397
|
+
* @example Use HuggingFace defaults (zero-config)
|
|
2366
2398
|
* ```typescript
|
|
2367
2399
|
* import { createA2E } from '@omote/core';
|
|
2368
|
-
*
|
|
2369
|
-
* const a2e = createA2E(); // uses HF CDN defaults (192MB fp16 GPU, 404MB CPU fallback)
|
|
2370
|
-
* await a2e.load();
|
|
2371
|
-
* const { blendshapes } = await a2e.infer(audioSamples);
|
|
2400
|
+
* const a2e = createA2E(); // fetches from HuggingFace CDN
|
|
2372
2401
|
* ```
|
|
2373
2402
|
*
|
|
2374
|
-
* @example
|
|
2403
|
+
* @example Self-host on your own CDN
|
|
2375
2404
|
* ```typescript
|
|
2376
|
-
*
|
|
2405
|
+
* import { configureModelUrls, createA2E } from '@omote/core';
|
|
2406
|
+
*
|
|
2407
|
+
* configureModelUrls({
|
|
2408
|
+
* lam: 'https://cdn.example.com/models/model_fp16.onnx',
|
|
2409
|
+
* senseVoice: 'https://cdn.example.com/models/sensevoice.int8.onnx',
|
|
2410
|
+
* // omitted keys keep HuggingFace defaults
|
|
2411
|
+
* });
|
|
2412
|
+
*
|
|
2413
|
+
* const a2e = createA2E(); // now fetches from your CDN
|
|
2377
2414
|
* ```
|
|
2378
2415
|
*/
|
|
2379
|
-
|
|
2416
|
+
/** Model URL keys that can be configured */
|
|
2417
|
+
type ModelUrlKey = 'lam' | 'senseVoice' | 'sileroVad' | 'kokoroTTS' | 'kokoroVoices';
|
|
2380
2418
|
/**
|
|
2381
|
-
*
|
|
2419
|
+
* Resolved model URLs — user overrides take priority, HuggingFace CDN is fallback.
|
|
2420
|
+
*
|
|
2421
|
+
* All SDK factories (`createA2E`, `createSenseVoice`, `createSileroVAD`) and
|
|
2422
|
+
* orchestrators (`VoicePipeline`) read from this object. Call
|
|
2423
|
+
* {@link configureModelUrls} before constructing any pipelines to point
|
|
2424
|
+
* models at your own CDN.
|
|
2382
2425
|
*/
|
|
2383
|
-
|
|
2384
|
-
/** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge). Default: HuggingFace CDN */
|
|
2385
|
-
gpuModelUrl?: string;
|
|
2386
|
-
/**
|
|
2387
|
-
* URL for GPU model external data file (.onnx.data weights).
|
|
2388
|
-
* Default: `${gpuModelUrl}.data`
|
|
2389
|
-
*
|
|
2390
|
-
* Set to `false` to skip external data loading (single-file models only).
|
|
2391
|
-
*/
|
|
2392
|
-
gpuExternalDataUrl?: string | false;
|
|
2393
|
-
/** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS). Default: HuggingFace CDN */
|
|
2394
|
-
cpuModelUrl?: string;
|
|
2395
|
-
/**
|
|
2396
|
-
* Model selection mode:
|
|
2397
|
-
* - 'auto': Safari/iOS -> CPU, everything else -> GPU (default)
|
|
2398
|
-
* - 'gpu': Force GPU model (Wav2Vec2Inference)
|
|
2399
|
-
* - 'cpu': Force CPU model (Wav2ArkitCpuInference)
|
|
2400
|
-
*/
|
|
2401
|
-
mode?: 'auto' | 'gpu' | 'cpu';
|
|
2402
|
-
/** Backend preference for GPU model (default: 'auto') */
|
|
2403
|
-
gpuBackend?: BackendPreference;
|
|
2404
|
-
/** Number of identity classes for GPU model (default: 12) */
|
|
2405
|
-
numIdentityClasses?: number;
|
|
2406
|
-
/**
|
|
2407
|
-
* Fall back to CPU model if GPU model fails to load (default: true)
|
|
2408
|
-
* Only applies when mode is 'auto' or 'gpu'
|
|
2409
|
-
*/
|
|
2410
|
-
fallbackOnError?: boolean;
|
|
2411
|
-
/**
|
|
2412
|
-
* Use Web Worker for CPU model inference (default: false)
|
|
2413
|
-
*
|
|
2414
|
-
* When true, Wav2ArkitCpuWorker is used instead of Wav2ArkitCpuInference,
|
|
2415
|
-
* running inference off the main thread to prevent UI blocking during
|
|
2416
|
-
* model loading and inference.
|
|
2417
|
-
*
|
|
2418
|
-
* Only applies when the CPU model is selected (mode: 'cpu', auto on Safari/iOS,
|
|
2419
|
-
* or fallback from GPU).
|
|
2420
|
-
*/
|
|
2421
|
-
useWorker?: boolean;
|
|
2422
|
-
/**
|
|
2423
|
-
* Unified inference worker instance.
|
|
2424
|
-
* When provided and CPU model is selected, uses Wav2ArkitCpuUnifiedAdapter.
|
|
2425
|
-
* Takes precedence over useWorker setting for the CPU model path.
|
|
2426
|
-
* GPU model (Wav2Vec2) always stays on main thread (WebGPU).
|
|
2427
|
-
*/
|
|
2428
|
-
unifiedWorker?: UnifiedInferenceWorker;
|
|
2429
|
-
}
|
|
2426
|
+
declare const DEFAULT_MODEL_URLS: Readonly<Record<ModelUrlKey, string>>;
|
|
2430
2427
|
/**
|
|
2431
|
-
*
|
|
2428
|
+
* Configure custom model URLs. Overrides persist for the lifetime of the page.
|
|
2429
|
+
* Omitted keys keep their HuggingFace CDN defaults.
|
|
2432
2430
|
*
|
|
2433
|
-
*
|
|
2434
|
-
*
|
|
2431
|
+
* Call this **once** at app startup, before constructing any pipelines.
|
|
2432
|
+
*
|
|
2433
|
+
* @example Self-host all models
|
|
2434
|
+
* ```typescript
|
|
2435
|
+
* configureModelUrls({
|
|
2436
|
+
* lam: 'https://cdn.example.com/models/lam.onnx',
|
|
2437
|
+
* senseVoice: 'https://cdn.example.com/models/sensevoice.int8.onnx',
|
|
2438
|
+
* sileroVad: 'https://cdn.example.com/models/silero-vad.onnx',
|
|
2439
|
+
* });
|
|
2440
|
+
* ```
|
|
2441
|
+
*
|
|
2442
|
+
* @example Override only one model
|
|
2443
|
+
* ```typescript
|
|
2444
|
+
* configureModelUrls({
|
|
2445
|
+
* lam: '/models/model_fp16.onnx', // self-hosted, same origin
|
|
2446
|
+
* });
|
|
2447
|
+
* ```
|
|
2435
2448
|
*/
|
|
2436
|
-
declare function
|
|
2449
|
+
declare function configureModelUrls(urls: Partial<Record<ModelUrlKey, string>>): void;
|
|
2450
|
+
/**
|
|
2451
|
+
* Reset all model URL overrides back to HuggingFace CDN defaults.
|
|
2452
|
+
* Mainly useful for testing.
|
|
2453
|
+
*/
|
|
2454
|
+
declare function resetModelUrls(): void;
|
|
2455
|
+
/**
|
|
2456
|
+
* Get the immutable HuggingFace CDN URLs (ignoring any overrides).
|
|
2457
|
+
* Useful for documentation or fallback logic.
|
|
2458
|
+
*/
|
|
2459
|
+
declare const HF_CDN_URLS: Readonly<Record<ModelUrlKey, string>>;
|
|
2437
2460
|
|
|
2438
2461
|
/**
|
|
2439
2462
|
* A2EProcessor — Engine-agnostic audio-to-expression processor
|
|
@@ -2484,9 +2507,6 @@ interface A2EProcessorConfig {
|
|
|
2484
2507
|
* The LAM model uses a one-hot identity vector (12 classes, indices 0-11) as
|
|
2485
2508
|
* style conditioning alongside audio features. Different indices produce
|
|
2486
2509
|
* different expression intensity across face regions (brows, eyes, cheeks).
|
|
2487
|
-
*
|
|
2488
|
-
* Only affects Wav2Vec2Inference (GPU model). Wav2ArkitCpuInference has
|
|
2489
|
-
* identity 11 baked into the model weights.
|
|
2490
2510
|
*/
|
|
2491
2511
|
identityIndex?: number;
|
|
2492
2512
|
/** Callback fired with each blendshape frame (push mode) */
|
|
@@ -2495,6 +2515,7 @@ interface A2EProcessorConfig {
|
|
|
2495
2515
|
onError?: (error: Error) => void;
|
|
2496
2516
|
}
|
|
2497
2517
|
declare class A2EProcessor {
|
|
2518
|
+
private static readonly MAX_PENDING_CHUNKS;
|
|
2498
2519
|
private readonly backend;
|
|
2499
2520
|
private readonly sampleRate;
|
|
2500
2521
|
private readonly chunkSize;
|
|
@@ -2510,6 +2531,8 @@ declare class A2EProcessor {
|
|
|
2510
2531
|
private _latestFrame;
|
|
2511
2532
|
private dripInterval;
|
|
2512
2533
|
private lastPulledFrame;
|
|
2534
|
+
private lastDequeuedTime;
|
|
2535
|
+
private decayBuffer;
|
|
2513
2536
|
private inferenceRunning;
|
|
2514
2537
|
private pendingChunks;
|
|
2515
2538
|
private getFrameCallCount;
|
|
@@ -2655,110 +2678,273 @@ declare class BlendshapeSmoother {
|
|
|
2655
2678
|
}
|
|
2656
2679
|
|
|
2657
2680
|
/**
|
|
2658
|
-
*
|
|
2659
|
-
*
|
|
2660
|
-
* Manages the mic capture + A2E inference loop independently of any
|
|
2661
|
-
* 3D renderer. Adapter packages (@omote/three, @omote/babylon) wrap this
|
|
2662
|
-
* thinly and pipe `latestWeights` into their renderer-specific blendshape
|
|
2663
|
-
* controllers.
|
|
2664
|
-
*
|
|
2665
|
-
* Internally delegates all buffer accumulation, inference, and frame
|
|
2666
|
-
* drip-feeding to {@link A2EProcessor}. This class only handles mic capture
|
|
2667
|
-
* (getUserMedia, ScriptProcessorNode, resampling).
|
|
2668
|
-
*
|
|
2669
|
-
* @deprecated Use {@link MicLipSync} from `@omote/core` instead. MicLipSync provides
|
|
2670
|
-
* the same mic → A2E composition with proper MicrophoneCapture integration, VAD support,
|
|
2671
|
-
* ExpressionProfile scaling, and pause/resume. This class will be removed in a future version.
|
|
2681
|
+
* SenseVoice adapter backed by UnifiedInferenceWorker
|
|
2672
2682
|
*
|
|
2673
|
-
*
|
|
2683
|
+
* Implements SenseVoiceBackend, delegating all inference to the shared worker.
|
|
2674
2684
|
*/
|
|
2675
2685
|
|
|
2676
|
-
|
|
2677
|
-
|
|
2678
|
-
|
|
2679
|
-
|
|
2680
|
-
|
|
2681
|
-
|
|
2686
|
+
declare class SenseVoiceUnifiedAdapter implements SenseVoiceBackend {
|
|
2687
|
+
private worker;
|
|
2688
|
+
private config;
|
|
2689
|
+
private _isLoaded;
|
|
2690
|
+
private loadedGeneration;
|
|
2691
|
+
private languageId;
|
|
2692
|
+
private textNormId;
|
|
2693
|
+
/** Per-adapter inference queue — ensures sequential state updates. */
|
|
2694
|
+
private inferenceQueue;
|
|
2695
|
+
constructor(worker: UnifiedInferenceWorker, config: SenseVoiceWorkerConfig);
|
|
2696
|
+
get isLoaded(): boolean;
|
|
2697
|
+
get backend(): 'wasm' | null;
|
|
2698
|
+
load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
|
|
2699
|
+
transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
|
|
2700
|
+
dispose(): Promise<void>;
|
|
2701
|
+
private assertLoaded;
|
|
2682
2702
|
}
|
|
2703
|
+
|
|
2683
2704
|
/**
|
|
2684
|
-
*
|
|
2705
|
+
* A2E adapter backed by UnifiedInferenceWorker
|
|
2706
|
+
*
|
|
2707
|
+
* Implements A2EBackend, delegating all inference to the shared worker.
|
|
2708
|
+
* Used on iOS to run A2E inference off the main thread via the unified worker.
|
|
2685
2709
|
*/
|
|
2686
|
-
|
|
2687
|
-
|
|
2688
|
-
|
|
2689
|
-
|
|
2690
|
-
|
|
2691
|
-
|
|
2692
|
-
|
|
2693
|
-
|
|
2694
|
-
|
|
2695
|
-
|
|
2696
|
-
|
|
2697
|
-
/**
|
|
2698
|
-
|
|
2699
|
-
|
|
2700
|
-
|
|
2701
|
-
|
|
2702
|
-
|
|
2703
|
-
|
|
2704
|
-
|
|
2705
|
-
|
|
2706
|
-
|
|
2710
|
+
|
|
2711
|
+
declare class A2EUnifiedAdapter implements A2EBackend {
|
|
2712
|
+
readonly modelId: "a2e";
|
|
2713
|
+
readonly chunkSize: number;
|
|
2714
|
+
private worker;
|
|
2715
|
+
private modelUrl;
|
|
2716
|
+
private externalDataUrl;
|
|
2717
|
+
private numIdentityClasses;
|
|
2718
|
+
private _isLoaded;
|
|
2719
|
+
private _backend;
|
|
2720
|
+
private loadedGeneration;
|
|
2721
|
+
/** Per-adapter inference queue — ensures sequential state updates. */
|
|
2722
|
+
private inferenceQueue;
|
|
2723
|
+
constructor(worker: UnifiedInferenceWorker, config: {
|
|
2724
|
+
modelUrl: string;
|
|
2725
|
+
externalDataUrl?: string | false;
|
|
2726
|
+
numIdentityClasses?: number;
|
|
2727
|
+
chunkSize?: number;
|
|
2728
|
+
});
|
|
2729
|
+
get isLoaded(): boolean;
|
|
2730
|
+
get backend(): RuntimeBackend | null;
|
|
2731
|
+
load(): Promise<A2EModelInfo>;
|
|
2732
|
+
infer(audioSamples: Float32Array, identityIndex?: number): Promise<A2EResult>;
|
|
2733
|
+
dispose(): Promise<void>;
|
|
2734
|
+
private assertLoaded;
|
|
2707
2735
|
}
|
|
2736
|
+
|
|
2708
2737
|
/**
|
|
2709
|
-
*
|
|
2738
|
+
* Kokoro TTS inference using ONNX Runtime Web
|
|
2739
|
+
*
|
|
2740
|
+
* Pure ONNX pipeline for browser-based text-to-speech. No transformers.js dependency.
|
|
2741
|
+
* Uses eSpeak-NG WASM for phonemization and Kokoro-82M (q8, 92MB) for synthesis.
|
|
2742
|
+
*
|
|
2743
|
+
* Pipeline: Text → Normalize → Phonemize (eSpeak WASM) → Tokenize → Voice Style → ONNX → Audio
|
|
2710
2744
|
*
|
|
2711
|
-
*
|
|
2712
|
-
* Adapters read `latestWeights` each frame to apply to their meshes.
|
|
2745
|
+
* @category Inference
|
|
2713
2746
|
*
|
|
2714
|
-
* @example
|
|
2747
|
+
* @example Basic usage
|
|
2715
2748
|
* ```typescript
|
|
2716
|
-
*
|
|
2717
|
-
*
|
|
2718
|
-
*
|
|
2719
|
-
*
|
|
2720
|
-
*
|
|
2721
|
-
* await
|
|
2722
|
-
*
|
|
2749
|
+
* import { KokoroTTSInference } from '@omote/core';
|
|
2750
|
+
*
|
|
2751
|
+
* const tts = new KokoroTTSInference({ defaultVoice: 'af_heart' });
|
|
2752
|
+
* await tts.load();
|
|
2753
|
+
*
|
|
2754
|
+
* const { audio, duration } = await tts.synthesize("Hello world");
|
|
2755
|
+
* // audio: Float32Array @ 24kHz
|
|
2756
|
+
* ```
|
|
2757
|
+
*
|
|
2758
|
+
* @example Streaming (sentence-by-sentence)
|
|
2759
|
+
* ```typescript
|
|
2760
|
+
* for await (const chunk of tts.stream("First sentence. Second sentence.")) {
|
|
2761
|
+
* playbackPipeline.feedBuffer(chunk.audio);
|
|
2762
|
+
* }
|
|
2723
2763
|
* ```
|
|
2764
|
+
*
|
|
2765
|
+
* @module inference/KokoroTTSInference
|
|
2724
2766
|
*/
|
|
2725
|
-
|
|
2726
|
-
|
|
2727
|
-
|
|
2728
|
-
|
|
2729
|
-
|
|
2730
|
-
|
|
2731
|
-
|
|
2732
|
-
|
|
2733
|
-
|
|
2734
|
-
|
|
2767
|
+
|
|
2768
|
+
interface KokoroTTSConfig {
|
|
2769
|
+
/** ONNX model URL (default: HF CDN q8, 92MB) */
|
|
2770
|
+
modelUrl?: string;
|
|
2771
|
+
/** Voice files base URL (default: HF CDN voices directory) */
|
|
2772
|
+
voiceBaseUrl?: string;
|
|
2773
|
+
/** Default voice (default: 'af_heart') */
|
|
2774
|
+
defaultVoice?: string;
|
|
2775
|
+
/** Backend preference (default: 'wasm' — WebGPU crashes on int64 input_ids) */
|
|
2776
|
+
backend?: BackendPreference;
|
|
2777
|
+
/** Speech speed multiplier (default: 1.0) */
|
|
2778
|
+
speed?: number;
|
|
2779
|
+
}
|
|
2780
|
+
interface KokoroTTSResult {
|
|
2781
|
+
/** Audio samples at 24kHz */
|
|
2782
|
+
audio: Float32Array;
|
|
2783
|
+
/** Duration in seconds */
|
|
2784
|
+
duration: number;
|
|
2785
|
+
/** Inference time in ms */
|
|
2786
|
+
inferenceTimeMs: number;
|
|
2787
|
+
}
|
|
2788
|
+
interface KokoroStreamChunk {
|
|
2789
|
+
/** Audio for this sentence */
|
|
2790
|
+
audio: Float32Array;
|
|
2791
|
+
/** Original text segment */
|
|
2792
|
+
text: string;
|
|
2793
|
+
/** Phonemes for this segment */
|
|
2794
|
+
phonemes: string;
|
|
2795
|
+
/** Duration in seconds */
|
|
2796
|
+
duration: number;
|
|
2797
|
+
}
|
|
2798
|
+
interface KokoroTTSModelInfo {
|
|
2799
|
+
/** Resolved backend */
|
|
2800
|
+
backend: string;
|
|
2801
|
+
/** Model load time in ms */
|
|
2802
|
+
loadTimeMs: number;
|
|
2803
|
+
/** Default voice */
|
|
2804
|
+
defaultVoice: string;
|
|
2805
|
+
}
|
|
2806
|
+
interface SynthesizeOptions {
|
|
2807
|
+
/** Voice to use (overrides defaultVoice) */
|
|
2808
|
+
voice?: string;
|
|
2809
|
+
/** Speed multiplier (overrides config speed) */
|
|
2810
|
+
speed?: number;
|
|
2811
|
+
}
|
|
2812
|
+
/**
|
|
2813
|
+
* Validate TTS input parameters at API boundaries.
|
|
2814
|
+
* Returns trimmed text on success, throws on invalid input.
|
|
2815
|
+
*/
|
|
2816
|
+
declare function validateTTSInput(text: unknown, voiceName: string, speed: number, availableVoices?: string[]): string;
|
|
2817
|
+
declare class KokoroTTSInference implements TTSBackend {
|
|
2818
|
+
private readonly config;
|
|
2819
|
+
private readonly modelUrl;
|
|
2820
|
+
private readonly voiceBaseUrl;
|
|
2821
|
+
private ort;
|
|
2822
|
+
private session;
|
|
2735
2823
|
private _backend;
|
|
2736
|
-
private
|
|
2737
|
-
|
|
2738
|
-
|
|
2739
|
-
|
|
2740
|
-
|
|
2741
|
-
|
|
2742
|
-
|
|
2743
|
-
|
|
2744
|
-
|
|
2745
|
-
get
|
|
2824
|
+
private isLoading;
|
|
2825
|
+
private poisoned;
|
|
2826
|
+
private inferenceQueue;
|
|
2827
|
+
private phonemizerReady;
|
|
2828
|
+
private defaultVoiceLoaded;
|
|
2829
|
+
/** Cached voice data (voice name → Float32Array) */
|
|
2830
|
+
private loadedVoices;
|
|
2831
|
+
constructor(config?: KokoroTTSConfig);
|
|
2832
|
+
get isLoaded(): boolean;
|
|
2833
|
+
get sampleRate(): number;
|
|
2746
2834
|
/**
|
|
2747
|
-
* Load the
|
|
2835
|
+
* Load the ONNX model, phonemizer WASM, and default voice.
|
|
2836
|
+
* Safe to call multiple times (no-ops after first successful load).
|
|
2748
2837
|
*/
|
|
2749
|
-
load(): Promise<
|
|
2838
|
+
load(): Promise<KokoroTTSModelInfo>;
|
|
2750
2839
|
/**
|
|
2751
|
-
*
|
|
2840
|
+
* Lazily initialize phonemizer and default voice on first use.
|
|
2841
|
+
* Moves 100-200ms of main-thread blocking out of load() into first synthesis.
|
|
2752
2842
|
*/
|
|
2753
|
-
|
|
2843
|
+
private ensureReady;
|
|
2754
2844
|
/**
|
|
2755
|
-
*
|
|
2845
|
+
* Synthesize speech from text (one-shot, full audio output).
|
|
2846
|
+
*
|
|
2847
|
+
* @param text - Input text to synthesize
|
|
2848
|
+
* @param options - Voice and speed overrides
|
|
2849
|
+
* @returns Audio Float32Array at 24kHz with duration
|
|
2756
2850
|
*/
|
|
2757
|
-
|
|
2851
|
+
synthesize(text: string, options?: SynthesizeOptions): Promise<KokoroTTSResult>;
|
|
2852
|
+
/**
|
|
2853
|
+
* Stream synthesis sentence-by-sentence (async generator).
|
|
2854
|
+
* Splits text on sentence boundaries and yields audio for each.
|
|
2855
|
+
*
|
|
2856
|
+
* Compatible with both `SynthesizeOptions` (legacy) and `TTSStreamOptions` (TTSBackend).
|
|
2857
|
+
*
|
|
2858
|
+
* @param text - Input text (can be multiple sentences)
|
|
2859
|
+
* @param options - Voice, speed, and abort signal overrides
|
|
2860
|
+
*/
|
|
2861
|
+
stream(text: string, options?: SynthesizeOptions & TTSStreamOptions): AsyncGenerator<KokoroStreamChunk & TTSChunk>;
|
|
2862
|
+
/**
|
|
2863
|
+
* Preload a voice (fetches and caches the .bin file).
|
|
2864
|
+
*/
|
|
2865
|
+
preloadVoice(voiceName: string): Promise<void>;
|
|
2866
|
+
/**
|
|
2867
|
+
* List available voice names.
|
|
2868
|
+
*/
|
|
2869
|
+
listVoices(): string[];
|
|
2870
|
+
/**
|
|
2871
|
+
* Release the ONNX session and clear cached voices.
|
|
2872
|
+
*/
|
|
2873
|
+
dispose(): Promise<void>;
|
|
2874
|
+
private ensureVoice;
|
|
2875
|
+
private queueInference;
|
|
2876
|
+
private runInference;
|
|
2877
|
+
}
|
|
2878
|
+
|
|
2879
|
+
/**
|
|
2880
|
+
* Kokoro TTS adapter backed by UnifiedInferenceWorker
|
|
2881
|
+
*
|
|
2882
|
+
* Implements TTSBackend, delegating ONNX inference to the shared worker.
|
|
2883
|
+
* Phonemization, tokenization, and voice loading stay on main thread (fast, <10ms).
|
|
2884
|
+
* Only the heavy `session.run()` (~1-2s per sentence) goes to the worker.
|
|
2885
|
+
*/
|
|
2886
|
+
|
|
2887
|
+
declare class KokoroTTSUnifiedAdapter implements TTSBackend {
|
|
2888
|
+
private worker;
|
|
2889
|
+
private readonly config;
|
|
2890
|
+
private readonly modelUrl;
|
|
2891
|
+
private readonly voiceBaseUrl;
|
|
2892
|
+
private _isLoaded;
|
|
2893
|
+
private loadedGeneration;
|
|
2894
|
+
/** Per-adapter inference queue — ensures sequential state updates. */
|
|
2895
|
+
private inferenceQueue;
|
|
2896
|
+
private loadedVoices;
|
|
2897
|
+
private phonemizerReady;
|
|
2898
|
+
private defaultVoiceLoaded;
|
|
2899
|
+
constructor(worker: UnifiedInferenceWorker, config?: KokoroTTSConfig);
|
|
2900
|
+
get isLoaded(): boolean;
|
|
2901
|
+
get sampleRate(): number;
|
|
2902
|
+
load(): Promise<KokoroTTSModelInfo>;
|
|
2903
|
+
stream(text: string, options?: SynthesizeOptions & TTSStreamOptions): AsyncGenerator<KokoroStreamChunk & TTSChunk>;
|
|
2904
|
+
dispose(): Promise<void>;
|
|
2905
|
+
private ensureVoice;
|
|
2906
|
+
private assertLoaded;
|
|
2907
|
+
private runWorkerInference;
|
|
2908
|
+
}
|
|
2909
|
+
|
|
2910
|
+
/**
|
|
2911
|
+
* Silero VAD adapter backed by UnifiedInferenceWorker
|
|
2912
|
+
*
|
|
2913
|
+
* Implements SileroVADBackend, delegating all inference to the shared worker.
|
|
2914
|
+
*/
|
|
2915
|
+
|
|
2916
|
+
declare class SileroVADUnifiedAdapter implements SileroVADBackend {
|
|
2917
|
+
private worker;
|
|
2918
|
+
private config;
|
|
2919
|
+
private _isLoaded;
|
|
2920
|
+
private loadedGeneration;
|
|
2921
|
+
private state;
|
|
2922
|
+
private context;
|
|
2923
|
+
private readonly chunkSize;
|
|
2924
|
+
private readonly contextSize;
|
|
2758
2925
|
/**
|
|
2759
|
-
*
|
|
2926
|
+
* Per-adapter inference queue — ensures sequential state updates.
|
|
2927
|
+
*
|
|
2928
|
+
* The unified worker processes messages serially (single thread), but this queue
|
|
2929
|
+
* guarantees per-adapter state consistency. Example: VAD LSTM state from call N
|
|
2930
|
+
* must be applied before call N+1 starts. Without the queue, two rapid process()
|
|
2931
|
+
* calls could both read the same stale state.
|
|
2760
2932
|
*/
|
|
2933
|
+
private inferenceQueue;
|
|
2934
|
+
private preSpeechBuffer;
|
|
2935
|
+
private wasSpeaking;
|
|
2936
|
+
constructor(worker: UnifiedInferenceWorker, config: SileroVADConfig);
|
|
2937
|
+
get isLoaded(): boolean;
|
|
2938
|
+
get backend(): RuntimeBackend | null;
|
|
2939
|
+
get sampleRate(): number;
|
|
2940
|
+
get threshold(): number;
|
|
2941
|
+
getChunkSize(): number;
|
|
2942
|
+
getChunkDurationMs(): number;
|
|
2943
|
+
load(): Promise<VADWorkerModelInfo>;
|
|
2944
|
+
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
2945
|
+
reset(): Promise<void>;
|
|
2761
2946
|
dispose(): Promise<void>;
|
|
2947
|
+
private assertLoaded;
|
|
2762
2948
|
}
|
|
2763
2949
|
|
|
2764
2950
|
/**
|
|
@@ -2961,14 +3147,208 @@ declare class SafariSpeechRecognition {
|
|
|
2961
3147
|
*/
|
|
2962
3148
|
private setupEventHandlers;
|
|
2963
3149
|
/**
|
|
2964
|
-
* Emit result to all registered callbacks
|
|
3150
|
+
* Emit result to all registered callbacks
|
|
3151
|
+
*/
|
|
3152
|
+
private emitResult;
|
|
3153
|
+
/**
|
|
3154
|
+
* Emit error to all registered callbacks
|
|
3155
|
+
*/
|
|
3156
|
+
private emitError;
|
|
3157
|
+
}
|
|
3158
|
+
|
|
3159
|
+
/**
|
|
3160
|
+
* Kokoro TTS Web Worker implementation
|
|
3161
|
+
*
|
|
3162
|
+
* Moves the heavy ONNX `session.run()` to a dedicated Web Worker to prevent
|
|
3163
|
+
* main thread blocking (~1-2s per sentence on WASM). Phonemizer, tokenizer,
|
|
3164
|
+
* and voice logic stay on the main thread (fast, <10ms combined).
|
|
3165
|
+
*
|
|
3166
|
+
* Architecture:
|
|
3167
|
+
* ```
|
|
3168
|
+
* Main Thread (KokoroTTSWorker): Worker (WORKER_SCRIPT):
|
|
3169
|
+
* stream(text) →
|
|
3170
|
+
* splitSentences(text)
|
|
3171
|
+
* for each sentence:
|
|
3172
|
+
* phonemize(sentence) → phonemes
|
|
3173
|
+
* tokenize(phonemes) → tokens
|
|
3174
|
+
* ensureVoice() → style
|
|
3175
|
+
* postMessage(tokens, style, speed) ──→ session.run(feeds)
|
|
3176
|
+
* await result ←── postMessage(audio)
|
|
3177
|
+
* yield {audio, text, phonemes, duration}
|
|
3178
|
+
* ```
|
|
3179
|
+
*
|
|
3180
|
+
* @category Inference
|
|
3181
|
+
*
|
|
3182
|
+
* @example Basic usage
|
|
3183
|
+
* ```typescript
|
|
3184
|
+
* import { KokoroTTSWorker } from '@omote/core';
|
|
3185
|
+
*
|
|
3186
|
+
* const tts = new KokoroTTSWorker({ defaultVoice: 'af_heart' });
|
|
3187
|
+
* await tts.load();
|
|
3188
|
+
*
|
|
3189
|
+
* for await (const chunk of tts.stream("Hello world!")) {
|
|
3190
|
+
* playbackPipeline.feedBuffer(chunk.audio);
|
|
3191
|
+
* }
|
|
3192
|
+
* ```
|
|
3193
|
+
*
|
|
3194
|
+
* @module inference/KokoroTTSWorker
|
|
3195
|
+
*/
|
|
3196
|
+
|
|
3197
|
+
/**
|
|
3198
|
+
* Kokoro TTS Worker — off-main-thread ONNX inference for non-blocking TTS.
|
|
3199
|
+
*
|
|
3200
|
+
* Phonemizer/tokenizer/voice logic run on the main thread (fast, <10ms).
|
|
3201
|
+
* Only the heavy ONNX `session.run()` is delegated to the worker.
|
|
3202
|
+
*
|
|
3203
|
+
* Implements the same TTSBackend interface as KokoroTTSInference.
|
|
3204
|
+
*
|
|
3205
|
+
* @see KokoroTTSInference for main-thread version
|
|
3206
|
+
*/
|
|
3207
|
+
declare class KokoroTTSWorker implements TTSBackend {
|
|
3208
|
+
private readonly config;
|
|
3209
|
+
private readonly modelUrl;
|
|
3210
|
+
private readonly voiceBaseUrl;
|
|
3211
|
+
private worker;
|
|
3212
|
+
private _isLoaded;
|
|
3213
|
+
private isLoading;
|
|
3214
|
+
private poisoned;
|
|
3215
|
+
/** Serializes all worker calls (stream sentence chunks + synthesize) */
|
|
3216
|
+
private inferenceQueue;
|
|
3217
|
+
/** Cached voice data (voice name → Float32Array) */
|
|
3218
|
+
private loadedVoices;
|
|
3219
|
+
/** Pending message handlers */
|
|
3220
|
+
private pendingResolvers;
|
|
3221
|
+
constructor(config?: KokoroTTSConfig);
|
|
3222
|
+
get isLoaded(): boolean;
|
|
3223
|
+
get sampleRate(): number;
|
|
3224
|
+
load(): Promise<KokoroTTSModelInfo>;
|
|
3225
|
+
synthesize(text: string, options?: SynthesizeOptions): Promise<KokoroTTSResult>;
|
|
3226
|
+
stream(text: string, options?: SynthesizeOptions & TTSStreamOptions): AsyncGenerator<KokoroStreamChunk & TTSChunk>;
|
|
3227
|
+
preloadVoice(voiceName: string): Promise<void>;
|
|
3228
|
+
listVoices(): string[];
|
|
3229
|
+
dispose(): Promise<void>;
|
|
3230
|
+
static isSupported(): boolean;
|
|
3231
|
+
private ensureVoice;
|
|
3232
|
+
private createWorker;
|
|
3233
|
+
private handleWorkerMessage;
|
|
3234
|
+
private sendMessage;
|
|
3235
|
+
/**
|
|
3236
|
+
* Queue worker inference through the serialization queue.
|
|
3237
|
+
* Sends pre-computed tokens + style to worker, returns audio.
|
|
2965
3238
|
*/
|
|
2966
|
-
private
|
|
3239
|
+
private runWorkerInference;
|
|
2967
3240
|
/**
|
|
2968
|
-
*
|
|
3241
|
+
* One-shot synthesis (phonemize + tokenize + worker inference).
|
|
2969
3242
|
*/
|
|
2970
|
-
private
|
|
3243
|
+
private queueInference;
|
|
3244
|
+
}
|
|
3245
|
+
|
|
3246
|
+
/**
|
|
3247
|
+
* Factory function for Kokoro TTS with automatic Worker vs main thread selection
|
|
3248
|
+
*
|
|
3249
|
+
* Provides a unified API that automatically selects the optimal implementation:
|
|
3250
|
+
* - Desktop: Uses KokoroTTSWorker (off-main-thread inference, no render hitching)
|
|
3251
|
+
* - iOS: Uses KokoroTTSInference (main thread, shared ORT instance to avoid OOM)
|
|
3252
|
+
*
|
|
3253
|
+
* @category Inference
|
|
3254
|
+
*
|
|
3255
|
+
* @example Auto-detect (recommended)
|
|
3256
|
+
* ```typescript
|
|
3257
|
+
* import { createKokoroTTS } from '@omote/core';
|
|
3258
|
+
*
|
|
3259
|
+
* const tts = createKokoroTTS({ defaultVoice: 'af_heart' });
|
|
3260
|
+
* await tts.load();
|
|
3261
|
+
*
|
|
3262
|
+
* for await (const chunk of tts.stream("Hello world!")) {
|
|
3263
|
+
* playbackPipeline.feedBuffer(chunk.audio);
|
|
3264
|
+
* }
|
|
3265
|
+
* ```
|
|
3266
|
+
*
|
|
3267
|
+
* @example Force worker
|
|
3268
|
+
* ```typescript
|
|
3269
|
+
* const tts = createKokoroTTS({ defaultVoice: 'af_heart', useWorker: true });
|
|
3270
|
+
* ```
|
|
3271
|
+
*
|
|
3272
|
+
* @example Force main thread
|
|
3273
|
+
* ```typescript
|
|
3274
|
+
* const tts = createKokoroTTS({ defaultVoice: 'af_heart', useWorker: false });
|
|
3275
|
+
* ```
|
|
3276
|
+
*/
|
|
3277
|
+
|
|
3278
|
+
/**
|
|
3279
|
+
* Configuration for the Kokoro TTS factory
|
|
3280
|
+
*/
|
|
3281
|
+
interface CreateKokoroTTSConfig extends KokoroTTSConfig, InferenceFactoryConfig {
|
|
2971
3282
|
}
|
|
3283
|
+
/**
|
|
3284
|
+
* Create a Kokoro TTS instance with automatic implementation selection.
|
|
3285
|
+
*
|
|
3286
|
+
* @param config - Factory configuration
|
|
3287
|
+
* @returns A TTSBackend instance (either Worker or main thread)
|
|
3288
|
+
*/
|
|
3289
|
+
declare function createKokoroTTS(config?: CreateKokoroTTSConfig): TTSBackend;
|
|
3290
|
+
|
|
3291
|
+
/** Available Kokoro v1.0 voices */
|
|
3292
|
+
declare const KOKORO_VOICES: {
|
|
3293
|
+
readonly af_heart: "af_heart";
|
|
3294
|
+
readonly af_alloy: "af_alloy";
|
|
3295
|
+
readonly af_aoede: "af_aoede";
|
|
3296
|
+
readonly af_bella: "af_bella";
|
|
3297
|
+
readonly af_jessica: "af_jessica";
|
|
3298
|
+
readonly af_kore: "af_kore";
|
|
3299
|
+
readonly af_nicole: "af_nicole";
|
|
3300
|
+
readonly af_nova: "af_nova";
|
|
3301
|
+
readonly af_river: "af_river";
|
|
3302
|
+
readonly af_sarah: "af_sarah";
|
|
3303
|
+
readonly af_sky: "af_sky";
|
|
3304
|
+
readonly am_adam: "am_adam";
|
|
3305
|
+
readonly am_echo: "am_echo";
|
|
3306
|
+
readonly am_eric: "am_eric";
|
|
3307
|
+
readonly am_fenrir: "am_fenrir";
|
|
3308
|
+
readonly am_liam: "am_liam";
|
|
3309
|
+
readonly am_michael: "am_michael";
|
|
3310
|
+
readonly am_onyx: "am_onyx";
|
|
3311
|
+
readonly am_puck: "am_puck";
|
|
3312
|
+
readonly am_santa: "am_santa";
|
|
3313
|
+
readonly bf_alice: "bf_alice";
|
|
3314
|
+
readonly bf_emma: "bf_emma";
|
|
3315
|
+
readonly bf_isabella: "bf_isabella";
|
|
3316
|
+
readonly bf_lily: "bf_lily";
|
|
3317
|
+
readonly bm_daniel: "bm_daniel";
|
|
3318
|
+
readonly bm_fable: "bm_fable";
|
|
3319
|
+
readonly bm_george: "bm_george";
|
|
3320
|
+
readonly bm_lewis: "bm_lewis";
|
|
3321
|
+
};
|
|
3322
|
+
type KokoroVoiceName = keyof typeof KOKORO_VOICES;
|
|
3323
|
+
/**
|
|
3324
|
+
* List all available voice names.
|
|
3325
|
+
*/
|
|
3326
|
+
declare function listVoices(): string[];
|
|
3327
|
+
|
|
3328
|
+
/**
|
|
3329
|
+
* ORT CDN configuration
|
|
3330
|
+
*
|
|
3331
|
+
* Allows consumers to override the CDN base URL used for loading
|
|
3332
|
+
* ONNX Runtime WASM/WebGPU binaries. By default, ORT loads from
|
|
3333
|
+
* its bundled CDN path. Use {@link configureOrtCdn} to point at
|
|
3334
|
+
* a self-hosted or enterprise CDN.
|
|
3335
|
+
*
|
|
3336
|
+
* @category Inference
|
|
3337
|
+
*/
|
|
3338
|
+
/**
|
|
3339
|
+
* Override the CDN base URL for ONNX Runtime WASM/WebGPU binaries.
|
|
3340
|
+
*
|
|
3341
|
+
* Must be an HTTPS URL or a relative path (starts with `/` or `./`).
|
|
3342
|
+
* Call this once at app startup, before loading any models.
|
|
3343
|
+
*
|
|
3344
|
+
* @param cdnPath - HTTPS URL or relative path to ORT binaries directory
|
|
3345
|
+
* @throws If cdnPath is not HTTPS or a relative path
|
|
3346
|
+
*/
|
|
3347
|
+
declare function configureOrtCdn(cdnPath: string): void;
|
|
3348
|
+
/**
|
|
3349
|
+
* Get the current ORT CDN base URL override, or null if using defaults.
|
|
3350
|
+
*/
|
|
3351
|
+
declare function getOrtCdnBase(): string | null;
|
|
2972
3352
|
|
|
2973
3353
|
/**
|
|
2974
3354
|
* Emotion - Helper for creating emotion vectors for avatar animation
|
|
@@ -3009,6 +3389,8 @@ type EmotionName = typeof EMOTION_NAMES[number];
|
|
|
3009
3389
|
type EmotionWeights = Partial<Record<EmotionName, number>>;
|
|
3010
3390
|
/** Total emotion vector size */
|
|
3011
3391
|
declare const EMOTION_VECTOR_SIZE = 26;
|
|
3392
|
+
/** Number of explicit emotion channels */
|
|
3393
|
+
declare const EXPLICIT_EMOTION_COUNT = 10;
|
|
3012
3394
|
/**
|
|
3013
3395
|
* Create an emotion vector from named weights
|
|
3014
3396
|
*
|
|
@@ -3507,7 +3889,48 @@ declare const MetricNames: {
|
|
|
3507
3889
|
readonly CACHE_HITS: "omote.cache.hits";
|
|
3508
3890
|
/** Counter: Cache misses */
|
|
3509
3891
|
readonly CACHE_MISSES: "omote.cache.misses";
|
|
3892
|
+
/** Histogram: VoicePipeline turn latency (speech end → transcript ready, excludes playback) */
|
|
3893
|
+
readonly VOICE_TURN_LATENCY: "omote.voice.turn.latency";
|
|
3894
|
+
/** Histogram: ASR transcription latency in ms */
|
|
3895
|
+
readonly VOICE_TRANSCRIPTION_LATENCY: "omote.voice.transcription.latency";
|
|
3896
|
+
/** Histogram: Response handler latency in ms */
|
|
3897
|
+
readonly VOICE_RESPONSE_LATENCY: "omote.voice.response.latency";
|
|
3898
|
+
/** Counter: Total transcriptions */
|
|
3899
|
+
readonly VOICE_TRANSCRIPTIONS: "omote.voice.transcriptions";
|
|
3900
|
+
/** Counter: Total interruptions */
|
|
3901
|
+
readonly VOICE_INTERRUPTIONS: "omote.voice.interruptions";
|
|
3902
|
+
/** Histogram: PlaybackPipeline session duration in ms */
|
|
3903
|
+
readonly PLAYBACK_SESSION_DURATION: "omote.playback.session.duration";
|
|
3904
|
+
/** Histogram: Audio chunk processing latency in ms */
|
|
3905
|
+
readonly PLAYBACK_CHUNK_LATENCY: "omote.playback.chunk.latency";
|
|
3906
|
+
/** Histogram: TTSSpeaker.connect() latency in ms */
|
|
3907
|
+
readonly TTS_CONNECT_LATENCY: "omote.tts.connect.latency";
|
|
3908
|
+
/** Histogram: TTSSpeaker.speak() latency in ms */
|
|
3909
|
+
readonly TTS_SPEAK_LATENCY: "omote.tts.speak.latency";
|
|
3910
|
+
/** Counter: TTSSpeaker.stop() aborted speak calls */
|
|
3911
|
+
readonly TTS_SPEAK_ABORTED: "omote.tts.speak.aborted";
|
|
3912
|
+
/** Counter: MicLipSync sessions started */
|
|
3913
|
+
readonly MIC_SESSIONS: "omote.mic.sessions";
|
|
3914
|
+
/** Histogram: CharacterController.update() latency in µs */
|
|
3915
|
+
readonly AVATAR_FRAME_LATENCY: "omote.avatar.frame.latency_us";
|
|
3916
|
+
/** Histogram: FaceCompositor.compose() latency in µs */
|
|
3917
|
+
readonly COMPOSITOR_COMPOSE_LATENCY: "omote.compositor.compose.latency_us";
|
|
3918
|
+
/** Counter: Frames exceeding budget threshold */
|
|
3919
|
+
readonly AVATAR_FRAME_DROPS: "omote.avatar.frame.drops";
|
|
3920
|
+
};
|
|
3921
|
+
/**
|
|
3922
|
+
* Centralized error type taxonomy for structured error reporting.
|
|
3923
|
+
*/
|
|
3924
|
+
declare const ErrorTypes: {
|
|
3925
|
+
readonly INFERENCE: "inference_error";
|
|
3926
|
+
readonly NETWORK: "network_error";
|
|
3927
|
+
readonly TIMEOUT: "timeout";
|
|
3928
|
+
readonly USER: "user_error";
|
|
3929
|
+
readonly RUNTIME: "runtime_error";
|
|
3930
|
+
readonly MEDIA: "media_error";
|
|
3931
|
+
readonly MODEL: "model_error";
|
|
3510
3932
|
};
|
|
3933
|
+
type ErrorType = typeof ErrorTypes[keyof typeof ErrorTypes];
|
|
3511
3934
|
/**
|
|
3512
3935
|
* Histogram buckets for inference latency (ms)
|
|
3513
3936
|
*/
|
|
@@ -3585,6 +4008,7 @@ declare class OmoteTelemetry {
|
|
|
3585
4008
|
private exporter;
|
|
3586
4009
|
private activeTraceId;
|
|
3587
4010
|
private metricsIntervalId;
|
|
4011
|
+
private spanStack;
|
|
3588
4012
|
private counters;
|
|
3589
4013
|
private histograms;
|
|
3590
4014
|
constructor(config: TelemetryConfig);
|
|
@@ -3682,6 +4106,14 @@ declare class OmoteTelemetry {
|
|
|
3682
4106
|
* Get current configuration
|
|
3683
4107
|
*/
|
|
3684
4108
|
getConfig(): TelemetryConfig;
|
|
4109
|
+
/**
|
|
4110
|
+
* Get the active span context for log-to-span correlation.
|
|
4111
|
+
* Returns the most recent (top of stack) active span, or null if none.
|
|
4112
|
+
*/
|
|
4113
|
+
getActiveContext(): {
|
|
4114
|
+
traceId: string;
|
|
4115
|
+
spanId: string;
|
|
4116
|
+
} | null;
|
|
3685
4117
|
}
|
|
3686
4118
|
|
|
3687
4119
|
/**
|
|
@@ -4294,6 +4726,7 @@ declare class ProceduralLifeLayer {
|
|
|
4294
4726
|
private noiseTime;
|
|
4295
4727
|
private previousEnergy;
|
|
4296
4728
|
private emphasisLevel;
|
|
4729
|
+
private readonly _outputBlendshapes;
|
|
4297
4730
|
constructor(config?: LifeLayerConfig);
|
|
4298
4731
|
/**
|
|
4299
4732
|
* Update the life layer and produce output for this frame.
|
|
@@ -4336,6 +4769,113 @@ declare class ProceduralLifeLayer {
|
|
|
4336
4769
|
private updateBrowNoise;
|
|
4337
4770
|
}
|
|
4338
4771
|
|
|
4772
|
+
/**
|
|
4773
|
+
* Body Animation — Renderer-agnostic interfaces and utilities.
|
|
4774
|
+
*
|
|
4775
|
+
* Defines the contract for body animation controllers that each renderer
|
|
4776
|
+
* adapter (@omote/three, @omote/babylon, @omote/r3f) implements natively.
|
|
4777
|
+
*
|
|
4778
|
+
* Also provides the shared bone filtering logic used during animation
|
|
4779
|
+
* retargeting — stripping head/neck/eye tracks so body animations don't
|
|
4780
|
+
* conflict with the face pipeline (FaceCompositor, gaze, ProceduralLifeLayer).
|
|
4781
|
+
*
|
|
4782
|
+
* @module animation
|
|
4783
|
+
*/
|
|
4784
|
+
/**
|
|
4785
|
+
* Renderer-agnostic animation controller interface.
|
|
4786
|
+
*
|
|
4787
|
+
* Each renderer adapter implements this against its native animation system:
|
|
4788
|
+
* - @omote/three → THREE.AnimationMixer + AnimationAction
|
|
4789
|
+
* - @omote/babylon → Babylon.js AnimationGroup
|
|
4790
|
+
* - @omote/r3f → React hook wrapping the Three.js implementation
|
|
4791
|
+
*
|
|
4792
|
+
* Python/Node ports implement this against their own runtimes.
|
|
4793
|
+
*/
|
|
4794
|
+
interface AnimationController {
|
|
4795
|
+
/** Play an animation by id. */
|
|
4796
|
+
play(id: string, options?: {
|
|
4797
|
+
fadeInDuration?: number;
|
|
4798
|
+
}): void;
|
|
4799
|
+
/** Stop all playing animations. */
|
|
4800
|
+
stop(fadeOutDuration?: number): void;
|
|
4801
|
+
/** Crossfade from current animation to target. */
|
|
4802
|
+
crossfadeTo(id: string, duration?: number): void;
|
|
4803
|
+
/** Check if a specific animation is currently playing. */
|
|
4804
|
+
isPlaying(id: string): boolean;
|
|
4805
|
+
/** Check if an animation with this id is loaded. */
|
|
4806
|
+
hasAnimation(id: string): boolean;
|
|
4807
|
+
/** List of loaded animation ids. */
|
|
4808
|
+
readonly availableAnimations: string[];
|
|
4809
|
+
}
|
|
4810
|
+
/**
|
|
4811
|
+
* Describes an external animation asset to load and configure.
|
|
4812
|
+
* Renderer-agnostic — loaders are adapter-specific.
|
|
4813
|
+
*/
|
|
4814
|
+
interface AnimationSource {
|
|
4815
|
+
/** Unique identifier for this animation. */
|
|
4816
|
+
id: string;
|
|
4817
|
+
/** URL to the animation file (FBX, GLB, etc.). */
|
|
4818
|
+
url: string;
|
|
4819
|
+
/** Clip name within the file (if it contains multiple clips). */
|
|
4820
|
+
clipName?: string;
|
|
4821
|
+
/** Playback options. */
|
|
4822
|
+
options?: AnimationSourceOptions;
|
|
4823
|
+
}
|
|
4824
|
+
interface AnimationSourceOptions {
|
|
4825
|
+
loop?: boolean;
|
|
4826
|
+
timeScale?: number;
|
|
4827
|
+
fadeInDuration?: number;
|
|
4828
|
+
fadeOutDuration?: number;
|
|
4829
|
+
clampWhenFinished?: boolean;
|
|
4830
|
+
}
|
|
4831
|
+
/**
|
|
4832
|
+
* Configuration for filtering bone tracks from body animations.
|
|
4833
|
+
*
|
|
4834
|
+
* The face pipeline (FaceCompositor, gaze tracking, ProceduralLifeLayer) owns
|
|
4835
|
+
* certain bones (head, neck, eyes). Body animations must strip these tracks
|
|
4836
|
+
* to prevent conflicts.
|
|
4837
|
+
*/
|
|
4838
|
+
interface BoneFilterConfig {
|
|
4839
|
+
/** Bone names owned by the face pipeline (e.g., ['Head', 'Neck', 'LeftEye', 'RightEye']). */
|
|
4840
|
+
proceduralBones: string[];
|
|
4841
|
+
/** Whether to strip .position tracks (keep only quaternion/rotation). */
|
|
4842
|
+
filterPositionTracks: boolean;
|
|
4843
|
+
/** Whether to strip morphTargetInfluences tracks. */
|
|
4844
|
+
filterMorphTargets: boolean;
|
|
4845
|
+
}
|
|
4846
|
+
/** Mixamo bone name prefix (stripped during retargeting). */
|
|
4847
|
+
declare const MIXAMO_PREFIX = "mixamorig";
|
|
4848
|
+
/**
|
|
4849
|
+
* Bones that need position tracks preserved during retargeting.
|
|
4850
|
+
* Stripping finger/hand position tracks causes fingers to splay to bind pose.
|
|
4851
|
+
*/
|
|
4852
|
+
declare const PRESERVE_POSITION_BONES: Set<string>;
|
|
4853
|
+
/** Default bone filter for RPM/Mixamo avatars. */
|
|
4854
|
+
declare const DEFAULT_BONE_FILTER: BoneFilterConfig;
|
|
4855
|
+
/**
|
|
4856
|
+
* A generic animation track descriptor. Renderers map their native track
|
|
4857
|
+
* objects to this shape for filtering, then map back.
|
|
4858
|
+
*/
|
|
4859
|
+
interface TrackDescriptor {
|
|
4860
|
+
/** Full track name, e.g. "mixamorigHips.quaternion" or "Head.position". */
|
|
4861
|
+
name: string;
|
|
4862
|
+
}
|
|
4863
|
+
/**
|
|
4864
|
+
* Filter animation tracks according to a BoneFilterConfig.
|
|
4865
|
+
*
|
|
4866
|
+
* This is the renderer-agnostic core of `retargetClip`. Renderer adapters
|
|
4867
|
+
* call this with their native track names and use the result to decide
|
|
4868
|
+
* which tracks to keep.
|
|
4869
|
+
*
|
|
4870
|
+
* @returns true if the track should be KEPT (not filtered out).
|
|
4871
|
+
*/
|
|
4872
|
+
declare function shouldKeepTrack(trackName: string, config: BoneFilterConfig): boolean;
|
|
4873
|
+
/**
|
|
4874
|
+
* Strip Mixamo prefix from a track name.
|
|
4875
|
+
* "mixamorigHips.quaternion" → "Hips.quaternion"
|
|
4876
|
+
*/
|
|
4877
|
+
declare function stripMixamoPrefix(trackName: string): string;
|
|
4878
|
+
|
|
4339
4879
|
/**
|
|
4340
4880
|
* FACS (Facial Action Coding System) to ARKit Blendshape Mapping
|
|
4341
4881
|
*
|
|
@@ -4555,6 +5095,172 @@ declare class FaceCompositor {
|
|
|
4555
5095
|
private applyProfileArrays;
|
|
4556
5096
|
}
|
|
4557
5097
|
|
|
5098
|
+
/**
|
|
5099
|
+
* TextEmotionAnalyzer — Lightweight keyword heuristic for mapping AI response
|
|
5100
|
+
* text to an emotion label.
|
|
5101
|
+
*
|
|
5102
|
+
* Returns null if no strong signal is detected (keeps current emotion).
|
|
5103
|
+
*
|
|
5104
|
+
* @category Face
|
|
5105
|
+
*/
|
|
5106
|
+
/**
|
|
5107
|
+
* Analyze AI response text for emotional content.
|
|
5108
|
+
*
|
|
5109
|
+
* @param text - The AI response text to analyze
|
|
5110
|
+
* @returns An emotion label string, or null if no strong signal detected
|
|
5111
|
+
*/
|
|
5112
|
+
declare function analyzeTextEmotion(text: string): string | null;
|
|
5113
|
+
|
|
5114
|
+
/**
|
|
5115
|
+
* EmotionTagParser — Strips `[tag]` emotion annotations from LLM response text.
|
|
5116
|
+
*
|
|
5117
|
+
* LLMs can self-annotate responses with emotion tags like `[excited]` or `[sad]`.
|
|
5118
|
+
* This parser extracts the first valid tag and returns clean display text.
|
|
5119
|
+
*
|
|
5120
|
+
* @category Face
|
|
5121
|
+
*/
|
|
5122
|
+
/**
|
|
5123
|
+
* Parse emotion tags from LLM response text.
|
|
5124
|
+
*
|
|
5125
|
+
* @param text - Raw LLM response text, possibly containing `[emotion]` tags
|
|
5126
|
+
* @returns Object with clean display text and extracted emotion label (or null)
|
|
5127
|
+
*/
|
|
5128
|
+
declare function parseEmotionTags(text: string): {
|
|
5129
|
+
cleanText: string;
|
|
5130
|
+
emotion: string | null;
|
|
5131
|
+
};
|
|
5132
|
+
|
|
5133
|
+
/**
|
|
5134
|
+
* CharacterController — Renderer-agnostic avatar composition loop
|
|
5135
|
+
*
|
|
5136
|
+
* Extracted from r3f's useOmoteAvatar + useGazeTracking.
|
|
5137
|
+
* Owns FaceCompositor, emotion resolution, eye angle math, head smoothing.
|
|
5138
|
+
* Pure function: input → output. No renderer side effects.
|
|
5139
|
+
*
|
|
5140
|
+
* @category Character
|
|
5141
|
+
*/
|
|
5142
|
+
|
|
5143
|
+
/**
|
|
5144
|
+
* Convert an emotion label string or EmotionWeights object to EmotionWeights.
|
|
5145
|
+
* Cached to avoid per-frame string allocation.
|
|
5146
|
+
*/
|
|
5147
|
+
declare function resolveEmotion(emotion: string | EmotionWeights | null | undefined): EmotionWeights | undefined;
|
|
5148
|
+
/** Simple 3D vector (renderer-agnostic) */
|
|
5149
|
+
interface Vec3 {
|
|
5150
|
+
x: number;
|
|
5151
|
+
y: number;
|
|
5152
|
+
z: number;
|
|
5153
|
+
}
|
|
5154
|
+
/** Quaternion (renderer-agnostic, for head rotation) */
|
|
5155
|
+
interface Quat {
|
|
5156
|
+
x: number;
|
|
5157
|
+
y: number;
|
|
5158
|
+
z: number;
|
|
5159
|
+
w: number;
|
|
5160
|
+
}
|
|
5161
|
+
interface CharacterControllerConfig {
|
|
5162
|
+
/** FaceCompositor configuration */
|
|
5163
|
+
compositor?: FaceCompositorConfig;
|
|
5164
|
+
/** Gaze tracking config */
|
|
5165
|
+
gaze?: {
|
|
5166
|
+
enabled?: boolean;
|
|
5167
|
+
yawInfluence?: number;
|
|
5168
|
+
pitchInfluence?: number;
|
|
5169
|
+
smoothing?: number;
|
|
5170
|
+
};
|
|
5171
|
+
}
|
|
5172
|
+
interface CharacterUpdateInput {
|
|
5173
|
+
/** Time since last frame in seconds */
|
|
5174
|
+
deltaTime: number;
|
|
5175
|
+
/** Scaled blendshapes from pipeline frame (or null when no frame) */
|
|
5176
|
+
baseBlendshapes: Float32Array | null;
|
|
5177
|
+
/** Raw blendshapes before profile scaling (optional) */
|
|
5178
|
+
rawBlendshapes?: Float32Array | null;
|
|
5179
|
+
/** Current emotion (string preset or weights object) */
|
|
5180
|
+
emotion?: string | EmotionWeights | null;
|
|
5181
|
+
/** Whether the avatar is currently speaking */
|
|
5182
|
+
isSpeaking: boolean;
|
|
5183
|
+
/** Current conversational state */
|
|
5184
|
+
state: ConversationalState;
|
|
5185
|
+
/** Audio energy level (0-1, drives emphasis/gesture intensity) */
|
|
5186
|
+
audioEnergy?: number;
|
|
5187
|
+
/** Camera world position (renderer provides in its own coords) */
|
|
5188
|
+
cameraWorldPos?: Vec3;
|
|
5189
|
+
/** Head bone world position (renderer provides in its own coords) */
|
|
5190
|
+
headWorldPos?: Vec3;
|
|
5191
|
+
/** Head bone world quaternion (for eye gaze local-space transform) */
|
|
5192
|
+
headWorldQuat?: Quat;
|
|
5193
|
+
/** Current avatar Y rotation in radians (for gaze compensation) */
|
|
5194
|
+
avatarRotationY?: number;
|
|
5195
|
+
}
|
|
5196
|
+
interface CharacterUpdateOutput {
|
|
5197
|
+
/** 52 ARKit blendshape values, clamped [0,1] — apply to morph targets */
|
|
5198
|
+
blendshapes: Float32Array;
|
|
5199
|
+
/** Head rotation delta (radians) — apply to head bone */
|
|
5200
|
+
headDelta: {
|
|
5201
|
+
yaw: number;
|
|
5202
|
+
pitch: number;
|
|
5203
|
+
};
|
|
5204
|
+
/** Normalized eye targets for eye blendshapes */
|
|
5205
|
+
eyeTargets: {
|
|
5206
|
+
x: number;
|
|
5207
|
+
y: number;
|
|
5208
|
+
};
|
|
5209
|
+
}
|
|
5210
|
+
declare class CharacterController {
|
|
5211
|
+
private readonly _compositor;
|
|
5212
|
+
private readonly gazeEnabled;
|
|
5213
|
+
private readonly gazeYawInfluence;
|
|
5214
|
+
private readonly gazePitchInfluence;
|
|
5215
|
+
private readonly gazeSmoothing;
|
|
5216
|
+
private readonly frameTimes;
|
|
5217
|
+
private frameTimeIdx;
|
|
5218
|
+
private frameTimeFill;
|
|
5219
|
+
private readonly zeroBase;
|
|
5220
|
+
private readonly outputBuffer;
|
|
5221
|
+
private readonly compositorInput;
|
|
5222
|
+
private gazeHeadYaw;
|
|
5223
|
+
private gazeHeadPitch;
|
|
5224
|
+
constructor(config?: CharacterControllerConfig);
|
|
5225
|
+
/**
|
|
5226
|
+
* Call each frame. Pure function: input → output. No renderer side effects.
|
|
5227
|
+
*
|
|
5228
|
+
* Composes A2E blendshapes, emotion, procedural life, gaze tracking
|
|
5229
|
+
* into a single output frame.
|
|
5230
|
+
*/
|
|
5231
|
+
update(input: CharacterUpdateInput): CharacterUpdateOutput;
|
|
5232
|
+
/** Set emotion (string preset or weights object). */
|
|
5233
|
+
setEmotion(emotion: string | EmotionWeights): void;
|
|
5234
|
+
/** Update character profile at runtime. */
|
|
5235
|
+
setProfile(profile: CharacterProfile): void;
|
|
5236
|
+
/** Access underlying FaceCompositor for advanced use. */
|
|
5237
|
+
get compositor(): FaceCompositor;
|
|
5238
|
+
/**
|
|
5239
|
+
* Get a snapshot of frame budget performance (rolling 2-second window).
|
|
5240
|
+
* Useful for runtime diagnostics / dev overlays.
|
|
5241
|
+
*/
|
|
5242
|
+
getPerformanceSnapshot(): {
|
|
5243
|
+
avgFrameUs: number;
|
|
5244
|
+
maxFrameUs: number;
|
|
5245
|
+
p95FrameUs: number;
|
|
5246
|
+
droppedFrames: number;
|
|
5247
|
+
totalFrames: number;
|
|
5248
|
+
};
|
|
5249
|
+
/** Reset all state (smoothing, life layer, emotions). */
|
|
5250
|
+
reset(): void;
|
|
5251
|
+
dispose(): void;
|
|
5252
|
+
/**
|
|
5253
|
+
* Compute normalized eye targets from camera and head positions.
|
|
5254
|
+
* Pure atan2/asin math — no renderer dependency.
|
|
5255
|
+
*/
|
|
5256
|
+
private computeEyeTargets;
|
|
5257
|
+
/**
|
|
5258
|
+
* Compute smoothed head rotation. Returns target yaw/pitch values.
|
|
5259
|
+
* Renderer is responsible for applying these to the head bone.
|
|
5260
|
+
*/
|
|
5261
|
+
private computeHeadGaze;
|
|
5262
|
+
}
|
|
5263
|
+
|
|
4558
5264
|
/**
|
|
4559
5265
|
* MicLipSync - Microphone → VAD → A2E → blendshapes
|
|
4560
5266
|
*
|
|
@@ -4576,7 +5282,7 @@ interface MicLipSyncConfig {
|
|
|
4576
5282
|
micChunkSize?: number;
|
|
4577
5283
|
/** Per-character expression weight scaling */
|
|
4578
5284
|
profile?: ExpressionProfile;
|
|
4579
|
-
/** Identity/style index for
|
|
5285
|
+
/** Identity/style index for A2E model (default: 0) */
|
|
4580
5286
|
identityIndex?: number;
|
|
4581
5287
|
}
|
|
4582
5288
|
interface MicLipSyncFrame {
|
|
@@ -4615,8 +5321,10 @@ declare class MicLipSync extends EventEmitter<MicLipSyncEvents> {
|
|
|
4615
5321
|
private _state;
|
|
4616
5322
|
private _isSpeaking;
|
|
4617
5323
|
private _currentFrame;
|
|
4618
|
-
private _currentRawFrame;
|
|
4619
5324
|
private profile;
|
|
5325
|
+
private _firstFrameEmitted;
|
|
5326
|
+
private readonly _profileBuffer;
|
|
5327
|
+
private vadQueue;
|
|
4620
5328
|
private speechStartTime;
|
|
4621
5329
|
private vadChunkSize;
|
|
4622
5330
|
private vadBuffer;
|
|
@@ -4646,47 +5354,6 @@ declare class MicLipSync extends EventEmitter<MicLipSyncEvents> {
|
|
|
4646
5354
|
private setState;
|
|
4647
5355
|
}
|
|
4648
5356
|
|
|
4649
|
-
/**
|
|
4650
|
-
* Shared types for orchestration layer
|
|
4651
|
-
*
|
|
4652
|
-
* @category Orchestration
|
|
4653
|
-
*/
|
|
4654
|
-
|
|
4655
|
-
type VoicePipelineState = 'idle' | 'loading' | 'ready' | 'listening' | 'thinking' | 'speaking' | 'error';
|
|
4656
|
-
interface LoadingProgress {
|
|
4657
|
-
currentModel: string;
|
|
4658
|
-
progress: number;
|
|
4659
|
-
totalModels: number;
|
|
4660
|
-
modelsLoaded: number;
|
|
4661
|
-
}
|
|
4662
|
-
interface TranscriptResult {
|
|
4663
|
-
text: string;
|
|
4664
|
-
emotion?: string;
|
|
4665
|
-
language?: string;
|
|
4666
|
-
event?: string;
|
|
4667
|
-
isFinal: boolean;
|
|
4668
|
-
inferenceTimeMs?: number;
|
|
4669
|
-
}
|
|
4670
|
-
/**
|
|
4671
|
-
* Consumer's response handler. VoicePipeline calls this with transcribed text.
|
|
4672
|
-
* Consumer must stream audio back for playback + lip sync.
|
|
4673
|
-
*/
|
|
4674
|
-
interface ResponseHandler {
|
|
4675
|
-
(params: {
|
|
4676
|
-
text: string;
|
|
4677
|
-
emotion?: string;
|
|
4678
|
-
event?: string;
|
|
4679
|
-
/** Stream audio chunks to pipeline for playback + lip sync */
|
|
4680
|
-
send: (chunk: Uint8Array) => Promise<void>;
|
|
4681
|
-
/** Call when all audio has been sent */
|
|
4682
|
-
done: () => Promise<void>;
|
|
4683
|
-
/** Aborted on interruption or stop() */
|
|
4684
|
-
signal: AbortSignal;
|
|
4685
|
-
/** Session ID for backend correlation */
|
|
4686
|
-
sessionId: string;
|
|
4687
|
-
}): Promise<void>;
|
|
4688
|
-
}
|
|
4689
|
-
|
|
4690
5357
|
/**
|
|
4691
5358
|
* VoicePipeline - Full conversational agent loop
|
|
4692
5359
|
*
|
|
@@ -4700,19 +5367,28 @@ interface ResponseHandler {
|
|
|
4700
5367
|
* @category Orchestration
|
|
4701
5368
|
*/
|
|
4702
5369
|
|
|
4703
|
-
|
|
4704
|
-
|
|
4705
|
-
models
|
|
5370
|
+
/** Shared config options for all VoicePipeline modes */
|
|
5371
|
+
interface VoicePipelineBaseConfig {
|
|
5372
|
+
/** Pre-built backends — skip internal factory creation. Takes precedence over `models`. */
|
|
5373
|
+
backends?: {
|
|
5374
|
+
asr: SenseVoiceBackend;
|
|
5375
|
+
lam: A2EBackend;
|
|
5376
|
+
vad: SileroVADBackend;
|
|
5377
|
+
tts?: TTSBackend;
|
|
5378
|
+
};
|
|
5379
|
+
/** External unified worker (reuse across pipelines). Takes precedence over internal creation. */
|
|
5380
|
+
unifiedWorker?: UnifiedInferenceWorker;
|
|
5381
|
+
/** URLs and options for model loading. Required if `backends` not provided. */
|
|
5382
|
+
models?: {
|
|
4706
5383
|
senseVoice: {
|
|
4707
5384
|
modelUrl: string;
|
|
4708
5385
|
tokensUrl?: string;
|
|
4709
5386
|
language?: string;
|
|
4710
5387
|
};
|
|
4711
5388
|
lam: {
|
|
4712
|
-
|
|
4713
|
-
|
|
4714
|
-
|
|
4715
|
-
mode?: 'auto' | 'gpu' | 'cpu';
|
|
5389
|
+
modelUrl: string;
|
|
5390
|
+
externalDataUrl?: string | false;
|
|
5391
|
+
backend?: 'auto' | 'webgpu' | 'wasm';
|
|
4716
5392
|
};
|
|
4717
5393
|
vad: {
|
|
4718
5394
|
modelUrl: string;
|
|
@@ -4720,14 +5396,10 @@ interface VoicePipelineConfig {
|
|
|
4720
5396
|
preSpeechBufferChunks?: number;
|
|
4721
5397
|
};
|
|
4722
5398
|
};
|
|
4723
|
-
/** Consumer's response handler */
|
|
4724
|
-
onResponse: ResponseHandler;
|
|
4725
5399
|
/** Per-character expression weight scaling */
|
|
4726
5400
|
profile?: ExpressionProfile;
|
|
4727
|
-
/** Identity/style index for
|
|
5401
|
+
/** Identity/style index for A2E model (default: 0) */
|
|
4728
5402
|
identityIndex?: number;
|
|
4729
|
-
/** LAM load timeout in ms — CPU fallback on timeout (default: 30000) */
|
|
4730
|
-
lamLoadTimeoutMs?: number;
|
|
4731
5403
|
/** Base silence timeout in ms (default: 500) */
|
|
4732
5404
|
silenceTimeoutMs?: number;
|
|
4733
5405
|
/** Extended silence timeout for long utterances (default: 700) */
|
|
@@ -4763,6 +5435,40 @@ interface VoicePipelineConfig {
|
|
|
4763
5435
|
/** Duration of neutral fade-out (default: 250ms) */
|
|
4764
5436
|
neutralTransitionMs?: number;
|
|
4765
5437
|
}
|
|
5438
|
+
/** Cloud TTS mode: consumer handles response + audio streaming */
|
|
5439
|
+
interface VoicePipelineCloudConfig extends VoicePipelineBaseConfig {
|
|
5440
|
+
mode: 'cloud';
|
|
5441
|
+
/** Consumer's response handler (streams audio back) */
|
|
5442
|
+
onResponse: ResponseHandler;
|
|
5443
|
+
}
|
|
5444
|
+
/** Local TTS mode: SDK handles synthesis internally via TTSBackend */
|
|
5445
|
+
interface VoicePipelineLocalConfig extends VoicePipelineBaseConfig {
|
|
5446
|
+
mode: 'local';
|
|
5447
|
+
/**
|
|
5448
|
+
* TTS backend (e.g., KokoroTTSInference). Provide either `tts` or `ttsConfig`.
|
|
5449
|
+
*
|
|
5450
|
+
* When `tts` is provided, VoicePipeline uses it as-is. On iOS, this means
|
|
5451
|
+
* inference runs on the main thread (may cause UI freezes).
|
|
5452
|
+
*
|
|
5453
|
+
* Prefer `ttsConfig` for automatic unified worker integration on iOS.
|
|
5454
|
+
*/
|
|
5455
|
+
tts?: TTSBackend;
|
|
5456
|
+
/**
|
|
5457
|
+
* Kokoro TTS configuration. When provided, VoicePipeline creates the TTS
|
|
5458
|
+
* internally and passes the unified worker on iOS for off-main-thread inference.
|
|
5459
|
+
*
|
|
5460
|
+
* Takes precedence over `tts` if both are provided.
|
|
5461
|
+
*/
|
|
5462
|
+
ttsConfig?: {
|
|
5463
|
+
defaultVoice?: string;
|
|
5464
|
+
speed?: number;
|
|
5465
|
+
modelUrl?: string;
|
|
5466
|
+
voiceBaseUrl?: string;
|
|
5467
|
+
};
|
|
5468
|
+
/** Optional text transform (e.g., LLM call). Receives transcript, returns response text. */
|
|
5469
|
+
onTranscript?: (text: string) => string | Promise<string>;
|
|
5470
|
+
}
|
|
5471
|
+
type VoicePipelineConfig = VoicePipelineCloudConfig | VoicePipelineLocalConfig;
|
|
4766
5472
|
interface VoicePipelineEvents {
|
|
4767
5473
|
'state': VoicePipelineState;
|
|
4768
5474
|
'loading:progress': LoadingProgress;
|
|
@@ -4787,6 +5493,7 @@ interface VoicePipelineEvents {
|
|
|
4787
5493
|
}
|
|
4788
5494
|
declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
|
|
4789
5495
|
private readonly config;
|
|
5496
|
+
private readonly isLocalMode;
|
|
4790
5497
|
private _state;
|
|
4791
5498
|
private stopped;
|
|
4792
5499
|
private epoch;
|
|
@@ -4799,6 +5506,7 @@ declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
|
|
|
4799
5506
|
private interruption;
|
|
4800
5507
|
private omoteEvents;
|
|
4801
5508
|
private mic;
|
|
5509
|
+
private static readonly MAX_AUDIO_BUFFER_SAMPLES;
|
|
4802
5510
|
private audioBuffer;
|
|
4803
5511
|
private audioBufferSamples;
|
|
4804
5512
|
private speechStartTime;
|
|
@@ -4810,6 +5518,8 @@ declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
|
|
|
4810
5518
|
private lastProgressiveSamples;
|
|
4811
5519
|
private asrErrorCount;
|
|
4812
5520
|
private responseAbortController;
|
|
5521
|
+
private _unsubChunk;
|
|
5522
|
+
private _unsubLevel;
|
|
4813
5523
|
private _currentFrame;
|
|
4814
5524
|
/** Current pipeline state */
|
|
4815
5525
|
get state(): VoicePipelineState;
|
|
@@ -4821,6 +5531,15 @@ declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
|
|
|
4821
5531
|
get sessionId(): string | null;
|
|
4822
5532
|
constructor(config: VoicePipelineConfig);
|
|
4823
5533
|
loadModels(): Promise<void>;
|
|
5534
|
+
/**
|
|
5535
|
+
* Load from pre-built backends (dependency injection path).
|
|
5536
|
+
* Loads any backends that aren't loaded yet.
|
|
5537
|
+
*/
|
|
5538
|
+
private loadFromBackends;
|
|
5539
|
+
/**
|
|
5540
|
+
* Load from factories (original path). Loads SenseVoice, LAM, and VAD in parallel.
|
|
5541
|
+
*/
|
|
5542
|
+
private loadFromFactories;
|
|
4824
5543
|
start(): Promise<void>;
|
|
4825
5544
|
stop(): void;
|
|
4826
5545
|
setProfile(profile: ExpressionProfile): void;
|
|
@@ -4830,6 +5549,10 @@ declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
|
|
|
4830
5549
|
private onSilenceDetected;
|
|
4831
5550
|
private processEndOfSpeech;
|
|
4832
5551
|
private callResponseHandler;
|
|
5552
|
+
/** Cloud mode: delegate to consumer's onResponse handler */
|
|
5553
|
+
private handleCloudResponse;
|
|
5554
|
+
/** Local mode: synthesize text with TTSBackend, stream to PlaybackPipeline */
|
|
5555
|
+
private handleLocalResponse;
|
|
4833
5556
|
private handleInterruption;
|
|
4834
5557
|
private startProgressiveTranscription;
|
|
4835
5558
|
private stopProgressiveTranscription;
|
|
@@ -4840,4 +5563,86 @@ declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
|
|
|
4840
5563
|
private clearSilenceTimer;
|
|
4841
5564
|
}
|
|
4842
5565
|
|
|
4843
|
-
|
|
5566
|
+
/**
|
|
5567
|
+
* VoiceOrchestrator — Shared voice wiring for OmoteAvatar adapters.
|
|
5568
|
+
*
|
|
5569
|
+
* Composes TTSSpeaker (local mode) or PlaybackPipeline (cloud mode) with
|
|
5570
|
+
* SpeechListener and InterruptionHandler. Supports both local TTS and
|
|
5571
|
+
* cloud TTS via discriminated union config.
|
|
5572
|
+
*
|
|
5573
|
+
* Extracted from the ~70 identical lines duplicated across three/babylon/r3f
|
|
5574
|
+
* adapters into a single reusable class.
|
|
5575
|
+
*
|
|
5576
|
+
* @category Orchestration
|
|
5577
|
+
*/
|
|
5578
|
+
|
|
5579
|
+
interface VoiceOrchestratorBaseConfig {
|
|
5580
|
+
listener?: SpeechListenerConfig;
|
|
5581
|
+
interruptionEnabled?: boolean;
|
|
5582
|
+
profile?: ExpressionProfile;
|
|
5583
|
+
}
|
|
5584
|
+
interface VoiceOrchestratorLocalConfig extends VoiceOrchestratorBaseConfig {
|
|
5585
|
+
mode?: 'local';
|
|
5586
|
+
tts: TTSBackend;
|
|
5587
|
+
speaker?: TTSSpeakerConfig;
|
|
5588
|
+
onTranscript: (text: string, emotion?: string) => string | Promise<string> | AsyncGenerator<string>;
|
|
5589
|
+
}
|
|
5590
|
+
interface VoiceOrchestratorCloudConfig extends VoiceOrchestratorBaseConfig {
|
|
5591
|
+
mode: 'cloud';
|
|
5592
|
+
onResponse: ResponseHandler;
|
|
5593
|
+
lam?: {
|
|
5594
|
+
modelUrl?: string;
|
|
5595
|
+
externalDataUrl?: string | false;
|
|
5596
|
+
};
|
|
5597
|
+
}
|
|
5598
|
+
type VoiceOrchestratorConfig = VoiceOrchestratorLocalConfig | VoiceOrchestratorCloudConfig;
|
|
5599
|
+
interface VoiceOrchestratorEvents {
|
|
5600
|
+
'state': ConversationalState;
|
|
5601
|
+
'transcript': TranscriptResult;
|
|
5602
|
+
[key: string]: unknown;
|
|
5603
|
+
}
|
|
5604
|
+
declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
|
|
5605
|
+
private speechListener;
|
|
5606
|
+
private interruption;
|
|
5607
|
+
private ttsSpeaker;
|
|
5608
|
+
private playbackPipeline;
|
|
5609
|
+
private ownedLam;
|
|
5610
|
+
private transcriptUnsub;
|
|
5611
|
+
private audioChunkUnsub;
|
|
5612
|
+
private connectEpoch;
|
|
5613
|
+
private responseAbortController;
|
|
5614
|
+
private _state;
|
|
5615
|
+
private _isSpeaking;
|
|
5616
|
+
private _frameSource;
|
|
5617
|
+
private _mode;
|
|
5618
|
+
private _sessionId;
|
|
5619
|
+
get state(): ConversationalState;
|
|
5620
|
+
get isSpeaking(): boolean;
|
|
5621
|
+
get frameSource(): FrameSource | null;
|
|
5622
|
+
/** Access the internal SpeechListener. */
|
|
5623
|
+
get listener(): SpeechListener | null;
|
|
5624
|
+
/** Access the internal TTSSpeaker (local mode only). */
|
|
5625
|
+
get speaker(): TTSSpeaker | null;
|
|
5626
|
+
connect(config: VoiceOrchestratorConfig): Promise<void>;
|
|
5627
|
+
disconnect(): Promise<void>;
|
|
5628
|
+
startListening(): Promise<void>;
|
|
5629
|
+
stopListening(): void;
|
|
5630
|
+
speak(text: string, options?: {
|
|
5631
|
+
signal?: AbortSignal;
|
|
5632
|
+
voice?: string;
|
|
5633
|
+
}): Promise<void>;
|
|
5634
|
+
streamText(options?: {
|
|
5635
|
+
signal?: AbortSignal;
|
|
5636
|
+
voice?: string;
|
|
5637
|
+
}): Promise<{
|
|
5638
|
+
push: (token: string) => void;
|
|
5639
|
+
end: () => Promise<void>;
|
|
5640
|
+
}>;
|
|
5641
|
+
stopSpeaking(): void;
|
|
5642
|
+
private wireLocalTranscript;
|
|
5643
|
+
private wireCloudTranscript;
|
|
5644
|
+
private handleInterruption;
|
|
5645
|
+
private setState;
|
|
5646
|
+
}
|
|
5647
|
+
|
|
5648
|
+
export { type A2EBackend, A2EInference, type A2EInferenceConfig, type A2EModelInfo, A2EProcessor, type A2EProcessorConfig, type A2EResult, A2EUnifiedAdapter, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, type AnimationController, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationSource, type AnimationSourceOptions, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, type BoneFilterConfig, type CacheConfig, type CacheSpanAttributes, CharacterController, type CharacterControllerConfig, type CharacterProfile, type CharacterUpdateInput, type CharacterUpdateOutput, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateKokoroTTSConfig, type CreateSenseVoiceConfig, type CreateTTSPlayerConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_BONE_FILTER, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, EXPLICIT_EMOTION_COUNT, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, type ErrorType, ErrorTypes, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FaceCompositorOutput, type FetchWithCacheOptions, type FrameSource, type FullFaceFrame, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceFactoryConfig, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, KOKORO_VOICES, type KokoroStreamChunk, type KokoroTTSConfig, KokoroTTSInference, type KokoroTTSModelInfo, type KokoroTTSResult, KokoroTTSUnifiedAdapter, KokoroTTSWorker, type KokoroVoiceName, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MIXAMO_PREFIX, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PRESERVE_POSITION_BONES, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, ProceduralLifeLayer, type Quat, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, SpeechListener, type SpeechListenerConfig, type SpeechListenerEvents, type SpeechListenerState, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type SynthesizeOptions, type TTSBackend, type TTSChunk, TTSPlayback, type TTSPlaybackConfig, type TTSPlaybackEvents, TTSPlayer, TTSSpeaker, type TTSSpeakerConfig, type TTSStreamOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TrackDescriptor, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type Vec3, VoiceOrchestrator, type VoiceOrchestratorCloudConfig, type VoiceOrchestratorConfig, type VoiceOrchestratorEvents, type VoiceOrchestratorLocalConfig, VoicePipeline, type VoicePipelineCloudConfig, type VoicePipelineConfig, type VoicePipelineEvents, type VoicePipelineLocalConfig, type VoicePipelineState, A2EInference as Wav2Vec2Inference, type WorkerHealthState, analyzeTextEmotion, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureOrtCdn, configureTelemetry, createA2E, createEmotionVector, createKokoroTTS, createSenseVoice, createSileroVAD, createTTSPlayer, fetchWithCache, float32ToPcm16, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getOrtCdnBase, getRecommendedBackend, getTelemetry, hasWebGPUApi, int16ToFloat32, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, listVoices as listKokoroVoices, parseEmotionTags, pcm16ToFloat32, preloadModels, resampleLinear, resetModelUrls, resolveBackend, resolveEmotion, shouldEnableWasmProxy, shouldKeepTrack, shouldUseNativeASR, shouldUseServerA2E, stripMixamoPrefix, supportsVADWorker, ttsToPlaybackFormat, validateTTSInput };
|