@omote/core 0.5.7 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +484 -35
- package/dist/index.d.ts +484 -35
- package/dist/index.js +1184 -488
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +1179 -483
- package/dist/index.mjs.map +1 -1
- package/package.json +3 -2
package/dist/index.d.mts
CHANGED
|
@@ -492,38 +492,13 @@ interface A2EBackend {
|
|
|
492
492
|
}
|
|
493
493
|
|
|
494
494
|
/**
|
|
495
|
-
*
|
|
495
|
+
* ExpressionProfile - Per-character weight scaling for A2E blendshape output
|
|
496
496
|
*
|
|
497
|
-
*
|
|
498
|
-
*
|
|
499
|
-
* 2. Running A2E inference in background (fire-and-forget via A2EProcessor)
|
|
500
|
-
* 3. Applying per-character ExpressionProfile scaling to raw A2E output
|
|
501
|
-
*
|
|
502
|
-
* The A2E model outputs all 52 ARKit blendshapes from audio — brows, eyes, cheeks,
|
|
503
|
-
* mouth, jaw, everything. ExpressionProfile allows per-character weight scaling
|
|
504
|
-
* by group (eyes, brows, jaw, mouth, cheeks, nose, tongue) with per-blendshape overrides.
|
|
497
|
+
* Maps blendshape groups (eyes, brows, jaw, mouth, cheeks, nose, tongue)
|
|
498
|
+
* to weight scalers. Used by PlaybackPipeline, MicLipSync, and VoicePipeline.
|
|
505
499
|
*
|
|
506
500
|
* @category Audio
|
|
507
|
-
*
|
|
508
|
-
* @example Basic usage
|
|
509
|
-
* ```typescript
|
|
510
|
-
* import { FullFacePipeline } from '@omote/core';
|
|
511
|
-
*
|
|
512
|
-
* const pipeline = new FullFacePipeline({
|
|
513
|
-
* lam,
|
|
514
|
-
* profile: { mouth: 1.2, brows: 0.8 },
|
|
515
|
-
* });
|
|
516
|
-
* await pipeline.initialize();
|
|
517
|
-
*
|
|
518
|
-
* pipeline.on('full_frame_ready', (frame) => {
|
|
519
|
-
* applyToAvatar(frame.blendshapes);
|
|
520
|
-
* });
|
|
521
|
-
*
|
|
522
|
-
* pipeline.start();
|
|
523
|
-
* await pipeline.onAudioChunk(audioData);
|
|
524
|
-
* ```
|
|
525
501
|
*/
|
|
526
|
-
|
|
527
502
|
type BlendshapeGroup = 'eyes' | 'brows' | 'jaw' | 'mouth' | 'cheeks' | 'nose' | 'tongue';
|
|
528
503
|
/**
|
|
529
504
|
* Per-character weight scaling for A2E blendshape output.
|
|
@@ -555,6 +530,53 @@ interface ExpressionProfile {
|
|
|
555
530
|
* Built once at module load from prefix matching.
|
|
556
531
|
*/
|
|
557
532
|
declare const BLENDSHAPE_TO_GROUP: Map<string, BlendshapeGroup>;
|
|
533
|
+
/**
|
|
534
|
+
* Apply ExpressionProfile scaling to raw A2E blendshapes.
|
|
535
|
+
*
|
|
536
|
+
* For each blendshape:
|
|
537
|
+
* 1. If an override exists for the blendshape name, use override as scaler
|
|
538
|
+
* 2. Otherwise, use the group scaler (default 1.0)
|
|
539
|
+
* 3. Clamp result to [0, 1]
|
|
540
|
+
*/
|
|
541
|
+
declare function applyProfile(raw: Float32Array, profile: ExpressionProfile): Float32Array;
|
|
542
|
+
|
|
543
|
+
/**
|
|
544
|
+
* FullFacePipeline - A2E expression pipeline with ExpressionProfile weight scaling
|
|
545
|
+
*
|
|
546
|
+
* Orchestrates full-face animation by:
|
|
547
|
+
* 1. Scheduling audio for playback immediately (audio-first, never waits for A2E)
|
|
548
|
+
* 2. Running A2E inference in background (fire-and-forget via A2EProcessor)
|
|
549
|
+
* 3. Applying per-character ExpressionProfile scaling to raw A2E output
|
|
550
|
+
*
|
|
551
|
+
* The A2E model outputs all 52 ARKit blendshapes from audio — brows, eyes, cheeks,
|
|
552
|
+
* mouth, jaw, everything. ExpressionProfile allows per-character weight scaling
|
|
553
|
+
* by group (eyes, brows, jaw, mouth, cheeks, nose, tongue) with per-blendshape overrides.
|
|
554
|
+
*
|
|
555
|
+
* @deprecated Use {@link PlaybackPipeline} from `@omote/core` instead. PlaybackPipeline
|
|
556
|
+
* is a superset with sync mode (`feedBuffer`), state tracking, and opt-in neutral transition.
|
|
557
|
+
* FullFacePipeline will continue to work but is no longer actively developed.
|
|
558
|
+
*
|
|
559
|
+
* @category Audio
|
|
560
|
+
*
|
|
561
|
+
* @example Basic usage
|
|
562
|
+
* ```typescript
|
|
563
|
+
* import { FullFacePipeline } from '@omote/core';
|
|
564
|
+
*
|
|
565
|
+
* const pipeline = new FullFacePipeline({
|
|
566
|
+
* lam,
|
|
567
|
+
* profile: { mouth: 1.2, brows: 0.8 },
|
|
568
|
+
* });
|
|
569
|
+
* await pipeline.initialize();
|
|
570
|
+
*
|
|
571
|
+
* pipeline.on('full_frame_ready', (frame) => {
|
|
572
|
+
* applyToAvatar(frame.blendshapes);
|
|
573
|
+
* });
|
|
574
|
+
*
|
|
575
|
+
* pipeline.start();
|
|
576
|
+
* await pipeline.onAudioChunk(audioData);
|
|
577
|
+
* ```
|
|
578
|
+
*/
|
|
579
|
+
|
|
558
580
|
/**
|
|
559
581
|
* Configuration for FullFacePipeline
|
|
560
582
|
*/
|
|
@@ -605,7 +627,7 @@ interface FullFacePipelineOptions {
|
|
|
605
627
|
/**
|
|
606
628
|
* Full face frame with scaled blendshapes
|
|
607
629
|
*/
|
|
608
|
-
interface FullFaceFrame {
|
|
630
|
+
interface FullFaceFrame$1 {
|
|
609
631
|
/** Scaled 52 ARKit blendshapes (ExpressionProfile applied) */
|
|
610
632
|
blendshapes: Float32Array;
|
|
611
633
|
/** Raw A2E output (52 blendshapes, before profile scaling) */
|
|
@@ -618,7 +640,7 @@ interface FullFaceFrame {
|
|
|
618
640
|
*/
|
|
619
641
|
interface FullFacePipelineEvents {
|
|
620
642
|
/** New merged frame ready for display */
|
|
621
|
-
full_frame_ready: FullFaceFrame;
|
|
643
|
+
full_frame_ready: FullFaceFrame$1;
|
|
622
644
|
/** Raw LAM frame ready (for debugging/monitoring) */
|
|
623
645
|
lam_frame_ready: Float32Array;
|
|
624
646
|
/** Playback has completed */
|
|
@@ -664,10 +686,7 @@ declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
|
|
|
664
686
|
/**
|
|
665
687
|
* Apply ExpressionProfile scaling to raw A2E blendshapes.
|
|
666
688
|
*
|
|
667
|
-
*
|
|
668
|
-
* 1. If an override exists for the blendshape name, use override as scaler
|
|
669
|
-
* 2. Otherwise, use the group scaler (default 1.0)
|
|
670
|
-
* 3. Clamp result to [0, 1]
|
|
689
|
+
* Delegates to the standalone applyProfile() utility from expressionProfile.ts.
|
|
671
690
|
*/
|
|
672
691
|
applyProfile(raw: Float32Array): Float32Array;
|
|
673
692
|
/**
|
|
@@ -727,6 +746,147 @@ declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
|
|
|
727
746
|
dispose(): void;
|
|
728
747
|
}
|
|
729
748
|
|
|
749
|
+
/**
|
|
750
|
+
* PlaybackPipeline - Audio playback + A2E lip sync with ExpressionProfile scaling
|
|
751
|
+
*
|
|
752
|
+
* Refactored superset of FullFacePipeline. Adds:
|
|
753
|
+
* - Sync mode (`feedBuffer`) for pre-recorded audio
|
|
754
|
+
* - State tracking (idle → playing → stopping)
|
|
755
|
+
* - Opt-in neutral transition animation on playback complete
|
|
756
|
+
* - Idempotent `start()` (no spurious playback:complete on restart)
|
|
757
|
+
*
|
|
758
|
+
* @category Audio
|
|
759
|
+
*/
|
|
760
|
+
|
|
761
|
+
type PlaybackState = 'idle' | 'playing' | 'stopping';
|
|
762
|
+
interface PlaybackPipelineConfig {
|
|
763
|
+
/** A2E inference backend (from createA2E) */
|
|
764
|
+
lam: A2EBackend;
|
|
765
|
+
/** Sample rate in Hz (default: 16000) */
|
|
766
|
+
sampleRate?: number;
|
|
767
|
+
/** Target chunk duration for coalescing in ms (default: 200) */
|
|
768
|
+
chunkTargetMs?: number;
|
|
769
|
+
/** Audio playback delay in ms (default: auto-detected from backend) */
|
|
770
|
+
audioDelayMs?: number;
|
|
771
|
+
/** A2E inference chunk size in samples (default: 16000) */
|
|
772
|
+
chunkSize?: number;
|
|
773
|
+
/** Identity/style index for Wav2Vec2 (default: 0) */
|
|
774
|
+
identityIndex?: number;
|
|
775
|
+
/** Per-character expression weight scaling */
|
|
776
|
+
profile?: ExpressionProfile;
|
|
777
|
+
/** Enable neutral transition on playback complete (default: false) */
|
|
778
|
+
neutralTransitionEnabled?: boolean;
|
|
779
|
+
/** Duration of neutral fade-out in ms (default: 250). Only applies when neutralTransitionEnabled=true. */
|
|
780
|
+
neutralTransitionMs?: number;
|
|
781
|
+
/** Stale frame warning threshold in ms (default: 2000) */
|
|
782
|
+
staleThresholdMs?: number;
|
|
783
|
+
}
|
|
784
|
+
/**
|
|
785
|
+
* Full face frame with scaled blendshapes
|
|
786
|
+
*/
|
|
787
|
+
interface FullFaceFrame {
|
|
788
|
+
/** Scaled 52 ARKit blendshapes (ExpressionProfile applied) */
|
|
789
|
+
blendshapes: Float32Array;
|
|
790
|
+
/** Raw A2E output (52 blendshapes, before profile scaling) */
|
|
791
|
+
rawBlendshapes: Float32Array;
|
|
792
|
+
/** AudioContext timestamp for this frame */
|
|
793
|
+
timestamp: number;
|
|
794
|
+
}
|
|
795
|
+
interface PlaybackPipelineEvents {
|
|
796
|
+
/** New frame ready for display (scaled by ExpressionProfile) */
|
|
797
|
+
'frame': FullFaceFrame;
|
|
798
|
+
/** Raw A2E frame (before profile scaling) */
|
|
799
|
+
'frame:raw': Float32Array;
|
|
800
|
+
/** Playback started (first audio scheduled) */
|
|
801
|
+
'playback:start': {
|
|
802
|
+
time: number;
|
|
803
|
+
};
|
|
804
|
+
/** Playback completed naturally */
|
|
805
|
+
'playback:complete': void;
|
|
806
|
+
/** Playback stopped (user-initiated) */
|
|
807
|
+
'playback:stop': void;
|
|
808
|
+
/** Error occurred */
|
|
809
|
+
'error': Error;
|
|
810
|
+
/** State changed */
|
|
811
|
+
'state': PlaybackState;
|
|
812
|
+
'full_frame_ready': FullFaceFrame;
|
|
813
|
+
'lam_frame_ready': Float32Array;
|
|
814
|
+
'playback_complete': void;
|
|
815
|
+
'playback_start': number;
|
|
816
|
+
[key: string]: unknown;
|
|
817
|
+
}
|
|
818
|
+
declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
|
|
819
|
+
private readonly config;
|
|
820
|
+
private scheduler;
|
|
821
|
+
private coalescer;
|
|
822
|
+
private processor;
|
|
823
|
+
private readonly sampleRate;
|
|
824
|
+
private _state;
|
|
825
|
+
private playbackStarted;
|
|
826
|
+
private monitorInterval;
|
|
827
|
+
private frameAnimationId;
|
|
828
|
+
private lastNewFrameTime;
|
|
829
|
+
private lastKnownLamFrame;
|
|
830
|
+
private staleWarningEmitted;
|
|
831
|
+
private readonly staleThresholdMs;
|
|
832
|
+
private frameLoopCount;
|
|
833
|
+
private profile;
|
|
834
|
+
private readonly neutralTransitionEnabled;
|
|
835
|
+
private readonly neutralTransitionMs;
|
|
836
|
+
private neutralTransitionFrame;
|
|
837
|
+
private neutralTransitionStart;
|
|
838
|
+
private neutralAnimationId;
|
|
839
|
+
private _currentFrame;
|
|
840
|
+
private _currentRawFrame;
|
|
841
|
+
/** Current pipeline state */
|
|
842
|
+
get state(): PlaybackState;
|
|
843
|
+
/** Current scaled blendshapes (updated in-place for perf) */
|
|
844
|
+
get currentFrame(): Float32Array | null;
|
|
845
|
+
/** Raw A2E blendshapes (before profile scaling) */
|
|
846
|
+
get currentRawFrame(): Float32Array | null;
|
|
847
|
+
constructor(config: PlaybackPipelineConfig);
|
|
848
|
+
/** Initialize AudioContext (lazy, call after user gesture) */
|
|
849
|
+
initialize(): Promise<void>;
|
|
850
|
+
/** Update ExpressionProfile at runtime */
|
|
851
|
+
setProfile(profile: ExpressionProfile): void;
|
|
852
|
+
/**
|
|
853
|
+
* Start a new playback session.
|
|
854
|
+
* Idempotent — calling during playback resets cleanly without emitting
|
|
855
|
+
* spurious playback:complete.
|
|
856
|
+
*/
|
|
857
|
+
start(): void;
|
|
858
|
+
/** Feed a streaming audio chunk (PCM16 Uint8Array) */
|
|
859
|
+
onAudioChunk(chunk: Uint8Array): Promise<void>;
|
|
860
|
+
/** Signal end of audio stream (flushes remaining audio) */
|
|
861
|
+
end(): Promise<void>;
|
|
862
|
+
/**
|
|
863
|
+
* Feed a complete audio buffer. Chunks into 200ms pieces, schedules each
|
|
864
|
+
* for playback, runs A2E inference, then waits for completion.
|
|
865
|
+
*/
|
|
866
|
+
feedBuffer(audio: ArrayBuffer | Float32Array): Promise<void>;
|
|
867
|
+
/** Stop playback immediately with fade-out */
|
|
868
|
+
stop(fadeOutMs?: number): Promise<void>;
|
|
869
|
+
/** Cleanup all resources */
|
|
870
|
+
dispose(): void;
|
|
871
|
+
/** Get pipeline debug state */
|
|
872
|
+
getDebugState(): {
|
|
873
|
+
state: PlaybackState;
|
|
874
|
+
playbackStarted: boolean;
|
|
875
|
+
coalescerFill: number;
|
|
876
|
+
processorFill: number;
|
|
877
|
+
queuedFrames: number;
|
|
878
|
+
currentTime: number;
|
|
879
|
+
playbackEndTime: number;
|
|
880
|
+
};
|
|
881
|
+
private startFrameLoop;
|
|
882
|
+
private startMonitoring;
|
|
883
|
+
private onPlaybackComplete;
|
|
884
|
+
private startNeutralTransition;
|
|
885
|
+
private cancelNeutralTransition;
|
|
886
|
+
private stopInternal;
|
|
887
|
+
private setState;
|
|
888
|
+
}
|
|
889
|
+
|
|
730
890
|
/**
|
|
731
891
|
* Interruption Handler
|
|
732
892
|
*
|
|
@@ -2455,6 +2615,10 @@ declare class BlendshapeSmoother {
|
|
|
2455
2615
|
* drip-feeding to {@link A2EProcessor}. This class only handles mic capture
|
|
2456
2616
|
* (getUserMedia, ScriptProcessorNode, resampling).
|
|
2457
2617
|
*
|
|
2618
|
+
* @deprecated Use {@link MicLipSync} from `@omote/core` instead. MicLipSync provides
|
|
2619
|
+
* the same mic → A2E composition with proper MicrophoneCapture integration, VAD support,
|
|
2620
|
+
* ExpressionProfile scaling, and pause/resume. This class will be removed in a future version.
|
|
2621
|
+
*
|
|
2458
2622
|
* @category Inference
|
|
2459
2623
|
*/
|
|
2460
2624
|
|
|
@@ -4090,4 +4254,289 @@ declare class ProceduralLifeLayer {
|
|
|
4090
4254
|
private updateBrowNoise;
|
|
4091
4255
|
}
|
|
4092
4256
|
|
|
4093
|
-
|
|
4257
|
+
/**
|
|
4258
|
+
* MicLipSync - Microphone → VAD → A2E → blendshapes
|
|
4259
|
+
*
|
|
4260
|
+
* Simple composition class for live mic lip sync ("mirror mode").
|
|
4261
|
+
* Replaces A2EOrchestrator with proper MicrophoneCapture integration.
|
|
4262
|
+
*
|
|
4263
|
+
* @category Orchestration
|
|
4264
|
+
*/
|
|
4265
|
+
|
|
4266
|
+
type MicLipSyncState = 'idle' | 'active' | 'paused';
|
|
4267
|
+
interface MicLipSyncConfig {
|
|
4268
|
+
/** A2E inference backend (from createA2E) — required */
|
|
4269
|
+
lam: A2EBackend;
|
|
4270
|
+
/** VAD backend for speech boundary detection (optional) */
|
|
4271
|
+
vad?: SileroVADBackend;
|
|
4272
|
+
/** Mic sample rate (default: 16000) */
|
|
4273
|
+
sampleRate?: number;
|
|
4274
|
+
/** Mic chunk size in samples (default: 512, required by Silero VAD) */
|
|
4275
|
+
micChunkSize?: number;
|
|
4276
|
+
/** Per-character expression weight scaling */
|
|
4277
|
+
profile?: ExpressionProfile;
|
|
4278
|
+
/** Identity/style index for Wav2Vec2 (default: 0) */
|
|
4279
|
+
identityIndex?: number;
|
|
4280
|
+
}
|
|
4281
|
+
interface MicLipSyncFrame {
|
|
4282
|
+
blendshapes: Float32Array;
|
|
4283
|
+
rawBlendshapes: Float32Array;
|
|
4284
|
+
}
|
|
4285
|
+
interface MicLipSyncEvents {
|
|
4286
|
+
/** New blendshape frame ready */
|
|
4287
|
+
'frame': MicLipSyncFrame;
|
|
4288
|
+
/** Speech started (VAD) */
|
|
4289
|
+
'speech:start': void;
|
|
4290
|
+
/** Speech ended (VAD) */
|
|
4291
|
+
'speech:end': {
|
|
4292
|
+
durationMs: number;
|
|
4293
|
+
};
|
|
4294
|
+
/** Microphone started */
|
|
4295
|
+
'mic:start': void;
|
|
4296
|
+
/** Microphone stopped */
|
|
4297
|
+
'mic:stop': void;
|
|
4298
|
+
/** Audio level update */
|
|
4299
|
+
'audio:level': {
|
|
4300
|
+
rms: number;
|
|
4301
|
+
peak: number;
|
|
4302
|
+
};
|
|
4303
|
+
/** State changed */
|
|
4304
|
+
'state': MicLipSyncState;
|
|
4305
|
+
/** Error occurred */
|
|
4306
|
+
'error': Error;
|
|
4307
|
+
[key: string]: unknown;
|
|
4308
|
+
}
|
|
4309
|
+
declare class MicLipSync extends EventEmitter<MicLipSyncEvents> {
|
|
4310
|
+
private omoteEvents;
|
|
4311
|
+
private mic;
|
|
4312
|
+
private processor;
|
|
4313
|
+
private vad?;
|
|
4314
|
+
private _state;
|
|
4315
|
+
private _isSpeaking;
|
|
4316
|
+
private _currentFrame;
|
|
4317
|
+
private _currentRawFrame;
|
|
4318
|
+
private profile;
|
|
4319
|
+
private speechStartTime;
|
|
4320
|
+
private vadChunkSize;
|
|
4321
|
+
private vadBuffer;
|
|
4322
|
+
private vadBufferOffset;
|
|
4323
|
+
/** Current state */
|
|
4324
|
+
get state(): MicLipSyncState;
|
|
4325
|
+
/** Latest blendshape frame (null before first inference) */
|
|
4326
|
+
get currentFrame(): Float32Array | null;
|
|
4327
|
+
/** Whether speech is currently detected (requires VAD) */
|
|
4328
|
+
get isSpeaking(): boolean;
|
|
4329
|
+
/** Current backend type */
|
|
4330
|
+
get backend(): string | null;
|
|
4331
|
+
constructor(config: MicLipSyncConfig);
|
|
4332
|
+
/** Start microphone capture and inference loop */
|
|
4333
|
+
start(): Promise<void>;
|
|
4334
|
+
/** Stop microphone and inference */
|
|
4335
|
+
stop(): void;
|
|
4336
|
+
/** Pause inference (mic stays open for faster resume) */
|
|
4337
|
+
pause(): void;
|
|
4338
|
+
/** Resume inference after pause */
|
|
4339
|
+
resume(): void;
|
|
4340
|
+
/** Update ExpressionProfile at runtime */
|
|
4341
|
+
setProfile(profile: ExpressionProfile): void;
|
|
4342
|
+
/** Dispose of all resources */
|
|
4343
|
+
dispose(): Promise<void>;
|
|
4344
|
+
private processVAD;
|
|
4345
|
+
private setState;
|
|
4346
|
+
}
|
|
4347
|
+
|
|
4348
|
+
/**
|
|
4349
|
+
* Shared types for orchestration layer
|
|
4350
|
+
*
|
|
4351
|
+
* @category Orchestration
|
|
4352
|
+
*/
|
|
4353
|
+
|
|
4354
|
+
type VoicePipelineState = 'idle' | 'loading' | 'ready' | 'listening' | 'thinking' | 'speaking' | 'error';
|
|
4355
|
+
interface LoadingProgress {
|
|
4356
|
+
currentModel: string;
|
|
4357
|
+
progress: number;
|
|
4358
|
+
totalModels: number;
|
|
4359
|
+
modelsLoaded: number;
|
|
4360
|
+
}
|
|
4361
|
+
interface TranscriptResult {
|
|
4362
|
+
text: string;
|
|
4363
|
+
emotion?: string;
|
|
4364
|
+
language?: string;
|
|
4365
|
+
event?: string;
|
|
4366
|
+
isFinal: boolean;
|
|
4367
|
+
inferenceTimeMs?: number;
|
|
4368
|
+
}
|
|
4369
|
+
/**
|
|
4370
|
+
* Consumer's response handler. VoicePipeline calls this with transcribed text.
|
|
4371
|
+
* Consumer must stream audio back for playback + lip sync.
|
|
4372
|
+
*/
|
|
4373
|
+
interface ResponseHandler {
|
|
4374
|
+
(params: {
|
|
4375
|
+
text: string;
|
|
4376
|
+
emotion?: string;
|
|
4377
|
+
event?: string;
|
|
4378
|
+
/** Stream audio chunks to pipeline for playback + lip sync */
|
|
4379
|
+
send: (chunk: Uint8Array) => Promise<void>;
|
|
4380
|
+
/** Call when all audio has been sent */
|
|
4381
|
+
done: () => Promise<void>;
|
|
4382
|
+
/** Aborted on interruption or stop() */
|
|
4383
|
+
signal: AbortSignal;
|
|
4384
|
+
/** Session ID for backend correlation */
|
|
4385
|
+
sessionId: string;
|
|
4386
|
+
}): Promise<void>;
|
|
4387
|
+
}
|
|
4388
|
+
|
|
4389
|
+
/**
|
|
4390
|
+
* VoicePipeline - Full conversational agent loop
|
|
4391
|
+
*
|
|
4392
|
+
* Composes: MicrophoneCapture → SileroVAD → SenseVoice ASR → PlaybackPipeline (A2E)
|
|
4393
|
+
*
|
|
4394
|
+
* State machine: idle → loading → ready → listening → thinking → speaking → listening → ...
|
|
4395
|
+
*
|
|
4396
|
+
* The consumer provides an `onResponse` callback that receives transcribed text
|
|
4397
|
+
* and streams audio back for playback + lip sync. VoicePipeline is backend-agnostic.
|
|
4398
|
+
*
|
|
4399
|
+
* @category Orchestration
|
|
4400
|
+
*/
|
|
4401
|
+
|
|
4402
|
+
interface VoicePipelineConfig {
|
|
4403
|
+
/** URLs and options for model loading */
|
|
4404
|
+
models: {
|
|
4405
|
+
senseVoice: {
|
|
4406
|
+
modelUrl: string;
|
|
4407
|
+
tokensUrl?: string;
|
|
4408
|
+
language?: string;
|
|
4409
|
+
};
|
|
4410
|
+
lam: {
|
|
4411
|
+
gpuModelUrl: string;
|
|
4412
|
+
gpuExternalDataUrl?: string | false;
|
|
4413
|
+
cpuModelUrl: string;
|
|
4414
|
+
mode?: 'auto' | 'gpu' | 'cpu';
|
|
4415
|
+
};
|
|
4416
|
+
vad: {
|
|
4417
|
+
modelUrl: string;
|
|
4418
|
+
threshold?: number;
|
|
4419
|
+
preSpeechBufferChunks?: number;
|
|
4420
|
+
};
|
|
4421
|
+
};
|
|
4422
|
+
/** Consumer's response handler */
|
|
4423
|
+
onResponse: ResponseHandler;
|
|
4424
|
+
/** Per-character expression weight scaling */
|
|
4425
|
+
profile?: ExpressionProfile;
|
|
4426
|
+
/** Identity/style index for Wav2Vec2 (default: 0) */
|
|
4427
|
+
identityIndex?: number;
|
|
4428
|
+
/** LAM load timeout in ms — CPU fallback on timeout (default: 30000) */
|
|
4429
|
+
lamLoadTimeoutMs?: number;
|
|
4430
|
+
/** Base silence timeout in ms (default: 500) */
|
|
4431
|
+
silenceTimeoutMs?: number;
|
|
4432
|
+
/** Extended silence timeout for long utterances (default: 700) */
|
|
4433
|
+
silenceTimeoutExtendedMs?: number;
|
|
4434
|
+
/** Enable adaptive timeout based on speech duration (default: true) */
|
|
4435
|
+
adaptiveTimeout?: boolean;
|
|
4436
|
+
/** Minimum audio duration in seconds (default: 0.3) */
|
|
4437
|
+
minAudioDurationSec?: number;
|
|
4438
|
+
/** Minimum audio energy (default: 0.02) */
|
|
4439
|
+
minAudioEnergy?: number;
|
|
4440
|
+
/** Enable audio normalization for quiet audio (default: true) */
|
|
4441
|
+
normalizeAudio?: boolean;
|
|
4442
|
+
/** Progressive transcription interval — desktop (default: 500ms) */
|
|
4443
|
+
progressiveIntervalMs?: number;
|
|
4444
|
+
/** Progressive transcription interval — iOS (default: 800ms) */
|
|
4445
|
+
progressiveIntervalIosMs?: number;
|
|
4446
|
+
/** Coverage threshold to use progressive result (default: 0.8) */
|
|
4447
|
+
progressiveCoverageThreshold?: number;
|
|
4448
|
+
/** Minimum samples before progressive transcription starts (default: 8000) */
|
|
4449
|
+
progressiveMinSamples?: number;
|
|
4450
|
+
/** Timeout for individual SenseVoice.transcribe() calls (default: 10000ms) */
|
|
4451
|
+
transcriptionTimeoutMs?: number;
|
|
4452
|
+
/** Enable barge-in detection (default: true) */
|
|
4453
|
+
interruptionEnabled?: boolean;
|
|
4454
|
+
/** Minimum speech duration for interruption (default: 200ms) */
|
|
4455
|
+
interruptionMinSpeechMs?: number;
|
|
4456
|
+
/** Audio playback delay (default: auto-detected) */
|
|
4457
|
+
audioDelayMs?: number;
|
|
4458
|
+
/** Coalescer target duration (default: 200ms) */
|
|
4459
|
+
chunkTargetMs?: number;
|
|
4460
|
+
/** Enable neutral transition on playback complete (default: true) */
|
|
4461
|
+
neutralTransitionEnabled?: boolean;
|
|
4462
|
+
/** Duration of neutral fade-out (default: 250ms) */
|
|
4463
|
+
neutralTransitionMs?: number;
|
|
4464
|
+
}
|
|
4465
|
+
interface VoicePipelineEvents {
|
|
4466
|
+
'state': VoicePipelineState;
|
|
4467
|
+
'loading:progress': LoadingProgress;
|
|
4468
|
+
'transcript': TranscriptResult;
|
|
4469
|
+
'frame': FullFaceFrame;
|
|
4470
|
+
'frame:raw': Float32Array;
|
|
4471
|
+
'speech:start': void;
|
|
4472
|
+
'speech:end': {
|
|
4473
|
+
durationMs: number;
|
|
4474
|
+
};
|
|
4475
|
+
'playback:start': {
|
|
4476
|
+
time: number;
|
|
4477
|
+
};
|
|
4478
|
+
'playback:complete': void;
|
|
4479
|
+
'interruption': void;
|
|
4480
|
+
'audio:level': {
|
|
4481
|
+
rms: number;
|
|
4482
|
+
peak: number;
|
|
4483
|
+
};
|
|
4484
|
+
'error': Error;
|
|
4485
|
+
[key: string]: unknown;
|
|
4486
|
+
}
|
|
4487
|
+
declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
|
|
4488
|
+
private readonly config;
|
|
4489
|
+
private _state;
|
|
4490
|
+
private stopped;
|
|
4491
|
+
private epoch;
|
|
4492
|
+
private _sessionId;
|
|
4493
|
+
private asr;
|
|
4494
|
+
private lam;
|
|
4495
|
+
private vad;
|
|
4496
|
+
private unifiedWorker;
|
|
4497
|
+
private playback;
|
|
4498
|
+
private interruption;
|
|
4499
|
+
private omoteEvents;
|
|
4500
|
+
private mic;
|
|
4501
|
+
private audioBuffer;
|
|
4502
|
+
private audioBufferSamples;
|
|
4503
|
+
private speechStartTime;
|
|
4504
|
+
private silenceTimer;
|
|
4505
|
+
private isSpeaking;
|
|
4506
|
+
private progressiveTimer;
|
|
4507
|
+
private progressivePromise;
|
|
4508
|
+
private lastProgressiveResult;
|
|
4509
|
+
private lastProgressiveSamples;
|
|
4510
|
+
private asrErrorCount;
|
|
4511
|
+
private responseAbortController;
|
|
4512
|
+
private _currentFrame;
|
|
4513
|
+
/** Current pipeline state */
|
|
4514
|
+
get state(): VoicePipelineState;
|
|
4515
|
+
/** Latest blendshape frame */
|
|
4516
|
+
get currentFrame(): Float32Array | null;
|
|
4517
|
+
/** Whether user is currently speaking */
|
|
4518
|
+
get isSpeechActive(): boolean;
|
|
4519
|
+
/** Session ID (generated on start(), null before) */
|
|
4520
|
+
get sessionId(): string | null;
|
|
4521
|
+
constructor(config: VoicePipelineConfig);
|
|
4522
|
+
loadModels(): Promise<void>;
|
|
4523
|
+
start(): Promise<void>;
|
|
4524
|
+
stop(): void;
|
|
4525
|
+
setProfile(profile: ExpressionProfile): void;
|
|
4526
|
+
dispose(): Promise<void>;
|
|
4527
|
+
private processAudioChunk;
|
|
4528
|
+
private getSilenceTimeout;
|
|
4529
|
+
private onSilenceDetected;
|
|
4530
|
+
private processEndOfSpeech;
|
|
4531
|
+
private callResponseHandler;
|
|
4532
|
+
private handleInterruption;
|
|
4533
|
+
private startProgressiveTranscription;
|
|
4534
|
+
private stopProgressiveTranscription;
|
|
4535
|
+
private transcribeWithTimeout;
|
|
4536
|
+
private normalizeAudio;
|
|
4537
|
+
private setState;
|
|
4538
|
+
private emitProgress;
|
|
4539
|
+
private clearSilenceTimer;
|
|
4540
|
+
}
|
|
4541
|
+
|
|
4542
|
+
export { type A2EBackend, type A2EModelInfo, A2EOrchestrator, type A2EOrchestratorConfig, A2EProcessor, type A2EProcessorConfig, type A2EProgressEvent, type A2EResult, ARKIT_BLENDSHAPES, type ActiveSpan, type AnimationClip, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type CreateA2EConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type ExpressionProfile, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, ProceduralLifeLayer, type QuotaInfo, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, VoicePipeline, type VoicePipelineConfig, type VoicePipelineEvents, type VoicePipelineState, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureTelemetry, createA2E, createEmotionVector, createSenseVoice, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, preloadModels, resolveBackend, shouldEnableWasmProxy, shouldUseCpuA2E, shouldUseNativeASR, shouldUseServerA2E, supportsVADWorker };
|