@omote/core 0.4.7 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -485,730 +485,279 @@ var AudioChunkCoalescer = class {
485
485
  }
486
486
  };
487
487
 
488
- // src/audio/LAMPipeline.ts
489
- var LAMPipeline = class {
490
- constructor(options = {}) {
491
- this.options = options;
492
- this.REQUIRED_SAMPLES = 16e3;
493
- // 1.0s at 16kHz (LAM requirement)
494
- this.FRAME_RATE = 30;
495
- // LAM outputs 30fps
496
- this.buffer = new Float32Array(0);
488
+ // src/inference/A2EProcessor.ts
489
+ var logger = createLogger("A2EProcessor");
490
+ var FRAME_RATE = 30;
491
+ var DRIP_INTERVAL_MS = 33;
492
+ var A2EProcessor = class {
493
+ constructor(config) {
494
+ this.writeOffset = 0;
497
495
  this.bufferStartTime = 0;
498
- this.frameQueue = [];
499
- /**
500
- * Last successfully retrieved frame
501
- * Used as fallback when no new frame is available to prevent avatar freezing
502
- */
503
- this.lastFrame = null;
504
- }
505
- /**
506
- * Push audio samples into the pipeline
496
+ // Frame queues (timestamped for pull mode, plain for drip mode)
497
+ this.timestampedQueue = [];
498
+ this.plainQueue = [];
499
+ // Push mode state
500
+ this._latestFrame = null;
501
+ this.dripInterval = null;
502
+ // Last-frame-hold for pull mode (prevents avatar freezing between frames)
503
+ this.lastPulledFrame = null;
504
+ // Inference serialization
505
+ this.inferenceRunning = false;
506
+ this.pendingChunks = [];
507
+ // Diagnostic: track getFrameForTime calls
508
+ this.getFrameCallCount = 0;
509
+ this.disposed = false;
510
+ this.backend = config.backend;
511
+ this.sampleRate = config.sampleRate ?? 16e3;
512
+ this.chunkSize = config.chunkSize ?? config.backend.chunkSize ?? 16e3;
513
+ this.onFrame = config.onFrame;
514
+ this.onError = config.onError;
515
+ this.bufferCapacity = this.chunkSize * 2;
516
+ this.buffer = new Float32Array(this.bufferCapacity);
517
+ }
518
+ // ═══════════════════════════════════════════════════════════════════════
519
+ // Audio Input
520
+ // ═══════════════════════════════════════════════════════════════════════
521
+ /**
522
+ * Push audio samples for inference (any source: mic, TTS, file).
507
523
  *
508
- * Accumulates samples and triggers LAM inference when buffer is full.
509
- * Multiple calls may be needed to accumulate enough samples.
524
+ * - With `timestamp`: frames stored with timestamps (pull mode)
525
+ * - Without `timestamp`: frames stored in plain queue (drip/push mode)
510
526
  *
511
- * @param samples - Float32Array of audio samples
512
- * @param timestamp - AudioContext time when these samples start playing
513
- * @param lam - LAM inference engine
527
+ * Fire-and-forget: returns immediately, inference runs async.
514
528
  */
515
- async push(samples, timestamp, lam) {
516
- if (this.buffer.length === 0) {
529
+ pushAudio(samples, timestamp) {
530
+ if (this.disposed) return;
531
+ if (this.writeOffset === 0 && timestamp !== void 0) {
517
532
  this.bufferStartTime = timestamp;
518
533
  }
519
- const newBuffer = new Float32Array(this.buffer.length + samples.length);
520
- newBuffer.set(this.buffer, 0);
521
- newBuffer.set(samples, this.buffer.length);
522
- this.buffer = newBuffer;
523
- while (this.buffer.length >= this.REQUIRED_SAMPLES) {
524
- await this.processBuffer(lam);
525
- if (this.buffer.length >= this.REQUIRED_SAMPLES) {
526
- await new Promise((r) => setTimeout(r, 0));
527
- }
528
- }
529
- }
530
- /**
531
- * Process accumulated buffer through LAM inference
532
- */
533
- async processBuffer(lam) {
534
- try {
535
- const toProcess = this.buffer.slice(0, this.REQUIRED_SAMPLES);
536
- const processedStartTime = this.bufferStartTime;
537
- this.buffer = this.buffer.slice(this.REQUIRED_SAMPLES);
538
- const processedDuration = this.REQUIRED_SAMPLES / (this.options.sampleRate ?? 16e3);
539
- this.bufferStartTime = processedStartTime + processedDuration;
540
- const result = await lam.infer(toProcess);
541
- const frameDuration = 1 / this.FRAME_RATE;
542
- for (let i = 0; i < result.blendshapes.length; i++) {
543
- const frame = result.blendshapes[i];
544
- const timestamp = processedStartTime + i * frameDuration;
545
- this.frameQueue.push({ frame, timestamp });
546
- }
547
- this.options.onInference?.(result.blendshapes.length);
548
- } catch (error) {
549
- this.options.onError?.(error);
550
- this.buffer = new Float32Array(0);
551
- this.bufferStartTime = 0;
552
- }
553
- }
554
- /**
555
- * Get the frame that should be displayed at the current time
556
- *
557
- * Automatically removes frames that have already been displayed.
558
- * This prevents memory leaks from accumulating old frames.
559
- *
560
- * Discard Window (prevents premature frame discarding):
561
- * - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
562
- * - WASM: 1.0s (LAM inference 50-500ms + higher variability)
563
- *
564
- * Last-Frame-Hold: Returns last valid frame instead of null to prevent
565
- * avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
566
- *
567
- * @param currentTime - Current AudioContext time
568
- * @param lam - LAM inference engine (optional, for backend detection)
569
- * @returns Current frame, or last frame as fallback, or null if no frames yet
570
- */
571
- getFrameForTime(currentTime, lam) {
572
- const discardWindow = lam?.backend === "wasm" ? 1 : 0.5;
573
- let discardedCount = 0;
574
- while (this.frameQueue.length > 0 && this.frameQueue[0].timestamp < currentTime - discardWindow) {
575
- const discarded = this.frameQueue.shift();
576
- discardedCount++;
577
- if (discardedCount === 1) {
578
- const ageMs = ((currentTime - discarded.timestamp) * 1e3).toFixed(0);
579
- console.warn("[LAM] Frame(s) discarded as too old", {
580
- ageMs,
581
- discardWindowMs: discardWindow * 1e3,
582
- queueLength: this.frameQueue.length,
583
- backend: lam?.backend ?? "unknown"
584
- });
585
- }
586
- }
587
- if (this.frameQueue.length > 0 && this.frameQueue[0].timestamp <= currentTime) {
588
- const { frame } = this.frameQueue.shift();
589
- this.lastFrame = frame;
590
- return frame;
591
- }
592
- return this.lastFrame;
593
- }
594
- /**
595
- * Get all frames in the queue (for debugging/monitoring)
596
- */
597
- getQueuedFrames() {
598
- return [...this.frameQueue];
599
- }
600
- /**
601
- * Get current buffer fill level (0-1)
602
- */
603
- get fillLevel() {
604
- return Math.min(1, this.buffer.length / this.REQUIRED_SAMPLES);
605
- }
606
- /**
607
- * Get number of frames queued
608
- */
609
- get queuedFrameCount() {
610
- return this.frameQueue.length;
611
- }
612
- /**
613
- * Get buffered audio duration in seconds
614
- */
615
- get bufferedDuration() {
616
- return this.buffer.length / (this.options.sampleRate ?? 16e3);
617
- }
618
- /**
619
- * Flush remaining buffered audio
620
- *
621
- * Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
622
- * This ensures the final audio chunk generates blendshape frames.
623
- *
624
- * Should be called when audio stream ends to prevent losing the last 0-1 seconds.
625
- *
626
- * @param lam - LAM inference engine
627
- */
628
- async flush(lam) {
629
- if (this.buffer.length === 0) {
630
- return;
631
- }
632
- const padded = new Float32Array(this.REQUIRED_SAMPLES);
633
- padded.set(this.buffer, 0);
634
- const processedStartTime = this.bufferStartTime;
635
- try {
636
- const result = await lam.infer(padded);
637
- const actualDuration = this.buffer.length / (this.options.sampleRate ?? 16e3);
638
- const frameDuration = 1 / this.FRAME_RATE;
639
- const actualFrameCount = Math.ceil(actualDuration * this.FRAME_RATE);
640
- for (let i = 0; i < Math.min(actualFrameCount, result.blendshapes.length); i++) {
641
- const frame = result.blendshapes[i];
642
- const timestamp = processedStartTime + i * frameDuration;
643
- this.frameQueue.push({ frame, timestamp });
534
+ if (this.writeOffset + samples.length > this.bufferCapacity) {
535
+ this.bufferCapacity = (this.writeOffset + samples.length) * 2;
536
+ const grown = new Float32Array(this.bufferCapacity);
537
+ grown.set(this.buffer.subarray(0, this.writeOffset));
538
+ this.buffer = grown;
539
+ }
540
+ this.buffer.set(samples, this.writeOffset);
541
+ this.writeOffset += samples.length;
542
+ logger.debug("pushAudio", {
543
+ samplesIn: samples.length,
544
+ writeOffset: this.writeOffset,
545
+ chunkSize: this.chunkSize,
546
+ willExtract: this.writeOffset >= this.chunkSize,
547
+ inferenceRunning: this.inferenceRunning,
548
+ pendingChunks: this.pendingChunks.length,
549
+ queuedFrames: this.timestampedQueue.length + this.plainQueue.length
550
+ });
551
+ while (this.writeOffset >= this.chunkSize) {
552
+ const chunk = this.buffer.slice(0, this.chunkSize);
553
+ this.buffer.copyWithin(0, this.chunkSize, this.writeOffset);
554
+ this.writeOffset -= this.chunkSize;
555
+ const chunkTimestamp = timestamp !== void 0 ? this.bufferStartTime : void 0;
556
+ this.pendingChunks.push({ chunk, timestamp: chunkTimestamp });
557
+ logger.info("Chunk queued for inference", {
558
+ chunkSize: chunk.length,
559
+ chunkTimestamp,
560
+ pendingChunks: this.pendingChunks.length,
561
+ remainderOffset: this.writeOffset
562
+ });
563
+ if (timestamp !== void 0) {
564
+ this.bufferStartTime += this.chunkSize / this.sampleRate;
644
565
  }
645
- this.buffer = new Float32Array(0);
646
- this.bufferStartTime = 0;
647
- this.options.onInference?.(Math.min(actualFrameCount, result.blendshapes.length));
648
- } catch (error) {
649
- this.options.onError?.(error);
650
- this.buffer = new Float32Array(0);
651
- this.bufferStartTime = 0;
652
566
  }
567
+ this.drainPendingChunks();
653
568
  }
654
569
  /**
655
- * Adjust all queued frame timestamps by an offset
656
- *
657
- * Used for synchronization when audio scheduling time differs from
658
- * the estimated time used during LAM processing.
570
+ * Flush remaining buffered audio (pads to chunkSize).
571
+ * Call at end of stream to process final partial chunk.
659
572
  *
660
- * @param offset - Time offset in seconds to add to all timestamps
573
+ * Routes through the serialized pendingChunks pipeline to maintain
574
+ * correct frame ordering. Without this, flush() could push frames
575
+ * with the latest timestamp to the queue before drainPendingChunks()
576
+ * finishes pushing frames with earlier timestamps — causing
577
+ * getFrameForTime() to see out-of-order timestamps and stall.
661
578
  */
662
- adjustTimestamps(offset) {
663
- for (const frame of this.frameQueue) {
664
- frame.timestamp += offset;
665
- }
579
+ async flush() {
580
+ if (this.disposed || this.writeOffset === 0) return;
581
+ const padded = new Float32Array(this.chunkSize);
582
+ padded.set(this.buffer.subarray(0, this.writeOffset), 0);
583
+ const chunkTimestamp = this.bufferStartTime > 0 ? this.bufferStartTime : void 0;
584
+ logger.info("flush: routing through drain pipeline", {
585
+ actualSamples: this.writeOffset,
586
+ chunkTimestamp: chunkTimestamp?.toFixed(3),
587
+ pendingChunks: this.pendingChunks.length,
588
+ inferenceRunning: this.inferenceRunning
589
+ });
590
+ this.writeOffset = 0;
591
+ this.bufferStartTime = 0;
592
+ this.pendingChunks.push({ chunk: padded, timestamp: chunkTimestamp });
593
+ this.drainPendingChunks();
666
594
  }
667
595
  /**
668
- * Reset the pipeline
596
+ * Reset buffer and frame queues
669
597
  */
670
598
  reset() {
671
- this.buffer = new Float32Array(0);
599
+ this.writeOffset = 0;
672
600
  this.bufferStartTime = 0;
673
- this.frameQueue = [];
674
- this.lastFrame = null;
675
- }
676
- };
677
-
678
- // src/audio/audioUtils.ts
679
- function pcm16ToFloat32(buffer) {
680
- const byteLen = buffer.byteLength & ~1;
681
- const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
682
- const float32 = new Float32Array(int16.length);
683
- for (let i = 0; i < int16.length; i++) {
684
- float32[i] = int16[i] / 32768;
685
- }
686
- return float32;
687
- }
688
- function int16ToFloat32(int16) {
689
- const float32 = new Float32Array(int16.length);
690
- for (let i = 0; i < int16.length; i++) {
691
- float32[i] = int16[i] / 32768;
692
- }
693
- return float32;
694
- }
695
-
696
- // src/audio/SyncedAudioPipeline.ts
697
- var SyncedAudioPipeline = class extends EventEmitter {
698
- constructor(options) {
699
- super();
700
- this.options = options;
701
- this.playbackStarted = false;
702
- this.monitorInterval = null;
703
- this.frameAnimationId = null;
704
- const sampleRate = options.sampleRate ?? 16e3;
705
- const autoDelay = options.lam.modelId === "wav2arkit_cpu" ? 750 : options.lam.backend === "wasm" ? 350 : 50;
706
- const audioDelayMs = options.audioDelayMs ?? autoDelay;
707
- this.scheduler = new AudioScheduler({
708
- sampleRate,
709
- initialLookaheadSec: audioDelayMs / 1e3
710
- });
711
- this.coalescer = new AudioChunkCoalescer({
712
- sampleRate,
713
- targetDurationMs: options.chunkTargetMs ?? 200
714
- });
715
- this.lamPipeline = new LAMPipeline({
716
- sampleRate,
717
- onError: (error) => {
718
- this.emit("error", error);
719
- }
720
- });
721
- }
722
- /**
723
- * Initialize the pipeline
724
- */
725
- async initialize() {
726
- await this.scheduler.initialize();
727
- }
728
- /**
729
- * Start a new playback session
730
- *
731
- * Resets all state and prepares for incoming audio chunks.
732
- * Audio will be scheduled immediately as chunks arrive (no buffering).
733
- */
734
- start() {
735
- this.stopMonitoring();
736
- this.scheduler.reset();
737
- this.coalescer.reset();
738
- this.lamPipeline.reset();
739
- this.playbackStarted = false;
740
- this.scheduler.warmup();
741
- this.startFrameLoop();
742
- this.startMonitoring();
743
- }
744
- /**
745
- * Receive audio chunk from network
601
+ this.timestampedQueue = [];
602
+ this.plainQueue = [];
603
+ this._latestFrame = null;
604
+ this.lastPulledFrame = null;
605
+ this.pendingChunks = [];
606
+ this.inferenceRunning = false;
607
+ this.getFrameCallCount = 0;
608
+ }
609
+ // ═══════════════════════════════════════════════════════════════════════
610
+ // Frame Output Pull Mode (TTS playback)
611
+ // ═══════════════════════════════════════════════════════════════════════
612
+ /**
613
+ * Get frame synced to external clock (e.g. AudioContext.currentTime).
746
614
  *
747
- * Audio-first design: schedules audio immediately, LAM runs in background.
748
- * This prevents LAM inference (50-300ms) from blocking audio scheduling,
749
- * which caused audible stuttering with continuous audio streams.
615
+ * Discards frames that are too old, returns the current frame,
616
+ * or holds last frame as fallback to prevent avatar freezing.
750
617
  *
751
- * @param chunk - Uint8Array containing Int16 PCM audio
752
- */
753
- async onAudioChunk(chunk) {
754
- const combined = this.coalescer.add(chunk);
755
- if (!combined) {
756
- return;
618
+ * @param currentTime - Current playback time (seconds)
619
+ * @returns Blendshape frame, or null if no frames yet
620
+ */
621
+ getFrameForTime(currentTime) {
622
+ this.getFrameCallCount++;
623
+ const discardWindow = this.backend.backend === "wasm" ? 1 : 0.5;
624
+ let discardCount = 0;
625
+ while (this.timestampedQueue.length > 0 && this.timestampedQueue[0].timestamp < currentTime - discardWindow) {
626
+ this.timestampedQueue.shift();
627
+ discardCount++;
628
+ }
629
+ if (discardCount > 0) {
630
+ logger.warn("getFrameForTime DISCARDED stale frames", {
631
+ discardCount,
632
+ currentTime: currentTime.toFixed(3),
633
+ discardWindow,
634
+ remainingFrames: this.timestampedQueue.length,
635
+ nextFrameTs: this.timestampedQueue.length > 0 ? this.timestampedQueue[0].timestamp.toFixed(3) : "none"
636
+ });
757
637
  }
758
- const float32 = pcm16ToFloat32(combined);
759
- const scheduleTime = await this.scheduler.schedule(float32);
760
- if (!this.playbackStarted) {
761
- this.playbackStarted = true;
762
- this.emit("playback_start", scheduleTime);
638
+ if (this.timestampedQueue.length > 0 && this.timestampedQueue[0].timestamp <= currentTime) {
639
+ const { frame } = this.timestampedQueue.shift();
640
+ this.lastPulledFrame = frame;
641
+ return frame;
763
642
  }
764
- this.lamPipeline.push(float32, scheduleTime, this.options.lam).catch((err) => {
765
- this.emit("error", err);
766
- });
767
- }
768
- /**
769
- * End of audio stream
770
- *
771
- * Flushes any remaining buffered data.
772
- */
773
- async end() {
774
- const remaining = this.coalescer.flush();
775
- if (remaining) {
776
- const chunk = new Uint8Array(remaining);
777
- await this.onAudioChunk(chunk);
643
+ if (this.timestampedQueue.length > 0 && this.getFrameCallCount % 60 === 0) {
644
+ logger.warn("getFrameForTime: frames in queue but NOT consumable", {
645
+ queueLen: this.timestampedQueue.length,
646
+ frontTimestamp: this.timestampedQueue[0].timestamp.toFixed(4),
647
+ currentTime: currentTime.toFixed(4),
648
+ delta: (this.timestampedQueue[0].timestamp - currentTime).toFixed(4),
649
+ callCount: this.getFrameCallCount
650
+ });
778
651
  }
779
- await this.lamPipeline.flush(this.options.lam);
652
+ return this.lastPulledFrame;
780
653
  }
781
- /**
782
- * Stop playback immediately with smooth fade-out
783
- *
784
- * Gracefully cancels all audio playback and LAM processing:
785
- * - Fades out audio over specified duration (default: 50ms)
786
- * - Cancels pending LAM inferences
787
- * - Clears all buffers and queues
788
- * - Emits 'playback_complete' event
789
- *
790
- * Use this for interruptions (e.g., user barge-in during AI speech).
791
- *
792
- * @param fadeOutMs - Fade-out duration in milliseconds (default: 50ms)
793
- * @returns Promise that resolves when fade-out completes
794
- */
795
- async stop(fadeOutMs = 50) {
796
- this.stopMonitoring();
797
- await this.scheduler.cancelAll(fadeOutMs);
798
- this.coalescer.reset();
799
- this.lamPipeline.reset();
800
- this.playbackStarted = false;
801
- this.emit("playback_complete", void 0);
654
+ // ═══════════════════════════════════════════════════════════════════════
655
+ // Frame Output Push Mode (live mic, game loop)
656
+ // ═══════════════════════════════════════════════════════════════════════
657
+ /** Latest frame from drip-feed (live mic, game loop) */
658
+ get latestFrame() {
659
+ return this._latestFrame;
802
660
  }
803
- /**
804
- * Start frame animation loop
805
- *
806
- * Uses requestAnimationFrame to check for new LAM frames.
807
- * Synchronized to AudioContext clock (not visual refresh rate).
808
- *
809
- * Frame Emission Strategy:
810
- * - LAMPipeline uses last-frame-hold to prevent null returns
811
- * - Always emit frames (even repeated frames) to maintain smooth animation
812
- * - Renderer is responsible for detecting duplicate frames if needed
813
- */
814
- startFrameLoop() {
815
- const updateFrame = () => {
816
- const currentTime = this.scheduler.getCurrentTime();
817
- const frame = this.lamPipeline.getFrameForTime(currentTime, this.options.lam);
661
+ /** Start 30fps drip-feed timer (push mode) */
662
+ startDrip() {
663
+ if (this.dripInterval) return;
664
+ this.dripInterval = setInterval(() => {
665
+ const frame = this.plainQueue.shift();
818
666
  if (frame) {
819
- this.emit("frame_ready", frame);
667
+ this._latestFrame = frame;
668
+ this.onFrame?.(frame);
820
669
  }
821
- this.frameAnimationId = requestAnimationFrame(updateFrame);
822
- };
823
- this.frameAnimationId = requestAnimationFrame(updateFrame);
670
+ }, DRIP_INTERVAL_MS);
824
671
  }
825
- /**
826
- * Start monitoring for playback completion
827
- */
828
- startMonitoring() {
829
- if (this.monitorInterval) {
830
- clearInterval(this.monitorInterval);
672
+ /** Stop drip-feed timer */
673
+ stopDrip() {
674
+ if (this.dripInterval) {
675
+ clearInterval(this.dripInterval);
676
+ this.dripInterval = null;
831
677
  }
832
- this.monitorInterval = window.setInterval(() => {
833
- if (this.scheduler.isComplete() && this.lamPipeline.queuedFrameCount === 0) {
834
- this.emit("playback_complete", void 0);
835
- this.stopMonitoring();
836
- }
837
- }, 100);
838
678
  }
839
- /**
840
- * Stop monitoring
841
- */
842
- stopMonitoring() {
843
- if (this.monitorInterval) {
844
- clearInterval(this.monitorInterval);
845
- this.monitorInterval = null;
846
- }
847
- if (this.frameAnimationId) {
848
- cancelAnimationFrame(this.frameAnimationId);
849
- this.frameAnimationId = null;
850
- }
679
+ // ═══════════════════════════════════════════════════════════════════════
680
+ // State
681
+ // ═══════════════════════════════════════════════════════════════════════
682
+ /** Number of frames waiting in queue (both modes combined) */
683
+ get queuedFrameCount() {
684
+ return this.timestampedQueue.length + this.plainQueue.length;
851
685
  }
852
- /**
853
- * Get current pipeline state (for debugging/monitoring)
854
- */
855
- getState() {
856
- return {
857
- playbackStarted: this.playbackStarted,
858
- coalescerFill: this.coalescer.fillLevel,
859
- lamFill: this.lamPipeline.fillLevel,
860
- queuedFrames: this.lamPipeline.queuedFrameCount,
861
- currentTime: this.scheduler.getCurrentTime(),
862
- playbackEndTime: this.scheduler.getPlaybackEndTime()
863
- };
686
+ /** Buffer fill level as fraction of chunkSize (0-1) */
687
+ get fillLevel() {
688
+ return Math.min(1, this.writeOffset / this.chunkSize);
864
689
  }
865
- /**
866
- * Cleanup resources
867
- */
690
+ /** Dispose resources */
868
691
  dispose() {
869
- this.stopMonitoring();
870
- this.scheduler.dispose();
871
- this.coalescer.reset();
872
- this.lamPipeline.reset();
873
- }
874
- };
875
-
876
- // src/animation/EmotionToBlendshapeMapper.ts
877
- var UPPER_FACE_BLENDSHAPES = [
878
- // Brows (5)
879
- "browDownLeft",
880
- "browDownRight",
881
- "browInnerUp",
882
- "browOuterUpLeft",
883
- "browOuterUpRight",
884
- // Eyes (4)
885
- "eyeSquintLeft",
886
- "eyeSquintRight",
887
- "eyeWideLeft",
888
- "eyeWideRight",
889
- // Cheeks (2)
890
- "cheekSquintLeft",
891
- "cheekSquintRight"
892
- ];
893
- var EMOTION_ARKIT_MAP = {
894
- happy: {
895
- // AU6 - Cheek raiser (primary Duchenne smile marker)
896
- cheekSquintLeft: 0.5,
897
- cheekSquintRight: 0.5,
898
- // Slight eye squint from genuine smile (orbicularis oculi activation)
899
- eyeSquintLeft: 0.2,
900
- eyeSquintRight: 0.2
901
- },
902
- angry: {
903
- // AU4 - Brow lowerer (intense, primary anger marker)
904
- browDownLeft: 0.7,
905
- browDownRight: 0.7,
906
- // AU5 - Upper lid raiser (wide eyes, part of the "glare")
907
- eyeWideLeft: 0.4,
908
- eyeWideRight: 0.4,
909
- // AU7 - Lid tightener (tense stare, combines with AU5 for angry glare)
910
- eyeSquintLeft: 0.3,
911
- eyeSquintRight: 0.3
912
- },
913
- sad: {
914
- // AU1 - Inner brow raiser (primary sadness marker)
915
- browInnerUp: 0.6,
916
- // AU4 - Brow lowerer (brows drawn together)
917
- browDownLeft: 0.3,
918
- browDownRight: 0.3
919
- },
920
- neutral: {}
921
- // All zeros - no expression overlay
922
- };
923
- var DEFAULT_CONFIG = {
924
- smoothingFactor: 0.15,
925
- confidenceThreshold: 0.3,
926
- intensity: 1,
927
- blendMode: "dominant",
928
- minBlendProbability: 0.1,
929
- energyModulation: false,
930
- minEnergyScale: 0.3,
931
- maxEnergyScale: 1
932
- };
933
- function createZeroBlendshapes() {
934
- const result = {};
935
- for (const name of UPPER_FACE_BLENDSHAPES) {
936
- result[name] = 0;
937
- }
938
- return result;
939
- }
940
- function clamp01(value) {
941
- return Math.max(0, Math.min(1, value));
942
- }
943
- var EmotionToBlendshapeMapper = class {
944
- /**
945
- * Create a new EmotionToBlendshapeMapper
946
- *
947
- * @param config - Optional configuration
948
- */
949
- constructor(config) {
950
- this.currentEnergy = 1;
951
- this.config = {
952
- ...DEFAULT_CONFIG,
953
- ...config
954
- };
955
- this.targetBlendshapes = createZeroBlendshapes();
956
- this.currentBlendshapes = createZeroBlendshapes();
957
- }
958
- /**
959
- * Map an emotion frame to target blendshapes
960
- *
961
- * This sets the target values that the mapper will smoothly interpolate
962
- * towards. Call update() each frame to apply smoothing.
963
- *
964
- * @param frame - Emotion frame from Emotion2VecInference
965
- * @param audioEnergy - Optional audio energy (0-1) for energy modulation
966
- * @returns Target upper face blendshapes (before smoothing)
967
- */
968
- mapFrame(frame, audioEnergy) {
969
- this.targetBlendshapes = createZeroBlendshapes();
970
- if (audioEnergy !== void 0) {
971
- this.currentEnergy = clamp01(audioEnergy);
972
- }
973
- if (!frame) {
974
- return { ...this.targetBlendshapes };
975
- }
976
- if (this.config.blendMode === "weighted") {
977
- this.mapFrameWeighted(frame);
978
- } else {
979
- this.mapFrameDominant(frame);
980
- }
981
- if (this.config.energyModulation) {
982
- this.applyEnergyModulation();
983
- }
984
- return { ...this.targetBlendshapes };
692
+ if (this.disposed) return;
693
+ this.disposed = true;
694
+ this.stopDrip();
695
+ this.reset();
985
696
  }
697
+ // ═══════════════════════════════════════════════════════════════════════
698
+ // Private
699
+ // ═══════════════════════════════════════════════════════════════════════
986
700
  /**
987
- * Map using dominant emotion only (original behavior)
701
+ * Process pending chunks sequentially.
702
+ * Fire-and-forget — called from pushAudio() without awaiting.
988
703
  */
989
- mapFrameDominant(frame) {
990
- if (frame.confidence < this.config.confidenceThreshold) {
991
- return;
992
- }
993
- const emotion = frame.emotion;
994
- const mapping = EMOTION_ARKIT_MAP[emotion];
995
- if (!mapping) {
996
- return;
997
- }
998
- const scale = this.config.intensity * frame.confidence;
999
- for (const [name, value] of Object.entries(mapping)) {
1000
- const blendshapeName = name;
1001
- if (value !== void 0) {
1002
- this.targetBlendshapes[blendshapeName] = clamp01(value * scale);
704
+ drainPendingChunks() {
705
+ if (this.inferenceRunning || this.pendingChunks.length === 0) {
706
+ if (this.inferenceRunning && this.pendingChunks.length > 0) {
707
+ logger.debug("drainPendingChunks skipped (inference running)", {
708
+ pendingChunks: this.pendingChunks.length
709
+ });
1003
710
  }
1004
- }
1005
- }
1006
- /**
1007
- * Map using weighted blend of all emotions by probability
1008
- * Creates more nuanced expressions (e.g., bittersweet = happy + sad)
1009
- */
1010
- mapFrameWeighted(frame) {
1011
- if (!frame.probabilities) {
1012
- this.mapFrameDominant(frame);
1013
711
  return;
1014
712
  }
1015
- for (const [emotion, probability] of Object.entries(frame.probabilities)) {
1016
- if (probability < this.config.minBlendProbability) {
1017
- continue;
1018
- }
1019
- const mapping = EMOTION_ARKIT_MAP[emotion];
1020
- if (!mapping) {
1021
- continue;
1022
- }
1023
- const scale = this.config.intensity * probability;
1024
- for (const [name, value] of Object.entries(mapping)) {
1025
- const blendshapeName = name;
1026
- if (value !== void 0) {
1027
- this.targetBlendshapes[blendshapeName] += value * scale;
713
+ this.inferenceRunning = true;
714
+ logger.info("drainPendingChunks starting", { pendingChunks: this.pendingChunks.length });
715
+ const processNext = async () => {
716
+ while (this.pendingChunks.length > 0 && !this.disposed) {
717
+ const { chunk, timestamp } = this.pendingChunks.shift();
718
+ try {
719
+ const t0 = performance.now();
720
+ const result = await this.backend.infer(chunk);
721
+ const inferMs = Math.round(performance.now() - t0);
722
+ const actualDuration = chunk.length / this.sampleRate;
723
+ const actualFrameCount = Math.ceil(actualDuration * FRAME_RATE);
724
+ const framesToQueue = Math.min(actualFrameCount, result.blendshapes.length);
725
+ logger.info("Inference complete", {
726
+ inferMs,
727
+ modelFrames: result.blendshapes.length,
728
+ framesToQueue,
729
+ timestamp,
730
+ totalQueued: this.timestampedQueue.length + framesToQueue,
731
+ remainingPending: this.pendingChunks.length
732
+ });
733
+ for (let i = 0; i < framesToQueue; i++) {
734
+ if (timestamp !== void 0) {
735
+ this.timestampedQueue.push({
736
+ frame: result.blendshapes[i],
737
+ timestamp: timestamp + i / FRAME_RATE
738
+ });
739
+ } else {
740
+ this.plainQueue.push(result.blendshapes[i]);
741
+ }
742
+ }
743
+ } catch (err) {
744
+ this.handleError(err);
745
+ }
746
+ if (this.pendingChunks.length > 0) {
747
+ await new Promise((r) => setTimeout(r, 0));
1028
748
  }
1029
749
  }
1030
- }
1031
- for (const name of UPPER_FACE_BLENDSHAPES) {
1032
- this.targetBlendshapes[name] = clamp01(this.targetBlendshapes[name]);
1033
- }
1034
- }
1035
- /**
1036
- * Apply energy modulation to scale emotion intensity by audio energy
1037
- * Louder speech = stronger expressions
1038
- */
1039
- applyEnergyModulation() {
1040
- const { minEnergyScale, maxEnergyScale } = this.config;
1041
- const energyScale = minEnergyScale + this.currentEnergy * (maxEnergyScale - minEnergyScale);
1042
- for (const name of UPPER_FACE_BLENDSHAPES) {
1043
- this.targetBlendshapes[name] = clamp01(this.targetBlendshapes[name] * energyScale);
1044
- }
1045
- }
1046
- /**
1047
- * Apply smoothing to interpolate current values towards target
1048
- *
1049
- * Uses exponential moving average:
1050
- * current = current + smoothingFactor * (target - current)
1051
- *
1052
- * @param _deltaMs - Delta time in milliseconds (reserved for future time-based smoothing)
1053
- */
1054
- update(_deltaMs) {
1055
- const factor = this.config.smoothingFactor;
1056
- for (const name of UPPER_FACE_BLENDSHAPES) {
1057
- const target = this.targetBlendshapes[name];
1058
- const current = this.currentBlendshapes[name];
1059
- this.currentBlendshapes[name] = clamp01(current + factor * (target - current));
1060
- }
1061
- }
1062
- /**
1063
- * Get current smoothed blendshape values
1064
- *
1065
- * @returns Current upper face blendshapes (after smoothing)
1066
- */
1067
- getCurrentBlendshapes() {
1068
- return { ...this.currentBlendshapes };
1069
- }
1070
- /**
1071
- * Reset mapper to neutral state
1072
- *
1073
- * Sets both target and current blendshapes to zero.
1074
- */
1075
- reset() {
1076
- this.targetBlendshapes = createZeroBlendshapes();
1077
- this.currentBlendshapes = createZeroBlendshapes();
1078
- this.currentEnergy = 1;
1079
- }
1080
- /**
1081
- * Get current configuration
1082
- */
1083
- getConfig() {
1084
- return { ...this.config };
1085
- }
1086
- /**
1087
- * Update configuration
1088
- *
1089
- * @param config - Partial configuration to update
1090
- */
1091
- setConfig(config) {
1092
- this.config = {
1093
- ...this.config,
1094
- ...config
1095
- };
1096
- }
1097
- };
1098
-
1099
- // src/animation/audioEnergy.ts
1100
- function calculateRMS(samples) {
1101
- if (samples.length === 0) return 0;
1102
- let sumSquares = 0;
1103
- for (let i = 0; i < samples.length; i++) {
1104
- sumSquares += samples[i] * samples[i];
1105
- }
1106
- return Math.sqrt(sumSquares / samples.length);
1107
- }
1108
- function calculatePeak(samples) {
1109
- let peak = 0;
1110
- for (let i = 0; i < samples.length; i++) {
1111
- const abs = Math.abs(samples[i]);
1112
- if (abs > peak) peak = abs;
1113
- }
1114
- return peak;
1115
- }
1116
- var AudioEnergyAnalyzer = class {
1117
- /**
1118
- * @param smoothingFactor How much to smooth (0 = no smoothing, 1 = infinite smoothing). Default 0.85
1119
- * @param noiseFloor Minimum energy threshold to consider as signal. Default 0.01
1120
- */
1121
- constructor(smoothingFactor = 0.85, noiseFloor = 0.01) {
1122
- this.smoothedRMS = 0;
1123
- this.smoothedPeak = 0;
1124
- this.smoothingFactor = Math.max(0, Math.min(0.99, smoothingFactor));
1125
- this.noiseFloor = noiseFloor;
1126
- }
1127
- /**
1128
- * Process audio samples and return smoothed energy values
1129
- * @param samples Audio samples (Float32Array)
1130
- * @returns Object with rms and peak values
1131
- */
1132
- process(samples) {
1133
- const instantRMS = calculateRMS(samples);
1134
- const instantPeak = calculatePeak(samples);
1135
- const gatedRMS = instantRMS > this.noiseFloor ? instantRMS : 0;
1136
- const gatedPeak = instantPeak > this.noiseFloor ? instantPeak : 0;
1137
- if (gatedRMS > this.smoothedRMS) {
1138
- this.smoothedRMS = this.smoothedRMS * 0.5 + gatedRMS * 0.5;
1139
- } else {
1140
- this.smoothedRMS = this.smoothedRMS * this.smoothingFactor + gatedRMS * (1 - this.smoothingFactor);
1141
- }
1142
- if (gatedPeak > this.smoothedPeak) {
1143
- this.smoothedPeak = this.smoothedPeak * 0.3 + gatedPeak * 0.7;
1144
- } else {
1145
- this.smoothedPeak = this.smoothedPeak * this.smoothingFactor + gatedPeak * (1 - this.smoothingFactor);
1146
- }
1147
- const energy = this.smoothedRMS * 0.7 + this.smoothedPeak * 0.3;
1148
- return {
1149
- rms: this.smoothedRMS,
1150
- peak: this.smoothedPeak,
1151
- energy: Math.min(1, energy * 2)
1152
- // Scale up and clamp
1153
- };
1154
- }
1155
- /**
1156
- * Reset analyzer state
1157
- */
1158
- reset() {
1159
- this.smoothedRMS = 0;
1160
- this.smoothedPeak = 0;
1161
- }
1162
- /**
1163
- * Get current smoothed RMS value
1164
- */
1165
- get rms() {
1166
- return this.smoothedRMS;
1167
- }
1168
- /**
1169
- * Get current smoothed peak value
1170
- */
1171
- get peak() {
1172
- return this.smoothedPeak;
1173
- }
1174
- };
1175
- var EmphasisDetector = class {
1176
- /**
1177
- * @param historySize Number of frames to track. Default 10
1178
- * @param emphasisThreshold Minimum energy increase to count as emphasis. Default 0.15
1179
- */
1180
- constructor(historySize = 10, emphasisThreshold = 0.15) {
1181
- this.energyHistory = [];
1182
- this.historySize = historySize;
1183
- this.emphasisThreshold = emphasisThreshold;
1184
- }
1185
- /**
1186
- * Process energy value and detect emphasis
1187
- * @param energy Current energy value (0-1)
1188
- * @returns Object with isEmphasis flag and emphasisStrength
1189
- */
1190
- process(energy) {
1191
- this.energyHistory.push(energy);
1192
- if (this.energyHistory.length > this.historySize) {
1193
- this.energyHistory.shift();
1194
- }
1195
- if (this.energyHistory.length < 3) {
1196
- return { isEmphasis: false, emphasisStrength: 0 };
1197
- }
1198
- const prevFrames = this.energyHistory.slice(0, -1);
1199
- const avgPrev = prevFrames.reduce((a, b) => a + b, 0) / prevFrames.length;
1200
- const increase = energy - avgPrev;
1201
- const isEmphasis = increase > this.emphasisThreshold;
1202
- return {
1203
- isEmphasis,
1204
- emphasisStrength: isEmphasis ? Math.min(1, increase / 0.3) : 0
750
+ this.inferenceRunning = false;
751
+ if (this.pendingChunks.length > 0) {
752
+ this.drainPendingChunks();
753
+ }
1205
754
  };
755
+ processNext().catch((err) => this.handleError(err));
1206
756
  }
1207
- /**
1208
- * Reset detector state
1209
- */
1210
- reset() {
1211
- this.energyHistory = [];
757
+ handleError(err) {
758
+ const error = err instanceof Error ? err : new Error(String(err));
759
+ logger.warn("A2EProcessor inference error", { error: error.message });
760
+ this.onError?.(error);
1212
761
  }
1213
762
  };
1214
763
 
@@ -2461,7 +2010,7 @@ function isSafari() {
2461
2010
  const ua = navigator.userAgent.toLowerCase();
2462
2011
  return /safari/.test(ua) && !/chrome|crios|fxios|chromium|edg/.test(ua);
2463
2012
  }
2464
- function shouldUseCpuLipSync() {
2013
+ function shouldUseCpuA2E() {
2465
2014
  return isSafari() || isIOS();
2466
2015
  }
2467
2016
  function isSpeechRecognitionAvailable() {
@@ -2471,22 +2020,22 @@ function isSpeechRecognitionAvailable() {
2471
2020
  function shouldUseNativeASR() {
2472
2021
  return (isIOS() || isSafari()) && isSpeechRecognitionAvailable();
2473
2022
  }
2474
- function shouldUseServerLipSync() {
2023
+ function shouldUseServerA2E() {
2475
2024
  return isIOS();
2476
2025
  }
2477
2026
 
2478
2027
  // src/inference/onnxLoader.ts
2479
- var logger = createLogger("OnnxLoader");
2028
+ var logger2 = createLogger("OnnxLoader");
2480
2029
  var ortInstance = null;
2481
2030
  var loadedBackend = null;
2482
2031
  var WASM_CDN_PATH = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
2483
2032
  async function isWebGPUAvailable() {
2484
2033
  if (isIOS()) {
2485
- logger.debug("WebGPU check: disabled on iOS (asyncify bundle crashes WebKit)");
2034
+ logger2.debug("WebGPU check: disabled on iOS (asyncify bundle crashes WebKit)");
2486
2035
  return false;
2487
2036
  }
2488
2037
  if (!hasWebGPUApi()) {
2489
- logger.debug("WebGPU check: navigator.gpu not available", {
2038
+ logger2.debug("WebGPU check: navigator.gpu not available", {
2490
2039
  isSecureContext: typeof window !== "undefined" ? window.isSecureContext : "N/A"
2491
2040
  });
2492
2041
  return false;
@@ -2494,19 +2043,19 @@ async function isWebGPUAvailable() {
2494
2043
  try {
2495
2044
  const adapter = await navigator.gpu.requestAdapter();
2496
2045
  if (!adapter) {
2497
- logger.debug("WebGPU check: No adapter available");
2046
+ logger2.debug("WebGPU check: No adapter available");
2498
2047
  return false;
2499
2048
  }
2500
2049
  const device = await adapter.requestDevice();
2501
2050
  if (!device) {
2502
- logger.debug("WebGPU check: Could not create device");
2051
+ logger2.debug("WebGPU check: Could not create device");
2503
2052
  return false;
2504
2053
  }
2505
2054
  device.destroy();
2506
- logger.debug("WebGPU check: Available and working");
2055
+ logger2.debug("WebGPU check: Available and working");
2507
2056
  return true;
2508
2057
  } catch (err) {
2509
- logger.debug("WebGPU check: Error during availability check", { error: err });
2058
+ logger2.debug("WebGPU check: Error during availability check", { error: err });
2510
2059
  return false;
2511
2060
  }
2512
2061
  }
@@ -2516,11 +2065,11 @@ function applyIOSWasmMemoryPatch() {
2516
2065
  iosWasmPatched = true;
2517
2066
  const OrigMemory = WebAssembly.Memory;
2518
2067
  const MAX_IOS_PAGES = 32768;
2519
- logger.info("Applying iOS WASM memory patch (max\u21922GB, shared preserved)");
2068
+ logger2.info("Applying iOS WASM memory patch (max\u21922GB, shared preserved)");
2520
2069
  WebAssembly.Memory = function IOSPatchedMemory(descriptor) {
2521
2070
  const patched = { ...descriptor };
2522
2071
  if (patched.maximum !== void 0 && patched.maximum > MAX_IOS_PAGES) {
2523
- logger.info("iOS memory patch: capping maximum", {
2072
+ logger2.info("iOS memory patch: capping maximum", {
2524
2073
  original: patched.maximum,
2525
2074
  capped: MAX_IOS_PAGES,
2526
2075
  shared: patched.shared,
@@ -2539,7 +2088,7 @@ function configureWasm(ort) {
2539
2088
  ort.env.wasm.numThreads = numThreads;
2540
2089
  ort.env.wasm.simd = true;
2541
2090
  ort.env.wasm.proxy = enableProxy;
2542
- logger.info("WASM configured", {
2091
+ logger2.info("WASM configured", {
2543
2092
  numThreads,
2544
2093
  simd: true,
2545
2094
  proxy: enableProxy,
@@ -2551,12 +2100,12 @@ async function getOnnxRuntime(backend) {
2551
2100
  return ortInstance;
2552
2101
  }
2553
2102
  if (ortInstance && loadedBackend !== backend) {
2554
- logger.warn(
2103
+ logger2.warn(
2555
2104
  `ONNX Runtime already loaded with ${loadedBackend} backend. Cannot switch to ${backend}. Returning existing instance.`
2556
2105
  );
2557
2106
  return ortInstance;
2558
2107
  }
2559
- logger.info(`Loading ONNX Runtime with ${backend} backend...`);
2108
+ logger2.info(`Loading ONNX Runtime with ${backend} backend...`);
2560
2109
  applyIOSWasmMemoryPatch();
2561
2110
  try {
2562
2111
  if (backend === "wasm" && (isIOS() || isSafari())) {
@@ -2571,10 +2120,10 @@ async function getOnnxRuntime(backend) {
2571
2120
  }
2572
2121
  loadedBackend = backend;
2573
2122
  configureWasm(ortInstance);
2574
- logger.info(`ONNX Runtime loaded successfully`, { backend });
2123
+ logger2.info(`ONNX Runtime loaded successfully`, { backend });
2575
2124
  return ortInstance;
2576
2125
  } catch (err) {
2577
- logger.error(`Failed to load ONNX Runtime with ${backend} backend`, {
2126
+ logger2.error(`Failed to load ONNX Runtime with ${backend} backend`, {
2578
2127
  error: err
2579
2128
  });
2580
2129
  throw new Error(
@@ -2585,7 +2134,7 @@ async function getOnnxRuntime(backend) {
2585
2134
  async function getOnnxRuntimeForPreference(preference = "auto") {
2586
2135
  const webgpuAvailable = await isWebGPUAvailable();
2587
2136
  const backend = resolveBackend(preference, webgpuAvailable);
2588
- logger.info("Resolved backend preference", {
2137
+ logger2.info("Resolved backend preference", {
2589
2138
  preference,
2590
2139
  webgpuAvailable,
2591
2140
  resolvedBackend: backend
@@ -2619,42 +2168,6 @@ function getSessionOptions(backend) {
2619
2168
  graphOptimizationLevel: "all"
2620
2169
  };
2621
2170
  }
2622
- async function createSessionWithFallback(modelBuffer, preferredBackend) {
2623
- const ort = await getOnnxRuntime(preferredBackend);
2624
- const modelData = new Uint8Array(modelBuffer);
2625
- if (preferredBackend === "webgpu") {
2626
- try {
2627
- const options2 = getSessionOptions("webgpu");
2628
- const session2 = await ort.InferenceSession.create(modelData, options2);
2629
- logger.info("Session created with WebGPU backend");
2630
- return { session: session2, backend: "webgpu" };
2631
- } catch (err) {
2632
- logger.warn("WebGPU session creation failed, falling back to WASM", {
2633
- error: err instanceof Error ? err.message : String(err)
2634
- });
2635
- }
2636
- }
2637
- const options = getSessionOptions("wasm");
2638
- const session = await ort.InferenceSession.create(modelData, options);
2639
- logger.info("Session created with WASM backend");
2640
- return { session, backend: "wasm" };
2641
- }
2642
- function getLoadedBackend() {
2643
- return loadedBackend;
2644
- }
2645
- function isOnnxRuntimeLoaded() {
2646
- return ortInstance !== null;
2647
- }
2648
- async function preloadOnnxRuntime(preference = "auto") {
2649
- if (ortInstance) {
2650
- logger.info("ONNX Runtime already preloaded", { backend: loadedBackend });
2651
- return loadedBackend;
2652
- }
2653
- logger.info("Preloading ONNX Runtime...", { preference });
2654
- const { backend } = await getOnnxRuntimeForPreference(preference);
2655
- logger.info("ONNX Runtime preloaded", { backend });
2656
- return backend;
2657
- }
2658
2171
 
2659
2172
  // src/inference/blendshapeUtils.ts
2660
2173
  var LAM_BLENDSHAPES = [
@@ -2804,16 +2317,19 @@ var WAV2ARKIT_BLENDSHAPES = [
2804
2317
  var REMAP_WAV2ARKIT_TO_LAM = WAV2ARKIT_BLENDSHAPES.map(
2805
2318
  (name) => LAM_BLENDSHAPES.indexOf(name)
2806
2319
  );
2807
- function remapWav2ArkitToLam(frame) {
2808
- const result = new Float32Array(52);
2809
- for (let i = 0; i < 52; i++) {
2810
- result[REMAP_WAV2ARKIT_TO_LAM[i]] = frame[i];
2320
+ function lerpBlendshapes(current, target, factor = 0.3) {
2321
+ const len = Math.max(current.length, target.length);
2322
+ const result = new Array(len);
2323
+ for (let i = 0; i < len; i++) {
2324
+ const c = current[i] ?? 0;
2325
+ const t = target[i] ?? 0;
2326
+ result[i] = c + (t - c) * factor;
2811
2327
  }
2812
2328
  return result;
2813
2329
  }
2814
2330
 
2815
2331
  // src/inference/Wav2Vec2Inference.ts
2816
- var logger2 = createLogger("Wav2Vec2");
2332
+ var logger3 = createLogger("Wav2Vec2");
2817
2333
  var CTC_VOCAB = [
2818
2334
  "<pad>",
2819
2335
  "<s>",
@@ -2863,6 +2379,7 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
2863
2379
  this.poisoned = false;
2864
2380
  this.config = config;
2865
2381
  this.numIdentityClasses = config.numIdentityClasses ?? 12;
2382
+ this.chunkSize = config.chunkSize ?? 16e3;
2866
2383
  }
2867
2384
  get backend() {
2868
2385
  return this.session ? this._backend : null;
@@ -2892,30 +2409,30 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
2892
2409
  "model.backend_requested": this.config.backend || "auto"
2893
2410
  });
2894
2411
  try {
2895
- logger2.info("Loading ONNX Runtime...", { preference: this.config.backend || "auto" });
2412
+ logger3.info("Loading ONNX Runtime...", { preference: this.config.backend || "auto" });
2896
2413
  const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend || "auto");
2897
2414
  this.ort = ort;
2898
2415
  this._backend = backend;
2899
- logger2.info("ONNX Runtime loaded", { backend: this._backend });
2416
+ logger3.info("ONNX Runtime loaded", { backend: this._backend });
2900
2417
  const modelUrl = this.config.modelUrl;
2901
2418
  const dataUrl = this.config.externalDataUrl !== false ? typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data` : null;
2902
2419
  const sessionOptions = getSessionOptions(this._backend);
2903
2420
  let isCached = false;
2904
2421
  if (isIOS()) {
2905
- logger2.info("iOS: passing model URLs directly to ORT (low-memory path)", {
2422
+ logger3.info("iOS: passing model URLs directly to ORT (low-memory path)", {
2906
2423
  modelUrl,
2907
2424
  dataUrl
2908
2425
  });
2909
2426
  if (dataUrl) {
2910
2427
  const dataFilename = dataUrl.split("/").pop();
2911
- logger2.info("iOS: setting externalData", { dataFilename, dataUrl });
2428
+ logger3.info("iOS: setting externalData", { dataFilename, dataUrl });
2912
2429
  sessionOptions.externalData = [{
2913
2430
  path: dataFilename,
2914
2431
  data: dataUrl
2915
2432
  // URL string — ORT fetches directly into WASM
2916
2433
  }];
2917
2434
  }
2918
- logger2.info("iOS: calling InferenceSession.create() with URL string", {
2435
+ logger3.info("iOS: calling InferenceSession.create() with URL string", {
2919
2436
  modelUrl,
2920
2437
  sessionOptions: JSON.stringify(
2921
2438
  sessionOptions,
@@ -2925,14 +2442,14 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
2925
2442
  try {
2926
2443
  this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
2927
2444
  } catch (sessionErr) {
2928
- logger2.error("iOS: InferenceSession.create() failed", {
2445
+ logger3.error("iOS: InferenceSession.create() failed", {
2929
2446
  error: sessionErr instanceof Error ? sessionErr.message : String(sessionErr),
2930
2447
  errorType: sessionErr?.constructor?.name,
2931
2448
  stack: sessionErr instanceof Error ? sessionErr.stack : void 0
2932
2449
  });
2933
2450
  throw sessionErr;
2934
2451
  }
2935
- logger2.info("iOS: session created successfully", {
2452
+ logger3.info("iOS: session created successfully", {
2936
2453
  inputNames: this.session.inputNames,
2937
2454
  outputNames: this.session.outputNames
2938
2455
  });
@@ -2941,15 +2458,15 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
2941
2458
  isCached = await cache.has(modelUrl);
2942
2459
  let modelBuffer;
2943
2460
  if (isCached) {
2944
- logger2.debug("Loading model from cache", { modelUrl });
2461
+ logger3.debug("Loading model from cache", { modelUrl });
2945
2462
  modelBuffer = await cache.get(modelUrl);
2946
2463
  if (!modelBuffer) {
2947
- logger2.warn("Cache corruption detected, clearing and retrying", { modelUrl });
2464
+ logger3.warn("Cache corruption detected, clearing and retrying", { modelUrl });
2948
2465
  await cache.delete(modelUrl);
2949
2466
  modelBuffer = await fetchWithCache(modelUrl);
2950
2467
  }
2951
2468
  } else {
2952
- logger2.debug("Fetching and caching model", { modelUrl });
2469
+ logger3.debug("Fetching and caching model", { modelUrl });
2953
2470
  modelBuffer = await fetchWithCache(modelUrl);
2954
2471
  }
2955
2472
  if (!modelBuffer) {
@@ -2960,31 +2477,31 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
2960
2477
  try {
2961
2478
  const isDataCached = await cache.has(dataUrl);
2962
2479
  if (isDataCached) {
2963
- logger2.debug("Loading external data from cache", { dataUrl });
2480
+ logger3.debug("Loading external data from cache", { dataUrl });
2964
2481
  externalDataBuffer = await cache.get(dataUrl);
2965
2482
  if (!externalDataBuffer) {
2966
- logger2.warn("Cache corruption for external data, retrying", { dataUrl });
2483
+ logger3.warn("Cache corruption for external data, retrying", { dataUrl });
2967
2484
  await cache.delete(dataUrl);
2968
2485
  externalDataBuffer = await fetchWithCache(dataUrl);
2969
2486
  }
2970
2487
  } else {
2971
- logger2.info("Fetching external model data", {
2488
+ logger3.info("Fetching external model data", {
2972
2489
  dataUrl,
2973
2490
  note: "This may be a large download (383MB+)"
2974
2491
  });
2975
2492
  externalDataBuffer = await fetchWithCache(dataUrl);
2976
2493
  }
2977
- logger2.info("External data loaded", {
2494
+ logger3.info("External data loaded", {
2978
2495
  size: formatBytes(externalDataBuffer.byteLength)
2979
2496
  });
2980
2497
  } catch (err) {
2981
- logger2.debug("No external data file found (single-file model)", {
2498
+ logger3.debug("No external data file found (single-file model)", {
2982
2499
  dataUrl,
2983
2500
  error: err.message
2984
2501
  });
2985
2502
  }
2986
2503
  }
2987
- logger2.debug("Creating ONNX session", {
2504
+ logger3.debug("Creating ONNX session", {
2988
2505
  graphSize: formatBytes(modelBuffer.byteLength),
2989
2506
  externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
2990
2507
  backend: this._backend
@@ -2999,12 +2516,12 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
2999
2516
  const modelData = new Uint8Array(modelBuffer);
3000
2517
  this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
3001
2518
  }
3002
- logger2.info("ONNX session created successfully", {
2519
+ logger3.info("ONNX session created successfully", {
3003
2520
  executionProvider: this._backend,
3004
2521
  backend: this._backend
3005
2522
  });
3006
2523
  const loadTimeMs = performance.now() - startTime;
3007
- logger2.info("Model loaded successfully", {
2524
+ logger3.info("Model loaded successfully", {
3008
2525
  backend: this._backend,
3009
2526
  loadTimeMs: Math.round(loadTimeMs),
3010
2527
  inputs: this.session.inputNames,
@@ -3020,13 +2537,13 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
3020
2537
  model: "wav2vec2",
3021
2538
  backend: this._backend
3022
2539
  });
3023
- logger2.debug("Running warmup inference to initialize GPU context");
2540
+ logger3.debug("Running warmup inference to initialize GPU context");
3024
2541
  const warmupStart = performance.now();
3025
- const warmupAudio = new Float32Array(16e3);
2542
+ const warmupAudio = new Float32Array(this.chunkSize);
3026
2543
  const warmupIdentity = new Float32Array(this.numIdentityClasses);
3027
2544
  warmupIdentity[0] = 1;
3028
2545
  const warmupFeeds = {
3029
- "audio": new this.ort.Tensor("float32", warmupAudio, [1, 16e3]),
2546
+ "audio": new this.ort.Tensor("float32", warmupAudio, [1, this.chunkSize]),
3030
2547
  "identity": new this.ort.Tensor("float32", warmupIdentity, [1, this.numIdentityClasses])
3031
2548
  };
3032
2549
  const WARMUP_TIMEOUT_MS = 15e3;
@@ -3036,12 +2553,12 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
3036
2553
  ]);
3037
2554
  const warmupTimeMs = performance.now() - warmupStart;
3038
2555
  if (warmupResult === "timeout") {
3039
- logger2.warn("Warmup inference timed out \u2014 GPU may be unresponsive. Continuing without warmup.", {
2556
+ logger3.warn("Warmup inference timed out \u2014 GPU may be unresponsive. Continuing without warmup.", {
3040
2557
  timeoutMs: WARMUP_TIMEOUT_MS,
3041
2558
  backend: this._backend
3042
2559
  });
3043
2560
  } else {
3044
- logger2.info("Warmup inference complete", {
2561
+ logger3.info("Warmup inference complete", {
3045
2562
  warmupTimeMs: Math.round(warmupTimeMs),
3046
2563
  backend: this._backend
3047
2564
  });
@@ -3069,11 +2586,10 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
3069
2586
  }
3070
2587
  /**
3071
2588
  * Run inference on raw audio
3072
- * @param audioSamples - Float32Array of raw audio at 16kHz (16000 samples = 1 second)
2589
+ * @param audioSamples - Float32Array of raw audio at 16kHz
3073
2590
  * @param identityIndex - Optional identity index (0-11, default 0 = neutral)
3074
2591
  *
3075
- * Note: Model expects 1-second chunks (16000 samples) for optimal performance.
3076
- * Audio will be zero-padded or truncated to 16000 samples.
2592
+ * Audio will be zero-padded or truncated to chunkSize samples.
3077
2593
  */
3078
2594
  async infer(audioSamples, identityIndex = 0) {
3079
2595
  if (!this.session) {
@@ -3084,20 +2600,20 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
3084
2600
  }
3085
2601
  const audioSamplesCopy = new Float32Array(audioSamples);
3086
2602
  let audio;
3087
- if (audioSamplesCopy.length === 16e3) {
2603
+ if (audioSamplesCopy.length === this.chunkSize) {
3088
2604
  audio = audioSamplesCopy;
3089
- } else if (audioSamplesCopy.length < 16e3) {
3090
- audio = new Float32Array(16e3);
2605
+ } else if (audioSamplesCopy.length < this.chunkSize) {
2606
+ audio = new Float32Array(this.chunkSize);
3091
2607
  audio.set(audioSamplesCopy, 0);
3092
2608
  } else {
3093
- audio = audioSamplesCopy.slice(0, 16e3);
2609
+ audio = audioSamplesCopy.slice(0, this.chunkSize);
3094
2610
  }
3095
2611
  const identity = new Float32Array(this.numIdentityClasses);
3096
2612
  identity[Math.max(0, Math.min(identityIndex, this.numIdentityClasses - 1))] = 1;
3097
2613
  const audioCopy = new Float32Array(audio);
3098
2614
  const identityCopy = new Float32Array(identity);
3099
2615
  const feeds = {
3100
- "audio": new this.ort.Tensor("float32", audioCopy, [1, 16e3]),
2616
+ "audio": new this.ort.Tensor("float32", audioCopy, [1, this.chunkSize]),
3101
2617
  "identity": new this.ort.Tensor("float32", identityCopy, [1, this.numIdentityClasses])
3102
2618
  };
3103
2619
  return this.queueInference(feeds);
@@ -3133,7 +2649,7 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
3133
2649
  const telemetry = getTelemetry();
3134
2650
  const span = telemetry?.startSpan("Wav2Vec2.infer", {
3135
2651
  "inference.backend": this._backend,
3136
- "inference.input_samples": 16e3
2652
+ "inference.input_samples": this.chunkSize
3137
2653
  });
3138
2654
  try {
3139
2655
  const startTime = performance.now();
@@ -3172,7 +2688,7 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
3172
2688
  blendshapes.push(symmetrizeBlendshapes(rawFrame));
3173
2689
  }
3174
2690
  const text = this.decodeCTC(asrLogits);
3175
- logger2.trace("Inference completed", {
2691
+ logger3.trace("Inference completed", {
3176
2692
  inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
3177
2693
  numA2EFrames,
3178
2694
  numASRFrames,
@@ -3206,12 +2722,12 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
3206
2722
  const errMsg = err instanceof Error ? err.message : String(err);
3207
2723
  if (errMsg.includes("timed out")) {
3208
2724
  this.poisoned = true;
3209
- logger2.error("CRITICAL: Inference session timed out \u2014 LAM is dead. Page reload required.", {
2725
+ logger3.error("CRITICAL: Inference session timed out \u2014 LAM is dead. Page reload required.", {
3210
2726
  backend: this._backend,
3211
2727
  timeoutMs: _Wav2Vec2Inference.INFERENCE_TIMEOUT_MS
3212
2728
  });
3213
2729
  } else {
3214
- logger2.error("Inference failed", { error: errMsg, backend: this._backend });
2730
+ logger3.error("Inference failed", { error: errMsg, backend: this._backend });
3215
2731
  }
3216
2732
  span?.endWithError(err instanceof Error ? err : new Error(String(err)));
3217
2733
  telemetry?.incrementCounter("omote.inference.total", 1, {
@@ -3252,56 +2768,74 @@ _Wav2Vec2Inference.INFERENCE_TIMEOUT_MS = 5e3;
3252
2768
  _Wav2Vec2Inference.isWebGPUAvailable = isWebGPUAvailable;
3253
2769
  var Wav2Vec2Inference = _Wav2Vec2Inference;
3254
2770
 
2771
+ // src/audio/audioUtils.ts
2772
+ function pcm16ToFloat32(buffer) {
2773
+ const byteLen = buffer.byteLength & ~1;
2774
+ const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
2775
+ const float32 = new Float32Array(int16.length);
2776
+ for (let i = 0; i < int16.length; i++) {
2777
+ float32[i] = int16[i] / 32768;
2778
+ }
2779
+ return float32;
2780
+ }
2781
+ function int16ToFloat32(int16) {
2782
+ const float32 = new Float32Array(int16.length);
2783
+ for (let i = 0; i < int16.length; i++) {
2784
+ float32[i] = int16[i] / 32768;
2785
+ }
2786
+ return float32;
2787
+ }
2788
+
3255
2789
  // src/audio/FullFacePipeline.ts
3256
- var logger3 = createLogger("FullFacePipeline");
3257
- var BLENDSHAPE_INDEX_MAP = /* @__PURE__ */ new Map();
3258
- LAM_BLENDSHAPES.forEach((name, index) => {
3259
- BLENDSHAPE_INDEX_MAP.set(name, index);
3260
- });
3261
- var UPPER_FACE_SET = new Set(UPPER_FACE_BLENDSHAPES);
3262
- var EMOTION_LABEL_MAP = {
3263
- // Direct labels
3264
- happy: "happy",
3265
- sad: "sad",
3266
- angry: "angry",
3267
- neutral: "neutral",
3268
- // Natural language synonyms
3269
- excited: "happy",
3270
- joyful: "happy",
3271
- cheerful: "happy",
3272
- delighted: "happy",
3273
- amused: "happy",
3274
- melancholic: "sad",
3275
- sorrowful: "sad",
3276
- disappointed: "sad",
3277
- frustrated: "angry",
3278
- irritated: "angry",
3279
- furious: "angry",
3280
- annoyed: "angry",
3281
- // SenseVoice labels
3282
- fearful: "sad",
3283
- disgusted: "angry",
3284
- surprised: "happy"
3285
- };
3286
- var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
2790
+ var logger4 = createLogger("FullFacePipeline");
2791
+ var BLENDSHAPE_TO_GROUP = /* @__PURE__ */ new Map();
2792
+ for (const name of LAM_BLENDSHAPES) {
2793
+ if (name.startsWith("eye")) {
2794
+ BLENDSHAPE_TO_GROUP.set(name, "eyes");
2795
+ } else if (name.startsWith("brow")) {
2796
+ BLENDSHAPE_TO_GROUP.set(name, "brows");
2797
+ } else if (name.startsWith("jaw")) {
2798
+ BLENDSHAPE_TO_GROUP.set(name, "jaw");
2799
+ } else if (name.startsWith("mouth")) {
2800
+ BLENDSHAPE_TO_GROUP.set(name, "mouth");
2801
+ } else if (name.startsWith("cheek")) {
2802
+ BLENDSHAPE_TO_GROUP.set(name, "cheeks");
2803
+ } else if (name.startsWith("nose")) {
2804
+ BLENDSHAPE_TO_GROUP.set(name, "nose");
2805
+ } else if (name.startsWith("tongue")) {
2806
+ BLENDSHAPE_TO_GROUP.set(name, "tongue");
2807
+ }
2808
+ }
2809
+ var FullFacePipeline = class extends EventEmitter {
3287
2810
  constructor(options) {
3288
2811
  super();
3289
2812
  this.options = options;
3290
2813
  this.playbackStarted = false;
3291
2814
  this.monitorInterval = null;
3292
2815
  this.frameAnimationId = null;
3293
- // Emotion state
3294
- this.lastEmotionFrame = null;
3295
- this.currentAudioEnergy = 0;
3296
2816
  // Stale frame detection
3297
2817
  this.lastNewFrameTime = 0;
3298
2818
  this.lastKnownLamFrame = null;
3299
2819
  this.staleWarningEmitted = false;
2820
+ // Diagnostic logging counter
2821
+ this.frameLoopCount = 0;
3300
2822
  const sampleRate = options.sampleRate ?? 16e3;
3301
- this.emotionBlendFactor = options.emotionBlendFactor ?? 0.8;
3302
- this.lamBlendFactor = options.lamBlendFactor ?? 0.2;
3303
- const autoDelay = options.lam.modelId === "wav2arkit_cpu" ? 750 : options.lam.backend === "wasm" ? 350 : 50;
2823
+ this.profile = options.profile ?? {};
2824
+ this.staleThresholdMs = options.staleThresholdMs ?? 2e3;
2825
+ const isCpuModel = options.lam.modelId === "wav2arkit_cpu";
2826
+ const chunkSize = options.chunkSize ?? options.lam.chunkSize ?? 16e3;
2827
+ const chunkAccumulationMs = chunkSize / sampleRate * 1e3;
2828
+ const inferenceEstimateMs = isCpuModel ? 300 : options.lam.backend === "wasm" ? 250 : 80;
2829
+ const marginMs = 100;
2830
+ const autoDelay = Math.ceil(chunkAccumulationMs + inferenceEstimateMs + marginMs);
3304
2831
  const audioDelayMs = options.audioDelayMs ?? autoDelay;
2832
+ logger4.info("FullFacePipeline config", {
2833
+ chunkSize,
2834
+ audioDelayMs,
2835
+ autoDelay,
2836
+ backend: options.lam.backend,
2837
+ modelId: options.lam.modelId
2838
+ });
3305
2839
  this.scheduler = new AudioScheduler({
3306
2840
  sampleRate,
3307
2841
  initialLookaheadSec: audioDelayMs / 1e3
@@ -3310,20 +2844,15 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
3310
2844
  sampleRate,
3311
2845
  targetDurationMs: options.chunkTargetMs ?? 200
3312
2846
  });
3313
- this.lamPipeline = new LAMPipeline({
2847
+ this.processor = new A2EProcessor({
2848
+ backend: options.lam,
3314
2849
  sampleRate,
2850
+ chunkSize,
3315
2851
  onError: (error) => {
3316
- logger3.error("LAM inference error", { message: error.message, stack: error.stack });
2852
+ logger4.error("A2E inference error", { message: error.message, stack: error.stack });
3317
2853
  this.emit("error", error);
3318
2854
  }
3319
2855
  });
3320
- this.emotionMapper = new EmotionToBlendshapeMapper({
3321
- smoothingFactor: 0.15,
3322
- confidenceThreshold: 0.3,
3323
- intensity: 1,
3324
- energyModulation: true
3325
- });
3326
- this.energyAnalyzer = new AudioEnergyAnalyzer();
3327
2856
  }
3328
2857
  /**
3329
2858
  * Initialize the pipeline
@@ -3332,40 +2861,33 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
3332
2861
  await this.scheduler.initialize();
3333
2862
  }
3334
2863
  /**
3335
- * Set emotion label from backend (e.g., LLM response emotion).
3336
- *
3337
- * Converts a natural language emotion label into an EmotionFrame
3338
- * that drives upper face blendshapes for the duration of the utterance.
3339
- *
3340
- * Supported labels: happy, excited, joyful, sad, melancholic, angry,
3341
- * frustrated, neutral, etc.
3342
- *
3343
- * @param label - Emotion label string (case-insensitive)
3344
- */
3345
- setEmotionLabel(label) {
3346
- const normalized = label.toLowerCase();
3347
- const mapped = EMOTION_LABEL_MAP[normalized] ?? "neutral";
3348
- const probabilities = {
3349
- neutral: 0.1,
3350
- happy: 0.1,
3351
- angry: 0.1,
3352
- sad: 0.1
3353
- };
3354
- probabilities[mapped] = 0.7;
3355
- const frame = {
3356
- emotion: mapped,
3357
- confidence: 0.7,
3358
- probabilities
3359
- };
3360
- this.lastEmotionFrame = frame;
3361
- logger3.info("Emotion label set", { label, mapped });
2864
+ * Update the ExpressionProfile at runtime (e.g., character switch).
2865
+ */
2866
+ setProfile(profile) {
2867
+ this.profile = profile;
3362
2868
  }
3363
2869
  /**
3364
- * Clear any set emotion label.
3365
- * Falls back to prosody-only upper face animation.
2870
+ * Apply ExpressionProfile scaling to raw A2E blendshapes.
2871
+ *
2872
+ * For each blendshape:
2873
+ * 1. If an override exists for the blendshape name, use override as scaler
2874
+ * 2. Otherwise, use the group scaler (default 1.0)
2875
+ * 3. Clamp result to [0, 1]
3366
2876
  */
3367
- clearEmotionLabel() {
3368
- this.lastEmotionFrame = null;
2877
+ applyProfile(raw) {
2878
+ const scaled = new Float32Array(52);
2879
+ for (let i = 0; i < 52; i++) {
2880
+ const name = LAM_BLENDSHAPES[i];
2881
+ let scaler;
2882
+ if (this.profile.overrides && this.profile.overrides[name] !== void 0) {
2883
+ scaler = this.profile.overrides[name];
2884
+ } else {
2885
+ const group = BLENDSHAPE_TO_GROUP.get(name);
2886
+ scaler = group ? this.profile[group] ?? 1 : 1;
2887
+ }
2888
+ scaled[i] = Math.min(1, Math.max(0, raw[i] * scaler));
2889
+ }
2890
+ return scaled;
3369
2891
  }
3370
2892
  /**
3371
2893
  * Start a new playback session
@@ -3377,15 +2899,12 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
3377
2899
  this.stopMonitoring();
3378
2900
  this.scheduler.reset();
3379
2901
  this.coalescer.reset();
3380
- this.lamPipeline.reset();
2902
+ this.processor.reset();
3381
2903
  this.playbackStarted = false;
3382
- this.lastEmotionFrame = null;
3383
- this.currentAudioEnergy = 0;
3384
- this.emotionMapper.reset();
3385
- this.energyAnalyzer.reset();
3386
2904
  this.lastNewFrameTime = 0;
3387
2905
  this.lastKnownLamFrame = null;
3388
2906
  this.staleWarningEmitted = false;
2907
+ this.frameLoopCount = 0;
3389
2908
  this.scheduler.warmup();
3390
2909
  this.startFrameLoop();
3391
2910
  this.startMonitoring();
@@ -3393,8 +2912,8 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
3393
2912
  /**
3394
2913
  * Receive audio chunk from network
3395
2914
  *
3396
- * Audio-first design: schedules audio immediately, LAM runs in background.
3397
- * This prevents LAM inference (50-300ms) from blocking audio scheduling.
2915
+ * Audio-first design: schedules audio immediately, A2E runs in background.
2916
+ * This prevents A2E inference (50-300ms) from blocking audio scheduling.
3398
2917
  *
3399
2918
  * @param chunk - Uint8Array containing Int16 PCM audio
3400
2919
  */
@@ -3409,100 +2928,69 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
3409
2928
  this.playbackStarted = true;
3410
2929
  this.emit("playback_start", scheduleTime);
3411
2930
  }
3412
- const { energy } = this.energyAnalyzer.process(float32);
3413
- this.currentAudioEnergy = energy;
3414
- this.lamPipeline.push(float32, scheduleTime, this.options.lam).catch((err) => {
3415
- this.emit("error", err);
2931
+ logger4.info("onAudioChunk \u2192 pushAudio", {
2932
+ float32Samples: float32.length,
2933
+ scheduleTime: scheduleTime.toFixed(3),
2934
+ currentTime: this.scheduler.getCurrentTime().toFixed(3),
2935
+ deltaToPlayback: (scheduleTime - this.scheduler.getCurrentTime()).toFixed(3)
3416
2936
  });
3417
- }
3418
- /**
3419
- * Get emotion frame for current animation.
3420
- *
3421
- * Priority:
3422
- * 1. Explicit emotion label from setEmotionLabel()
3423
- * 2. Prosody fallback: subtle brow movement from audio energy
3424
- */
3425
- getEmotionFrame() {
3426
- if (this.lastEmotionFrame) {
3427
- return { frame: this.lastEmotionFrame, energy: this.currentAudioEnergy };
3428
- }
3429
- return { frame: null, energy: this.currentAudioEnergy };
3430
- }
3431
- /**
3432
- * Merge LAM blendshapes with emotion upper face blendshapes
3433
- */
3434
- mergeBlendshapes(lamFrame, emotionFrame, audioEnergy) {
3435
- const merged = new Float32Array(52);
3436
- let emotionBlendshapes;
3437
- if (emotionFrame) {
3438
- this.emotionMapper.mapFrame(emotionFrame, audioEnergy);
3439
- this.emotionMapper.update(33);
3440
- emotionBlendshapes = this.emotionMapper.getCurrentBlendshapes();
3441
- } else {
3442
- emotionBlendshapes = {};
3443
- for (const name of UPPER_FACE_BLENDSHAPES) {
3444
- emotionBlendshapes[name] = 0;
3445
- }
3446
- }
3447
- for (let i = 0; i < 52; i++) {
3448
- const name = LAM_BLENDSHAPES[i];
3449
- if (UPPER_FACE_SET.has(name)) {
3450
- const emotionValue = emotionBlendshapes[name] ?? 0;
3451
- const lamValue = lamFrame[i];
3452
- merged[i] = emotionValue * this.emotionBlendFactor + lamValue * this.lamBlendFactor;
3453
- } else {
3454
- merged[i] = lamFrame[i];
3455
- }
3456
- }
3457
- return { merged, emotionBlendshapes };
2937
+ this.processor.pushAudio(float32, scheduleTime);
3458
2938
  }
3459
2939
  /**
3460
2940
  * Start frame animation loop
2941
+ *
2942
+ * Polls A2EProcessor at render rate (60fps) for the latest inference frame
2943
+ * matching the current AudioContext time. Between inference batches (~30fps
2944
+ * bursts), getFrameForTime() holds the last frame.
3461
2945
  */
3462
2946
  startFrameLoop() {
3463
2947
  const updateFrame = () => {
2948
+ this.frameLoopCount++;
3464
2949
  const currentTime = this.scheduler.getCurrentTime();
3465
- const lamFrame = this.lamPipeline.getFrameForTime(currentTime, this.options.lam);
3466
- if (lamFrame) {
3467
- if (lamFrame !== this.lastKnownLamFrame) {
3468
- this.lastNewFrameTime = performance.now();
3469
- this.lastKnownLamFrame = lamFrame;
3470
- this.staleWarningEmitted = false;
2950
+ const lamFrame = this.processor.getFrameForTime(currentTime);
2951
+ if (lamFrame && lamFrame !== this.lastKnownLamFrame) {
2952
+ this.lastNewFrameTime = performance.now();
2953
+ this.lastKnownLamFrame = lamFrame;
2954
+ this.staleWarningEmitted = false;
2955
+ logger4.info("New A2E frame", {
2956
+ jawOpen: lamFrame[24]?.toFixed(3),
2957
+ mouthClose: lamFrame[26]?.toFixed(3),
2958
+ browInnerUp: lamFrame[2]?.toFixed(3),
2959
+ browDownL: lamFrame[0]?.toFixed(3),
2960
+ browOuterUpL: lamFrame[3]?.toFixed(3),
2961
+ currentTime: currentTime.toFixed(3),
2962
+ queuedFrames: this.processor.queuedFrameCount
2963
+ });
2964
+ }
2965
+ if (this.frameLoopCount % 60 === 0) {
2966
+ logger4.info("Frame loop heartbeat", {
2967
+ frameLoopCount: this.frameLoopCount,
2968
+ currentTime: currentTime.toFixed(3),
2969
+ playbackEndTime: this.scheduler.getPlaybackEndTime().toFixed(3),
2970
+ queuedFrames: this.processor.queuedFrameCount,
2971
+ playbackStarted: this.playbackStarted,
2972
+ msSinceNewFrame: this.lastNewFrameTime > 0 ? Math.round(performance.now() - this.lastNewFrameTime) : -1,
2973
+ processorFill: this.processor.fillLevel.toFixed(2)
2974
+ });
2975
+ }
2976
+ if (this.playbackStarted && this.lastNewFrameTime > 0 && performance.now() - this.lastNewFrameTime > this.staleThresholdMs) {
2977
+ if (!this.staleWarningEmitted) {
2978
+ this.staleWarningEmitted = true;
2979
+ logger4.warn("A2E stalled \u2014 no new inference frames", {
2980
+ staleDurationMs: Math.round(performance.now() - this.lastNewFrameTime),
2981
+ queuedFrames: this.processor.queuedFrameCount
2982
+ });
3471
2983
  }
3472
- const { frame: emotionFrame, energy } = this.getEmotionFrame();
3473
- const { merged, emotionBlendshapes } = this.mergeBlendshapes(lamFrame, emotionFrame, energy);
2984
+ }
2985
+ if (lamFrame) {
2986
+ const scaled = this.applyProfile(lamFrame);
3474
2987
  const fullFrame = {
3475
- blendshapes: merged,
3476
- lamBlendshapes: lamFrame,
3477
- emotionBlendshapes,
3478
- emotion: emotionFrame,
2988
+ blendshapes: scaled,
2989
+ rawBlendshapes: lamFrame,
3479
2990
  timestamp: currentTime
3480
2991
  };
3481
2992
  this.emit("full_frame_ready", fullFrame);
3482
2993
  this.emit("lam_frame_ready", lamFrame);
3483
- if (emotionFrame) {
3484
- this.emit("emotion_frame_ready", emotionFrame);
3485
- }
3486
- } else if (this.playbackStarted && !this.lastKnownLamFrame) {
3487
- const { frame: emotionFrame, energy } = this.getEmotionFrame();
3488
- if (emotionFrame && energy > 0.05) {
3489
- const startupFrame = new Float32Array(52);
3490
- const { merged, emotionBlendshapes } = this.mergeBlendshapes(startupFrame, emotionFrame, energy);
3491
- this.emit("full_frame_ready", {
3492
- blendshapes: merged,
3493
- lamBlendshapes: startupFrame,
3494
- emotionBlendshapes,
3495
- emotion: emotionFrame,
3496
- timestamp: currentTime
3497
- });
3498
- }
3499
- }
3500
- if (this.playbackStarted && this.lastNewFrameTime > 0 && !this.staleWarningEmitted && performance.now() - this.lastNewFrameTime > _FullFacePipeline.STALE_FRAME_THRESHOLD_MS) {
3501
- this.staleWarningEmitted = true;
3502
- logger3.warn("LAM appears stalled \u2014 no new frames for 3+ seconds during playback", {
3503
- staleDurationMs: Math.round(performance.now() - this.lastNewFrameTime),
3504
- queuedFrames: this.lamPipeline.queuedFrameCount
3505
- });
3506
2994
  }
3507
2995
  this.frameAnimationId = requestAnimationFrame(updateFrame);
3508
2996
  };
@@ -3517,7 +3005,7 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
3517
3005
  const chunk = new Uint8Array(remaining);
3518
3006
  await this.onAudioChunk(chunk);
3519
3007
  }
3520
- await this.lamPipeline.flush(this.options.lam);
3008
+ await this.processor.flush();
3521
3009
  }
3522
3010
  /**
3523
3011
  * Stop playback immediately with smooth fade-out
@@ -3526,12 +3014,8 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
3526
3014
  this.stopMonitoring();
3527
3015
  await this.scheduler.cancelAll(fadeOutMs);
3528
3016
  this.coalescer.reset();
3529
- this.lamPipeline.reset();
3017
+ this.processor.reset();
3530
3018
  this.playbackStarted = false;
3531
- this.lastEmotionFrame = null;
3532
- this.currentAudioEnergy = 0;
3533
- this.emotionMapper.reset();
3534
- this.energyAnalyzer.reset();
3535
3019
  this.lastNewFrameTime = 0;
3536
3020
  this.lastKnownLamFrame = null;
3537
3021
  this.staleWarningEmitted = false;
@@ -3545,7 +3029,7 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
3545
3029
  clearInterval(this.monitorInterval);
3546
3030
  }
3547
3031
  this.monitorInterval = setInterval(() => {
3548
- if (this.scheduler.isComplete() && this.lamPipeline.queuedFrameCount === 0) {
3032
+ if (this.scheduler.isComplete() && this.processor.queuedFrameCount === 0) {
3549
3033
  this.emit("playback_complete", void 0);
3550
3034
  this.stopMonitoring();
3551
3035
  }
@@ -3571,20 +3055,12 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
3571
3055
  return {
3572
3056
  playbackStarted: this.playbackStarted,
3573
3057
  coalescerFill: this.coalescer.fillLevel,
3574
- lamFill: this.lamPipeline.fillLevel,
3575
- queuedLAMFrames: this.lamPipeline.queuedFrameCount,
3576
- emotionLabel: this.lastEmotionFrame?.emotion ?? null,
3577
- currentAudioEnergy: this.currentAudioEnergy,
3058
+ processorFill: this.processor.fillLevel,
3059
+ queuedFrames: this.processor.queuedFrameCount,
3578
3060
  currentTime: this.scheduler.getCurrentTime(),
3579
3061
  playbackEndTime: this.scheduler.getPlaybackEndTime()
3580
3062
  };
3581
3063
  }
3582
- /**
3583
- * Check if an explicit emotion label is currently set
3584
- */
3585
- get hasEmotionLabel() {
3586
- return this.lastEmotionFrame !== null;
3587
- }
3588
3064
  /**
3589
3065
  * Cleanup resources
3590
3066
  */
@@ -3592,13 +3068,9 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
3592
3068
  this.stopMonitoring();
3593
3069
  this.scheduler.dispose();
3594
3070
  this.coalescer.reset();
3595
- this.lamPipeline.reset();
3596
- this.lastEmotionFrame = null;
3597
- this.currentAudioEnergy = 0;
3071
+ this.processor.dispose();
3598
3072
  }
3599
3073
  };
3600
- _FullFacePipeline.STALE_FRAME_THRESHOLD_MS = 3e3;
3601
- var FullFacePipeline = _FullFacePipeline;
3602
3074
 
3603
3075
  // src/inference/kaldiFbank.ts
3604
3076
  function fft(re, im) {
@@ -3885,7 +3357,7 @@ function ctcGreedyDecode(logits, seqLen, vocabSize, tokenMap) {
3885
3357
  }
3886
3358
 
3887
3359
  // src/inference/SenseVoiceInference.ts
3888
- var logger4 = createLogger("SenseVoice");
3360
+ var logger5 = createLogger("SenseVoice");
3889
3361
  var _SenseVoiceInference = class _SenseVoiceInference {
3890
3362
  constructor(config) {
3891
3363
  this.session = null;
@@ -3938,26 +3410,26 @@ var _SenseVoiceInference = class _SenseVoiceInference {
3938
3410
  "model.backend_requested": this.config.backend
3939
3411
  });
3940
3412
  try {
3941
- logger4.info("Loading ONNX Runtime...", { preference: this.config.backend });
3413
+ logger5.info("Loading ONNX Runtime...", { preference: this.config.backend });
3942
3414
  const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
3943
3415
  this.ort = ort;
3944
3416
  this._backend = backend;
3945
- logger4.info("ONNX Runtime loaded", { backend: this._backend });
3946
- logger4.debug("Fetching tokens vocabulary", { tokensUrl: this.config.tokensUrl });
3417
+ logger5.info("ONNX Runtime loaded", { backend: this._backend });
3418
+ logger5.debug("Fetching tokens vocabulary", { tokensUrl: this.config.tokensUrl });
3947
3419
  const tokensResponse = await fetch(this.config.tokensUrl);
3948
3420
  if (!tokensResponse.ok) {
3949
3421
  throw new Error(`Failed to fetch tokens.txt: ${tokensResponse.status} ${tokensResponse.statusText}`);
3950
3422
  }
3951
3423
  const tokensText = await tokensResponse.text();
3952
3424
  this.tokenMap = parseTokensFile(tokensText);
3953
- logger4.debug("Tokens loaded", { vocabSize: this.tokenMap.size });
3425
+ logger5.debug("Tokens loaded", { vocabSize: this.tokenMap.size });
3954
3426
  const sessionOptions = getSessionOptions(this._backend);
3955
3427
  if (this._backend === "webgpu") {
3956
3428
  sessionOptions.graphOptimizationLevel = "basic";
3957
3429
  }
3958
3430
  let isCached = false;
3959
3431
  if (isIOS()) {
3960
- logger4.info("iOS: passing model URL directly to ORT (low-memory path)", {
3432
+ logger5.info("iOS: passing model URL directly to ORT (low-memory path)", {
3961
3433
  modelUrl: this.config.modelUrl
3962
3434
  });
3963
3435
  this.session = await this.ort.InferenceSession.create(
@@ -3969,14 +3441,14 @@ var _SenseVoiceInference = class _SenseVoiceInference {
3969
3441
  isCached = await cache.has(this.config.modelUrl);
3970
3442
  let modelBuffer;
3971
3443
  if (isCached) {
3972
- logger4.debug("Loading model from cache", { modelUrl: this.config.modelUrl });
3444
+ logger5.debug("Loading model from cache", { modelUrl: this.config.modelUrl });
3973
3445
  modelBuffer = await cache.get(this.config.modelUrl);
3974
3446
  onProgress?.(modelBuffer.byteLength, modelBuffer.byteLength);
3975
3447
  } else {
3976
- logger4.debug("Fetching and caching model", { modelUrl: this.config.modelUrl });
3448
+ logger5.debug("Fetching and caching model", { modelUrl: this.config.modelUrl });
3977
3449
  modelBuffer = await fetchWithCache(this.config.modelUrl, onProgress);
3978
3450
  }
3979
- logger4.debug("Creating ONNX session", {
3451
+ logger5.debug("Creating ONNX session", {
3980
3452
  size: formatBytes(modelBuffer.byteLength),
3981
3453
  backend: this._backend
3982
3454
  });
@@ -3989,15 +3461,15 @@ var _SenseVoiceInference = class _SenseVoiceInference {
3989
3461
  const cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
3990
3462
  this.negMean = cmvn.negMean;
3991
3463
  this.invStddev = cmvn.invStddev;
3992
- logger4.debug("CMVN loaded from model metadata", { dim: this.negMean.length });
3464
+ logger5.debug("CMVN loaded from model metadata", { dim: this.negMean.length });
3993
3465
  } else {
3994
- logger4.warn("CMVN not found in model metadata \u2014 features will not be normalized");
3466
+ logger5.warn("CMVN not found in model metadata \u2014 features will not be normalized");
3995
3467
  }
3996
3468
  } catch (cmvnErr) {
3997
- logger4.warn("Failed to read CMVN from model metadata", { error: cmvnErr });
3469
+ logger5.warn("Failed to read CMVN from model metadata", { error: cmvnErr });
3998
3470
  }
3999
3471
  const loadTimeMs = performance.now() - startTime;
4000
- logger4.info("SenseVoice model loaded", {
3472
+ logger5.info("SenseVoice model loaded", {
4001
3473
  backend: this._backend,
4002
3474
  loadTimeMs: Math.round(loadTimeMs),
4003
3475
  vocabSize: this.tokenMap.size,
@@ -4108,7 +3580,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
4108
3580
  const vocabSize = logitsDims[2];
4109
3581
  const decoded = ctcGreedyDecode(logitsData, seqLen, vocabSize, this.tokenMap);
4110
3582
  const inferenceTimeMs = performance.now() - startTime;
4111
- logger4.trace("Transcription complete", {
3583
+ logger5.trace("Transcription complete", {
4112
3584
  text: decoded.text.substring(0, 50),
4113
3585
  language: decoded.language,
4114
3586
  emotion: decoded.emotion,
@@ -4146,7 +3618,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
4146
3618
  const errMsg = err instanceof Error ? err.message : String(err);
4147
3619
  if (errMsg.includes("timed out")) {
4148
3620
  this.poisoned = true;
4149
- logger4.error("CRITICAL: Inference session timed out \u2014 SenseVoice is dead. Page reload required.", {
3621
+ logger5.error("CRITICAL: Inference session timed out \u2014 SenseVoice is dead. Page reload required.", {
4150
3622
  backend: this._backend,
4151
3623
  timeoutMs: _SenseVoiceInference.INFERENCE_TIMEOUT_MS
4152
3624
  });
@@ -4154,7 +3626,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
4154
3626
  const oomError = new Error(
4155
3627
  `SenseVoice inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
4156
3628
  );
4157
- logger4.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
3629
+ logger5.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
4158
3630
  pointer: `0x${err.toString(16)}`,
4159
3631
  backend: this._backend
4160
3632
  });
@@ -4167,7 +3639,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
4167
3639
  reject(oomError);
4168
3640
  return;
4169
3641
  } else {
4170
- logger4.error("Inference failed", { error: errMsg, backend: this._backend });
3642
+ logger5.error("Inference failed", { error: errMsg, backend: this._backend });
4171
3643
  }
4172
3644
  span?.endWithError(err instanceof Error ? err : new Error(String(err)));
4173
3645
  telemetry?.incrementCounter("omote.inference.total", 1, {
@@ -4196,7 +3668,7 @@ _SenseVoiceInference.INFERENCE_TIMEOUT_MS = 1e4;
4196
3668
  var SenseVoiceInference = _SenseVoiceInference;
4197
3669
 
4198
3670
  // src/inference/SenseVoiceWorker.ts
4199
- var logger5 = createLogger("SenseVoiceWorker");
3671
+ var logger6 = createLogger("SenseVoiceWorker");
4200
3672
  var WASM_CDN_PATH2 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
4201
3673
  var LOAD_TIMEOUT_MS = 3e4;
4202
3674
  var INFERENCE_TIMEOUT_MS = 1e4;
@@ -4929,7 +4401,7 @@ var SenseVoiceWorker = class {
4929
4401
  this.handleWorkerMessage(event.data);
4930
4402
  };
4931
4403
  worker.onerror = (error) => {
4932
- logger5.error("Worker error", { error: error.message });
4404
+ logger6.error("Worker error", { error: error.message });
4933
4405
  for (const [, resolver] of this.pendingResolvers) {
4934
4406
  resolver.reject(new Error(`Worker error: ${error.message}`));
4935
4407
  }
@@ -5009,9 +4481,9 @@ var SenseVoiceWorker = class {
5009
4481
  "model.language": this.config.language
5010
4482
  });
5011
4483
  try {
5012
- logger5.info("Creating SenseVoice worker...");
4484
+ logger6.info("Creating SenseVoice worker...");
5013
4485
  this.worker = this.createWorker();
5014
- logger5.info("Loading model in worker...", {
4486
+ logger6.info("Loading model in worker...", {
5015
4487
  modelUrl: this.config.modelUrl,
5016
4488
  tokensUrl: this.config.tokensUrl,
5017
4489
  language: this.config.language,
@@ -5033,7 +4505,7 @@ var SenseVoiceWorker = class {
5033
4505
  this._isLoaded = true;
5034
4506
  const loadTimeMs = performance.now() - startTime;
5035
4507
  onProgress?.(1, 1);
5036
- logger5.info("SenseVoice worker loaded successfully", {
4508
+ logger6.info("SenseVoice worker loaded successfully", {
5037
4509
  backend: "wasm",
5038
4510
  loadTimeMs: Math.round(loadTimeMs),
5039
4511
  workerLoadTimeMs: Math.round(result.loadTimeMs),
@@ -5112,7 +4584,7 @@ var SenseVoiceWorker = class {
5112
4584
  INFERENCE_TIMEOUT_MS
5113
4585
  );
5114
4586
  const totalTimeMs = performance.now() - startTime;
5115
- logger5.trace("Worker transcription complete", {
4587
+ logger6.trace("Worker transcription complete", {
5116
4588
  text: result.text.substring(0, 50),
5117
4589
  language: result.language,
5118
4590
  emotion: result.emotion,
@@ -5148,11 +4620,11 @@ var SenseVoiceWorker = class {
5148
4620
  } catch (err) {
5149
4621
  const errMsg = err instanceof Error ? err.message : String(err);
5150
4622
  if (errMsg.includes("timed out")) {
5151
- logger5.error("CRITICAL: Worker inference timed out \u2014 SenseVoice worker is dead. Page reload required.", {
4623
+ logger6.error("CRITICAL: Worker inference timed out \u2014 SenseVoice worker is dead. Page reload required.", {
5152
4624
  timeoutMs: INFERENCE_TIMEOUT_MS
5153
4625
  });
5154
4626
  } else {
5155
- logger5.error("Worker inference failed", { error: errMsg });
4627
+ logger6.error("Worker inference failed", { error: errMsg });
5156
4628
  }
5157
4629
  span?.endWithError(err instanceof Error ? err : new Error(String(err)));
5158
4630
  telemetry?.incrementCounter("omote.inference.total", 1, {
@@ -5190,7 +4662,7 @@ var SenseVoiceWorker = class {
5190
4662
  };
5191
4663
 
5192
4664
  // src/inference/UnifiedInferenceWorker.ts
5193
- var logger6 = createLogger("UnifiedInferenceWorker");
4665
+ var logger7 = createLogger("UnifiedInferenceWorker");
5194
4666
  var WASM_CDN_PATH3 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
5195
4667
  var INIT_TIMEOUT_MS = 15e3;
5196
4668
  var SV_LOAD_TIMEOUT_MS = 3e4;
@@ -5886,7 +5358,7 @@ var UnifiedInferenceWorker = class {
5886
5358
  const telemetry = getTelemetry();
5887
5359
  const span = telemetry?.startSpan("UnifiedInferenceWorker.init");
5888
5360
  try {
5889
- logger6.info("Creating unified inference worker...");
5361
+ logger7.info("Creating unified inference worker...");
5890
5362
  this.worker = this.createWorker();
5891
5363
  await this.sendMessage(
5892
5364
  { type: "init", wasmPaths: WASM_CDN_PATH3, isIOS: isIOS() },
@@ -5895,7 +5367,7 @@ var UnifiedInferenceWorker = class {
5895
5367
  );
5896
5368
  this.initialized = true;
5897
5369
  const loadTimeMs = performance.now() - startTime;
5898
- logger6.info("Unified worker initialized", { loadTimeMs: Math.round(loadTimeMs) });
5370
+ logger7.info("Unified worker initialized", { loadTimeMs: Math.round(loadTimeMs) });
5899
5371
  span?.setAttributes({ "worker.init_time_ms": loadTimeMs });
5900
5372
  span?.end();
5901
5373
  } catch (error) {
@@ -5949,8 +5421,8 @@ var UnifiedInferenceWorker = class {
5949
5421
  if (!this.worker) return;
5950
5422
  await this.sendMessage({ type: "sv:dispose" }, "sv:disposed", DISPOSE_TIMEOUT_MS);
5951
5423
  }
5952
- // ── Wav2ArkitCpu (Lip Sync) ──────────────────────────────────────────
5953
- async loadLipSync(config) {
5424
+ // ── Wav2ArkitCpu (A2E) ──────────────────────────────────────────────
5425
+ async loadA2E(config) {
5954
5426
  this.assertReady();
5955
5427
  const startTime = performance.now();
5956
5428
  const result = await this.sendMessage(
@@ -5971,7 +5443,7 @@ var UnifiedInferenceWorker = class {
5971
5443
  outputNames: result.outputNames
5972
5444
  };
5973
5445
  }
5974
- async inferLipSync(audio) {
5446
+ async inferA2E(audio) {
5975
5447
  this.assertReady();
5976
5448
  return this.sendMessage(
5977
5449
  { type: "cpu:infer", audio },
@@ -5979,7 +5451,7 @@ var UnifiedInferenceWorker = class {
5979
5451
  CPU_INFER_TIMEOUT_MS
5980
5452
  );
5981
5453
  }
5982
- async disposeLipSync() {
5454
+ async disposeA2E() {
5983
5455
  if (!this.worker) return;
5984
5456
  await this.sendMessage({ type: "cpu:dispose" }, "cpu:disposed", DISPOSE_TIMEOUT_MS);
5985
5457
  }
@@ -6069,7 +5541,7 @@ var UnifiedInferenceWorker = class {
6069
5541
  this.handleWorkerMessage(event.data);
6070
5542
  };
6071
5543
  worker.onerror = (error) => {
6072
- logger6.error("Unified worker error", { error: error.message });
5544
+ logger7.error("Unified worker error", { error: error.message });
6073
5545
  this.rejectAllPending(`Worker error: ${error.message}`);
6074
5546
  };
6075
5547
  return worker;
@@ -6083,7 +5555,7 @@ var UnifiedInferenceWorker = class {
6083
5555
  this.pendingRequests.delete(requestId);
6084
5556
  pending.reject(new Error(data.error));
6085
5557
  } else {
6086
- logger6.error("Worker broadcast error", { error: data.error });
5558
+ logger7.error("Worker broadcast error", { error: data.error });
6087
5559
  this.rejectAllPending(data.error);
6088
5560
  }
6089
5561
  return;
@@ -6105,7 +5577,7 @@ var UnifiedInferenceWorker = class {
6105
5577
  const timeout = setTimeout(() => {
6106
5578
  this.pendingRequests.delete(requestId);
6107
5579
  this.poisoned = true;
6108
- logger6.error("CRITICAL: Worker operation timed out \u2014 worker is dead", {
5580
+ logger7.error("CRITICAL: Worker operation timed out \u2014 worker is dead", {
6109
5581
  type: message.type,
6110
5582
  timeoutMs
6111
5583
  });
@@ -6171,7 +5643,7 @@ var SenseVoiceUnifiedAdapter = class {
6171
5643
  });
6172
5644
  this._isLoaded = true;
6173
5645
  onProgress?.(1, 1);
6174
- logger6.info("SenseVoice loaded via unified worker", {
5646
+ logger7.info("SenseVoice loaded via unified worker", {
6175
5647
  backend: "wasm",
6176
5648
  loadTimeMs: Math.round(result.loadTimeMs),
6177
5649
  vocabSize: result.vocabSize
@@ -6212,6 +5684,7 @@ var SenseVoiceUnifiedAdapter = class {
6212
5684
  var Wav2ArkitCpuUnifiedAdapter = class {
6213
5685
  constructor(worker, config) {
6214
5686
  this.modelId = "wav2arkit_cpu";
5687
+ this.chunkSize = 16e3;
6215
5688
  this._isLoaded = false;
6216
5689
  this.inferenceQueue = Promise.resolve();
6217
5690
  this.worker = worker;
@@ -6230,12 +5703,12 @@ var Wav2ArkitCpuUnifiedAdapter = class {
6230
5703
  });
6231
5704
  try {
6232
5705
  const externalDataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${this.config.modelUrl}.data` : null;
6233
- const result = await this.worker.loadLipSync({
5706
+ const result = await this.worker.loadA2E({
6234
5707
  modelUrl: this.config.modelUrl,
6235
5708
  externalDataUrl: externalDataUrl || null
6236
5709
  });
6237
5710
  this._isLoaded = true;
6238
- logger6.info("Wav2ArkitCpu loaded via unified worker", {
5711
+ logger7.info("Wav2ArkitCpu loaded via unified worker", {
6239
5712
  backend: "wasm",
6240
5713
  loadTimeMs: Math.round(result.loadTimeMs)
6241
5714
  });
@@ -6262,7 +5735,7 @@ var Wav2ArkitCpuUnifiedAdapter = class {
6262
5735
  });
6263
5736
  try {
6264
5737
  const startTime = performance.now();
6265
- const result = await this.worker.inferLipSync(audioCopy);
5738
+ const result = await this.worker.inferA2E(audioCopy);
6266
5739
  const inferenceTimeMs = performance.now() - startTime;
6267
5740
  const flatBuffer = result.blendshapes;
6268
5741
  const { numFrames, numBlendshapes } = result;
@@ -6285,7 +5758,7 @@ var Wav2ArkitCpuUnifiedAdapter = class {
6285
5758
  }
6286
5759
  async dispose() {
6287
5760
  if (this._isLoaded) {
6288
- await this.worker.disposeLipSync();
5761
+ await this.worker.disposeA2E();
6289
5762
  this._isLoaded = false;
6290
5763
  }
6291
5764
  }
@@ -6341,7 +5814,7 @@ var SileroVADUnifiedAdapter = class {
6341
5814
  sampleRate: this.config.sampleRate
6342
5815
  });
6343
5816
  this._isLoaded = true;
6344
- logger6.info("SileroVAD loaded via unified worker", {
5817
+ logger7.info("SileroVAD loaded via unified worker", {
6345
5818
  backend: "wasm",
6346
5819
  loadTimeMs: Math.round(result.loadTimeMs),
6347
5820
  sampleRate: this.config.sampleRate,
@@ -6422,10 +5895,10 @@ var SileroVADUnifiedAdapter = class {
6422
5895
  };
6423
5896
 
6424
5897
  // src/inference/createSenseVoice.ts
6425
- var logger7 = createLogger("createSenseVoice");
5898
+ var logger8 = createLogger("createSenseVoice");
6426
5899
  function createSenseVoice(config) {
6427
5900
  if (config.unifiedWorker) {
6428
- logger7.info("Creating SenseVoiceUnifiedAdapter (shared unified worker)");
5901
+ logger8.info("Creating SenseVoiceUnifiedAdapter (shared unified worker)");
6429
5902
  return new SenseVoiceUnifiedAdapter(config.unifiedWorker, {
6430
5903
  modelUrl: config.modelUrl,
6431
5904
  tokensUrl: config.tokensUrl,
@@ -6438,7 +5911,7 @@ function createSenseVoice(config) {
6438
5911
  if (!SenseVoiceWorker.isSupported()) {
6439
5912
  throw new Error("Web Workers are not supported in this environment");
6440
5913
  }
6441
- logger7.info("Creating SenseVoiceWorker (off-main-thread)");
5914
+ logger8.info("Creating SenseVoiceWorker (off-main-thread)");
6442
5915
  return new SenseVoiceWorker({
6443
5916
  modelUrl: config.modelUrl,
6444
5917
  tokensUrl: config.tokensUrl,
@@ -6447,7 +5920,7 @@ function createSenseVoice(config) {
6447
5920
  });
6448
5921
  }
6449
5922
  if (useWorker === false) {
6450
- logger7.info("Creating SenseVoiceInference (main thread)");
5923
+ logger8.info("Creating SenseVoiceInference (main thread)");
6451
5924
  return new SenseVoiceInference({
6452
5925
  modelUrl: config.modelUrl,
6453
5926
  tokensUrl: config.tokensUrl,
@@ -6456,7 +5929,7 @@ function createSenseVoice(config) {
6456
5929
  });
6457
5930
  }
6458
5931
  if (SenseVoiceWorker.isSupported() && !isIOS()) {
6459
- logger7.info("Auto-detected: creating SenseVoiceWorker (off-main-thread)");
5932
+ logger8.info("Auto-detected: creating SenseVoiceWorker (off-main-thread)");
6460
5933
  return new SenseVoiceWorker({
6461
5934
  modelUrl: config.modelUrl,
6462
5935
  tokensUrl: config.tokensUrl,
@@ -6464,7 +5937,7 @@ function createSenseVoice(config) {
6464
5937
  textNorm: config.textNorm
6465
5938
  });
6466
5939
  }
6467
- logger7.info("Auto-detected: creating SenseVoiceInference (main thread)", {
5940
+ logger8.info("Auto-detected: creating SenseVoiceInference (main thread)", {
6468
5941
  reason: isIOS() ? "iOS (shared ORT instance)" : "Worker unsupported"
6469
5942
  });
6470
5943
  return new SenseVoiceInference({
@@ -6476,10 +5949,11 @@ function createSenseVoice(config) {
6476
5949
  }
6477
5950
 
6478
5951
  // src/inference/Wav2ArkitCpuInference.ts
6479
- var logger8 = createLogger("Wav2ArkitCpu");
5952
+ var logger9 = createLogger("Wav2ArkitCpu");
6480
5953
  var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
6481
5954
  constructor(config) {
6482
5955
  this.modelId = "wav2arkit_cpu";
5956
+ this.chunkSize = 16e3;
6483
5957
  this.session = null;
6484
5958
  this.ort = null;
6485
5959
  this._backend = "wasm";
@@ -6517,16 +5991,16 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
6517
5991
  });
6518
5992
  try {
6519
5993
  const preference = this.config.backend || "wasm";
6520
- logger8.info("Loading ONNX Runtime...", { preference });
5994
+ logger9.info("Loading ONNX Runtime...", { preference });
6521
5995
  const { ort, backend } = await getOnnxRuntimeForPreference(preference);
6522
5996
  this.ort = ort;
6523
5997
  this._backend = backend;
6524
- logger8.info("ONNX Runtime loaded", { backend: this._backend });
5998
+ logger9.info("ONNX Runtime loaded", { backend: this._backend });
6525
5999
  const modelUrl = this.config.modelUrl;
6526
6000
  const dataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${modelUrl}.data` : null;
6527
6001
  const sessionOptions = getSessionOptions(this._backend);
6528
6002
  if (isIOS()) {
6529
- logger8.info("iOS: passing model URLs directly to ORT (low-memory path)", {
6003
+ logger9.info("iOS: passing model URLs directly to ORT (low-memory path)", {
6530
6004
  modelUrl,
6531
6005
  dataUrl
6532
6006
  });
@@ -6544,15 +6018,15 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
6544
6018
  const isCached = await cache.has(modelUrl);
6545
6019
  let modelBuffer;
6546
6020
  if (isCached) {
6547
- logger8.debug("Loading model from cache", { modelUrl });
6021
+ logger9.debug("Loading model from cache", { modelUrl });
6548
6022
  modelBuffer = await cache.get(modelUrl);
6549
6023
  if (!modelBuffer) {
6550
- logger8.warn("Cache corruption detected, clearing and retrying", { modelUrl });
6024
+ logger9.warn("Cache corruption detected, clearing and retrying", { modelUrl });
6551
6025
  await cache.delete(modelUrl);
6552
6026
  modelBuffer = await fetchWithCache(modelUrl);
6553
6027
  }
6554
6028
  } else {
6555
- logger8.debug("Fetching and caching model graph", { modelUrl });
6029
+ logger9.debug("Fetching and caching model graph", { modelUrl });
6556
6030
  modelBuffer = await fetchWithCache(modelUrl);
6557
6031
  }
6558
6032
  if (!modelBuffer) {
@@ -6563,31 +6037,31 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
6563
6037
  try {
6564
6038
  const isDataCached = await cache.has(dataUrl);
6565
6039
  if (isDataCached) {
6566
- logger8.debug("Loading external data from cache", { dataUrl });
6040
+ logger9.debug("Loading external data from cache", { dataUrl });
6567
6041
  externalDataBuffer = await cache.get(dataUrl);
6568
6042
  if (!externalDataBuffer) {
6569
- logger8.warn("Cache corruption for external data, retrying", { dataUrl });
6043
+ logger9.warn("Cache corruption for external data, retrying", { dataUrl });
6570
6044
  await cache.delete(dataUrl);
6571
6045
  externalDataBuffer = await fetchWithCache(dataUrl);
6572
6046
  }
6573
6047
  } else {
6574
- logger8.info("Fetching external model data", {
6048
+ logger9.info("Fetching external model data", {
6575
6049
  dataUrl,
6576
6050
  note: "This may be a large download (400MB+)"
6577
6051
  });
6578
6052
  externalDataBuffer = await fetchWithCache(dataUrl);
6579
6053
  }
6580
- logger8.info("External data loaded", {
6054
+ logger9.info("External data loaded", {
6581
6055
  size: formatBytes(externalDataBuffer.byteLength)
6582
6056
  });
6583
6057
  } catch (err) {
6584
- logger8.debug("No external data file found (single-file model)", {
6058
+ logger9.debug("No external data file found (single-file model)", {
6585
6059
  dataUrl,
6586
6060
  error: err.message
6587
6061
  });
6588
6062
  }
6589
6063
  }
6590
- logger8.debug("Creating ONNX session", {
6064
+ logger9.debug("Creating ONNX session", {
6591
6065
  graphSize: formatBytes(modelBuffer.byteLength),
6592
6066
  externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
6593
6067
  backend: this._backend
@@ -6603,7 +6077,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
6603
6077
  this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
6604
6078
  }
6605
6079
  const loadTimeMs = performance.now() - startTime;
6606
- logger8.info("Model loaded successfully", {
6080
+ logger9.info("Model loaded successfully", {
6607
6081
  backend: this._backend,
6608
6082
  loadTimeMs: Math.round(loadTimeMs),
6609
6083
  inputs: this.session.inputNames,
@@ -6619,12 +6093,12 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
6619
6093
  model: "wav2arkit_cpu",
6620
6094
  backend: this._backend
6621
6095
  });
6622
- logger8.debug("Running warmup inference");
6096
+ logger9.debug("Running warmup inference");
6623
6097
  const warmupStart = performance.now();
6624
6098
  const silentAudio = new Float32Array(16e3);
6625
6099
  await this.infer(silentAudio);
6626
6100
  const warmupTimeMs = performance.now() - warmupStart;
6627
- logger8.info("Warmup inference complete", {
6101
+ logger9.info("Warmup inference complete", {
6628
6102
  warmupTimeMs: Math.round(warmupTimeMs),
6629
6103
  backend: this._backend
6630
6104
  });
@@ -6711,7 +6185,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
6711
6185
  const symmetrized = symmetrizeBlendshapes(rawFrame);
6712
6186
  blendshapes.push(symmetrized);
6713
6187
  }
6714
- logger8.trace("Inference completed", {
6188
+ logger9.trace("Inference completed", {
6715
6189
  inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
6716
6190
  numFrames,
6717
6191
  inputSamples
@@ -6739,7 +6213,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
6739
6213
  const errMsg = err instanceof Error ? err.message : String(err);
6740
6214
  if (errMsg.includes("timed out")) {
6741
6215
  this.poisoned = true;
6742
- logger8.error("CRITICAL: Inference session timed out \u2014 Wav2ArkitCpu is dead. Page reload required.", {
6216
+ logger9.error("CRITICAL: Inference session timed out \u2014 Wav2ArkitCpu is dead. Page reload required.", {
6743
6217
  backend: this._backend,
6744
6218
  timeoutMs: _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
6745
6219
  });
@@ -6747,7 +6221,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
6747
6221
  const oomError = new Error(
6748
6222
  `Wav2ArkitCpu inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
6749
6223
  );
6750
- logger8.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
6224
+ logger9.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
6751
6225
  pointer: `0x${err.toString(16)}`,
6752
6226
  backend: this._backend
6753
6227
  });
@@ -6760,7 +6234,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
6760
6234
  reject(oomError);
6761
6235
  return;
6762
6236
  } else {
6763
- logger8.error("Inference failed", { error: errMsg, backend: this._backend });
6237
+ logger9.error("Inference failed", { error: errMsg, backend: this._backend });
6764
6238
  }
6765
6239
  span?.endWithError(err instanceof Error ? err : new Error(String(err)));
6766
6240
  telemetry?.incrementCounter("omote.inference.total", 1, {
@@ -6787,7 +6261,7 @@ _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS = 5e3;
6787
6261
  var Wav2ArkitCpuInference = _Wav2ArkitCpuInference;
6788
6262
 
6789
6263
  // src/inference/Wav2ArkitCpuWorker.ts
6790
- var logger9 = createLogger("Wav2ArkitCpuWorker");
6264
+ var logger10 = createLogger("Wav2ArkitCpuWorker");
6791
6265
  var WASM_CDN_PATH4 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
6792
6266
  var LOAD_TIMEOUT_MS2 = 6e4;
6793
6267
  var INFERENCE_TIMEOUT_MS2 = 5e3;
@@ -7033,6 +6507,7 @@ self.onerror = function(err) {
7033
6507
  var Wav2ArkitCpuWorker = class {
7034
6508
  constructor(config) {
7035
6509
  this.modelId = "wav2arkit_cpu";
6510
+ this.chunkSize = 16e3;
7036
6511
  this.worker = null;
7037
6512
  this.isLoading = false;
7038
6513
  this._isLoaded = false;
@@ -7067,7 +6542,7 @@ var Wav2ArkitCpuWorker = class {
7067
6542
  this.handleWorkerMessage(event.data);
7068
6543
  };
7069
6544
  worker.onerror = (error) => {
7070
- logger9.error("Worker error", { error: error.message });
6545
+ logger10.error("Worker error", { error: error.message });
7071
6546
  for (const [, resolver] of this.pendingResolvers) {
7072
6547
  resolver.reject(new Error(`Worker error: ${error.message}`));
7073
6548
  }
@@ -7143,10 +6618,10 @@ var Wav2ArkitCpuWorker = class {
7143
6618
  "model.backend_requested": "wasm"
7144
6619
  });
7145
6620
  try {
7146
- logger9.info("Creating wav2arkit_cpu worker...");
6621
+ logger10.info("Creating wav2arkit_cpu worker...");
7147
6622
  this.worker = this.createWorker();
7148
6623
  const externalDataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${this.config.modelUrl}.data` : null;
7149
- logger9.info("Loading model in worker...", {
6624
+ logger10.info("Loading model in worker...", {
7150
6625
  modelUrl: this.config.modelUrl,
7151
6626
  externalDataUrl,
7152
6627
  isIOS: isIOS()
@@ -7164,7 +6639,7 @@ var Wav2ArkitCpuWorker = class {
7164
6639
  );
7165
6640
  this._isLoaded = true;
7166
6641
  const loadTimeMs = performance.now() - startTime;
7167
- logger9.info("Wav2ArkitCpu worker loaded successfully", {
6642
+ logger10.info("Wav2ArkitCpu worker loaded successfully", {
7168
6643
  backend: "wasm",
7169
6644
  loadTimeMs: Math.round(loadTimeMs),
7170
6645
  workerLoadTimeMs: Math.round(result.loadTimeMs),
@@ -7249,7 +6724,7 @@ var Wav2ArkitCpuWorker = class {
7249
6724
  for (let f = 0; f < numFrames; f++) {
7250
6725
  blendshapes.push(flatBuffer.slice(f * numBlendshapes, (f + 1) * numBlendshapes));
7251
6726
  }
7252
- logger9.trace("Worker inference completed", {
6727
+ logger10.trace("Worker inference completed", {
7253
6728
  inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
7254
6729
  workerTimeMs: Math.round(result.inferenceTimeMs * 100) / 100,
7255
6730
  numFrames,
@@ -7279,12 +6754,12 @@ var Wav2ArkitCpuWorker = class {
7279
6754
  const errMsg = err instanceof Error ? err.message : String(err);
7280
6755
  if (errMsg.includes("timed out")) {
7281
6756
  this.poisoned = true;
7282
- logger9.error("CRITICAL: Worker inference timed out \u2014 Wav2ArkitCpu worker is dead. Page reload required.", {
6757
+ logger10.error("CRITICAL: Worker inference timed out \u2014 Wav2ArkitCpu worker is dead. Page reload required.", {
7283
6758
  backend: "wasm",
7284
6759
  timeoutMs: INFERENCE_TIMEOUT_MS2
7285
6760
  });
7286
6761
  } else {
7287
- logger9.error("Worker inference failed", { error: errMsg, backend: "wasm" });
6762
+ logger10.error("Worker inference failed", { error: errMsg, backend: "wasm" });
7288
6763
  }
7289
6764
  span?.endWithError(err instanceof Error ? err : new Error(String(err)));
7290
6765
  telemetry?.incrementCounter("omote.inference.total", 1, {
@@ -7321,39 +6796,39 @@ var Wav2ArkitCpuWorker = class {
7321
6796
  }
7322
6797
  };
7323
6798
 
7324
- // src/inference/createLipSync.ts
7325
- var logger10 = createLogger("createLipSync");
7326
- function createLipSync(config) {
6799
+ // src/inference/createA2E.ts
6800
+ var logger11 = createLogger("createA2E");
6801
+ function createA2E(config) {
7327
6802
  const mode = config.mode ?? "auto";
7328
6803
  const fallbackOnError = config.fallbackOnError ?? true;
7329
6804
  let useCpu;
7330
6805
  if (mode === "cpu") {
7331
6806
  useCpu = true;
7332
- logger10.info("Forcing CPU lip sync model (wav2arkit_cpu)");
6807
+ logger11.info("Forcing CPU A2E model (wav2arkit_cpu)");
7333
6808
  } else if (mode === "gpu") {
7334
6809
  useCpu = false;
7335
- logger10.info("Forcing GPU lip sync model (Wav2Vec2)");
6810
+ logger11.info("Forcing GPU A2E model (Wav2Vec2)");
7336
6811
  } else {
7337
- useCpu = shouldUseCpuLipSync();
7338
- logger10.info("Auto-detected lip sync model", {
6812
+ useCpu = shouldUseCpuA2E();
6813
+ logger11.info("Auto-detected A2E model", {
7339
6814
  useCpu,
7340
6815
  isSafari: isSafari()
7341
6816
  });
7342
6817
  }
7343
6818
  if (useCpu) {
7344
6819
  if (config.unifiedWorker) {
7345
- logger10.info("Creating Wav2ArkitCpuUnifiedAdapter (404MB, WASM, shared unified worker)");
6820
+ logger11.info("Creating Wav2ArkitCpuUnifiedAdapter (404MB, WASM, shared unified worker)");
7346
6821
  return new Wav2ArkitCpuUnifiedAdapter(config.unifiedWorker, {
7347
6822
  modelUrl: config.cpuModelUrl
7348
6823
  });
7349
6824
  }
7350
6825
  if (config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
7351
- logger10.info("Creating Wav2ArkitCpuWorker (404MB, WASM, off-main-thread)");
6826
+ logger11.info("Creating Wav2ArkitCpuWorker (404MB, WASM, off-main-thread)");
7352
6827
  return new Wav2ArkitCpuWorker({
7353
6828
  modelUrl: config.cpuModelUrl
7354
6829
  });
7355
6830
  }
7356
- logger10.info("Creating Wav2ArkitCpuInference (404MB, WASM)");
6831
+ logger11.info("Creating Wav2ArkitCpuInference (404MB, WASM)");
7357
6832
  return new Wav2ArkitCpuInference({
7358
6833
  modelUrl: config.cpuModelUrl
7359
6834
  });
@@ -7365,13 +6840,13 @@ function createLipSync(config) {
7365
6840
  numIdentityClasses: config.numIdentityClasses
7366
6841
  });
7367
6842
  if (fallbackOnError) {
7368
- logger10.info("Creating Wav2Vec2Inference with CPU fallback");
7369
- return new LipSyncWithFallback(gpuInstance, config);
6843
+ logger11.info("Creating Wav2Vec2Inference with CPU fallback");
6844
+ return new A2EWithFallback(gpuInstance, config);
7370
6845
  }
7371
- logger10.info("Creating Wav2Vec2Inference (no fallback)");
6846
+ logger11.info("Creating Wav2Vec2Inference (no fallback)");
7372
6847
  return gpuInstance;
7373
6848
  }
7374
- var LipSyncWithFallback = class {
6849
+ var A2EWithFallback = class {
7375
6850
  constructor(gpuInstance, config) {
7376
6851
  this.hasFallenBack = false;
7377
6852
  this.implementation = gpuInstance;
@@ -7380,6 +6855,9 @@ var LipSyncWithFallback = class {
7380
6855
  get modelId() {
7381
6856
  return this.implementation.modelId;
7382
6857
  }
6858
+ get chunkSize() {
6859
+ return this.implementation.chunkSize;
6860
+ }
7383
6861
  get backend() {
7384
6862
  return this.implementation.backend;
7385
6863
  }
@@ -7394,7 +6872,7 @@ var LipSyncWithFallback = class {
7394
6872
  }
7395
6873
  }
7396
6874
  async fallbackToCpu(reason) {
7397
- logger10.warn("GPU model load failed, falling back to CPU model", { reason });
6875
+ logger11.warn("GPU model load failed, falling back to CPU model", { reason });
7398
6876
  try {
7399
6877
  await this.implementation.dispose();
7400
6878
  } catch {
@@ -7403,17 +6881,17 @@ var LipSyncWithFallback = class {
7403
6881
  this.implementation = new Wav2ArkitCpuUnifiedAdapter(this.config.unifiedWorker, {
7404
6882
  modelUrl: this.config.cpuModelUrl
7405
6883
  });
7406
- logger10.info("Fallback to Wav2ArkitCpuUnifiedAdapter successful");
6884
+ logger11.info("Fallback to Wav2ArkitCpuUnifiedAdapter successful");
7407
6885
  } else if (this.config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
7408
6886
  this.implementation = new Wav2ArkitCpuWorker({
7409
6887
  modelUrl: this.config.cpuModelUrl
7410
6888
  });
7411
- logger10.info("Fallback to Wav2ArkitCpuWorker successful");
6889
+ logger11.info("Fallback to Wav2ArkitCpuWorker successful");
7412
6890
  } else {
7413
6891
  this.implementation = new Wav2ArkitCpuInference({
7414
6892
  modelUrl: this.config.cpuModelUrl
7415
6893
  });
7416
- logger10.info("Fallback to Wav2ArkitCpuInference successful");
6894
+ logger11.info("Fallback to Wav2ArkitCpuInference successful");
7417
6895
  }
7418
6896
  this.hasFallenBack = true;
7419
6897
  return await this.implementation.load();
@@ -7426,8 +6904,198 @@ var LipSyncWithFallback = class {
7426
6904
  }
7427
6905
  };
7428
6906
 
6907
+ // src/inference/BlendshapeSmoother.ts
6908
+ var NUM_BLENDSHAPES = 52;
6909
+ var BlendshapeSmoother = class {
6910
+ constructor(config) {
6911
+ /** Whether any target has been set */
6912
+ this._hasTarget = false;
6913
+ this.halflife = config?.halflife ?? 0.06;
6914
+ this.values = new Float32Array(NUM_BLENDSHAPES);
6915
+ this.velocities = new Float32Array(NUM_BLENDSHAPES);
6916
+ this.targets = new Float32Array(NUM_BLENDSHAPES);
6917
+ }
6918
+ /** Whether a target frame has been set (false until first setTarget call) */
6919
+ get hasTarget() {
6920
+ return this._hasTarget;
6921
+ }
6922
+ /**
6923
+ * Set new target frame from inference output.
6924
+ * Springs will converge toward these values on subsequent update() calls.
6925
+ */
6926
+ setTarget(frame) {
6927
+ this.targets.set(frame);
6928
+ this._hasTarget = true;
6929
+ }
6930
+ /**
6931
+ * Advance all 52 springs by `dt` seconds and return the smoothed frame.
6932
+ *
6933
+ * Call this every render frame (e.g., inside requestAnimationFrame).
6934
+ * Returns the internal values buffer — do NOT mutate the returned array.
6935
+ *
6936
+ * @param dt - Time step in seconds (e.g., 1/60 for 60fps)
6937
+ * @returns Smoothed blendshape values (Float32Array of 52)
6938
+ */
6939
+ update(dt) {
6940
+ if (!this._hasTarget) {
6941
+ return this.values;
6942
+ }
6943
+ if (this.halflife <= 0) {
6944
+ this.values.set(this.targets);
6945
+ this.velocities.fill(0);
6946
+ return this.values;
6947
+ }
6948
+ const damping = Math.LN2 / this.halflife;
6949
+ const eydt = Math.exp(-damping * dt);
6950
+ for (let i = 0; i < NUM_BLENDSHAPES; i++) {
6951
+ const j0 = this.values[i] - this.targets[i];
6952
+ const j1 = this.velocities[i] + j0 * damping;
6953
+ this.values[i] = eydt * (j0 + j1 * dt) + this.targets[i];
6954
+ this.velocities[i] = eydt * (this.velocities[i] - j1 * damping * dt);
6955
+ this.values[i] = Math.max(0, Math.min(1, this.values[i]));
6956
+ }
6957
+ return this.values;
6958
+ }
6959
+ /**
6960
+ * Decay all spring targets to neutral (0).
6961
+ *
6962
+ * Call when inference stalls (no new frames for threshold duration).
6963
+ * The springs will smoothly close the mouth / relax the face over
6964
+ * the halflife period rather than freezing.
6965
+ */
6966
+ decayToNeutral() {
6967
+ this.targets.fill(0);
6968
+ }
6969
+ /**
6970
+ * Reset all state (values, velocities, targets).
6971
+ * Call when starting a new playback session.
6972
+ */
6973
+ reset() {
6974
+ this.values.fill(0);
6975
+ this.velocities.fill(0);
6976
+ this.targets.fill(0);
6977
+ this._hasTarget = false;
6978
+ }
6979
+ };
6980
+
6981
+ // src/animation/audioEnergy.ts
6982
+ function calculateRMS(samples) {
6983
+ if (samples.length === 0) return 0;
6984
+ let sumSquares = 0;
6985
+ for (let i = 0; i < samples.length; i++) {
6986
+ sumSquares += samples[i] * samples[i];
6987
+ }
6988
+ return Math.sqrt(sumSquares / samples.length);
6989
+ }
6990
+ function calculatePeak(samples) {
6991
+ let peak = 0;
6992
+ for (let i = 0; i < samples.length; i++) {
6993
+ const abs = Math.abs(samples[i]);
6994
+ if (abs > peak) peak = abs;
6995
+ }
6996
+ return peak;
6997
+ }
6998
+ var AudioEnergyAnalyzer = class {
6999
+ /**
7000
+ * @param smoothingFactor How much to smooth (0 = no smoothing, 1 = infinite smoothing). Default 0.85
7001
+ * @param noiseFloor Minimum energy threshold to consider as signal. Default 0.01
7002
+ */
7003
+ constructor(smoothingFactor = 0.85, noiseFloor = 0.01) {
7004
+ this.smoothedRMS = 0;
7005
+ this.smoothedPeak = 0;
7006
+ this.smoothingFactor = Math.max(0, Math.min(0.99, smoothingFactor));
7007
+ this.noiseFloor = noiseFloor;
7008
+ }
7009
+ /**
7010
+ * Process audio samples and return smoothed energy values
7011
+ * @param samples Audio samples (Float32Array)
7012
+ * @returns Object with rms and peak values
7013
+ */
7014
+ process(samples) {
7015
+ const instantRMS = calculateRMS(samples);
7016
+ const instantPeak = calculatePeak(samples);
7017
+ const gatedRMS = instantRMS > this.noiseFloor ? instantRMS : 0;
7018
+ const gatedPeak = instantPeak > this.noiseFloor ? instantPeak : 0;
7019
+ if (gatedRMS > this.smoothedRMS) {
7020
+ this.smoothedRMS = this.smoothedRMS * 0.5 + gatedRMS * 0.5;
7021
+ } else {
7022
+ this.smoothedRMS = this.smoothedRMS * this.smoothingFactor + gatedRMS * (1 - this.smoothingFactor);
7023
+ }
7024
+ if (gatedPeak > this.smoothedPeak) {
7025
+ this.smoothedPeak = this.smoothedPeak * 0.3 + gatedPeak * 0.7;
7026
+ } else {
7027
+ this.smoothedPeak = this.smoothedPeak * this.smoothingFactor + gatedPeak * (1 - this.smoothingFactor);
7028
+ }
7029
+ const energy = this.smoothedRMS * 0.7 + this.smoothedPeak * 0.3;
7030
+ return {
7031
+ rms: this.smoothedRMS,
7032
+ peak: this.smoothedPeak,
7033
+ energy: Math.min(1, energy * 2)
7034
+ // Scale up and clamp
7035
+ };
7036
+ }
7037
+ /**
7038
+ * Reset analyzer state
7039
+ */
7040
+ reset() {
7041
+ this.smoothedRMS = 0;
7042
+ this.smoothedPeak = 0;
7043
+ }
7044
+ /**
7045
+ * Get current smoothed RMS value
7046
+ */
7047
+ get rms() {
7048
+ return this.smoothedRMS;
7049
+ }
7050
+ /**
7051
+ * Get current smoothed peak value
7052
+ */
7053
+ get peak() {
7054
+ return this.smoothedPeak;
7055
+ }
7056
+ };
7057
+ var EmphasisDetector = class {
7058
+ /**
7059
+ * @param historySize Number of frames to track. Default 10
7060
+ * @param emphasisThreshold Minimum energy increase to count as emphasis. Default 0.15
7061
+ */
7062
+ constructor(historySize = 10, emphasisThreshold = 0.15) {
7063
+ this.energyHistory = [];
7064
+ this.historySize = historySize;
7065
+ this.emphasisThreshold = emphasisThreshold;
7066
+ }
7067
+ /**
7068
+ * Process energy value and detect emphasis
7069
+ * @param energy Current energy value (0-1)
7070
+ * @returns Object with isEmphasis flag and emphasisStrength
7071
+ */
7072
+ process(energy) {
7073
+ this.energyHistory.push(energy);
7074
+ if (this.energyHistory.length > this.historySize) {
7075
+ this.energyHistory.shift();
7076
+ }
7077
+ if (this.energyHistory.length < 3) {
7078
+ return { isEmphasis: false, emphasisStrength: 0 };
7079
+ }
7080
+ const prevFrames = this.energyHistory.slice(0, -1);
7081
+ const avgPrev = prevFrames.reduce((a, b) => a + b, 0) / prevFrames.length;
7082
+ const increase = energy - avgPrev;
7083
+ const isEmphasis = increase > this.emphasisThreshold;
7084
+ return {
7085
+ isEmphasis,
7086
+ emphasisStrength: isEmphasis ? Math.min(1, increase / 0.3) : 0
7087
+ };
7088
+ }
7089
+ /**
7090
+ * Reset detector state
7091
+ */
7092
+ reset() {
7093
+ this.energyHistory = [];
7094
+ }
7095
+ };
7096
+
7429
7097
  // src/inference/SileroVADInference.ts
7430
- var logger11 = createLogger("SileroVAD");
7098
+ var logger12 = createLogger("SileroVAD");
7431
7099
  var SileroVADInference = class {
7432
7100
  constructor(config) {
7433
7101
  this.session = null;
@@ -7501,23 +7169,23 @@ var SileroVADInference = class {
7501
7169
  "model.sample_rate": this.config.sampleRate
7502
7170
  });
7503
7171
  try {
7504
- logger11.info("Loading ONNX Runtime...", { preference: this.config.backend });
7172
+ logger12.info("Loading ONNX Runtime...", { preference: this.config.backend });
7505
7173
  const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
7506
7174
  this.ort = ort;
7507
7175
  this._backend = backend;
7508
- logger11.info("ONNX Runtime loaded", { backend: this._backend });
7176
+ logger12.info("ONNX Runtime loaded", { backend: this._backend });
7509
7177
  const cache = getModelCache();
7510
7178
  const modelUrl = this.config.modelUrl;
7511
7179
  const isCached = await cache.has(modelUrl);
7512
7180
  let modelBuffer;
7513
7181
  if (isCached) {
7514
- logger11.debug("Loading model from cache", { modelUrl });
7182
+ logger12.debug("Loading model from cache", { modelUrl });
7515
7183
  modelBuffer = await cache.get(modelUrl);
7516
7184
  } else {
7517
- logger11.debug("Fetching and caching model", { modelUrl });
7185
+ logger12.debug("Fetching and caching model", { modelUrl });
7518
7186
  modelBuffer = await fetchWithCache(modelUrl);
7519
7187
  }
7520
- logger11.debug("Creating ONNX session", {
7188
+ logger12.debug("Creating ONNX session", {
7521
7189
  size: formatBytes(modelBuffer.byteLength),
7522
7190
  backend: this._backend
7523
7191
  });
@@ -7526,7 +7194,7 @@ var SileroVADInference = class {
7526
7194
  this.session = await ort.InferenceSession.create(modelData, sessionOptions);
7527
7195
  this.reset();
7528
7196
  const loadTimeMs = performance.now() - startTime;
7529
- logger11.info("Model loaded successfully", {
7197
+ logger12.info("Model loaded successfully", {
7530
7198
  backend: this._backend,
7531
7199
  loadTimeMs: Math.round(loadTimeMs),
7532
7200
  sampleRate: this.config.sampleRate,
@@ -7581,7 +7249,7 @@ var SileroVADInference = class {
7581
7249
  []
7582
7250
  );
7583
7251
  } catch (e) {
7584
- logger11.warn("BigInt64Array not available, using bigint array fallback", {
7252
+ logger12.warn("BigInt64Array not available, using bigint array fallback", {
7585
7253
  error: e instanceof Error ? e.message : String(e)
7586
7254
  });
7587
7255
  this.srTensor = new this.ort.Tensor(
@@ -7687,7 +7355,7 @@ var SileroVADInference = class {
7687
7355
  this.preSpeechBuffer.shift();
7688
7356
  }
7689
7357
  }
7690
- logger11.trace("Skipping VAD inference - audio too quiet", {
7358
+ logger12.trace("Skipping VAD inference - audio too quiet", {
7691
7359
  rms: Math.round(rms * 1e4) / 1e4,
7692
7360
  threshold: MIN_ENERGY_THRESHOLD
7693
7361
  });
@@ -7741,7 +7409,7 @@ var SileroVADInference = class {
7741
7409
  if (isSpeech && !this.wasSpeaking) {
7742
7410
  preSpeechChunks = [...this.preSpeechBuffer];
7743
7411
  this.preSpeechBuffer = [];
7744
- logger11.debug("Speech started with pre-speech buffer", {
7412
+ logger12.debug("Speech started with pre-speech buffer", {
7745
7413
  preSpeechChunks: preSpeechChunks.length,
7746
7414
  durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
7747
7415
  });
@@ -7754,7 +7422,7 @@ var SileroVADInference = class {
7754
7422
  this.preSpeechBuffer = [];
7755
7423
  }
7756
7424
  this.wasSpeaking = isSpeech;
7757
- logger11.trace("VAD inference completed", {
7425
+ logger12.trace("VAD inference completed", {
7758
7426
  probability: Math.round(probability * 1e3) / 1e3,
7759
7427
  isSpeech,
7760
7428
  inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100
@@ -7785,7 +7453,7 @@ var SileroVADInference = class {
7785
7453
  const oomError = new Error(
7786
7454
  `SileroVAD inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reducing concurrent model sessions or reloading the page.`
7787
7455
  );
7788
- logger11.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
7456
+ logger12.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
7789
7457
  pointer: `0x${err.toString(16)}`,
7790
7458
  backend: this._backend
7791
7459
  });
@@ -7828,7 +7496,7 @@ var SileroVADInference = class {
7828
7496
  SileroVADInference.isWebGPUAvailable = isWebGPUAvailable;
7829
7497
 
7830
7498
  // src/inference/SileroVADWorker.ts
7831
- var logger12 = createLogger("SileroVADWorker");
7499
+ var logger13 = createLogger("SileroVADWorker");
7832
7500
  var WASM_CDN_PATH5 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
7833
7501
  var LOAD_TIMEOUT_MS3 = 1e4;
7834
7502
  var INFERENCE_TIMEOUT_MS3 = 1e3;
@@ -8106,7 +7774,7 @@ var SileroVADWorker = class {
8106
7774
  this.handleWorkerMessage(event.data);
8107
7775
  };
8108
7776
  worker.onerror = (error) => {
8109
- logger12.error("Worker error", { error: error.message });
7777
+ logger13.error("Worker error", { error: error.message });
8110
7778
  for (const [, resolver] of this.pendingResolvers) {
8111
7779
  resolver.reject(new Error(`Worker error: ${error.message}`));
8112
7780
  }
@@ -8182,9 +7850,9 @@ var SileroVADWorker = class {
8182
7850
  "model.sample_rate": this.config.sampleRate
8183
7851
  });
8184
7852
  try {
8185
- logger12.info("Creating VAD worker...");
7853
+ logger13.info("Creating VAD worker...");
8186
7854
  this.worker = this.createWorker();
8187
- logger12.info("Loading model in worker...", {
7855
+ logger13.info("Loading model in worker...", {
8188
7856
  modelUrl: this.config.modelUrl,
8189
7857
  sampleRate: this.config.sampleRate
8190
7858
  });
@@ -8200,7 +7868,7 @@ var SileroVADWorker = class {
8200
7868
  );
8201
7869
  this._isLoaded = true;
8202
7870
  const loadTimeMs = performance.now() - startTime;
8203
- logger12.info("VAD worker loaded successfully", {
7871
+ logger13.info("VAD worker loaded successfully", {
8204
7872
  backend: "wasm",
8205
7873
  loadTimeMs: Math.round(loadTimeMs),
8206
7874
  workerLoadTimeMs: Math.round(result.loadTimeMs),
@@ -8307,7 +7975,7 @@ var SileroVADWorker = class {
8307
7975
  if (isSpeech && !this.wasSpeaking) {
8308
7976
  preSpeechChunks = [...this.preSpeechBuffer];
8309
7977
  this.preSpeechBuffer = [];
8310
- logger12.debug("Speech started with pre-speech buffer", {
7978
+ logger13.debug("Speech started with pre-speech buffer", {
8311
7979
  preSpeechChunks: preSpeechChunks.length,
8312
7980
  durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
8313
7981
  });
@@ -8320,7 +7988,7 @@ var SileroVADWorker = class {
8320
7988
  this.preSpeechBuffer = [];
8321
7989
  }
8322
7990
  this.wasSpeaking = isSpeech;
8323
- logger12.trace("VAD worker inference completed", {
7991
+ logger13.trace("VAD worker inference completed", {
8324
7992
  probability: Math.round(result.probability * 1e3) / 1e3,
8325
7993
  isSpeech,
8326
7994
  inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
@@ -8388,44 +8056,44 @@ var SileroVADWorker = class {
8388
8056
  };
8389
8057
 
8390
8058
  // src/inference/createSileroVAD.ts
8391
- var logger13 = createLogger("createSileroVAD");
8059
+ var logger14 = createLogger("createSileroVAD");
8392
8060
  function supportsVADWorker() {
8393
8061
  if (typeof Worker === "undefined") {
8394
- logger13.debug("Worker not supported: Worker constructor undefined");
8062
+ logger14.debug("Worker not supported: Worker constructor undefined");
8395
8063
  return false;
8396
8064
  }
8397
8065
  if (typeof URL === "undefined" || typeof URL.createObjectURL === "undefined") {
8398
- logger13.debug("Worker not supported: URL.createObjectURL unavailable");
8066
+ logger14.debug("Worker not supported: URL.createObjectURL unavailable");
8399
8067
  return false;
8400
8068
  }
8401
8069
  if (typeof Blob === "undefined") {
8402
- logger13.debug("Worker not supported: Blob constructor unavailable");
8070
+ logger14.debug("Worker not supported: Blob constructor unavailable");
8403
8071
  return false;
8404
8072
  }
8405
8073
  return true;
8406
8074
  }
8407
8075
  function createSileroVAD(config) {
8408
8076
  if (config.unifiedWorker) {
8409
- logger13.info("Creating SileroVADUnifiedAdapter (shared unified worker)");
8077
+ logger14.info("Creating SileroVADUnifiedAdapter (shared unified worker)");
8410
8078
  return new SileroVADUnifiedAdapter(config.unifiedWorker, config);
8411
8079
  }
8412
8080
  const fallbackOnError = config.fallbackOnError ?? true;
8413
8081
  let useWorker;
8414
8082
  if (config.useWorker !== void 0) {
8415
8083
  useWorker = config.useWorker;
8416
- logger13.debug("Worker preference explicitly set", { useWorker });
8084
+ logger14.debug("Worker preference explicitly set", { useWorker });
8417
8085
  } else {
8418
8086
  const workerSupported = supportsVADWorker();
8419
8087
  const onMobile = isMobile();
8420
8088
  useWorker = workerSupported && !onMobile;
8421
- logger13.debug("Auto-detected Worker preference", {
8089
+ logger14.debug("Auto-detected Worker preference", {
8422
8090
  useWorker,
8423
8091
  workerSupported,
8424
8092
  onMobile
8425
8093
  });
8426
8094
  }
8427
8095
  if (useWorker) {
8428
- logger13.info("Creating SileroVADWorker (off-main-thread)");
8096
+ logger14.info("Creating SileroVADWorker (off-main-thread)");
8429
8097
  const worker = new SileroVADWorker({
8430
8098
  modelUrl: config.modelUrl,
8431
8099
  sampleRate: config.sampleRate,
@@ -8437,7 +8105,7 @@ function createSileroVAD(config) {
8437
8105
  }
8438
8106
  return worker;
8439
8107
  }
8440
- logger13.info("Creating SileroVADInference (main thread)");
8108
+ logger14.info("Creating SileroVADInference (main thread)");
8441
8109
  return new SileroVADInference(config);
8442
8110
  }
8443
8111
  var VADWorkerWithFallback = class {
@@ -8463,7 +8131,7 @@ var VADWorkerWithFallback = class {
8463
8131
  try {
8464
8132
  return await this.implementation.load();
8465
8133
  } catch (error) {
8466
- logger13.warn("Worker load failed, falling back to main thread", {
8134
+ logger14.warn("Worker load failed, falling back to main thread", {
8467
8135
  error: error instanceof Error ? error.message : String(error)
8468
8136
  });
8469
8137
  try {
@@ -8472,7 +8140,7 @@ var VADWorkerWithFallback = class {
8472
8140
  }
8473
8141
  this.implementation = new SileroVADInference(this.config);
8474
8142
  this.hasFallenBack = true;
8475
- logger13.info("Fallback to SileroVADInference successful");
8143
+ logger14.info("Fallback to SileroVADInference successful");
8476
8144
  return await this.implementation.load();
8477
8145
  }
8478
8146
  }
@@ -8493,8 +8161,175 @@ var VADWorkerWithFallback = class {
8493
8161
  }
8494
8162
  };
8495
8163
 
8164
+ // src/inference/A2EOrchestrator.ts
8165
+ var logger15 = createLogger("A2EOrchestrator");
8166
+ var A2EOrchestrator = class {
8167
+ constructor(config) {
8168
+ this.a2e = null;
8169
+ this.processor = null;
8170
+ // Mic capture state (lightweight — no dependency on MicrophoneCapture class
8171
+ // which requires an external EventEmitter. We do raw Web Audio here.)
8172
+ this.stream = null;
8173
+ this.audioContext = null;
8174
+ this.scriptProcessor = null;
8175
+ this.nativeSampleRate = 0;
8176
+ this._isReady = false;
8177
+ this._isStreaming = false;
8178
+ this._backend = null;
8179
+ this.disposed = false;
8180
+ this.config = {
8181
+ sampleRate: 16e3,
8182
+ ...config
8183
+ };
8184
+ }
8185
+ /** Latest blendshape weights from inference (null if none yet) */
8186
+ get latestWeights() {
8187
+ return this.processor?.latestFrame ?? null;
8188
+ }
8189
+ /** Whether the model is loaded and ready for inference */
8190
+ get isReady() {
8191
+ return this._isReady;
8192
+ }
8193
+ /** Whether mic is active and inference loop is running */
8194
+ get isStreaming() {
8195
+ return this._isStreaming;
8196
+ }
8197
+ /** Current backend type (webgpu, wasm, or null) */
8198
+ get backend() {
8199
+ return this._backend;
8200
+ }
8201
+ /**
8202
+ * Load the A2E model and create the processor
8203
+ */
8204
+ async load() {
8205
+ if (this.disposed) throw new Error("A2EOrchestrator has been disposed");
8206
+ logger15.info("Loading A2E model...");
8207
+ this.a2e = createA2E({
8208
+ gpuModelUrl: this.config.gpuModelUrl,
8209
+ gpuExternalDataUrl: this.config.gpuExternalDataUrl,
8210
+ cpuModelUrl: this.config.cpuModelUrl ?? this.config.gpuModelUrl,
8211
+ ...this.config.a2eConfig
8212
+ });
8213
+ const info = await this.a2e.load();
8214
+ this._backend = info.backend;
8215
+ this.processor = new A2EProcessor({
8216
+ backend: this.a2e,
8217
+ sampleRate: this.config.sampleRate,
8218
+ chunkSize: this.config.chunkSize,
8219
+ onFrame: this.config.onFrame,
8220
+ onError: this.config.onError
8221
+ });
8222
+ this._isReady = true;
8223
+ logger15.info("A2E model loaded", {
8224
+ backend: info.backend,
8225
+ loadTimeMs: info.loadTimeMs,
8226
+ modelId: this.a2e.modelId
8227
+ });
8228
+ this.config.onReady?.();
8229
+ }
8230
+ /**
8231
+ * Start mic capture and inference loop
8232
+ */
8233
+ async start() {
8234
+ if (this.disposed) throw new Error("A2EOrchestrator has been disposed");
8235
+ if (!this._isReady || !this.processor) throw new Error("Model not loaded. Call load() first.");
8236
+ if (this._isStreaming) return;
8237
+ try {
8238
+ this.stream = await navigator.mediaDevices.getUserMedia({
8239
+ audio: {
8240
+ sampleRate: { ideal: this.config.sampleRate },
8241
+ channelCount: 1,
8242
+ echoCancellation: true,
8243
+ noiseSuppression: true,
8244
+ autoGainControl: true
8245
+ }
8246
+ });
8247
+ this.audioContext = new AudioContext({ sampleRate: this.config.sampleRate });
8248
+ if (this.audioContext.state === "suspended") {
8249
+ await this.audioContext.resume();
8250
+ }
8251
+ this.nativeSampleRate = this.audioContext.sampleRate;
8252
+ const source = this.audioContext.createMediaStreamSource(this.stream);
8253
+ this.scriptProcessor = this.audioContext.createScriptProcessor(4096, 1, 1);
8254
+ this.scriptProcessor.onaudioprocess = (e) => {
8255
+ if (!this._isStreaming || !this.processor) return;
8256
+ const input = e.inputBuffer.getChannelData(0);
8257
+ let samples;
8258
+ if (this.nativeSampleRate !== this.config.sampleRate) {
8259
+ const ratio = this.config.sampleRate / this.nativeSampleRate;
8260
+ const newLen = Math.round(input.length * ratio);
8261
+ samples = new Float32Array(newLen);
8262
+ for (let i = 0; i < newLen; i++) {
8263
+ const srcIdx = i / ratio;
8264
+ const lo = Math.floor(srcIdx);
8265
+ const hi = Math.min(lo + 1, input.length - 1);
8266
+ const frac = srcIdx - lo;
8267
+ samples[i] = input[lo] * (1 - frac) + input[hi] * frac;
8268
+ }
8269
+ } else {
8270
+ samples = new Float32Array(input);
8271
+ }
8272
+ this.processor.pushAudio(samples);
8273
+ };
8274
+ source.connect(this.scriptProcessor);
8275
+ this.scriptProcessor.connect(this.audioContext.destination);
8276
+ this._isStreaming = true;
8277
+ this.processor.startDrip();
8278
+ logger15.info("Mic capture started", { sampleRate: this.nativeSampleRate });
8279
+ } catch (err) {
8280
+ const error = err instanceof Error ? err : new Error(String(err));
8281
+ logger15.error("Failed to start mic capture", { error: error.message });
8282
+ this.config.onError?.(error);
8283
+ throw error;
8284
+ }
8285
+ }
8286
+ /**
8287
+ * Stop mic capture and inference loop
8288
+ */
8289
+ stop() {
8290
+ this._isStreaming = false;
8291
+ if (this.processor) {
8292
+ this.processor.stopDrip();
8293
+ this.processor.reset();
8294
+ }
8295
+ if (this.scriptProcessor) {
8296
+ this.scriptProcessor.disconnect();
8297
+ this.scriptProcessor.onaudioprocess = null;
8298
+ this.scriptProcessor = null;
8299
+ }
8300
+ if (this.stream) {
8301
+ this.stream.getTracks().forEach((t) => t.stop());
8302
+ this.stream = null;
8303
+ }
8304
+ if (this.audioContext) {
8305
+ this.audioContext.close().catch(() => {
8306
+ });
8307
+ this.audioContext = null;
8308
+ }
8309
+ logger15.info("Mic capture stopped");
8310
+ }
8311
+ /**
8312
+ * Dispose of all resources
8313
+ */
8314
+ async dispose() {
8315
+ if (this.disposed) return;
8316
+ this.disposed = true;
8317
+ this.stop();
8318
+ if (this.processor) {
8319
+ this.processor.dispose();
8320
+ this.processor = null;
8321
+ }
8322
+ if (this.a2e) {
8323
+ await this.a2e.dispose();
8324
+ this.a2e = null;
8325
+ }
8326
+ this._isReady = false;
8327
+ this._backend = null;
8328
+ }
8329
+ };
8330
+
8496
8331
  // src/inference/SafariSpeechRecognition.ts
8497
- var logger14 = createLogger("SafariSpeech");
8332
+ var logger16 = createLogger("SafariSpeech");
8498
8333
  var SafariSpeechRecognition = class _SafariSpeechRecognition {
8499
8334
  constructor(config = {}) {
8500
8335
  this.recognition = null;
@@ -8513,7 +8348,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
8513
8348
  interimResults: config.interimResults ?? true,
8514
8349
  maxAlternatives: config.maxAlternatives ?? 1
8515
8350
  };
8516
- logger14.debug("SafariSpeechRecognition created", {
8351
+ logger16.debug("SafariSpeechRecognition created", {
8517
8352
  language: this.config.language,
8518
8353
  continuous: this.config.continuous
8519
8354
  });
@@ -8574,7 +8409,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
8574
8409
  */
8575
8410
  async start() {
8576
8411
  if (this.isListening) {
8577
- logger14.warn("Already listening");
8412
+ logger16.warn("Already listening");
8578
8413
  return;
8579
8414
  }
8580
8415
  if (!_SafariSpeechRecognition.isAvailable()) {
@@ -8604,7 +8439,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
8604
8439
  this.isListening = true;
8605
8440
  this.startTime = performance.now();
8606
8441
  this.accumulatedText = "";
8607
- logger14.info("Speech recognition started", {
8442
+ logger16.info("Speech recognition started", {
8608
8443
  language: this.config.language
8609
8444
  });
8610
8445
  span?.end();
@@ -8619,7 +8454,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
8619
8454
  */
8620
8455
  async stop() {
8621
8456
  if (!this.isListening || !this.recognition) {
8622
- logger14.warn("Not currently listening");
8457
+ logger16.warn("Not currently listening");
8623
8458
  return {
8624
8459
  text: this.accumulatedText,
8625
8460
  language: this.config.language,
@@ -8648,7 +8483,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
8648
8483
  if (this.recognition && this.isListening) {
8649
8484
  this.recognition.abort();
8650
8485
  this.isListening = false;
8651
- logger14.info("Speech recognition aborted");
8486
+ logger16.info("Speech recognition aborted");
8652
8487
  }
8653
8488
  }
8654
8489
  /**
@@ -8679,7 +8514,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
8679
8514
  this.isListening = false;
8680
8515
  this.resultCallbacks = [];
8681
8516
  this.errorCallbacks = [];
8682
- logger14.debug("SafariSpeechRecognition disposed");
8517
+ logger16.debug("SafariSpeechRecognition disposed");
8683
8518
  }
8684
8519
  /**
8685
8520
  * Set up event handlers for the recognition instance
@@ -8707,7 +8542,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
8707
8542
  confidence: alternative.confidence
8708
8543
  };
8709
8544
  this.emitResult(speechResult);
8710
- logger14.trace("Speech result", {
8545
+ logger16.trace("Speech result", {
8711
8546
  text: text.substring(0, 50),
8712
8547
  isFinal,
8713
8548
  confidence: alternative.confidence
@@ -8717,12 +8552,12 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
8717
8552
  span?.end();
8718
8553
  } catch (error) {
8719
8554
  span?.endWithError(error instanceof Error ? error : new Error(String(error)));
8720
- logger14.error("Error processing speech result", { error });
8555
+ logger16.error("Error processing speech result", { error });
8721
8556
  }
8722
8557
  };
8723
8558
  this.recognition.onerror = (event) => {
8724
8559
  const error = new Error(`Speech recognition error: ${event.error} - ${event.message}`);
8725
- logger14.error("Speech recognition error", { error: event.error, message: event.message });
8560
+ logger16.error("Speech recognition error", { error: event.error, message: event.message });
8726
8561
  this.emitError(error);
8727
8562
  if (this.stopRejecter) {
8728
8563
  this.stopRejecter(error);
@@ -8732,7 +8567,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
8732
8567
  };
8733
8568
  this.recognition.onend = () => {
8734
8569
  this.isListening = false;
8735
- logger14.info("Speech recognition ended", {
8570
+ logger16.info("Speech recognition ended", {
8736
8571
  totalText: this.accumulatedText.length,
8737
8572
  durationMs: performance.now() - this.startTime
8738
8573
  });
@@ -8749,13 +8584,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
8749
8584
  }
8750
8585
  };
8751
8586
  this.recognition.onstart = () => {
8752
- logger14.debug("Speech recognition started by browser");
8587
+ logger16.debug("Speech recognition started by browser");
8753
8588
  };
8754
8589
  this.recognition.onspeechstart = () => {
8755
- logger14.debug("Speech detected");
8590
+ logger16.debug("Speech detected");
8756
8591
  };
8757
8592
  this.recognition.onspeechend = () => {
8758
- logger14.debug("Speech ended");
8593
+ logger16.debug("Speech ended");
8759
8594
  };
8760
8595
  }
8761
8596
  /**
@@ -8766,7 +8601,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
8766
8601
  try {
8767
8602
  callback(result);
8768
8603
  } catch (error) {
8769
- logger14.error("Error in result callback", { error });
8604
+ logger16.error("Error in result callback", { error });
8770
8605
  }
8771
8606
  }
8772
8607
  }
@@ -8778,7 +8613,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
8778
8613
  try {
8779
8614
  callback(error);
8780
8615
  } catch (callbackError) {
8781
- logger14.error("Error in error callback", { error: callbackError });
8616
+ logger16.error("Error in error callback", { error: callbackError });
8782
8617
  }
8783
8618
  }
8784
8619
  }
@@ -9191,13 +9026,14 @@ var AgentCoreAdapter = class extends EventEmitter {
9191
9026
  if (!this.lam) {
9192
9027
  throw new Error("LAM must be initialized before pipeline");
9193
9028
  }
9194
- this.pipeline = new SyncedAudioPipeline({
9029
+ this.pipeline = new FullFacePipeline({
9195
9030
  lam: this.lam,
9196
9031
  sampleRate: 16e3,
9197
9032
  chunkTargetMs: 200
9198
9033
  });
9199
9034
  await this.pipeline.initialize();
9200
- this.pipeline.on("frame_ready", (frame) => {
9035
+ this.pipeline.on("full_frame_ready", (fullFrame) => {
9036
+ const frame = fullFrame.blendshapes;
9201
9037
  this.emit("animation", {
9202
9038
  blendshapes: frame,
9203
9039
  get: (name) => {
@@ -9376,9 +9212,9 @@ var AgentCoreAdapter = class extends EventEmitter {
9376
9212
  });
9377
9213
  }
9378
9214
  }
9379
- // REMOVED: processAudioForAnimation() - now handled by SyncedAudioPipeline
9215
+ // REMOVED: processAudioForAnimation() - now handled by FullFacePipeline
9380
9216
  // The pipeline manages audio scheduling, LAM inference, and frame synchronization
9381
- // Frames are emitted via pipeline.on('frame_ready') event (see initPipeline())
9217
+ // Frames are emitted via pipeline.on('full_frame_ready') event (see initPipeline())
9382
9218
  /**
9383
9219
  * Detect voice activity using Silero VAD
9384
9220
  * Falls back to simple RMS if VAD not available
@@ -11189,6 +11025,8 @@ function isProtocolEvent(obj) {
11189
11025
  return typeof obj === "object" && obj !== null && "v" in obj && "type" in obj && "ts" in obj;
11190
11026
  }
11191
11027
  export {
11028
+ A2EOrchestrator,
11029
+ A2EProcessor,
11192
11030
  ARKIT_BLENDSHAPES,
11193
11031
  AgentCoreAdapter,
11194
11032
  AnimationGraph,
@@ -11196,23 +11034,22 @@ export {
11196
11034
  AudioEnergyAnalyzer,
11197
11035
  AudioScheduler,
11198
11036
  AudioSyncManager,
11037
+ BLENDSHAPE_TO_GROUP,
11038
+ BlendshapeSmoother,
11199
11039
  CTC_VOCAB,
11200
11040
  ConsoleExporter,
11201
11041
  ConversationOrchestrator,
11202
11042
  DEFAULT_ANIMATION_CONFIG,
11203
11043
  DEFAULT_LOGGING_CONFIG,
11204
- EMOTION_ARKIT_MAP,
11205
11044
  EMOTION_NAMES,
11206
11045
  EMOTION_VECTOR_SIZE,
11207
11046
  EmotionController,
11208
11047
  EmotionPresets,
11209
- EmotionToBlendshapeMapper,
11210
11048
  EmphasisDetector,
11211
11049
  EventEmitter,
11212
11050
  FullFacePipeline,
11213
11051
  INFERENCE_LATENCY_BUCKETS,
11214
11052
  InterruptionHandler,
11215
- LAMPipeline,
11216
11053
  LAM_BLENDSHAPES,
11217
11054
  LOG_LEVEL_PRIORITY,
11218
11055
  MODEL_LOAD_TIME_BUCKETS,
@@ -11231,73 +11068,54 @@ export {
11231
11068
  SileroVADInference,
11232
11069
  SileroVADUnifiedAdapter,
11233
11070
  SileroVADWorker,
11234
- SyncedAudioPipeline,
11235
11071
  TenantManager,
11236
- UPPER_FACE_BLENDSHAPES,
11237
11072
  UnifiedInferenceWorker,
11238
- WAV2ARKIT_BLENDSHAPES,
11239
11073
  Wav2ArkitCpuInference,
11240
11074
  Wav2ArkitCpuUnifiedAdapter,
11241
11075
  Wav2ArkitCpuWorker,
11242
11076
  Wav2Vec2Inference,
11243
- applyCMVN,
11244
- applyLFR,
11245
11077
  blendEmotions,
11246
11078
  calculatePeak,
11247
11079
  calculateRMS,
11248
- computeKaldiFbank,
11249
11080
  configureCacheLimit,
11250
11081
  configureLogging,
11251
11082
  configureTelemetry,
11083
+ createA2E,
11252
11084
  createEmotionVector,
11253
- createLipSync,
11254
11085
  createLogger,
11255
11086
  createSenseVoice,
11256
- createSessionWithFallback,
11257
11087
  createSileroVAD,
11258
- ctcGreedyDecode,
11259
11088
  fetchWithCache,
11260
11089
  formatBytes,
11261
11090
  getCacheConfig,
11262
11091
  getCacheKey,
11263
11092
  getEmotionPreset,
11264
- getLoadedBackend,
11265
11093
  getLoggingConfig,
11266
11094
  getModelCache,
11267
- getOnnxRuntime,
11268
- getOnnxRuntimeForPreference,
11269
11095
  getOptimalWasmThreads,
11270
11096
  getRecommendedBackend,
11271
- getSessionOptions,
11272
11097
  getTelemetry,
11273
11098
  hasWebGPUApi,
11274
11099
  isAndroid,
11275
11100
  isIOS,
11276
11101
  isIOSSafari,
11277
11102
  isMobile,
11278
- isOnnxRuntimeLoaded,
11279
11103
  isProtocolEvent,
11280
11104
  isSafari,
11281
11105
  isSpeechRecognitionAvailable,
11282
11106
  isWebGPUAvailable,
11107
+ lerpBlendshapes,
11283
11108
  lerpEmotion,
11284
11109
  noopLogger,
11285
- parseCMVNFromMetadata,
11286
- parseTokensFile,
11287
11110
  preloadModels,
11288
- preloadOnnxRuntime,
11289
- remapWav2ArkitToLam,
11290
11111
  resetLoggingConfig,
11291
11112
  resolveBackend,
11292
- resolveLanguageId,
11293
- resolveTextNormId,
11294
11113
  setLogLevel,
11295
11114
  setLoggingEnabled,
11296
11115
  shouldEnableWasmProxy,
11297
- shouldUseCpuLipSync,
11116
+ shouldUseCpuA2E,
11298
11117
  shouldUseNativeASR,
11299
- shouldUseServerLipSync,
11300
- supportsVADWorker,
11301
- symmetrizeBlendshapes
11118
+ shouldUseServerA2E,
11119
+ supportsVADWorker
11302
11120
  };
11303
11121
  //# sourceMappingURL=index.mjs.map