@omote/core 0.4.7 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -485,730 +485,353 @@ var AudioChunkCoalescer = class {
485
485
  }
486
486
  };
487
487
 
488
- // src/audio/LAMPipeline.ts
489
- var LAMPipeline = class {
490
- constructor(options = {}) {
491
- this.options = options;
492
- this.REQUIRED_SAMPLES = 16e3;
493
- // 1.0s at 16kHz (LAM requirement)
494
- this.FRAME_RATE = 30;
495
- // LAM outputs 30fps
496
- this.buffer = new Float32Array(0);
488
+ // src/inference/A2EProcessor.ts
489
+ var logger = createLogger("A2EProcessor");
490
+ var FRAME_RATE = 30;
491
+ var DRIP_INTERVAL_MS = 33;
492
+ var A2EProcessor = class {
493
+ constructor(config) {
494
+ this.writeOffset = 0;
497
495
  this.bufferStartTime = 0;
498
- this.frameQueue = [];
499
- /**
500
- * Last successfully retrieved frame
501
- * Used as fallback when no new frame is available to prevent avatar freezing
502
- */
503
- this.lastFrame = null;
504
- }
505
- /**
506
- * Push audio samples into the pipeline
496
+ // Frame queues (timestamped for pull mode, plain for drip mode)
497
+ this.timestampedQueue = [];
498
+ this.plainQueue = [];
499
+ // Push mode state
500
+ this._latestFrame = null;
501
+ this.dripInterval = null;
502
+ // Last-frame-hold for pull mode (prevents avatar freezing between frames)
503
+ this.lastPulledFrame = null;
504
+ // Inference serialization
505
+ this.inferenceRunning = false;
506
+ this.pendingChunks = [];
507
+ // Diagnostic: track getFrameForTime calls
508
+ this.getFrameCallCount = 0;
509
+ this.disposed = false;
510
+ this.backend = config.backend;
511
+ this.sampleRate = config.sampleRate ?? 16e3;
512
+ this.chunkSize = config.chunkSize ?? config.backend.chunkSize ?? 16e3;
513
+ this.onFrame = config.onFrame;
514
+ this.onError = config.onError;
515
+ this.bufferCapacity = this.chunkSize * 2;
516
+ this.buffer = new Float32Array(this.bufferCapacity);
517
+ }
518
+ // ═══════════════════════════════════════════════════════════════════════
519
+ // Audio Input
520
+ // ═══════════════════════════════════════════════════════════════════════
521
+ /**
522
+ * Push audio samples for inference (any source: mic, TTS, file).
507
523
  *
508
- * Accumulates samples and triggers LAM inference when buffer is full.
509
- * Multiple calls may be needed to accumulate enough samples.
524
+ * - With `timestamp`: frames stored with timestamps (pull mode)
525
+ * - Without `timestamp`: frames stored in plain queue (drip/push mode)
510
526
  *
511
- * @param samples - Float32Array of audio samples
512
- * @param timestamp - AudioContext time when these samples start playing
513
- * @param lam - LAM inference engine
527
+ * Fire-and-forget: returns immediately, inference runs async.
514
528
  */
515
- async push(samples, timestamp, lam) {
516
- if (this.buffer.length === 0) {
529
+ pushAudio(samples, timestamp) {
530
+ if (this.disposed) return;
531
+ if (this.writeOffset === 0 && timestamp !== void 0) {
517
532
  this.bufferStartTime = timestamp;
518
533
  }
519
- const newBuffer = new Float32Array(this.buffer.length + samples.length);
520
- newBuffer.set(this.buffer, 0);
521
- newBuffer.set(samples, this.buffer.length);
522
- this.buffer = newBuffer;
523
- while (this.buffer.length >= this.REQUIRED_SAMPLES) {
524
- await this.processBuffer(lam);
525
- if (this.buffer.length >= this.REQUIRED_SAMPLES) {
526
- await new Promise((r) => setTimeout(r, 0));
534
+ if (this.writeOffset + samples.length > this.bufferCapacity) {
535
+ this.bufferCapacity = (this.writeOffset + samples.length) * 2;
536
+ const grown = new Float32Array(this.bufferCapacity);
537
+ grown.set(this.buffer.subarray(0, this.writeOffset));
538
+ this.buffer = grown;
539
+ }
540
+ this.buffer.set(samples, this.writeOffset);
541
+ this.writeOffset += samples.length;
542
+ logger.debug("pushAudio", {
543
+ samplesIn: samples.length,
544
+ writeOffset: this.writeOffset,
545
+ chunkSize: this.chunkSize,
546
+ willExtract: this.writeOffset >= this.chunkSize,
547
+ inferenceRunning: this.inferenceRunning,
548
+ pendingChunks: this.pendingChunks.length,
549
+ queuedFrames: this.timestampedQueue.length + this.plainQueue.length
550
+ });
551
+ while (this.writeOffset >= this.chunkSize) {
552
+ const chunk = this.buffer.slice(0, this.chunkSize);
553
+ this.buffer.copyWithin(0, this.chunkSize, this.writeOffset);
554
+ this.writeOffset -= this.chunkSize;
555
+ const chunkTimestamp = timestamp !== void 0 ? this.bufferStartTime : void 0;
556
+ this.pendingChunks.push({ chunk, timestamp: chunkTimestamp });
557
+ logger.info("Chunk queued for inference", {
558
+ chunkSize: chunk.length,
559
+ chunkTimestamp,
560
+ pendingChunks: this.pendingChunks.length,
561
+ remainderOffset: this.writeOffset
562
+ });
563
+ if (timestamp !== void 0) {
564
+ this.bufferStartTime += this.chunkSize / this.sampleRate;
527
565
  }
528
566
  }
567
+ this.drainPendingChunks();
529
568
  }
530
569
  /**
531
- * Process accumulated buffer through LAM inference
532
- */
533
- async processBuffer(lam) {
534
- try {
535
- const toProcess = this.buffer.slice(0, this.REQUIRED_SAMPLES);
536
- const processedStartTime = this.bufferStartTime;
537
- this.buffer = this.buffer.slice(this.REQUIRED_SAMPLES);
538
- const processedDuration = this.REQUIRED_SAMPLES / (this.options.sampleRate ?? 16e3);
539
- this.bufferStartTime = processedStartTime + processedDuration;
540
- const result = await lam.infer(toProcess);
541
- const frameDuration = 1 / this.FRAME_RATE;
542
- for (let i = 0; i < result.blendshapes.length; i++) {
543
- const frame = result.blendshapes[i];
544
- const timestamp = processedStartTime + i * frameDuration;
545
- this.frameQueue.push({ frame, timestamp });
546
- }
547
- this.options.onInference?.(result.blendshapes.length);
548
- } catch (error) {
549
- this.options.onError?.(error);
550
- this.buffer = new Float32Array(0);
551
- this.bufferStartTime = 0;
552
- }
553
- }
554
- /**
555
- * Get the frame that should be displayed at the current time
556
- *
557
- * Automatically removes frames that have already been displayed.
558
- * This prevents memory leaks from accumulating old frames.
559
- *
560
- * Discard Window (prevents premature frame discarding):
561
- * - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
562
- * - WASM: 1.0s (LAM inference 50-500ms + higher variability)
563
- *
564
- * Last-Frame-Hold: Returns last valid frame instead of null to prevent
565
- * avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
570
+ * Flush remaining buffered audio (pads to chunkSize).
571
+ * Call at end of stream to process final partial chunk.
566
572
  *
567
- * @param currentTime - Current AudioContext time
568
- * @param lam - LAM inference engine (optional, for backend detection)
569
- * @returns Current frame, or last frame as fallback, or null if no frames yet
570
- */
571
- getFrameForTime(currentTime, lam) {
572
- const discardWindow = lam?.backend === "wasm" ? 1 : 0.5;
573
- let discardedCount = 0;
574
- while (this.frameQueue.length > 0 && this.frameQueue[0].timestamp < currentTime - discardWindow) {
575
- const discarded = this.frameQueue.shift();
576
- discardedCount++;
577
- if (discardedCount === 1) {
578
- const ageMs = ((currentTime - discarded.timestamp) * 1e3).toFixed(0);
579
- console.warn("[LAM] Frame(s) discarded as too old", {
580
- ageMs,
581
- discardWindowMs: discardWindow * 1e3,
582
- queueLength: this.frameQueue.length,
583
- backend: lam?.backend ?? "unknown"
584
- });
585
- }
586
- }
587
- if (this.frameQueue.length > 0 && this.frameQueue[0].timestamp <= currentTime) {
588
- const { frame } = this.frameQueue.shift();
589
- this.lastFrame = frame;
590
- return frame;
591
- }
592
- return this.lastFrame;
593
- }
594
- /**
595
- * Get all frames in the queue (for debugging/monitoring)
573
+ * Routes through the serialized pendingChunks pipeline to maintain
574
+ * correct frame ordering. Without this, flush() could push frames
575
+ * with the latest timestamp to the queue before drainPendingChunks()
576
+ * finishes pushing frames with earlier timestamps — causing
577
+ * getFrameForTime() to see out-of-order timestamps and stall.
596
578
  */
597
- getQueuedFrames() {
598
- return [...this.frameQueue];
599
- }
600
- /**
601
- * Get current buffer fill level (0-1)
602
- */
603
- get fillLevel() {
604
- return Math.min(1, this.buffer.length / this.REQUIRED_SAMPLES);
605
- }
606
- /**
607
- * Get number of frames queued
608
- */
609
- get queuedFrameCount() {
610
- return this.frameQueue.length;
611
- }
612
- /**
613
- * Get buffered audio duration in seconds
614
- */
615
- get bufferedDuration() {
616
- return this.buffer.length / (this.options.sampleRate ?? 16e3);
617
- }
618
- /**
619
- * Flush remaining buffered audio
620
- *
621
- * Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
622
- * This ensures the final audio chunk generates blendshape frames.
623
- *
624
- * Should be called when audio stream ends to prevent losing the last 0-1 seconds.
625
- *
626
- * @param lam - LAM inference engine
627
- */
628
- async flush(lam) {
629
- if (this.buffer.length === 0) {
630
- return;
631
- }
632
- const padded = new Float32Array(this.REQUIRED_SAMPLES);
633
- padded.set(this.buffer, 0);
634
- const processedStartTime = this.bufferStartTime;
635
- try {
636
- const result = await lam.infer(padded);
637
- const actualDuration = this.buffer.length / (this.options.sampleRate ?? 16e3);
638
- const frameDuration = 1 / this.FRAME_RATE;
639
- const actualFrameCount = Math.ceil(actualDuration * this.FRAME_RATE);
640
- for (let i = 0; i < Math.min(actualFrameCount, result.blendshapes.length); i++) {
641
- const frame = result.blendshapes[i];
642
- const timestamp = processedStartTime + i * frameDuration;
643
- this.frameQueue.push({ frame, timestamp });
644
- }
645
- this.buffer = new Float32Array(0);
646
- this.bufferStartTime = 0;
647
- this.options.onInference?.(Math.min(actualFrameCount, result.blendshapes.length));
648
- } catch (error) {
649
- this.options.onError?.(error);
650
- this.buffer = new Float32Array(0);
651
- this.bufferStartTime = 0;
652
- }
653
- }
654
- /**
655
- * Adjust all queued frame timestamps by an offset
656
- *
657
- * Used for synchronization when audio scheduling time differs from
658
- * the estimated time used during LAM processing.
659
- *
660
- * @param offset - Time offset in seconds to add to all timestamps
661
- */
662
- adjustTimestamps(offset) {
663
- for (const frame of this.frameQueue) {
664
- frame.timestamp += offset;
665
- }
579
+ async flush() {
580
+ if (this.disposed || this.writeOffset === 0) return;
581
+ const padded = new Float32Array(this.chunkSize);
582
+ padded.set(this.buffer.subarray(0, this.writeOffset), 0);
583
+ const chunkTimestamp = this.bufferStartTime > 0 ? this.bufferStartTime : void 0;
584
+ logger.info("flush: routing through drain pipeline", {
585
+ actualSamples: this.writeOffset,
586
+ chunkTimestamp: chunkTimestamp?.toFixed(3),
587
+ pendingChunks: this.pendingChunks.length,
588
+ inferenceRunning: this.inferenceRunning
589
+ });
590
+ this.writeOffset = 0;
591
+ this.bufferStartTime = 0;
592
+ this.pendingChunks.push({ chunk: padded, timestamp: chunkTimestamp });
593
+ this.drainPendingChunks();
666
594
  }
667
595
  /**
668
- * Reset the pipeline
596
+ * Reset buffer and frame queues
669
597
  */
670
598
  reset() {
671
- this.buffer = new Float32Array(0);
599
+ this.writeOffset = 0;
672
600
  this.bufferStartTime = 0;
673
- this.frameQueue = [];
674
- this.lastFrame = null;
675
- }
676
- };
677
-
678
- // src/audio/audioUtils.ts
679
- function pcm16ToFloat32(buffer) {
680
- const byteLen = buffer.byteLength & ~1;
681
- const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
682
- const float32 = new Float32Array(int16.length);
683
- for (let i = 0; i < int16.length; i++) {
684
- float32[i] = int16[i] / 32768;
685
- }
686
- return float32;
687
- }
688
- function int16ToFloat32(int16) {
689
- const float32 = new Float32Array(int16.length);
690
- for (let i = 0; i < int16.length; i++) {
691
- float32[i] = int16[i] / 32768;
692
- }
693
- return float32;
694
- }
695
-
696
- // src/audio/SyncedAudioPipeline.ts
697
- var SyncedAudioPipeline = class extends EventEmitter {
698
- constructor(options) {
699
- super();
700
- this.options = options;
701
- this.playbackStarted = false;
702
- this.monitorInterval = null;
703
- this.frameAnimationId = null;
704
- const sampleRate = options.sampleRate ?? 16e3;
705
- const autoDelay = options.lam.modelId === "wav2arkit_cpu" ? 750 : options.lam.backend === "wasm" ? 350 : 50;
706
- const audioDelayMs = options.audioDelayMs ?? autoDelay;
707
- this.scheduler = new AudioScheduler({
708
- sampleRate,
709
- initialLookaheadSec: audioDelayMs / 1e3
710
- });
711
- this.coalescer = new AudioChunkCoalescer({
712
- sampleRate,
713
- targetDurationMs: options.chunkTargetMs ?? 200
714
- });
715
- this.lamPipeline = new LAMPipeline({
716
- sampleRate,
717
- onError: (error) => {
718
- this.emit("error", error);
719
- }
720
- });
721
- }
722
- /**
723
- * Initialize the pipeline
724
- */
725
- async initialize() {
726
- await this.scheduler.initialize();
727
- }
728
- /**
729
- * Start a new playback session
601
+ this.timestampedQueue = [];
602
+ this.plainQueue = [];
603
+ this._latestFrame = null;
604
+ this.lastPulledFrame = null;
605
+ this.pendingChunks = [];
606
+ this.inferenceRunning = false;
607
+ this.getFrameCallCount = 0;
608
+ }
609
+ // ═══════════════════════════════════════════════════════════════════════
610
+ // Frame Output Pull Mode (TTS playback)
611
+ // ═══════════════════════════════════════════════════════════════════════
612
+ /**
613
+ * Get frame synced to external clock (e.g. AudioContext.currentTime).
730
614
  *
731
- * Resets all state and prepares for incoming audio chunks.
732
- * Audio will be scheduled immediately as chunks arrive (no buffering).
733
- */
734
- start() {
735
- this.stopMonitoring();
736
- this.scheduler.reset();
737
- this.coalescer.reset();
738
- this.lamPipeline.reset();
739
- this.playbackStarted = false;
740
- this.scheduler.warmup();
741
- this.startFrameLoop();
742
- this.startMonitoring();
743
- }
744
- /**
745
- * Receive audio chunk from network
746
- *
747
- * Audio-first design: schedules audio immediately, LAM runs in background.
748
- * This prevents LAM inference (50-300ms) from blocking audio scheduling,
749
- * which caused audible stuttering with continuous audio streams.
615
+ * Discards frames that are too old, returns the current frame,
616
+ * or holds last frame as fallback to prevent avatar freezing.
750
617
  *
751
- * @param chunk - Uint8Array containing Int16 PCM audio
752
- */
753
- async onAudioChunk(chunk) {
754
- const combined = this.coalescer.add(chunk);
755
- if (!combined) {
756
- return;
618
+ * @param currentTime - Current playback time (seconds)
619
+ * @returns Blendshape frame, or null if no frames yet
620
+ */
621
+ getFrameForTime(currentTime) {
622
+ this.getFrameCallCount++;
623
+ const discardWindow = this.backend.backend === "wasm" ? 1 : 0.5;
624
+ let discardCount = 0;
625
+ while (this.timestampedQueue.length > 0 && this.timestampedQueue[0].timestamp < currentTime - discardWindow) {
626
+ this.timestampedQueue.shift();
627
+ discardCount++;
628
+ }
629
+ if (discardCount > 0) {
630
+ logger.warn("getFrameForTime DISCARDED stale frames", {
631
+ discardCount,
632
+ currentTime: currentTime.toFixed(3),
633
+ discardWindow,
634
+ remainingFrames: this.timestampedQueue.length,
635
+ nextFrameTs: this.timestampedQueue.length > 0 ? this.timestampedQueue[0].timestamp.toFixed(3) : "none"
636
+ });
757
637
  }
758
- const float32 = pcm16ToFloat32(combined);
759
- const scheduleTime = await this.scheduler.schedule(float32);
760
- if (!this.playbackStarted) {
761
- this.playbackStarted = true;
762
- this.emit("playback_start", scheduleTime);
638
+ if (this.timestampedQueue.length > 0 && this.timestampedQueue[0].timestamp <= currentTime) {
639
+ const { frame } = this.timestampedQueue.shift();
640
+ this.lastPulledFrame = frame;
641
+ return frame;
763
642
  }
764
- this.lamPipeline.push(float32, scheduleTime, this.options.lam).catch((err) => {
765
- this.emit("error", err);
766
- });
767
- }
768
- /**
769
- * End of audio stream
770
- *
771
- * Flushes any remaining buffered data.
772
- */
773
- async end() {
774
- const remaining = this.coalescer.flush();
775
- if (remaining) {
776
- const chunk = new Uint8Array(remaining);
777
- await this.onAudioChunk(chunk);
643
+ if (this.timestampedQueue.length > 0 && this.getFrameCallCount % 60 === 0) {
644
+ logger.warn("getFrameForTime: frames in queue but NOT consumable", {
645
+ queueLen: this.timestampedQueue.length,
646
+ frontTimestamp: this.timestampedQueue[0].timestamp.toFixed(4),
647
+ currentTime: currentTime.toFixed(4),
648
+ delta: (this.timestampedQueue[0].timestamp - currentTime).toFixed(4),
649
+ callCount: this.getFrameCallCount
650
+ });
778
651
  }
779
- await this.lamPipeline.flush(this.options.lam);
652
+ return this.lastPulledFrame;
780
653
  }
781
- /**
782
- * Stop playback immediately with smooth fade-out
783
- *
784
- * Gracefully cancels all audio playback and LAM processing:
785
- * - Fades out audio over specified duration (default: 50ms)
786
- * - Cancels pending LAM inferences
787
- * - Clears all buffers and queues
788
- * - Emits 'playback_complete' event
789
- *
790
- * Use this for interruptions (e.g., user barge-in during AI speech).
791
- *
792
- * @param fadeOutMs - Fade-out duration in milliseconds (default: 50ms)
793
- * @returns Promise that resolves when fade-out completes
794
- */
795
- async stop(fadeOutMs = 50) {
796
- this.stopMonitoring();
797
- await this.scheduler.cancelAll(fadeOutMs);
798
- this.coalescer.reset();
799
- this.lamPipeline.reset();
800
- this.playbackStarted = false;
801
- this.emit("playback_complete", void 0);
654
+ // ═══════════════════════════════════════════════════════════════════════
655
+ // Frame Output Push Mode (live mic, game loop)
656
+ // ═══════════════════════════════════════════════════════════════════════
657
+ /** Latest frame from drip-feed (live mic, game loop) */
658
+ get latestFrame() {
659
+ return this._latestFrame;
802
660
  }
803
- /**
804
- * Start frame animation loop
805
- *
806
- * Uses requestAnimationFrame to check for new LAM frames.
807
- * Synchronized to AudioContext clock (not visual refresh rate).
808
- *
809
- * Frame Emission Strategy:
810
- * - LAMPipeline uses last-frame-hold to prevent null returns
811
- * - Always emit frames (even repeated frames) to maintain smooth animation
812
- * - Renderer is responsible for detecting duplicate frames if needed
813
- */
814
- startFrameLoop() {
815
- const updateFrame = () => {
816
- const currentTime = this.scheduler.getCurrentTime();
817
- const frame = this.lamPipeline.getFrameForTime(currentTime, this.options.lam);
661
+ /** Start 30fps drip-feed timer (push mode) */
662
+ startDrip() {
663
+ if (this.dripInterval) return;
664
+ this.dripInterval = setInterval(() => {
665
+ const frame = this.plainQueue.shift();
818
666
  if (frame) {
819
- this.emit("frame_ready", frame);
667
+ this._latestFrame = frame;
668
+ this.onFrame?.(frame);
820
669
  }
821
- this.frameAnimationId = requestAnimationFrame(updateFrame);
822
- };
823
- this.frameAnimationId = requestAnimationFrame(updateFrame);
670
+ }, DRIP_INTERVAL_MS);
824
671
  }
825
- /**
826
- * Start monitoring for playback completion
827
- */
828
- startMonitoring() {
829
- if (this.monitorInterval) {
830
- clearInterval(this.monitorInterval);
672
+ /** Stop drip-feed timer */
673
+ stopDrip() {
674
+ if (this.dripInterval) {
675
+ clearInterval(this.dripInterval);
676
+ this.dripInterval = null;
831
677
  }
832
- this.monitorInterval = window.setInterval(() => {
833
- if (this.scheduler.isComplete() && this.lamPipeline.queuedFrameCount === 0) {
834
- this.emit("playback_complete", void 0);
835
- this.stopMonitoring();
836
- }
837
- }, 100);
838
678
  }
839
- /**
840
- * Stop monitoring
841
- */
842
- stopMonitoring() {
843
- if (this.monitorInterval) {
844
- clearInterval(this.monitorInterval);
845
- this.monitorInterval = null;
846
- }
847
- if (this.frameAnimationId) {
848
- cancelAnimationFrame(this.frameAnimationId);
849
- this.frameAnimationId = null;
850
- }
679
+ // ═══════════════════════════════════════════════════════════════════════
680
+ // State
681
+ // ═══════════════════════════════════════════════════════════════════════
682
+ /** Number of frames waiting in queue (both modes combined) */
683
+ get queuedFrameCount() {
684
+ return this.timestampedQueue.length + this.plainQueue.length;
851
685
  }
852
- /**
853
- * Get current pipeline state (for debugging/monitoring)
854
- */
855
- getState() {
856
- return {
857
- playbackStarted: this.playbackStarted,
858
- coalescerFill: this.coalescer.fillLevel,
859
- lamFill: this.lamPipeline.fillLevel,
860
- queuedFrames: this.lamPipeline.queuedFrameCount,
861
- currentTime: this.scheduler.getCurrentTime(),
862
- playbackEndTime: this.scheduler.getPlaybackEndTime()
863
- };
686
+ /** Buffer fill level as fraction of chunkSize (0-1) */
687
+ get fillLevel() {
688
+ return Math.min(1, this.writeOffset / this.chunkSize);
864
689
  }
865
- /**
866
- * Cleanup resources
867
- */
690
+ /** Dispose resources */
868
691
  dispose() {
869
- this.stopMonitoring();
870
- this.scheduler.dispose();
871
- this.coalescer.reset();
872
- this.lamPipeline.reset();
873
- }
874
- };
875
-
876
- // src/animation/EmotionToBlendshapeMapper.ts
877
- var UPPER_FACE_BLENDSHAPES = [
878
- // Brows (5)
879
- "browDownLeft",
880
- "browDownRight",
881
- "browInnerUp",
882
- "browOuterUpLeft",
883
- "browOuterUpRight",
884
- // Eyes (4)
885
- "eyeSquintLeft",
886
- "eyeSquintRight",
887
- "eyeWideLeft",
888
- "eyeWideRight",
889
- // Cheeks (2)
890
- "cheekSquintLeft",
891
- "cheekSquintRight"
892
- ];
893
- var EMOTION_ARKIT_MAP = {
894
- happy: {
895
- // AU6 - Cheek raiser (primary Duchenne smile marker)
896
- cheekSquintLeft: 0.5,
897
- cheekSquintRight: 0.5,
898
- // Slight eye squint from genuine smile (orbicularis oculi activation)
899
- eyeSquintLeft: 0.2,
900
- eyeSquintRight: 0.2
901
- },
902
- angry: {
903
- // AU4 - Brow lowerer (intense, primary anger marker)
904
- browDownLeft: 0.7,
905
- browDownRight: 0.7,
906
- // AU5 - Upper lid raiser (wide eyes, part of the "glare")
907
- eyeWideLeft: 0.4,
908
- eyeWideRight: 0.4,
909
- // AU7 - Lid tightener (tense stare, combines with AU5 for angry glare)
910
- eyeSquintLeft: 0.3,
911
- eyeSquintRight: 0.3
912
- },
913
- sad: {
914
- // AU1 - Inner brow raiser (primary sadness marker)
915
- browInnerUp: 0.6,
916
- // AU4 - Brow lowerer (brows drawn together)
917
- browDownLeft: 0.3,
918
- browDownRight: 0.3
919
- },
920
- neutral: {}
921
- // All zeros - no expression overlay
922
- };
923
- var DEFAULT_CONFIG = {
924
- smoothingFactor: 0.15,
925
- confidenceThreshold: 0.3,
926
- intensity: 1,
927
- blendMode: "dominant",
928
- minBlendProbability: 0.1,
929
- energyModulation: false,
930
- minEnergyScale: 0.3,
931
- maxEnergyScale: 1
932
- };
933
- function createZeroBlendshapes() {
934
- const result = {};
935
- for (const name of UPPER_FACE_BLENDSHAPES) {
936
- result[name] = 0;
937
- }
938
- return result;
939
- }
940
- function clamp01(value) {
941
- return Math.max(0, Math.min(1, value));
942
- }
943
- var EmotionToBlendshapeMapper = class {
944
- /**
945
- * Create a new EmotionToBlendshapeMapper
946
- *
947
- * @param config - Optional configuration
948
- */
949
- constructor(config) {
950
- this.currentEnergy = 1;
951
- this.config = {
952
- ...DEFAULT_CONFIG,
953
- ...config
954
- };
955
- this.targetBlendshapes = createZeroBlendshapes();
956
- this.currentBlendshapes = createZeroBlendshapes();
957
- }
958
- /**
959
- * Map an emotion frame to target blendshapes
960
- *
961
- * This sets the target values that the mapper will smoothly interpolate
962
- * towards. Call update() each frame to apply smoothing.
963
- *
964
- * @param frame - Emotion frame from Emotion2VecInference
965
- * @param audioEnergy - Optional audio energy (0-1) for energy modulation
966
- * @returns Target upper face blendshapes (before smoothing)
967
- */
968
- mapFrame(frame, audioEnergy) {
969
- this.targetBlendshapes = createZeroBlendshapes();
970
- if (audioEnergy !== void 0) {
971
- this.currentEnergy = clamp01(audioEnergy);
972
- }
973
- if (!frame) {
974
- return { ...this.targetBlendshapes };
975
- }
976
- if (this.config.blendMode === "weighted") {
977
- this.mapFrameWeighted(frame);
978
- } else {
979
- this.mapFrameDominant(frame);
980
- }
981
- if (this.config.energyModulation) {
982
- this.applyEnergyModulation();
983
- }
984
- return { ...this.targetBlendshapes };
692
+ if (this.disposed) return;
693
+ this.disposed = true;
694
+ this.stopDrip();
695
+ this.reset();
985
696
  }
697
+ // ═══════════════════════════════════════════════════════════════════════
698
+ // Private
699
+ // ═══════════════════════════════════════════════════════════════════════
986
700
  /**
987
- * Map using dominant emotion only (original behavior)
701
+ * Process pending chunks sequentially.
702
+ * Fire-and-forget — called from pushAudio() without awaiting.
988
703
  */
989
- mapFrameDominant(frame) {
990
- if (frame.confidence < this.config.confidenceThreshold) {
991
- return;
992
- }
993
- const emotion = frame.emotion;
994
- const mapping = EMOTION_ARKIT_MAP[emotion];
995
- if (!mapping) {
996
- return;
997
- }
998
- const scale = this.config.intensity * frame.confidence;
999
- for (const [name, value] of Object.entries(mapping)) {
1000
- const blendshapeName = name;
1001
- if (value !== void 0) {
1002
- this.targetBlendshapes[blendshapeName] = clamp01(value * scale);
704
+ drainPendingChunks() {
705
+ if (this.inferenceRunning || this.pendingChunks.length === 0) {
706
+ if (this.inferenceRunning && this.pendingChunks.length > 0) {
707
+ logger.debug("drainPendingChunks skipped (inference running)", {
708
+ pendingChunks: this.pendingChunks.length
709
+ });
1003
710
  }
1004
- }
1005
- }
1006
- /**
1007
- * Map using weighted blend of all emotions by probability
1008
- * Creates more nuanced expressions (e.g., bittersweet = happy + sad)
1009
- */
1010
- mapFrameWeighted(frame) {
1011
- if (!frame.probabilities) {
1012
- this.mapFrameDominant(frame);
1013
711
  return;
1014
712
  }
1015
- for (const [emotion, probability] of Object.entries(frame.probabilities)) {
1016
- if (probability < this.config.minBlendProbability) {
1017
- continue;
1018
- }
1019
- const mapping = EMOTION_ARKIT_MAP[emotion];
1020
- if (!mapping) {
1021
- continue;
1022
- }
1023
- const scale = this.config.intensity * probability;
1024
- for (const [name, value] of Object.entries(mapping)) {
1025
- const blendshapeName = name;
1026
- if (value !== void 0) {
1027
- this.targetBlendshapes[blendshapeName] += value * scale;
713
+ this.inferenceRunning = true;
714
+ logger.info("drainPendingChunks starting", { pendingChunks: this.pendingChunks.length });
715
+ const processNext = async () => {
716
+ while (this.pendingChunks.length > 0 && !this.disposed) {
717
+ const { chunk, timestamp } = this.pendingChunks.shift();
718
+ try {
719
+ const t0 = performance.now();
720
+ const result = await this.backend.infer(chunk);
721
+ const inferMs = Math.round(performance.now() - t0);
722
+ const actualDuration = chunk.length / this.sampleRate;
723
+ const actualFrameCount = Math.ceil(actualDuration * FRAME_RATE);
724
+ const framesToQueue = Math.min(actualFrameCount, result.blendshapes.length);
725
+ logger.info("Inference complete", {
726
+ inferMs,
727
+ modelFrames: result.blendshapes.length,
728
+ framesToQueue,
729
+ timestamp,
730
+ totalQueued: this.timestampedQueue.length + framesToQueue,
731
+ remainingPending: this.pendingChunks.length
732
+ });
733
+ for (let i = 0; i < framesToQueue; i++) {
734
+ if (timestamp !== void 0) {
735
+ this.timestampedQueue.push({
736
+ frame: result.blendshapes[i],
737
+ timestamp: timestamp + i / FRAME_RATE
738
+ });
739
+ } else {
740
+ this.plainQueue.push(result.blendshapes[i]);
741
+ }
742
+ }
743
+ } catch (err) {
744
+ this.handleError(err);
745
+ }
746
+ if (this.pendingChunks.length > 0) {
747
+ await new Promise((r) => setTimeout(r, 0));
1028
748
  }
1029
749
  }
1030
- }
1031
- for (const name of UPPER_FACE_BLENDSHAPES) {
1032
- this.targetBlendshapes[name] = clamp01(this.targetBlendshapes[name]);
1033
- }
1034
- }
1035
- /**
1036
- * Apply energy modulation to scale emotion intensity by audio energy
1037
- * Louder speech = stronger expressions
1038
- */
1039
- applyEnergyModulation() {
1040
- const { minEnergyScale, maxEnergyScale } = this.config;
1041
- const energyScale = minEnergyScale + this.currentEnergy * (maxEnergyScale - minEnergyScale);
1042
- for (const name of UPPER_FACE_BLENDSHAPES) {
1043
- this.targetBlendshapes[name] = clamp01(this.targetBlendshapes[name] * energyScale);
1044
- }
1045
- }
1046
- /**
1047
- * Apply smoothing to interpolate current values towards target
1048
- *
1049
- * Uses exponential moving average:
1050
- * current = current + smoothingFactor * (target - current)
1051
- *
1052
- * @param _deltaMs - Delta time in milliseconds (reserved for future time-based smoothing)
1053
- */
1054
- update(_deltaMs) {
1055
- const factor = this.config.smoothingFactor;
1056
- for (const name of UPPER_FACE_BLENDSHAPES) {
1057
- const target = this.targetBlendshapes[name];
1058
- const current = this.currentBlendshapes[name];
1059
- this.currentBlendshapes[name] = clamp01(current + factor * (target - current));
1060
- }
1061
- }
1062
- /**
1063
- * Get current smoothed blendshape values
1064
- *
1065
- * @returns Current upper face blendshapes (after smoothing)
1066
- */
1067
- getCurrentBlendshapes() {
1068
- return { ...this.currentBlendshapes };
1069
- }
1070
- /**
1071
- * Reset mapper to neutral state
1072
- *
1073
- * Sets both target and current blendshapes to zero.
1074
- */
1075
- reset() {
1076
- this.targetBlendshapes = createZeroBlendshapes();
1077
- this.currentBlendshapes = createZeroBlendshapes();
1078
- this.currentEnergy = 1;
1079
- }
1080
- /**
1081
- * Get current configuration
1082
- */
1083
- getConfig() {
1084
- return { ...this.config };
1085
- }
1086
- /**
1087
- * Update configuration
1088
- *
1089
- * @param config - Partial configuration to update
1090
- */
1091
- setConfig(config) {
1092
- this.config = {
1093
- ...this.config,
1094
- ...config
750
+ this.inferenceRunning = false;
751
+ if (this.pendingChunks.length > 0) {
752
+ this.drainPendingChunks();
753
+ }
1095
754
  };
755
+ processNext().catch((err) => this.handleError(err));
756
+ }
757
+ handleError(err) {
758
+ const error = err instanceof Error ? err : new Error(String(err));
759
+ logger.warn("A2EProcessor inference error", { error: error.message });
760
+ this.onError?.(error);
1096
761
  }
1097
762
  };
1098
763
 
1099
- // src/animation/audioEnergy.ts
1100
- function calculateRMS(samples) {
1101
- if (samples.length === 0) return 0;
1102
- let sumSquares = 0;
1103
- for (let i = 0; i < samples.length; i++) {
1104
- sumSquares += samples[i] * samples[i];
764
+ // src/inference/BlendshapeSmoother.ts
765
+ var NUM_BLENDSHAPES = 52;
766
+ var BlendshapeSmoother = class {
767
+ constructor(config) {
768
+ /** Whether any target has been set */
769
+ this._hasTarget = false;
770
+ this.halflife = config?.halflife ?? 0.06;
771
+ this.values = new Float32Array(NUM_BLENDSHAPES);
772
+ this.velocities = new Float32Array(NUM_BLENDSHAPES);
773
+ this.targets = new Float32Array(NUM_BLENDSHAPES);
1105
774
  }
1106
- return Math.sqrt(sumSquares / samples.length);
1107
- }
1108
- function calculatePeak(samples) {
1109
- let peak = 0;
1110
- for (let i = 0; i < samples.length; i++) {
1111
- const abs = Math.abs(samples[i]);
1112
- if (abs > peak) peak = abs;
775
+ /** Whether a target frame has been set (false until first setTarget call) */
776
+ get hasTarget() {
777
+ return this._hasTarget;
1113
778
  }
1114
- return peak;
1115
- }
1116
- var AudioEnergyAnalyzer = class {
1117
779
  /**
1118
- * @param smoothingFactor How much to smooth (0 = no smoothing, 1 = infinite smoothing). Default 0.85
1119
- * @param noiseFloor Minimum energy threshold to consider as signal. Default 0.01
780
+ * Set new target frame from inference output.
781
+ * Springs will converge toward these values on subsequent update() calls.
1120
782
  */
1121
- constructor(smoothingFactor = 0.85, noiseFloor = 0.01) {
1122
- this.smoothedRMS = 0;
1123
- this.smoothedPeak = 0;
1124
- this.smoothingFactor = Math.max(0, Math.min(0.99, smoothingFactor));
1125
- this.noiseFloor = noiseFloor;
783
+ setTarget(frame) {
784
+ this.targets.set(frame);
785
+ this._hasTarget = true;
1126
786
  }
1127
787
  /**
1128
- * Process audio samples and return smoothed energy values
1129
- * @param samples Audio samples (Float32Array)
1130
- * @returns Object with rms and peak values
788
+ * Advance all 52 springs by `dt` seconds and return the smoothed frame.
789
+ *
790
+ * Call this every render frame (e.g., inside requestAnimationFrame).
791
+ * Returns the internal values buffer — do NOT mutate the returned array.
792
+ *
793
+ * @param dt - Time step in seconds (e.g., 1/60 for 60fps)
794
+ * @returns Smoothed blendshape values (Float32Array of 52)
1131
795
  */
1132
- process(samples) {
1133
- const instantRMS = calculateRMS(samples);
1134
- const instantPeak = calculatePeak(samples);
1135
- const gatedRMS = instantRMS > this.noiseFloor ? instantRMS : 0;
1136
- const gatedPeak = instantPeak > this.noiseFloor ? instantPeak : 0;
1137
- if (gatedRMS > this.smoothedRMS) {
1138
- this.smoothedRMS = this.smoothedRMS * 0.5 + gatedRMS * 0.5;
1139
- } else {
1140
- this.smoothedRMS = this.smoothedRMS * this.smoothingFactor + gatedRMS * (1 - this.smoothingFactor);
796
+ update(dt) {
797
+ if (!this._hasTarget) {
798
+ return this.values;
1141
799
  }
1142
- if (gatedPeak > this.smoothedPeak) {
1143
- this.smoothedPeak = this.smoothedPeak * 0.3 + gatedPeak * 0.7;
1144
- } else {
1145
- this.smoothedPeak = this.smoothedPeak * this.smoothingFactor + gatedPeak * (1 - this.smoothingFactor);
800
+ if (this.halflife <= 0) {
801
+ this.values.set(this.targets);
802
+ this.velocities.fill(0);
803
+ return this.values;
1146
804
  }
1147
- const energy = this.smoothedRMS * 0.7 + this.smoothedPeak * 0.3;
1148
- return {
1149
- rms: this.smoothedRMS,
1150
- peak: this.smoothedPeak,
1151
- energy: Math.min(1, energy * 2)
1152
- // Scale up and clamp
1153
- };
1154
- }
1155
- /**
1156
- * Reset analyzer state
1157
- */
1158
- reset() {
1159
- this.smoothedRMS = 0;
1160
- this.smoothedPeak = 0;
1161
- }
1162
- /**
1163
- * Get current smoothed RMS value
1164
- */
1165
- get rms() {
1166
- return this.smoothedRMS;
1167
- }
1168
- /**
1169
- * Get current smoothed peak value
1170
- */
1171
- get peak() {
1172
- return this.smoothedPeak;
1173
- }
1174
- };
1175
- var EmphasisDetector = class {
1176
- /**
1177
- * @param historySize Number of frames to track. Default 10
1178
- * @param emphasisThreshold Minimum energy increase to count as emphasis. Default 0.15
1179
- */
1180
- constructor(historySize = 10, emphasisThreshold = 0.15) {
1181
- this.energyHistory = [];
1182
- this.historySize = historySize;
1183
- this.emphasisThreshold = emphasisThreshold;
805
+ const damping = Math.LN2 / this.halflife;
806
+ const eydt = Math.exp(-damping * dt);
807
+ for (let i = 0; i < NUM_BLENDSHAPES; i++) {
808
+ const j0 = this.values[i] - this.targets[i];
809
+ const j1 = this.velocities[i] + j0 * damping;
810
+ this.values[i] = eydt * (j0 + j1 * dt) + this.targets[i];
811
+ this.velocities[i] = eydt * (this.velocities[i] - j1 * damping * dt);
812
+ this.values[i] = Math.max(0, Math.min(1, this.values[i]));
813
+ }
814
+ return this.values;
1184
815
  }
1185
816
  /**
1186
- * Process energy value and detect emphasis
1187
- * @param energy Current energy value (0-1)
1188
- * @returns Object with isEmphasis flag and emphasisStrength
817
+ * Decay all spring targets to neutral (0).
818
+ *
819
+ * Call when inference stalls (no new frames for threshold duration).
820
+ * The springs will smoothly close the mouth / relax the face over
821
+ * the halflife period rather than freezing.
1189
822
  */
1190
- process(energy) {
1191
- this.energyHistory.push(energy);
1192
- if (this.energyHistory.length > this.historySize) {
1193
- this.energyHistory.shift();
1194
- }
1195
- if (this.energyHistory.length < 3) {
1196
- return { isEmphasis: false, emphasisStrength: 0 };
1197
- }
1198
- const prevFrames = this.energyHistory.slice(0, -1);
1199
- const avgPrev = prevFrames.reduce((a, b) => a + b, 0) / prevFrames.length;
1200
- const increase = energy - avgPrev;
1201
- const isEmphasis = increase > this.emphasisThreshold;
1202
- return {
1203
- isEmphasis,
1204
- emphasisStrength: isEmphasis ? Math.min(1, increase / 0.3) : 0
1205
- };
823
+ decayToNeutral() {
824
+ this.targets.fill(0);
1206
825
  }
1207
826
  /**
1208
- * Reset detector state
827
+ * Reset all state (values, velocities, targets).
828
+ * Call when starting a new playback session.
1209
829
  */
1210
830
  reset() {
1211
- this.energyHistory = [];
831
+ this.values.fill(0);
832
+ this.velocities.fill(0);
833
+ this.targets.fill(0);
834
+ this._hasTarget = false;
1212
835
  }
1213
836
  };
1214
837
 
@@ -2461,7 +2084,7 @@ function isSafari() {
2461
2084
  const ua = navigator.userAgent.toLowerCase();
2462
2085
  return /safari/.test(ua) && !/chrome|crios|fxios|chromium|edg/.test(ua);
2463
2086
  }
2464
- function shouldUseCpuLipSync() {
2087
+ function shouldUseCpuA2E() {
2465
2088
  return isSafari() || isIOS();
2466
2089
  }
2467
2090
  function isSpeechRecognitionAvailable() {
@@ -2471,22 +2094,22 @@ function isSpeechRecognitionAvailable() {
2471
2094
  function shouldUseNativeASR() {
2472
2095
  return (isIOS() || isSafari()) && isSpeechRecognitionAvailable();
2473
2096
  }
2474
- function shouldUseServerLipSync() {
2097
+ function shouldUseServerA2E() {
2475
2098
  return isIOS();
2476
2099
  }
2477
2100
 
2478
2101
  // src/inference/onnxLoader.ts
2479
- var logger = createLogger("OnnxLoader");
2102
+ var logger2 = createLogger("OnnxLoader");
2480
2103
  var ortInstance = null;
2481
2104
  var loadedBackend = null;
2482
2105
  var WASM_CDN_PATH = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
2483
2106
  async function isWebGPUAvailable() {
2484
2107
  if (isIOS()) {
2485
- logger.debug("WebGPU check: disabled on iOS (asyncify bundle crashes WebKit)");
2108
+ logger2.debug("WebGPU check: disabled on iOS (asyncify bundle crashes WebKit)");
2486
2109
  return false;
2487
2110
  }
2488
2111
  if (!hasWebGPUApi()) {
2489
- logger.debug("WebGPU check: navigator.gpu not available", {
2112
+ logger2.debug("WebGPU check: navigator.gpu not available", {
2490
2113
  isSecureContext: typeof window !== "undefined" ? window.isSecureContext : "N/A"
2491
2114
  });
2492
2115
  return false;
@@ -2494,19 +2117,19 @@ async function isWebGPUAvailable() {
2494
2117
  try {
2495
2118
  const adapter = await navigator.gpu.requestAdapter();
2496
2119
  if (!adapter) {
2497
- logger.debug("WebGPU check: No adapter available");
2120
+ logger2.debug("WebGPU check: No adapter available");
2498
2121
  return false;
2499
2122
  }
2500
2123
  const device = await adapter.requestDevice();
2501
2124
  if (!device) {
2502
- logger.debug("WebGPU check: Could not create device");
2125
+ logger2.debug("WebGPU check: Could not create device");
2503
2126
  return false;
2504
2127
  }
2505
2128
  device.destroy();
2506
- logger.debug("WebGPU check: Available and working");
2129
+ logger2.debug("WebGPU check: Available and working");
2507
2130
  return true;
2508
2131
  } catch (err) {
2509
- logger.debug("WebGPU check: Error during availability check", { error: err });
2132
+ logger2.debug("WebGPU check: Error during availability check", { error: err });
2510
2133
  return false;
2511
2134
  }
2512
2135
  }
@@ -2516,11 +2139,11 @@ function applyIOSWasmMemoryPatch() {
2516
2139
  iosWasmPatched = true;
2517
2140
  const OrigMemory = WebAssembly.Memory;
2518
2141
  const MAX_IOS_PAGES = 32768;
2519
- logger.info("Applying iOS WASM memory patch (max\u21922GB, shared preserved)");
2142
+ logger2.info("Applying iOS WASM memory patch (max\u21922GB, shared preserved)");
2520
2143
  WebAssembly.Memory = function IOSPatchedMemory(descriptor) {
2521
2144
  const patched = { ...descriptor };
2522
2145
  if (patched.maximum !== void 0 && patched.maximum > MAX_IOS_PAGES) {
2523
- logger.info("iOS memory patch: capping maximum", {
2146
+ logger2.info("iOS memory patch: capping maximum", {
2524
2147
  original: patched.maximum,
2525
2148
  capped: MAX_IOS_PAGES,
2526
2149
  shared: patched.shared,
@@ -2539,7 +2162,7 @@ function configureWasm(ort) {
2539
2162
  ort.env.wasm.numThreads = numThreads;
2540
2163
  ort.env.wasm.simd = true;
2541
2164
  ort.env.wasm.proxy = enableProxy;
2542
- logger.info("WASM configured", {
2165
+ logger2.info("WASM configured", {
2543
2166
  numThreads,
2544
2167
  simd: true,
2545
2168
  proxy: enableProxy,
@@ -2551,12 +2174,12 @@ async function getOnnxRuntime(backend) {
2551
2174
  return ortInstance;
2552
2175
  }
2553
2176
  if (ortInstance && loadedBackend !== backend) {
2554
- logger.warn(
2177
+ logger2.warn(
2555
2178
  `ONNX Runtime already loaded with ${loadedBackend} backend. Cannot switch to ${backend}. Returning existing instance.`
2556
2179
  );
2557
2180
  return ortInstance;
2558
2181
  }
2559
- logger.info(`Loading ONNX Runtime with ${backend} backend...`);
2182
+ logger2.info(`Loading ONNX Runtime with ${backend} backend...`);
2560
2183
  applyIOSWasmMemoryPatch();
2561
2184
  try {
2562
2185
  if (backend === "wasm" && (isIOS() || isSafari())) {
@@ -2571,10 +2194,10 @@ async function getOnnxRuntime(backend) {
2571
2194
  }
2572
2195
  loadedBackend = backend;
2573
2196
  configureWasm(ortInstance);
2574
- logger.info(`ONNX Runtime loaded successfully`, { backend });
2197
+ logger2.info(`ONNX Runtime loaded successfully`, { backend });
2575
2198
  return ortInstance;
2576
2199
  } catch (err) {
2577
- logger.error(`Failed to load ONNX Runtime with ${backend} backend`, {
2200
+ logger2.error(`Failed to load ONNX Runtime with ${backend} backend`, {
2578
2201
  error: err
2579
2202
  });
2580
2203
  throw new Error(
@@ -2585,7 +2208,7 @@ async function getOnnxRuntime(backend) {
2585
2208
  async function getOnnxRuntimeForPreference(preference = "auto") {
2586
2209
  const webgpuAvailable = await isWebGPUAvailable();
2587
2210
  const backend = resolveBackend(preference, webgpuAvailable);
2588
- logger.info("Resolved backend preference", {
2211
+ logger2.info("Resolved backend preference", {
2589
2212
  preference,
2590
2213
  webgpuAvailable,
2591
2214
  resolvedBackend: backend
@@ -2619,42 +2242,6 @@ function getSessionOptions(backend) {
2619
2242
  graphOptimizationLevel: "all"
2620
2243
  };
2621
2244
  }
2622
- async function createSessionWithFallback(modelBuffer, preferredBackend) {
2623
- const ort = await getOnnxRuntime(preferredBackend);
2624
- const modelData = new Uint8Array(modelBuffer);
2625
- if (preferredBackend === "webgpu") {
2626
- try {
2627
- const options2 = getSessionOptions("webgpu");
2628
- const session2 = await ort.InferenceSession.create(modelData, options2);
2629
- logger.info("Session created with WebGPU backend");
2630
- return { session: session2, backend: "webgpu" };
2631
- } catch (err) {
2632
- logger.warn("WebGPU session creation failed, falling back to WASM", {
2633
- error: err instanceof Error ? err.message : String(err)
2634
- });
2635
- }
2636
- }
2637
- const options = getSessionOptions("wasm");
2638
- const session = await ort.InferenceSession.create(modelData, options);
2639
- logger.info("Session created with WASM backend");
2640
- return { session, backend: "wasm" };
2641
- }
2642
- function getLoadedBackend() {
2643
- return loadedBackend;
2644
- }
2645
- function isOnnxRuntimeLoaded() {
2646
- return ortInstance !== null;
2647
- }
2648
- async function preloadOnnxRuntime(preference = "auto") {
2649
- if (ortInstance) {
2650
- logger.info("ONNX Runtime already preloaded", { backend: loadedBackend });
2651
- return loadedBackend;
2652
- }
2653
- logger.info("Preloading ONNX Runtime...", { preference });
2654
- const { backend } = await getOnnxRuntimeForPreference(preference);
2655
- logger.info("ONNX Runtime preloaded", { backend });
2656
- return backend;
2657
- }
2658
2245
 
2659
2246
  // src/inference/blendshapeUtils.ts
2660
2247
  var LAM_BLENDSHAPES = [
@@ -2804,16 +2391,19 @@ var WAV2ARKIT_BLENDSHAPES = [
2804
2391
  var REMAP_WAV2ARKIT_TO_LAM = WAV2ARKIT_BLENDSHAPES.map(
2805
2392
  (name) => LAM_BLENDSHAPES.indexOf(name)
2806
2393
  );
2807
- function remapWav2ArkitToLam(frame) {
2808
- const result = new Float32Array(52);
2809
- for (let i = 0; i < 52; i++) {
2810
- result[REMAP_WAV2ARKIT_TO_LAM[i]] = frame[i];
2394
+ function lerpBlendshapes(current, target, factor = 0.3) {
2395
+ const len = Math.max(current.length, target.length);
2396
+ const result = new Array(len);
2397
+ for (let i = 0; i < len; i++) {
2398
+ const c = current[i] ?? 0;
2399
+ const t = target[i] ?? 0;
2400
+ result[i] = c + (t - c) * factor;
2811
2401
  }
2812
2402
  return result;
2813
2403
  }
2814
2404
 
2815
2405
  // src/inference/Wav2Vec2Inference.ts
2816
- var logger2 = createLogger("Wav2Vec2");
2406
+ var logger3 = createLogger("Wav2Vec2");
2817
2407
  var CTC_VOCAB = [
2818
2408
  "<pad>",
2819
2409
  "<s>",
@@ -2863,6 +2453,7 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
2863
2453
  this.poisoned = false;
2864
2454
  this.config = config;
2865
2455
  this.numIdentityClasses = config.numIdentityClasses ?? 12;
2456
+ this.chunkSize = config.chunkSize ?? 16e3;
2866
2457
  }
2867
2458
  get backend() {
2868
2459
  return this.session ? this._backend : null;
@@ -2892,30 +2483,30 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
2892
2483
  "model.backend_requested": this.config.backend || "auto"
2893
2484
  });
2894
2485
  try {
2895
- logger2.info("Loading ONNX Runtime...", { preference: this.config.backend || "auto" });
2486
+ logger3.info("Loading ONNX Runtime...", { preference: this.config.backend || "auto" });
2896
2487
  const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend || "auto");
2897
2488
  this.ort = ort;
2898
2489
  this._backend = backend;
2899
- logger2.info("ONNX Runtime loaded", { backend: this._backend });
2490
+ logger3.info("ONNX Runtime loaded", { backend: this._backend });
2900
2491
  const modelUrl = this.config.modelUrl;
2901
2492
  const dataUrl = this.config.externalDataUrl !== false ? typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data` : null;
2902
2493
  const sessionOptions = getSessionOptions(this._backend);
2903
2494
  let isCached = false;
2904
2495
  if (isIOS()) {
2905
- logger2.info("iOS: passing model URLs directly to ORT (low-memory path)", {
2496
+ logger3.info("iOS: passing model URLs directly to ORT (low-memory path)", {
2906
2497
  modelUrl,
2907
2498
  dataUrl
2908
2499
  });
2909
2500
  if (dataUrl) {
2910
2501
  const dataFilename = dataUrl.split("/").pop();
2911
- logger2.info("iOS: setting externalData", { dataFilename, dataUrl });
2502
+ logger3.info("iOS: setting externalData", { dataFilename, dataUrl });
2912
2503
  sessionOptions.externalData = [{
2913
2504
  path: dataFilename,
2914
2505
  data: dataUrl
2915
2506
  // URL string — ORT fetches directly into WASM
2916
2507
  }];
2917
2508
  }
2918
- logger2.info("iOS: calling InferenceSession.create() with URL string", {
2509
+ logger3.info("iOS: calling InferenceSession.create() with URL string", {
2919
2510
  modelUrl,
2920
2511
  sessionOptions: JSON.stringify(
2921
2512
  sessionOptions,
@@ -2925,14 +2516,14 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
2925
2516
  try {
2926
2517
  this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
2927
2518
  } catch (sessionErr) {
2928
- logger2.error("iOS: InferenceSession.create() failed", {
2519
+ logger3.error("iOS: InferenceSession.create() failed", {
2929
2520
  error: sessionErr instanceof Error ? sessionErr.message : String(sessionErr),
2930
2521
  errorType: sessionErr?.constructor?.name,
2931
2522
  stack: sessionErr instanceof Error ? sessionErr.stack : void 0
2932
2523
  });
2933
2524
  throw sessionErr;
2934
2525
  }
2935
- logger2.info("iOS: session created successfully", {
2526
+ logger3.info("iOS: session created successfully", {
2936
2527
  inputNames: this.session.inputNames,
2937
2528
  outputNames: this.session.outputNames
2938
2529
  });
@@ -2941,15 +2532,15 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
2941
2532
  isCached = await cache.has(modelUrl);
2942
2533
  let modelBuffer;
2943
2534
  if (isCached) {
2944
- logger2.debug("Loading model from cache", { modelUrl });
2535
+ logger3.debug("Loading model from cache", { modelUrl });
2945
2536
  modelBuffer = await cache.get(modelUrl);
2946
2537
  if (!modelBuffer) {
2947
- logger2.warn("Cache corruption detected, clearing and retrying", { modelUrl });
2538
+ logger3.warn("Cache corruption detected, clearing and retrying", { modelUrl });
2948
2539
  await cache.delete(modelUrl);
2949
2540
  modelBuffer = await fetchWithCache(modelUrl);
2950
2541
  }
2951
2542
  } else {
2952
- logger2.debug("Fetching and caching model", { modelUrl });
2543
+ logger3.debug("Fetching and caching model", { modelUrl });
2953
2544
  modelBuffer = await fetchWithCache(modelUrl);
2954
2545
  }
2955
2546
  if (!modelBuffer) {
@@ -2960,31 +2551,31 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
2960
2551
  try {
2961
2552
  const isDataCached = await cache.has(dataUrl);
2962
2553
  if (isDataCached) {
2963
- logger2.debug("Loading external data from cache", { dataUrl });
2554
+ logger3.debug("Loading external data from cache", { dataUrl });
2964
2555
  externalDataBuffer = await cache.get(dataUrl);
2965
2556
  if (!externalDataBuffer) {
2966
- logger2.warn("Cache corruption for external data, retrying", { dataUrl });
2557
+ logger3.warn("Cache corruption for external data, retrying", { dataUrl });
2967
2558
  await cache.delete(dataUrl);
2968
2559
  externalDataBuffer = await fetchWithCache(dataUrl);
2969
2560
  }
2970
2561
  } else {
2971
- logger2.info("Fetching external model data", {
2562
+ logger3.info("Fetching external model data", {
2972
2563
  dataUrl,
2973
2564
  note: "This may be a large download (383MB+)"
2974
2565
  });
2975
2566
  externalDataBuffer = await fetchWithCache(dataUrl);
2976
2567
  }
2977
- logger2.info("External data loaded", {
2568
+ logger3.info("External data loaded", {
2978
2569
  size: formatBytes(externalDataBuffer.byteLength)
2979
2570
  });
2980
2571
  } catch (err) {
2981
- logger2.debug("No external data file found (single-file model)", {
2572
+ logger3.debug("No external data file found (single-file model)", {
2982
2573
  dataUrl,
2983
2574
  error: err.message
2984
2575
  });
2985
2576
  }
2986
2577
  }
2987
- logger2.debug("Creating ONNX session", {
2578
+ logger3.debug("Creating ONNX session", {
2988
2579
  graphSize: formatBytes(modelBuffer.byteLength),
2989
2580
  externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
2990
2581
  backend: this._backend
@@ -2999,12 +2590,12 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
2999
2590
  const modelData = new Uint8Array(modelBuffer);
3000
2591
  this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
3001
2592
  }
3002
- logger2.info("ONNX session created successfully", {
2593
+ logger3.info("ONNX session created successfully", {
3003
2594
  executionProvider: this._backend,
3004
2595
  backend: this._backend
3005
2596
  });
3006
2597
  const loadTimeMs = performance.now() - startTime;
3007
- logger2.info("Model loaded successfully", {
2598
+ logger3.info("Model loaded successfully", {
3008
2599
  backend: this._backend,
3009
2600
  loadTimeMs: Math.round(loadTimeMs),
3010
2601
  inputs: this.session.inputNames,
@@ -3020,13 +2611,13 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
3020
2611
  model: "wav2vec2",
3021
2612
  backend: this._backend
3022
2613
  });
3023
- logger2.debug("Running warmup inference to initialize GPU context");
2614
+ logger3.debug("Running warmup inference to initialize GPU context");
3024
2615
  const warmupStart = performance.now();
3025
- const warmupAudio = new Float32Array(16e3);
2616
+ const warmupAudio = new Float32Array(this.chunkSize);
3026
2617
  const warmupIdentity = new Float32Array(this.numIdentityClasses);
3027
2618
  warmupIdentity[0] = 1;
3028
2619
  const warmupFeeds = {
3029
- "audio": new this.ort.Tensor("float32", warmupAudio, [1, 16e3]),
2620
+ "audio": new this.ort.Tensor("float32", warmupAudio, [1, this.chunkSize]),
3030
2621
  "identity": new this.ort.Tensor("float32", warmupIdentity, [1, this.numIdentityClasses])
3031
2622
  };
3032
2623
  const WARMUP_TIMEOUT_MS = 15e3;
@@ -3036,12 +2627,12 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
3036
2627
  ]);
3037
2628
  const warmupTimeMs = performance.now() - warmupStart;
3038
2629
  if (warmupResult === "timeout") {
3039
- logger2.warn("Warmup inference timed out \u2014 GPU may be unresponsive. Continuing without warmup.", {
2630
+ logger3.warn("Warmup inference timed out \u2014 GPU may be unresponsive. Continuing without warmup.", {
3040
2631
  timeoutMs: WARMUP_TIMEOUT_MS,
3041
2632
  backend: this._backend
3042
2633
  });
3043
2634
  } else {
3044
- logger2.info("Warmup inference complete", {
2635
+ logger3.info("Warmup inference complete", {
3045
2636
  warmupTimeMs: Math.round(warmupTimeMs),
3046
2637
  backend: this._backend
3047
2638
  });
@@ -3069,11 +2660,10 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
3069
2660
  }
3070
2661
  /**
3071
2662
  * Run inference on raw audio
3072
- * @param audioSamples - Float32Array of raw audio at 16kHz (16000 samples = 1 second)
2663
+ * @param audioSamples - Float32Array of raw audio at 16kHz
3073
2664
  * @param identityIndex - Optional identity index (0-11, default 0 = neutral)
3074
2665
  *
3075
- * Note: Model expects 1-second chunks (16000 samples) for optimal performance.
3076
- * Audio will be zero-padded or truncated to 16000 samples.
2666
+ * Audio will be zero-padded or truncated to chunkSize samples.
3077
2667
  */
3078
2668
  async infer(audioSamples, identityIndex = 0) {
3079
2669
  if (!this.session) {
@@ -3084,20 +2674,20 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
3084
2674
  }
3085
2675
  const audioSamplesCopy = new Float32Array(audioSamples);
3086
2676
  let audio;
3087
- if (audioSamplesCopy.length === 16e3) {
2677
+ if (audioSamplesCopy.length === this.chunkSize) {
3088
2678
  audio = audioSamplesCopy;
3089
- } else if (audioSamplesCopy.length < 16e3) {
3090
- audio = new Float32Array(16e3);
2679
+ } else if (audioSamplesCopy.length < this.chunkSize) {
2680
+ audio = new Float32Array(this.chunkSize);
3091
2681
  audio.set(audioSamplesCopy, 0);
3092
2682
  } else {
3093
- audio = audioSamplesCopy.slice(0, 16e3);
2683
+ audio = audioSamplesCopy.slice(0, this.chunkSize);
3094
2684
  }
3095
2685
  const identity = new Float32Array(this.numIdentityClasses);
3096
2686
  identity[Math.max(0, Math.min(identityIndex, this.numIdentityClasses - 1))] = 1;
3097
2687
  const audioCopy = new Float32Array(audio);
3098
2688
  const identityCopy = new Float32Array(identity);
3099
2689
  const feeds = {
3100
- "audio": new this.ort.Tensor("float32", audioCopy, [1, 16e3]),
2690
+ "audio": new this.ort.Tensor("float32", audioCopy, [1, this.chunkSize]),
3101
2691
  "identity": new this.ort.Tensor("float32", identityCopy, [1, this.numIdentityClasses])
3102
2692
  };
3103
2693
  return this.queueInference(feeds);
@@ -3133,7 +2723,7 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
3133
2723
  const telemetry = getTelemetry();
3134
2724
  const span = telemetry?.startSpan("Wav2Vec2.infer", {
3135
2725
  "inference.backend": this._backend,
3136
- "inference.input_samples": 16e3
2726
+ "inference.input_samples": this.chunkSize
3137
2727
  });
3138
2728
  try {
3139
2729
  const startTime = performance.now();
@@ -3172,7 +2762,7 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
3172
2762
  blendshapes.push(symmetrizeBlendshapes(rawFrame));
3173
2763
  }
3174
2764
  const text = this.decodeCTC(asrLogits);
3175
- logger2.trace("Inference completed", {
2765
+ logger3.trace("Inference completed", {
3176
2766
  inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
3177
2767
  numA2EFrames,
3178
2768
  numASRFrames,
@@ -3206,12 +2796,12 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
3206
2796
  const errMsg = err instanceof Error ? err.message : String(err);
3207
2797
  if (errMsg.includes("timed out")) {
3208
2798
  this.poisoned = true;
3209
- logger2.error("CRITICAL: Inference session timed out \u2014 LAM is dead. Page reload required.", {
2799
+ logger3.error("CRITICAL: Inference session timed out \u2014 LAM is dead. Page reload required.", {
3210
2800
  backend: this._backend,
3211
2801
  timeoutMs: _Wav2Vec2Inference.INFERENCE_TIMEOUT_MS
3212
2802
  });
3213
2803
  } else {
3214
- logger2.error("Inference failed", { error: errMsg, backend: this._backend });
2804
+ logger3.error("Inference failed", { error: errMsg, backend: this._backend });
3215
2805
  }
3216
2806
  span?.endWithError(err instanceof Error ? err : new Error(String(err)));
3217
2807
  telemetry?.incrementCounter("omote.inference.total", 1, {
@@ -3252,56 +2842,79 @@ _Wav2Vec2Inference.INFERENCE_TIMEOUT_MS = 5e3;
3252
2842
  _Wav2Vec2Inference.isWebGPUAvailable = isWebGPUAvailable;
3253
2843
  var Wav2Vec2Inference = _Wav2Vec2Inference;
3254
2844
 
2845
+ // src/audio/audioUtils.ts
2846
+ function pcm16ToFloat32(buffer) {
2847
+ const byteLen = buffer.byteLength & ~1;
2848
+ const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
2849
+ const float32 = new Float32Array(int16.length);
2850
+ for (let i = 0; i < int16.length; i++) {
2851
+ float32[i] = int16[i] / 32768;
2852
+ }
2853
+ return float32;
2854
+ }
2855
+ function int16ToFloat32(int16) {
2856
+ const float32 = new Float32Array(int16.length);
2857
+ for (let i = 0; i < int16.length; i++) {
2858
+ float32[i] = int16[i] / 32768;
2859
+ }
2860
+ return float32;
2861
+ }
2862
+
3255
2863
  // src/audio/FullFacePipeline.ts
3256
- var logger3 = createLogger("FullFacePipeline");
3257
- var BLENDSHAPE_INDEX_MAP = /* @__PURE__ */ new Map();
3258
- LAM_BLENDSHAPES.forEach((name, index) => {
3259
- BLENDSHAPE_INDEX_MAP.set(name, index);
3260
- });
3261
- var UPPER_FACE_SET = new Set(UPPER_FACE_BLENDSHAPES);
3262
- var EMOTION_LABEL_MAP = {
3263
- // Direct labels
3264
- happy: "happy",
3265
- sad: "sad",
3266
- angry: "angry",
3267
- neutral: "neutral",
3268
- // Natural language synonyms
3269
- excited: "happy",
3270
- joyful: "happy",
3271
- cheerful: "happy",
3272
- delighted: "happy",
3273
- amused: "happy",
3274
- melancholic: "sad",
3275
- sorrowful: "sad",
3276
- disappointed: "sad",
3277
- frustrated: "angry",
3278
- irritated: "angry",
3279
- furious: "angry",
3280
- annoyed: "angry",
3281
- // SenseVoice labels
3282
- fearful: "sad",
3283
- disgusted: "angry",
3284
- surprised: "happy"
3285
- };
3286
- var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
2864
+ var logger4 = createLogger("FullFacePipeline");
2865
+ var BLENDSHAPE_TO_GROUP = /* @__PURE__ */ new Map();
2866
+ for (const name of LAM_BLENDSHAPES) {
2867
+ if (name.startsWith("eye")) {
2868
+ BLENDSHAPE_TO_GROUP.set(name, "eyes");
2869
+ } else if (name.startsWith("brow")) {
2870
+ BLENDSHAPE_TO_GROUP.set(name, "brows");
2871
+ } else if (name.startsWith("jaw")) {
2872
+ BLENDSHAPE_TO_GROUP.set(name, "jaw");
2873
+ } else if (name.startsWith("mouth")) {
2874
+ BLENDSHAPE_TO_GROUP.set(name, "mouth");
2875
+ } else if (name.startsWith("cheek")) {
2876
+ BLENDSHAPE_TO_GROUP.set(name, "cheeks");
2877
+ } else if (name.startsWith("nose")) {
2878
+ BLENDSHAPE_TO_GROUP.set(name, "nose");
2879
+ } else if (name.startsWith("tongue")) {
2880
+ BLENDSHAPE_TO_GROUP.set(name, "tongue");
2881
+ }
2882
+ }
2883
+ var FullFacePipeline = class extends EventEmitter {
3287
2884
  constructor(options) {
3288
2885
  super();
3289
2886
  this.options = options;
3290
2887
  this.playbackStarted = false;
3291
2888
  this.monitorInterval = null;
3292
2889
  this.frameAnimationId = null;
3293
- // Emotion state
3294
- this.lastEmotionFrame = null;
3295
- this.currentAudioEnergy = 0;
3296
2890
  // Stale frame detection
3297
2891
  this.lastNewFrameTime = 0;
3298
2892
  this.lastKnownLamFrame = null;
3299
2893
  this.staleWarningEmitted = false;
2894
+ // Frame loop timing (for dt calculation)
2895
+ this.lastFrameLoopTime = 0;
2896
+ // Diagnostic logging counter
2897
+ this.frameLoopCount = 0;
3300
2898
  const sampleRate = options.sampleRate ?? 16e3;
3301
- this.emotionBlendFactor = options.emotionBlendFactor ?? 0.8;
3302
- this.lamBlendFactor = options.lamBlendFactor ?? 0.2;
3303
- const autoDelay = options.lam.modelId === "wav2arkit_cpu" ? 750 : options.lam.backend === "wasm" ? 350 : 50;
2899
+ this.profile = options.profile ?? {};
2900
+ this.staleThresholdMs = options.staleThresholdMs ?? 2e3;
2901
+ this.smoother = new BlendshapeSmoother({
2902
+ halflife: options.smoothingHalflife ?? 0.06
2903
+ });
2904
+ const isCpuModel = options.lam.modelId === "wav2arkit_cpu";
2905
+ const chunkSize = options.chunkSize ?? options.lam.chunkSize ?? 16e3;
2906
+ const chunkAccumulationMs = chunkSize / sampleRate * 1e3;
2907
+ const inferenceEstimateMs = isCpuModel ? 300 : options.lam.backend === "wasm" ? 250 : 80;
2908
+ const marginMs = 100;
2909
+ const autoDelay = Math.ceil(chunkAccumulationMs + inferenceEstimateMs + marginMs);
3304
2910
  const audioDelayMs = options.audioDelayMs ?? autoDelay;
2911
+ logger4.info("FullFacePipeline config", {
2912
+ chunkSize,
2913
+ audioDelayMs,
2914
+ autoDelay,
2915
+ backend: options.lam.backend,
2916
+ modelId: options.lam.modelId
2917
+ });
3305
2918
  this.scheduler = new AudioScheduler({
3306
2919
  sampleRate,
3307
2920
  initialLookaheadSec: audioDelayMs / 1e3
@@ -3310,20 +2923,15 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
3310
2923
  sampleRate,
3311
2924
  targetDurationMs: options.chunkTargetMs ?? 200
3312
2925
  });
3313
- this.lamPipeline = new LAMPipeline({
2926
+ this.processor = new A2EProcessor({
2927
+ backend: options.lam,
3314
2928
  sampleRate,
2929
+ chunkSize,
3315
2930
  onError: (error) => {
3316
- logger3.error("LAM inference error", { message: error.message, stack: error.stack });
2931
+ logger4.error("A2E inference error", { message: error.message, stack: error.stack });
3317
2932
  this.emit("error", error);
3318
2933
  }
3319
2934
  });
3320
- this.emotionMapper = new EmotionToBlendshapeMapper({
3321
- smoothingFactor: 0.15,
3322
- confidenceThreshold: 0.3,
3323
- intensity: 1,
3324
- energyModulation: true
3325
- });
3326
- this.energyAnalyzer = new AudioEnergyAnalyzer();
3327
2935
  }
3328
2936
  /**
3329
2937
  * Initialize the pipeline
@@ -3332,40 +2940,33 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
3332
2940
  await this.scheduler.initialize();
3333
2941
  }
3334
2942
  /**
3335
- * Set emotion label from backend (e.g., LLM response emotion).
3336
- *
3337
- * Converts a natural language emotion label into an EmotionFrame
3338
- * that drives upper face blendshapes for the duration of the utterance.
3339
- *
3340
- * Supported labels: happy, excited, joyful, sad, melancholic, angry,
3341
- * frustrated, neutral, etc.
3342
- *
3343
- * @param label - Emotion label string (case-insensitive)
3344
- */
3345
- setEmotionLabel(label) {
3346
- const normalized = label.toLowerCase();
3347
- const mapped = EMOTION_LABEL_MAP[normalized] ?? "neutral";
3348
- const probabilities = {
3349
- neutral: 0.1,
3350
- happy: 0.1,
3351
- angry: 0.1,
3352
- sad: 0.1
3353
- };
3354
- probabilities[mapped] = 0.7;
3355
- const frame = {
3356
- emotion: mapped,
3357
- confidence: 0.7,
3358
- probabilities
3359
- };
3360
- this.lastEmotionFrame = frame;
3361
- logger3.info("Emotion label set", { label, mapped });
2943
+ * Update the ExpressionProfile at runtime (e.g., character switch).
2944
+ */
2945
+ setProfile(profile) {
2946
+ this.profile = profile;
3362
2947
  }
3363
2948
  /**
3364
- * Clear any set emotion label.
3365
- * Falls back to prosody-only upper face animation.
2949
+ * Apply ExpressionProfile scaling to raw A2E blendshapes.
2950
+ *
2951
+ * For each blendshape:
2952
+ * 1. If an override exists for the blendshape name, use override as scaler
2953
+ * 2. Otherwise, use the group scaler (default 1.0)
2954
+ * 3. Clamp result to [0, 1]
3366
2955
  */
3367
- clearEmotionLabel() {
3368
- this.lastEmotionFrame = null;
2956
+ applyProfile(raw) {
2957
+ const scaled = new Float32Array(52);
2958
+ for (let i = 0; i < 52; i++) {
2959
+ const name = LAM_BLENDSHAPES[i];
2960
+ let scaler;
2961
+ if (this.profile.overrides && this.profile.overrides[name] !== void 0) {
2962
+ scaler = this.profile.overrides[name];
2963
+ } else {
2964
+ const group = BLENDSHAPE_TO_GROUP.get(name);
2965
+ scaler = group ? this.profile[group] ?? 1 : 1;
2966
+ }
2967
+ scaled[i] = Math.min(1, Math.max(0, raw[i] * scaler));
2968
+ }
2969
+ return scaled;
3369
2970
  }
3370
2971
  /**
3371
2972
  * Start a new playback session
@@ -3377,15 +2978,14 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
3377
2978
  this.stopMonitoring();
3378
2979
  this.scheduler.reset();
3379
2980
  this.coalescer.reset();
3380
- this.lamPipeline.reset();
2981
+ this.processor.reset();
3381
2982
  this.playbackStarted = false;
3382
- this.lastEmotionFrame = null;
3383
- this.currentAudioEnergy = 0;
3384
- this.emotionMapper.reset();
3385
- this.energyAnalyzer.reset();
3386
2983
  this.lastNewFrameTime = 0;
3387
2984
  this.lastKnownLamFrame = null;
3388
2985
  this.staleWarningEmitted = false;
2986
+ this.lastFrameLoopTime = 0;
2987
+ this.frameLoopCount = 0;
2988
+ this.smoother.reset();
3389
2989
  this.scheduler.warmup();
3390
2990
  this.startFrameLoop();
3391
2991
  this.startMonitoring();
@@ -3393,8 +2993,8 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
3393
2993
  /**
3394
2994
  * Receive audio chunk from network
3395
2995
  *
3396
- * Audio-first design: schedules audio immediately, LAM runs in background.
3397
- * This prevents LAM inference (50-300ms) from blocking audio scheduling.
2996
+ * Audio-first design: schedules audio immediately, A2E runs in background.
2997
+ * This prevents A2E inference (50-300ms) from blocking audio scheduling.
3398
2998
  *
3399
2999
  * @param chunk - Uint8Array containing Int16 PCM audio
3400
3000
  */
@@ -3409,100 +3009,77 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
3409
3009
  this.playbackStarted = true;
3410
3010
  this.emit("playback_start", scheduleTime);
3411
3011
  }
3412
- const { energy } = this.energyAnalyzer.process(float32);
3413
- this.currentAudioEnergy = energy;
3414
- this.lamPipeline.push(float32, scheduleTime, this.options.lam).catch((err) => {
3415
- this.emit("error", err);
3012
+ logger4.info("onAudioChunk \u2192 pushAudio", {
3013
+ float32Samples: float32.length,
3014
+ scheduleTime: scheduleTime.toFixed(3),
3015
+ currentTime: this.scheduler.getCurrentTime().toFixed(3),
3016
+ deltaToPlayback: (scheduleTime - this.scheduler.getCurrentTime()).toFixed(3)
3416
3017
  });
3417
- }
3418
- /**
3419
- * Get emotion frame for current animation.
3420
- *
3421
- * Priority:
3422
- * 1. Explicit emotion label from setEmotionLabel()
3423
- * 2. Prosody fallback: subtle brow movement from audio energy
3424
- */
3425
- getEmotionFrame() {
3426
- if (this.lastEmotionFrame) {
3427
- return { frame: this.lastEmotionFrame, energy: this.currentAudioEnergy };
3428
- }
3429
- return { frame: null, energy: this.currentAudioEnergy };
3430
- }
3431
- /**
3432
- * Merge LAM blendshapes with emotion upper face blendshapes
3433
- */
3434
- mergeBlendshapes(lamFrame, emotionFrame, audioEnergy) {
3435
- const merged = new Float32Array(52);
3436
- let emotionBlendshapes;
3437
- if (emotionFrame) {
3438
- this.emotionMapper.mapFrame(emotionFrame, audioEnergy);
3439
- this.emotionMapper.update(33);
3440
- emotionBlendshapes = this.emotionMapper.getCurrentBlendshapes();
3441
- } else {
3442
- emotionBlendshapes = {};
3443
- for (const name of UPPER_FACE_BLENDSHAPES) {
3444
- emotionBlendshapes[name] = 0;
3445
- }
3446
- }
3447
- for (let i = 0; i < 52; i++) {
3448
- const name = LAM_BLENDSHAPES[i];
3449
- if (UPPER_FACE_SET.has(name)) {
3450
- const emotionValue = emotionBlendshapes[name] ?? 0;
3451
- const lamValue = lamFrame[i];
3452
- merged[i] = emotionValue * this.emotionBlendFactor + lamValue * this.lamBlendFactor;
3453
- } else {
3454
- merged[i] = lamFrame[i];
3455
- }
3456
- }
3457
- return { merged, emotionBlendshapes };
3018
+ this.processor.pushAudio(float32, scheduleTime);
3458
3019
  }
3459
3020
  /**
3460
3021
  * Start frame animation loop
3022
+ *
3023
+ * Uses critically damped spring smoother to produce continuous output
3024
+ * at render rate (60fps), even between inference batches (~30fps bursts).
3025
+ * Springs interpolate toward the latest inference target, and decay
3026
+ * to neutral when inference stalls.
3461
3027
  */
3462
3028
  startFrameLoop() {
3029
+ this.lastFrameLoopTime = 0;
3463
3030
  const updateFrame = () => {
3031
+ const now = performance.now() / 1e3;
3032
+ const dt = this.lastFrameLoopTime > 0 ? now - this.lastFrameLoopTime : 1 / 60;
3033
+ this.lastFrameLoopTime = now;
3034
+ this.frameLoopCount++;
3464
3035
  const currentTime = this.scheduler.getCurrentTime();
3465
- const lamFrame = this.lamPipeline.getFrameForTime(currentTime, this.options.lam);
3466
- if (lamFrame) {
3467
- if (lamFrame !== this.lastKnownLamFrame) {
3468
- this.lastNewFrameTime = performance.now();
3469
- this.lastKnownLamFrame = lamFrame;
3470
- this.staleWarningEmitted = false;
3036
+ const lamFrame = this.processor.getFrameForTime(currentTime);
3037
+ if (lamFrame && lamFrame !== this.lastKnownLamFrame) {
3038
+ this.smoother.setTarget(lamFrame);
3039
+ this.lastNewFrameTime = performance.now();
3040
+ this.lastKnownLamFrame = lamFrame;
3041
+ this.staleWarningEmitted = false;
3042
+ logger4.info("New A2E frame", {
3043
+ jawOpen: lamFrame[24]?.toFixed(3),
3044
+ mouthClose: lamFrame[26]?.toFixed(3),
3045
+ browInnerUp: lamFrame[2]?.toFixed(3),
3046
+ browDownL: lamFrame[0]?.toFixed(3),
3047
+ browOuterUpL: lamFrame[3]?.toFixed(3),
3048
+ currentTime: currentTime.toFixed(3),
3049
+ queuedFrames: this.processor.queuedFrameCount
3050
+ });
3051
+ }
3052
+ if (this.frameLoopCount % 60 === 0) {
3053
+ logger4.info("Frame loop heartbeat", {
3054
+ frameLoopCount: this.frameLoopCount,
3055
+ currentTime: currentTime.toFixed(3),
3056
+ playbackEndTime: this.scheduler.getPlaybackEndTime().toFixed(3),
3057
+ queuedFrames: this.processor.queuedFrameCount,
3058
+ hasTarget: this.smoother.hasTarget,
3059
+ playbackStarted: this.playbackStarted,
3060
+ msSinceNewFrame: this.lastNewFrameTime > 0 ? Math.round(performance.now() - this.lastNewFrameTime) : -1,
3061
+ processorFill: this.processor.fillLevel.toFixed(2)
3062
+ });
3063
+ }
3064
+ if (this.playbackStarted && this.lastNewFrameTime > 0 && performance.now() - this.lastNewFrameTime > this.staleThresholdMs) {
3065
+ this.smoother.decayToNeutral();
3066
+ if (!this.staleWarningEmitted) {
3067
+ this.staleWarningEmitted = true;
3068
+ logger4.warn("A2E stalled \u2014 decaying to neutral", {
3069
+ staleDurationMs: Math.round(performance.now() - this.lastNewFrameTime),
3070
+ queuedFrames: this.processor.queuedFrameCount
3071
+ });
3471
3072
  }
3472
- const { frame: emotionFrame, energy } = this.getEmotionFrame();
3473
- const { merged, emotionBlendshapes } = this.mergeBlendshapes(lamFrame, emotionFrame, energy);
3073
+ }
3074
+ if (lamFrame) {
3075
+ const scaled = this.applyProfile(lamFrame);
3474
3076
  const fullFrame = {
3475
- blendshapes: merged,
3476
- lamBlendshapes: lamFrame,
3477
- emotionBlendshapes,
3478
- emotion: emotionFrame,
3077
+ blendshapes: scaled,
3078
+ rawBlendshapes: lamFrame,
3479
3079
  timestamp: currentTime
3480
3080
  };
3481
3081
  this.emit("full_frame_ready", fullFrame);
3482
3082
  this.emit("lam_frame_ready", lamFrame);
3483
- if (emotionFrame) {
3484
- this.emit("emotion_frame_ready", emotionFrame);
3485
- }
3486
- } else if (this.playbackStarted && !this.lastKnownLamFrame) {
3487
- const { frame: emotionFrame, energy } = this.getEmotionFrame();
3488
- if (emotionFrame && energy > 0.05) {
3489
- const startupFrame = new Float32Array(52);
3490
- const { merged, emotionBlendshapes } = this.mergeBlendshapes(startupFrame, emotionFrame, energy);
3491
- this.emit("full_frame_ready", {
3492
- blendshapes: merged,
3493
- lamBlendshapes: startupFrame,
3494
- emotionBlendshapes,
3495
- emotion: emotionFrame,
3496
- timestamp: currentTime
3497
- });
3498
- }
3499
- }
3500
- if (this.playbackStarted && this.lastNewFrameTime > 0 && !this.staleWarningEmitted && performance.now() - this.lastNewFrameTime > _FullFacePipeline.STALE_FRAME_THRESHOLD_MS) {
3501
- this.staleWarningEmitted = true;
3502
- logger3.warn("LAM appears stalled \u2014 no new frames for 3+ seconds during playback", {
3503
- staleDurationMs: Math.round(performance.now() - this.lastNewFrameTime),
3504
- queuedFrames: this.lamPipeline.queuedFrameCount
3505
- });
3506
3083
  }
3507
3084
  this.frameAnimationId = requestAnimationFrame(updateFrame);
3508
3085
  };
@@ -3517,7 +3094,7 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
3517
3094
  const chunk = new Uint8Array(remaining);
3518
3095
  await this.onAudioChunk(chunk);
3519
3096
  }
3520
- await this.lamPipeline.flush(this.options.lam);
3097
+ await this.processor.flush();
3521
3098
  }
3522
3099
  /**
3523
3100
  * Stop playback immediately with smooth fade-out
@@ -3526,15 +3103,13 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
3526
3103
  this.stopMonitoring();
3527
3104
  await this.scheduler.cancelAll(fadeOutMs);
3528
3105
  this.coalescer.reset();
3529
- this.lamPipeline.reset();
3106
+ this.processor.reset();
3107
+ this.smoother.reset();
3530
3108
  this.playbackStarted = false;
3531
- this.lastEmotionFrame = null;
3532
- this.currentAudioEnergy = 0;
3533
- this.emotionMapper.reset();
3534
- this.energyAnalyzer.reset();
3535
3109
  this.lastNewFrameTime = 0;
3536
3110
  this.lastKnownLamFrame = null;
3537
3111
  this.staleWarningEmitted = false;
3112
+ this.lastFrameLoopTime = 0;
3538
3113
  this.emit("playback_complete", void 0);
3539
3114
  }
3540
3115
  /**
@@ -3545,7 +3120,7 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
3545
3120
  clearInterval(this.monitorInterval);
3546
3121
  }
3547
3122
  this.monitorInterval = setInterval(() => {
3548
- if (this.scheduler.isComplete() && this.lamPipeline.queuedFrameCount === 0) {
3123
+ if (this.scheduler.isComplete() && this.processor.queuedFrameCount === 0) {
3549
3124
  this.emit("playback_complete", void 0);
3550
3125
  this.stopMonitoring();
3551
3126
  }
@@ -3571,20 +3146,12 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
3571
3146
  return {
3572
3147
  playbackStarted: this.playbackStarted,
3573
3148
  coalescerFill: this.coalescer.fillLevel,
3574
- lamFill: this.lamPipeline.fillLevel,
3575
- queuedLAMFrames: this.lamPipeline.queuedFrameCount,
3576
- emotionLabel: this.lastEmotionFrame?.emotion ?? null,
3577
- currentAudioEnergy: this.currentAudioEnergy,
3149
+ processorFill: this.processor.fillLevel,
3150
+ queuedFrames: this.processor.queuedFrameCount,
3578
3151
  currentTime: this.scheduler.getCurrentTime(),
3579
3152
  playbackEndTime: this.scheduler.getPlaybackEndTime()
3580
3153
  };
3581
3154
  }
3582
- /**
3583
- * Check if an explicit emotion label is currently set
3584
- */
3585
- get hasEmotionLabel() {
3586
- return this.lastEmotionFrame !== null;
3587
- }
3588
3155
  /**
3589
3156
  * Cleanup resources
3590
3157
  */
@@ -3592,13 +3159,9 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
3592
3159
  this.stopMonitoring();
3593
3160
  this.scheduler.dispose();
3594
3161
  this.coalescer.reset();
3595
- this.lamPipeline.reset();
3596
- this.lastEmotionFrame = null;
3597
- this.currentAudioEnergy = 0;
3162
+ this.processor.dispose();
3598
3163
  }
3599
3164
  };
3600
- _FullFacePipeline.STALE_FRAME_THRESHOLD_MS = 3e3;
3601
- var FullFacePipeline = _FullFacePipeline;
3602
3165
 
3603
3166
  // src/inference/kaldiFbank.ts
3604
3167
  function fft(re, im) {
@@ -3885,7 +3448,7 @@ function ctcGreedyDecode(logits, seqLen, vocabSize, tokenMap) {
3885
3448
  }
3886
3449
 
3887
3450
  // src/inference/SenseVoiceInference.ts
3888
- var logger4 = createLogger("SenseVoice");
3451
+ var logger5 = createLogger("SenseVoice");
3889
3452
  var _SenseVoiceInference = class _SenseVoiceInference {
3890
3453
  constructor(config) {
3891
3454
  this.session = null;
@@ -3938,26 +3501,26 @@ var _SenseVoiceInference = class _SenseVoiceInference {
3938
3501
  "model.backend_requested": this.config.backend
3939
3502
  });
3940
3503
  try {
3941
- logger4.info("Loading ONNX Runtime...", { preference: this.config.backend });
3504
+ logger5.info("Loading ONNX Runtime...", { preference: this.config.backend });
3942
3505
  const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
3943
3506
  this.ort = ort;
3944
3507
  this._backend = backend;
3945
- logger4.info("ONNX Runtime loaded", { backend: this._backend });
3946
- logger4.debug("Fetching tokens vocabulary", { tokensUrl: this.config.tokensUrl });
3508
+ logger5.info("ONNX Runtime loaded", { backend: this._backend });
3509
+ logger5.debug("Fetching tokens vocabulary", { tokensUrl: this.config.tokensUrl });
3947
3510
  const tokensResponse = await fetch(this.config.tokensUrl);
3948
3511
  if (!tokensResponse.ok) {
3949
3512
  throw new Error(`Failed to fetch tokens.txt: ${tokensResponse.status} ${tokensResponse.statusText}`);
3950
3513
  }
3951
3514
  const tokensText = await tokensResponse.text();
3952
3515
  this.tokenMap = parseTokensFile(tokensText);
3953
- logger4.debug("Tokens loaded", { vocabSize: this.tokenMap.size });
3516
+ logger5.debug("Tokens loaded", { vocabSize: this.tokenMap.size });
3954
3517
  const sessionOptions = getSessionOptions(this._backend);
3955
3518
  if (this._backend === "webgpu") {
3956
3519
  sessionOptions.graphOptimizationLevel = "basic";
3957
3520
  }
3958
3521
  let isCached = false;
3959
3522
  if (isIOS()) {
3960
- logger4.info("iOS: passing model URL directly to ORT (low-memory path)", {
3523
+ logger5.info("iOS: passing model URL directly to ORT (low-memory path)", {
3961
3524
  modelUrl: this.config.modelUrl
3962
3525
  });
3963
3526
  this.session = await this.ort.InferenceSession.create(
@@ -3969,14 +3532,14 @@ var _SenseVoiceInference = class _SenseVoiceInference {
3969
3532
  isCached = await cache.has(this.config.modelUrl);
3970
3533
  let modelBuffer;
3971
3534
  if (isCached) {
3972
- logger4.debug("Loading model from cache", { modelUrl: this.config.modelUrl });
3535
+ logger5.debug("Loading model from cache", { modelUrl: this.config.modelUrl });
3973
3536
  modelBuffer = await cache.get(this.config.modelUrl);
3974
3537
  onProgress?.(modelBuffer.byteLength, modelBuffer.byteLength);
3975
3538
  } else {
3976
- logger4.debug("Fetching and caching model", { modelUrl: this.config.modelUrl });
3539
+ logger5.debug("Fetching and caching model", { modelUrl: this.config.modelUrl });
3977
3540
  modelBuffer = await fetchWithCache(this.config.modelUrl, onProgress);
3978
3541
  }
3979
- logger4.debug("Creating ONNX session", {
3542
+ logger5.debug("Creating ONNX session", {
3980
3543
  size: formatBytes(modelBuffer.byteLength),
3981
3544
  backend: this._backend
3982
3545
  });
@@ -3989,15 +3552,15 @@ var _SenseVoiceInference = class _SenseVoiceInference {
3989
3552
  const cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
3990
3553
  this.negMean = cmvn.negMean;
3991
3554
  this.invStddev = cmvn.invStddev;
3992
- logger4.debug("CMVN loaded from model metadata", { dim: this.negMean.length });
3555
+ logger5.debug("CMVN loaded from model metadata", { dim: this.negMean.length });
3993
3556
  } else {
3994
- logger4.warn("CMVN not found in model metadata \u2014 features will not be normalized");
3557
+ logger5.warn("CMVN not found in model metadata \u2014 features will not be normalized");
3995
3558
  }
3996
3559
  } catch (cmvnErr) {
3997
- logger4.warn("Failed to read CMVN from model metadata", { error: cmvnErr });
3560
+ logger5.warn("Failed to read CMVN from model metadata", { error: cmvnErr });
3998
3561
  }
3999
3562
  const loadTimeMs = performance.now() - startTime;
4000
- logger4.info("SenseVoice model loaded", {
3563
+ logger5.info("SenseVoice model loaded", {
4001
3564
  backend: this._backend,
4002
3565
  loadTimeMs: Math.round(loadTimeMs),
4003
3566
  vocabSize: this.tokenMap.size,
@@ -4108,7 +3671,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
4108
3671
  const vocabSize = logitsDims[2];
4109
3672
  const decoded = ctcGreedyDecode(logitsData, seqLen, vocabSize, this.tokenMap);
4110
3673
  const inferenceTimeMs = performance.now() - startTime;
4111
- logger4.trace("Transcription complete", {
3674
+ logger5.trace("Transcription complete", {
4112
3675
  text: decoded.text.substring(0, 50),
4113
3676
  language: decoded.language,
4114
3677
  emotion: decoded.emotion,
@@ -4146,7 +3709,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
4146
3709
  const errMsg = err instanceof Error ? err.message : String(err);
4147
3710
  if (errMsg.includes("timed out")) {
4148
3711
  this.poisoned = true;
4149
- logger4.error("CRITICAL: Inference session timed out \u2014 SenseVoice is dead. Page reload required.", {
3712
+ logger5.error("CRITICAL: Inference session timed out \u2014 SenseVoice is dead. Page reload required.", {
4150
3713
  backend: this._backend,
4151
3714
  timeoutMs: _SenseVoiceInference.INFERENCE_TIMEOUT_MS
4152
3715
  });
@@ -4154,7 +3717,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
4154
3717
  const oomError = new Error(
4155
3718
  `SenseVoice inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
4156
3719
  );
4157
- logger4.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
3720
+ logger5.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
4158
3721
  pointer: `0x${err.toString(16)}`,
4159
3722
  backend: this._backend
4160
3723
  });
@@ -4167,7 +3730,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
4167
3730
  reject(oomError);
4168
3731
  return;
4169
3732
  } else {
4170
- logger4.error("Inference failed", { error: errMsg, backend: this._backend });
3733
+ logger5.error("Inference failed", { error: errMsg, backend: this._backend });
4171
3734
  }
4172
3735
  span?.endWithError(err instanceof Error ? err : new Error(String(err)));
4173
3736
  telemetry?.incrementCounter("omote.inference.total", 1, {
@@ -4196,7 +3759,7 @@ _SenseVoiceInference.INFERENCE_TIMEOUT_MS = 1e4;
4196
3759
  var SenseVoiceInference = _SenseVoiceInference;
4197
3760
 
4198
3761
  // src/inference/SenseVoiceWorker.ts
4199
- var logger5 = createLogger("SenseVoiceWorker");
3762
+ var logger6 = createLogger("SenseVoiceWorker");
4200
3763
  var WASM_CDN_PATH2 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
4201
3764
  var LOAD_TIMEOUT_MS = 3e4;
4202
3765
  var INFERENCE_TIMEOUT_MS = 1e4;
@@ -4929,7 +4492,7 @@ var SenseVoiceWorker = class {
4929
4492
  this.handleWorkerMessage(event.data);
4930
4493
  };
4931
4494
  worker.onerror = (error) => {
4932
- logger5.error("Worker error", { error: error.message });
4495
+ logger6.error("Worker error", { error: error.message });
4933
4496
  for (const [, resolver] of this.pendingResolvers) {
4934
4497
  resolver.reject(new Error(`Worker error: ${error.message}`));
4935
4498
  }
@@ -5009,9 +4572,9 @@ var SenseVoiceWorker = class {
5009
4572
  "model.language": this.config.language
5010
4573
  });
5011
4574
  try {
5012
- logger5.info("Creating SenseVoice worker...");
4575
+ logger6.info("Creating SenseVoice worker...");
5013
4576
  this.worker = this.createWorker();
5014
- logger5.info("Loading model in worker...", {
4577
+ logger6.info("Loading model in worker...", {
5015
4578
  modelUrl: this.config.modelUrl,
5016
4579
  tokensUrl: this.config.tokensUrl,
5017
4580
  language: this.config.language,
@@ -5033,7 +4596,7 @@ var SenseVoiceWorker = class {
5033
4596
  this._isLoaded = true;
5034
4597
  const loadTimeMs = performance.now() - startTime;
5035
4598
  onProgress?.(1, 1);
5036
- logger5.info("SenseVoice worker loaded successfully", {
4599
+ logger6.info("SenseVoice worker loaded successfully", {
5037
4600
  backend: "wasm",
5038
4601
  loadTimeMs: Math.round(loadTimeMs),
5039
4602
  workerLoadTimeMs: Math.round(result.loadTimeMs),
@@ -5112,7 +4675,7 @@ var SenseVoiceWorker = class {
5112
4675
  INFERENCE_TIMEOUT_MS
5113
4676
  );
5114
4677
  const totalTimeMs = performance.now() - startTime;
5115
- logger5.trace("Worker transcription complete", {
4678
+ logger6.trace("Worker transcription complete", {
5116
4679
  text: result.text.substring(0, 50),
5117
4680
  language: result.language,
5118
4681
  emotion: result.emotion,
@@ -5148,11 +4711,11 @@ var SenseVoiceWorker = class {
5148
4711
  } catch (err) {
5149
4712
  const errMsg = err instanceof Error ? err.message : String(err);
5150
4713
  if (errMsg.includes("timed out")) {
5151
- logger5.error("CRITICAL: Worker inference timed out \u2014 SenseVoice worker is dead. Page reload required.", {
4714
+ logger6.error("CRITICAL: Worker inference timed out \u2014 SenseVoice worker is dead. Page reload required.", {
5152
4715
  timeoutMs: INFERENCE_TIMEOUT_MS
5153
4716
  });
5154
4717
  } else {
5155
- logger5.error("Worker inference failed", { error: errMsg });
4718
+ logger6.error("Worker inference failed", { error: errMsg });
5156
4719
  }
5157
4720
  span?.endWithError(err instanceof Error ? err : new Error(String(err)));
5158
4721
  telemetry?.incrementCounter("omote.inference.total", 1, {
@@ -5190,7 +4753,7 @@ var SenseVoiceWorker = class {
5190
4753
  };
5191
4754
 
5192
4755
  // src/inference/UnifiedInferenceWorker.ts
5193
- var logger6 = createLogger("UnifiedInferenceWorker");
4756
+ var logger7 = createLogger("UnifiedInferenceWorker");
5194
4757
  var WASM_CDN_PATH3 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
5195
4758
  var INIT_TIMEOUT_MS = 15e3;
5196
4759
  var SV_LOAD_TIMEOUT_MS = 3e4;
@@ -5886,7 +5449,7 @@ var UnifiedInferenceWorker = class {
5886
5449
  const telemetry = getTelemetry();
5887
5450
  const span = telemetry?.startSpan("UnifiedInferenceWorker.init");
5888
5451
  try {
5889
- logger6.info("Creating unified inference worker...");
5452
+ logger7.info("Creating unified inference worker...");
5890
5453
  this.worker = this.createWorker();
5891
5454
  await this.sendMessage(
5892
5455
  { type: "init", wasmPaths: WASM_CDN_PATH3, isIOS: isIOS() },
@@ -5895,7 +5458,7 @@ var UnifiedInferenceWorker = class {
5895
5458
  );
5896
5459
  this.initialized = true;
5897
5460
  const loadTimeMs = performance.now() - startTime;
5898
- logger6.info("Unified worker initialized", { loadTimeMs: Math.round(loadTimeMs) });
5461
+ logger7.info("Unified worker initialized", { loadTimeMs: Math.round(loadTimeMs) });
5899
5462
  span?.setAttributes({ "worker.init_time_ms": loadTimeMs });
5900
5463
  span?.end();
5901
5464
  } catch (error) {
@@ -5949,8 +5512,8 @@ var UnifiedInferenceWorker = class {
5949
5512
  if (!this.worker) return;
5950
5513
  await this.sendMessage({ type: "sv:dispose" }, "sv:disposed", DISPOSE_TIMEOUT_MS);
5951
5514
  }
5952
- // ── Wav2ArkitCpu (Lip Sync) ──────────────────────────────────────────
5953
- async loadLipSync(config) {
5515
+ // ── Wav2ArkitCpu (A2E) ──────────────────────────────────────────────
5516
+ async loadA2E(config) {
5954
5517
  this.assertReady();
5955
5518
  const startTime = performance.now();
5956
5519
  const result = await this.sendMessage(
@@ -5971,7 +5534,7 @@ var UnifiedInferenceWorker = class {
5971
5534
  outputNames: result.outputNames
5972
5535
  };
5973
5536
  }
5974
- async inferLipSync(audio) {
5537
+ async inferA2E(audio) {
5975
5538
  this.assertReady();
5976
5539
  return this.sendMessage(
5977
5540
  { type: "cpu:infer", audio },
@@ -5979,7 +5542,7 @@ var UnifiedInferenceWorker = class {
5979
5542
  CPU_INFER_TIMEOUT_MS
5980
5543
  );
5981
5544
  }
5982
- async disposeLipSync() {
5545
+ async disposeA2E() {
5983
5546
  if (!this.worker) return;
5984
5547
  await this.sendMessage({ type: "cpu:dispose" }, "cpu:disposed", DISPOSE_TIMEOUT_MS);
5985
5548
  }
@@ -6069,7 +5632,7 @@ var UnifiedInferenceWorker = class {
6069
5632
  this.handleWorkerMessage(event.data);
6070
5633
  };
6071
5634
  worker.onerror = (error) => {
6072
- logger6.error("Unified worker error", { error: error.message });
5635
+ logger7.error("Unified worker error", { error: error.message });
6073
5636
  this.rejectAllPending(`Worker error: ${error.message}`);
6074
5637
  };
6075
5638
  return worker;
@@ -6083,7 +5646,7 @@ var UnifiedInferenceWorker = class {
6083
5646
  this.pendingRequests.delete(requestId);
6084
5647
  pending.reject(new Error(data.error));
6085
5648
  } else {
6086
- logger6.error("Worker broadcast error", { error: data.error });
5649
+ logger7.error("Worker broadcast error", { error: data.error });
6087
5650
  this.rejectAllPending(data.error);
6088
5651
  }
6089
5652
  return;
@@ -6105,7 +5668,7 @@ var UnifiedInferenceWorker = class {
6105
5668
  const timeout = setTimeout(() => {
6106
5669
  this.pendingRequests.delete(requestId);
6107
5670
  this.poisoned = true;
6108
- logger6.error("CRITICAL: Worker operation timed out \u2014 worker is dead", {
5671
+ logger7.error("CRITICAL: Worker operation timed out \u2014 worker is dead", {
6109
5672
  type: message.type,
6110
5673
  timeoutMs
6111
5674
  });
@@ -6171,7 +5734,7 @@ var SenseVoiceUnifiedAdapter = class {
6171
5734
  });
6172
5735
  this._isLoaded = true;
6173
5736
  onProgress?.(1, 1);
6174
- logger6.info("SenseVoice loaded via unified worker", {
5737
+ logger7.info("SenseVoice loaded via unified worker", {
6175
5738
  backend: "wasm",
6176
5739
  loadTimeMs: Math.round(result.loadTimeMs),
6177
5740
  vocabSize: result.vocabSize
@@ -6212,6 +5775,7 @@ var SenseVoiceUnifiedAdapter = class {
6212
5775
  var Wav2ArkitCpuUnifiedAdapter = class {
6213
5776
  constructor(worker, config) {
6214
5777
  this.modelId = "wav2arkit_cpu";
5778
+ this.chunkSize = 16e3;
6215
5779
  this._isLoaded = false;
6216
5780
  this.inferenceQueue = Promise.resolve();
6217
5781
  this.worker = worker;
@@ -6230,12 +5794,12 @@ var Wav2ArkitCpuUnifiedAdapter = class {
6230
5794
  });
6231
5795
  try {
6232
5796
  const externalDataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${this.config.modelUrl}.data` : null;
6233
- const result = await this.worker.loadLipSync({
5797
+ const result = await this.worker.loadA2E({
6234
5798
  modelUrl: this.config.modelUrl,
6235
5799
  externalDataUrl: externalDataUrl || null
6236
5800
  });
6237
5801
  this._isLoaded = true;
6238
- logger6.info("Wav2ArkitCpu loaded via unified worker", {
5802
+ logger7.info("Wav2ArkitCpu loaded via unified worker", {
6239
5803
  backend: "wasm",
6240
5804
  loadTimeMs: Math.round(result.loadTimeMs)
6241
5805
  });
@@ -6262,7 +5826,7 @@ var Wav2ArkitCpuUnifiedAdapter = class {
6262
5826
  });
6263
5827
  try {
6264
5828
  const startTime = performance.now();
6265
- const result = await this.worker.inferLipSync(audioCopy);
5829
+ const result = await this.worker.inferA2E(audioCopy);
6266
5830
  const inferenceTimeMs = performance.now() - startTime;
6267
5831
  const flatBuffer = result.blendshapes;
6268
5832
  const { numFrames, numBlendshapes } = result;
@@ -6285,7 +5849,7 @@ var Wav2ArkitCpuUnifiedAdapter = class {
6285
5849
  }
6286
5850
  async dispose() {
6287
5851
  if (this._isLoaded) {
6288
- await this.worker.disposeLipSync();
5852
+ await this.worker.disposeA2E();
6289
5853
  this._isLoaded = false;
6290
5854
  }
6291
5855
  }
@@ -6341,7 +5905,7 @@ var SileroVADUnifiedAdapter = class {
6341
5905
  sampleRate: this.config.sampleRate
6342
5906
  });
6343
5907
  this._isLoaded = true;
6344
- logger6.info("SileroVAD loaded via unified worker", {
5908
+ logger7.info("SileroVAD loaded via unified worker", {
6345
5909
  backend: "wasm",
6346
5910
  loadTimeMs: Math.round(result.loadTimeMs),
6347
5911
  sampleRate: this.config.sampleRate,
@@ -6422,10 +5986,10 @@ var SileroVADUnifiedAdapter = class {
6422
5986
  };
6423
5987
 
6424
5988
  // src/inference/createSenseVoice.ts
6425
- var logger7 = createLogger("createSenseVoice");
5989
+ var logger8 = createLogger("createSenseVoice");
6426
5990
  function createSenseVoice(config) {
6427
5991
  if (config.unifiedWorker) {
6428
- logger7.info("Creating SenseVoiceUnifiedAdapter (shared unified worker)");
5992
+ logger8.info("Creating SenseVoiceUnifiedAdapter (shared unified worker)");
6429
5993
  return new SenseVoiceUnifiedAdapter(config.unifiedWorker, {
6430
5994
  modelUrl: config.modelUrl,
6431
5995
  tokensUrl: config.tokensUrl,
@@ -6438,7 +6002,7 @@ function createSenseVoice(config) {
6438
6002
  if (!SenseVoiceWorker.isSupported()) {
6439
6003
  throw new Error("Web Workers are not supported in this environment");
6440
6004
  }
6441
- logger7.info("Creating SenseVoiceWorker (off-main-thread)");
6005
+ logger8.info("Creating SenseVoiceWorker (off-main-thread)");
6442
6006
  return new SenseVoiceWorker({
6443
6007
  modelUrl: config.modelUrl,
6444
6008
  tokensUrl: config.tokensUrl,
@@ -6447,7 +6011,7 @@ function createSenseVoice(config) {
6447
6011
  });
6448
6012
  }
6449
6013
  if (useWorker === false) {
6450
- logger7.info("Creating SenseVoiceInference (main thread)");
6014
+ logger8.info("Creating SenseVoiceInference (main thread)");
6451
6015
  return new SenseVoiceInference({
6452
6016
  modelUrl: config.modelUrl,
6453
6017
  tokensUrl: config.tokensUrl,
@@ -6456,7 +6020,7 @@ function createSenseVoice(config) {
6456
6020
  });
6457
6021
  }
6458
6022
  if (SenseVoiceWorker.isSupported() && !isIOS()) {
6459
- logger7.info("Auto-detected: creating SenseVoiceWorker (off-main-thread)");
6023
+ logger8.info("Auto-detected: creating SenseVoiceWorker (off-main-thread)");
6460
6024
  return new SenseVoiceWorker({
6461
6025
  modelUrl: config.modelUrl,
6462
6026
  tokensUrl: config.tokensUrl,
@@ -6464,7 +6028,7 @@ function createSenseVoice(config) {
6464
6028
  textNorm: config.textNorm
6465
6029
  });
6466
6030
  }
6467
- logger7.info("Auto-detected: creating SenseVoiceInference (main thread)", {
6031
+ logger8.info("Auto-detected: creating SenseVoiceInference (main thread)", {
6468
6032
  reason: isIOS() ? "iOS (shared ORT instance)" : "Worker unsupported"
6469
6033
  });
6470
6034
  return new SenseVoiceInference({
@@ -6476,10 +6040,11 @@ function createSenseVoice(config) {
6476
6040
  }
6477
6041
 
6478
6042
  // src/inference/Wav2ArkitCpuInference.ts
6479
- var logger8 = createLogger("Wav2ArkitCpu");
6043
+ var logger9 = createLogger("Wav2ArkitCpu");
6480
6044
  var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
6481
6045
  constructor(config) {
6482
6046
  this.modelId = "wav2arkit_cpu";
6047
+ this.chunkSize = 16e3;
6483
6048
  this.session = null;
6484
6049
  this.ort = null;
6485
6050
  this._backend = "wasm";
@@ -6517,16 +6082,16 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
6517
6082
  });
6518
6083
  try {
6519
6084
  const preference = this.config.backend || "wasm";
6520
- logger8.info("Loading ONNX Runtime...", { preference });
6085
+ logger9.info("Loading ONNX Runtime...", { preference });
6521
6086
  const { ort, backend } = await getOnnxRuntimeForPreference(preference);
6522
6087
  this.ort = ort;
6523
6088
  this._backend = backend;
6524
- logger8.info("ONNX Runtime loaded", { backend: this._backend });
6089
+ logger9.info("ONNX Runtime loaded", { backend: this._backend });
6525
6090
  const modelUrl = this.config.modelUrl;
6526
6091
  const dataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${modelUrl}.data` : null;
6527
6092
  const sessionOptions = getSessionOptions(this._backend);
6528
6093
  if (isIOS()) {
6529
- logger8.info("iOS: passing model URLs directly to ORT (low-memory path)", {
6094
+ logger9.info("iOS: passing model URLs directly to ORT (low-memory path)", {
6530
6095
  modelUrl,
6531
6096
  dataUrl
6532
6097
  });
@@ -6544,15 +6109,15 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
6544
6109
  const isCached = await cache.has(modelUrl);
6545
6110
  let modelBuffer;
6546
6111
  if (isCached) {
6547
- logger8.debug("Loading model from cache", { modelUrl });
6112
+ logger9.debug("Loading model from cache", { modelUrl });
6548
6113
  modelBuffer = await cache.get(modelUrl);
6549
6114
  if (!modelBuffer) {
6550
- logger8.warn("Cache corruption detected, clearing and retrying", { modelUrl });
6115
+ logger9.warn("Cache corruption detected, clearing and retrying", { modelUrl });
6551
6116
  await cache.delete(modelUrl);
6552
6117
  modelBuffer = await fetchWithCache(modelUrl);
6553
6118
  }
6554
6119
  } else {
6555
- logger8.debug("Fetching and caching model graph", { modelUrl });
6120
+ logger9.debug("Fetching and caching model graph", { modelUrl });
6556
6121
  modelBuffer = await fetchWithCache(modelUrl);
6557
6122
  }
6558
6123
  if (!modelBuffer) {
@@ -6563,31 +6128,31 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
6563
6128
  try {
6564
6129
  const isDataCached = await cache.has(dataUrl);
6565
6130
  if (isDataCached) {
6566
- logger8.debug("Loading external data from cache", { dataUrl });
6131
+ logger9.debug("Loading external data from cache", { dataUrl });
6567
6132
  externalDataBuffer = await cache.get(dataUrl);
6568
6133
  if (!externalDataBuffer) {
6569
- logger8.warn("Cache corruption for external data, retrying", { dataUrl });
6134
+ logger9.warn("Cache corruption for external data, retrying", { dataUrl });
6570
6135
  await cache.delete(dataUrl);
6571
6136
  externalDataBuffer = await fetchWithCache(dataUrl);
6572
6137
  }
6573
6138
  } else {
6574
- logger8.info("Fetching external model data", {
6139
+ logger9.info("Fetching external model data", {
6575
6140
  dataUrl,
6576
6141
  note: "This may be a large download (400MB+)"
6577
6142
  });
6578
6143
  externalDataBuffer = await fetchWithCache(dataUrl);
6579
6144
  }
6580
- logger8.info("External data loaded", {
6145
+ logger9.info("External data loaded", {
6581
6146
  size: formatBytes(externalDataBuffer.byteLength)
6582
6147
  });
6583
6148
  } catch (err) {
6584
- logger8.debug("No external data file found (single-file model)", {
6149
+ logger9.debug("No external data file found (single-file model)", {
6585
6150
  dataUrl,
6586
6151
  error: err.message
6587
6152
  });
6588
6153
  }
6589
6154
  }
6590
- logger8.debug("Creating ONNX session", {
6155
+ logger9.debug("Creating ONNX session", {
6591
6156
  graphSize: formatBytes(modelBuffer.byteLength),
6592
6157
  externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
6593
6158
  backend: this._backend
@@ -6603,7 +6168,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
6603
6168
  this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
6604
6169
  }
6605
6170
  const loadTimeMs = performance.now() - startTime;
6606
- logger8.info("Model loaded successfully", {
6171
+ logger9.info("Model loaded successfully", {
6607
6172
  backend: this._backend,
6608
6173
  loadTimeMs: Math.round(loadTimeMs),
6609
6174
  inputs: this.session.inputNames,
@@ -6619,12 +6184,12 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
6619
6184
  model: "wav2arkit_cpu",
6620
6185
  backend: this._backend
6621
6186
  });
6622
- logger8.debug("Running warmup inference");
6187
+ logger9.debug("Running warmup inference");
6623
6188
  const warmupStart = performance.now();
6624
6189
  const silentAudio = new Float32Array(16e3);
6625
6190
  await this.infer(silentAudio);
6626
6191
  const warmupTimeMs = performance.now() - warmupStart;
6627
- logger8.info("Warmup inference complete", {
6192
+ logger9.info("Warmup inference complete", {
6628
6193
  warmupTimeMs: Math.round(warmupTimeMs),
6629
6194
  backend: this._backend
6630
6195
  });
@@ -6711,7 +6276,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
6711
6276
  const symmetrized = symmetrizeBlendshapes(rawFrame);
6712
6277
  blendshapes.push(symmetrized);
6713
6278
  }
6714
- logger8.trace("Inference completed", {
6279
+ logger9.trace("Inference completed", {
6715
6280
  inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
6716
6281
  numFrames,
6717
6282
  inputSamples
@@ -6739,7 +6304,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
6739
6304
  const errMsg = err instanceof Error ? err.message : String(err);
6740
6305
  if (errMsg.includes("timed out")) {
6741
6306
  this.poisoned = true;
6742
- logger8.error("CRITICAL: Inference session timed out \u2014 Wav2ArkitCpu is dead. Page reload required.", {
6307
+ logger9.error("CRITICAL: Inference session timed out \u2014 Wav2ArkitCpu is dead. Page reload required.", {
6743
6308
  backend: this._backend,
6744
6309
  timeoutMs: _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
6745
6310
  });
@@ -6747,7 +6312,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
6747
6312
  const oomError = new Error(
6748
6313
  `Wav2ArkitCpu inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
6749
6314
  );
6750
- logger8.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
6315
+ logger9.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
6751
6316
  pointer: `0x${err.toString(16)}`,
6752
6317
  backend: this._backend
6753
6318
  });
@@ -6760,7 +6325,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
6760
6325
  reject(oomError);
6761
6326
  return;
6762
6327
  } else {
6763
- logger8.error("Inference failed", { error: errMsg, backend: this._backend });
6328
+ logger9.error("Inference failed", { error: errMsg, backend: this._backend });
6764
6329
  }
6765
6330
  span?.endWithError(err instanceof Error ? err : new Error(String(err)));
6766
6331
  telemetry?.incrementCounter("omote.inference.total", 1, {
@@ -6787,7 +6352,7 @@ _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS = 5e3;
6787
6352
  var Wav2ArkitCpuInference = _Wav2ArkitCpuInference;
6788
6353
 
6789
6354
  // src/inference/Wav2ArkitCpuWorker.ts
6790
- var logger9 = createLogger("Wav2ArkitCpuWorker");
6355
+ var logger10 = createLogger("Wav2ArkitCpuWorker");
6791
6356
  var WASM_CDN_PATH4 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
6792
6357
  var LOAD_TIMEOUT_MS2 = 6e4;
6793
6358
  var INFERENCE_TIMEOUT_MS2 = 5e3;
@@ -7033,6 +6598,7 @@ self.onerror = function(err) {
7033
6598
  var Wav2ArkitCpuWorker = class {
7034
6599
  constructor(config) {
7035
6600
  this.modelId = "wav2arkit_cpu";
6601
+ this.chunkSize = 16e3;
7036
6602
  this.worker = null;
7037
6603
  this.isLoading = false;
7038
6604
  this._isLoaded = false;
@@ -7067,7 +6633,7 @@ var Wav2ArkitCpuWorker = class {
7067
6633
  this.handleWorkerMessage(event.data);
7068
6634
  };
7069
6635
  worker.onerror = (error) => {
7070
- logger9.error("Worker error", { error: error.message });
6636
+ logger10.error("Worker error", { error: error.message });
7071
6637
  for (const [, resolver] of this.pendingResolvers) {
7072
6638
  resolver.reject(new Error(`Worker error: ${error.message}`));
7073
6639
  }
@@ -7143,10 +6709,10 @@ var Wav2ArkitCpuWorker = class {
7143
6709
  "model.backend_requested": "wasm"
7144
6710
  });
7145
6711
  try {
7146
- logger9.info("Creating wav2arkit_cpu worker...");
6712
+ logger10.info("Creating wav2arkit_cpu worker...");
7147
6713
  this.worker = this.createWorker();
7148
6714
  const externalDataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${this.config.modelUrl}.data` : null;
7149
- logger9.info("Loading model in worker...", {
6715
+ logger10.info("Loading model in worker...", {
7150
6716
  modelUrl: this.config.modelUrl,
7151
6717
  externalDataUrl,
7152
6718
  isIOS: isIOS()
@@ -7164,7 +6730,7 @@ var Wav2ArkitCpuWorker = class {
7164
6730
  );
7165
6731
  this._isLoaded = true;
7166
6732
  const loadTimeMs = performance.now() - startTime;
7167
- logger9.info("Wav2ArkitCpu worker loaded successfully", {
6733
+ logger10.info("Wav2ArkitCpu worker loaded successfully", {
7168
6734
  backend: "wasm",
7169
6735
  loadTimeMs: Math.round(loadTimeMs),
7170
6736
  workerLoadTimeMs: Math.round(result.loadTimeMs),
@@ -7249,7 +6815,7 @@ var Wav2ArkitCpuWorker = class {
7249
6815
  for (let f = 0; f < numFrames; f++) {
7250
6816
  blendshapes.push(flatBuffer.slice(f * numBlendshapes, (f + 1) * numBlendshapes));
7251
6817
  }
7252
- logger9.trace("Worker inference completed", {
6818
+ logger10.trace("Worker inference completed", {
7253
6819
  inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
7254
6820
  workerTimeMs: Math.round(result.inferenceTimeMs * 100) / 100,
7255
6821
  numFrames,
@@ -7279,12 +6845,12 @@ var Wav2ArkitCpuWorker = class {
7279
6845
  const errMsg = err instanceof Error ? err.message : String(err);
7280
6846
  if (errMsg.includes("timed out")) {
7281
6847
  this.poisoned = true;
7282
- logger9.error("CRITICAL: Worker inference timed out \u2014 Wav2ArkitCpu worker is dead. Page reload required.", {
6848
+ logger10.error("CRITICAL: Worker inference timed out \u2014 Wav2ArkitCpu worker is dead. Page reload required.", {
7283
6849
  backend: "wasm",
7284
6850
  timeoutMs: INFERENCE_TIMEOUT_MS2
7285
6851
  });
7286
6852
  } else {
7287
- logger9.error("Worker inference failed", { error: errMsg, backend: "wasm" });
6853
+ logger10.error("Worker inference failed", { error: errMsg, backend: "wasm" });
7288
6854
  }
7289
6855
  span?.endWithError(err instanceof Error ? err : new Error(String(err)));
7290
6856
  telemetry?.incrementCounter("omote.inference.total", 1, {
@@ -7321,39 +6887,39 @@ var Wav2ArkitCpuWorker = class {
7321
6887
  }
7322
6888
  };
7323
6889
 
7324
- // src/inference/createLipSync.ts
7325
- var logger10 = createLogger("createLipSync");
7326
- function createLipSync(config) {
6890
+ // src/inference/createA2E.ts
6891
+ var logger11 = createLogger("createA2E");
6892
+ function createA2E(config) {
7327
6893
  const mode = config.mode ?? "auto";
7328
6894
  const fallbackOnError = config.fallbackOnError ?? true;
7329
6895
  let useCpu;
7330
6896
  if (mode === "cpu") {
7331
6897
  useCpu = true;
7332
- logger10.info("Forcing CPU lip sync model (wav2arkit_cpu)");
6898
+ logger11.info("Forcing CPU A2E model (wav2arkit_cpu)");
7333
6899
  } else if (mode === "gpu") {
7334
6900
  useCpu = false;
7335
- logger10.info("Forcing GPU lip sync model (Wav2Vec2)");
6901
+ logger11.info("Forcing GPU A2E model (Wav2Vec2)");
7336
6902
  } else {
7337
- useCpu = shouldUseCpuLipSync();
7338
- logger10.info("Auto-detected lip sync model", {
6903
+ useCpu = shouldUseCpuA2E();
6904
+ logger11.info("Auto-detected A2E model", {
7339
6905
  useCpu,
7340
6906
  isSafari: isSafari()
7341
6907
  });
7342
6908
  }
7343
6909
  if (useCpu) {
7344
6910
  if (config.unifiedWorker) {
7345
- logger10.info("Creating Wav2ArkitCpuUnifiedAdapter (404MB, WASM, shared unified worker)");
6911
+ logger11.info("Creating Wav2ArkitCpuUnifiedAdapter (404MB, WASM, shared unified worker)");
7346
6912
  return new Wav2ArkitCpuUnifiedAdapter(config.unifiedWorker, {
7347
6913
  modelUrl: config.cpuModelUrl
7348
6914
  });
7349
6915
  }
7350
6916
  if (config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
7351
- logger10.info("Creating Wav2ArkitCpuWorker (404MB, WASM, off-main-thread)");
6917
+ logger11.info("Creating Wav2ArkitCpuWorker (404MB, WASM, off-main-thread)");
7352
6918
  return new Wav2ArkitCpuWorker({
7353
6919
  modelUrl: config.cpuModelUrl
7354
6920
  });
7355
6921
  }
7356
- logger10.info("Creating Wav2ArkitCpuInference (404MB, WASM)");
6922
+ logger11.info("Creating Wav2ArkitCpuInference (404MB, WASM)");
7357
6923
  return new Wav2ArkitCpuInference({
7358
6924
  modelUrl: config.cpuModelUrl
7359
6925
  });
@@ -7365,13 +6931,13 @@ function createLipSync(config) {
7365
6931
  numIdentityClasses: config.numIdentityClasses
7366
6932
  });
7367
6933
  if (fallbackOnError) {
7368
- logger10.info("Creating Wav2Vec2Inference with CPU fallback");
7369
- return new LipSyncWithFallback(gpuInstance, config);
6934
+ logger11.info("Creating Wav2Vec2Inference with CPU fallback");
6935
+ return new A2EWithFallback(gpuInstance, config);
7370
6936
  }
7371
- logger10.info("Creating Wav2Vec2Inference (no fallback)");
6937
+ logger11.info("Creating Wav2Vec2Inference (no fallback)");
7372
6938
  return gpuInstance;
7373
6939
  }
7374
- var LipSyncWithFallback = class {
6940
+ var A2EWithFallback = class {
7375
6941
  constructor(gpuInstance, config) {
7376
6942
  this.hasFallenBack = false;
7377
6943
  this.implementation = gpuInstance;
@@ -7380,6 +6946,9 @@ var LipSyncWithFallback = class {
7380
6946
  get modelId() {
7381
6947
  return this.implementation.modelId;
7382
6948
  }
6949
+ get chunkSize() {
6950
+ return this.implementation.chunkSize;
6951
+ }
7383
6952
  get backend() {
7384
6953
  return this.implementation.backend;
7385
6954
  }
@@ -7394,7 +6963,7 @@ var LipSyncWithFallback = class {
7394
6963
  }
7395
6964
  }
7396
6965
  async fallbackToCpu(reason) {
7397
- logger10.warn("GPU model load failed, falling back to CPU model", { reason });
6966
+ logger11.warn("GPU model load failed, falling back to CPU model", { reason });
7398
6967
  try {
7399
6968
  await this.implementation.dispose();
7400
6969
  } catch {
@@ -7403,17 +6972,17 @@ var LipSyncWithFallback = class {
7403
6972
  this.implementation = new Wav2ArkitCpuUnifiedAdapter(this.config.unifiedWorker, {
7404
6973
  modelUrl: this.config.cpuModelUrl
7405
6974
  });
7406
- logger10.info("Fallback to Wav2ArkitCpuUnifiedAdapter successful");
6975
+ logger11.info("Fallback to Wav2ArkitCpuUnifiedAdapter successful");
7407
6976
  } else if (this.config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
7408
6977
  this.implementation = new Wav2ArkitCpuWorker({
7409
6978
  modelUrl: this.config.cpuModelUrl
7410
6979
  });
7411
- logger10.info("Fallback to Wav2ArkitCpuWorker successful");
6980
+ logger11.info("Fallback to Wav2ArkitCpuWorker successful");
7412
6981
  } else {
7413
6982
  this.implementation = new Wav2ArkitCpuInference({
7414
6983
  modelUrl: this.config.cpuModelUrl
7415
6984
  });
7416
- logger10.info("Fallback to Wav2ArkitCpuInference successful");
6985
+ logger11.info("Fallback to Wav2ArkitCpuInference successful");
7417
6986
  }
7418
6987
  this.hasFallenBack = true;
7419
6988
  return await this.implementation.load();
@@ -7426,8 +6995,124 @@ var LipSyncWithFallback = class {
7426
6995
  }
7427
6996
  };
7428
6997
 
6998
+ // src/animation/audioEnergy.ts
6999
+ function calculateRMS(samples) {
7000
+ if (samples.length === 0) return 0;
7001
+ let sumSquares = 0;
7002
+ for (let i = 0; i < samples.length; i++) {
7003
+ sumSquares += samples[i] * samples[i];
7004
+ }
7005
+ return Math.sqrt(sumSquares / samples.length);
7006
+ }
7007
+ function calculatePeak(samples) {
7008
+ let peak = 0;
7009
+ for (let i = 0; i < samples.length; i++) {
7010
+ const abs = Math.abs(samples[i]);
7011
+ if (abs > peak) peak = abs;
7012
+ }
7013
+ return peak;
7014
+ }
7015
+ var AudioEnergyAnalyzer = class {
7016
+ /**
7017
+ * @param smoothingFactor How much to smooth (0 = no smoothing, 1 = infinite smoothing). Default 0.85
7018
+ * @param noiseFloor Minimum energy threshold to consider as signal. Default 0.01
7019
+ */
7020
+ constructor(smoothingFactor = 0.85, noiseFloor = 0.01) {
7021
+ this.smoothedRMS = 0;
7022
+ this.smoothedPeak = 0;
7023
+ this.smoothingFactor = Math.max(0, Math.min(0.99, smoothingFactor));
7024
+ this.noiseFloor = noiseFloor;
7025
+ }
7026
+ /**
7027
+ * Process audio samples and return smoothed energy values
7028
+ * @param samples Audio samples (Float32Array)
7029
+ * @returns Object with rms and peak values
7030
+ */
7031
+ process(samples) {
7032
+ const instantRMS = calculateRMS(samples);
7033
+ const instantPeak = calculatePeak(samples);
7034
+ const gatedRMS = instantRMS > this.noiseFloor ? instantRMS : 0;
7035
+ const gatedPeak = instantPeak > this.noiseFloor ? instantPeak : 0;
7036
+ if (gatedRMS > this.smoothedRMS) {
7037
+ this.smoothedRMS = this.smoothedRMS * 0.5 + gatedRMS * 0.5;
7038
+ } else {
7039
+ this.smoothedRMS = this.smoothedRMS * this.smoothingFactor + gatedRMS * (1 - this.smoothingFactor);
7040
+ }
7041
+ if (gatedPeak > this.smoothedPeak) {
7042
+ this.smoothedPeak = this.smoothedPeak * 0.3 + gatedPeak * 0.7;
7043
+ } else {
7044
+ this.smoothedPeak = this.smoothedPeak * this.smoothingFactor + gatedPeak * (1 - this.smoothingFactor);
7045
+ }
7046
+ const energy = this.smoothedRMS * 0.7 + this.smoothedPeak * 0.3;
7047
+ return {
7048
+ rms: this.smoothedRMS,
7049
+ peak: this.smoothedPeak,
7050
+ energy: Math.min(1, energy * 2)
7051
+ // Scale up and clamp
7052
+ };
7053
+ }
7054
+ /**
7055
+ * Reset analyzer state
7056
+ */
7057
+ reset() {
7058
+ this.smoothedRMS = 0;
7059
+ this.smoothedPeak = 0;
7060
+ }
7061
+ /**
7062
+ * Get current smoothed RMS value
7063
+ */
7064
+ get rms() {
7065
+ return this.smoothedRMS;
7066
+ }
7067
+ /**
7068
+ * Get current smoothed peak value
7069
+ */
7070
+ get peak() {
7071
+ return this.smoothedPeak;
7072
+ }
7073
+ };
7074
+ var EmphasisDetector = class {
7075
+ /**
7076
+ * @param historySize Number of frames to track. Default 10
7077
+ * @param emphasisThreshold Minimum energy increase to count as emphasis. Default 0.15
7078
+ */
7079
+ constructor(historySize = 10, emphasisThreshold = 0.15) {
7080
+ this.energyHistory = [];
7081
+ this.historySize = historySize;
7082
+ this.emphasisThreshold = emphasisThreshold;
7083
+ }
7084
+ /**
7085
+ * Process energy value and detect emphasis
7086
+ * @param energy Current energy value (0-1)
7087
+ * @returns Object with isEmphasis flag and emphasisStrength
7088
+ */
7089
+ process(energy) {
7090
+ this.energyHistory.push(energy);
7091
+ if (this.energyHistory.length > this.historySize) {
7092
+ this.energyHistory.shift();
7093
+ }
7094
+ if (this.energyHistory.length < 3) {
7095
+ return { isEmphasis: false, emphasisStrength: 0 };
7096
+ }
7097
+ const prevFrames = this.energyHistory.slice(0, -1);
7098
+ const avgPrev = prevFrames.reduce((a, b) => a + b, 0) / prevFrames.length;
7099
+ const increase = energy - avgPrev;
7100
+ const isEmphasis = increase > this.emphasisThreshold;
7101
+ return {
7102
+ isEmphasis,
7103
+ emphasisStrength: isEmphasis ? Math.min(1, increase / 0.3) : 0
7104
+ };
7105
+ }
7106
+ /**
7107
+ * Reset detector state
7108
+ */
7109
+ reset() {
7110
+ this.energyHistory = [];
7111
+ }
7112
+ };
7113
+
7429
7114
  // src/inference/SileroVADInference.ts
7430
- var logger11 = createLogger("SileroVAD");
7115
+ var logger12 = createLogger("SileroVAD");
7431
7116
  var SileroVADInference = class {
7432
7117
  constructor(config) {
7433
7118
  this.session = null;
@@ -7501,23 +7186,23 @@ var SileroVADInference = class {
7501
7186
  "model.sample_rate": this.config.sampleRate
7502
7187
  });
7503
7188
  try {
7504
- logger11.info("Loading ONNX Runtime...", { preference: this.config.backend });
7189
+ logger12.info("Loading ONNX Runtime...", { preference: this.config.backend });
7505
7190
  const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
7506
7191
  this.ort = ort;
7507
7192
  this._backend = backend;
7508
- logger11.info("ONNX Runtime loaded", { backend: this._backend });
7193
+ logger12.info("ONNX Runtime loaded", { backend: this._backend });
7509
7194
  const cache = getModelCache();
7510
7195
  const modelUrl = this.config.modelUrl;
7511
7196
  const isCached = await cache.has(modelUrl);
7512
7197
  let modelBuffer;
7513
7198
  if (isCached) {
7514
- logger11.debug("Loading model from cache", { modelUrl });
7199
+ logger12.debug("Loading model from cache", { modelUrl });
7515
7200
  modelBuffer = await cache.get(modelUrl);
7516
7201
  } else {
7517
- logger11.debug("Fetching and caching model", { modelUrl });
7202
+ logger12.debug("Fetching and caching model", { modelUrl });
7518
7203
  modelBuffer = await fetchWithCache(modelUrl);
7519
7204
  }
7520
- logger11.debug("Creating ONNX session", {
7205
+ logger12.debug("Creating ONNX session", {
7521
7206
  size: formatBytes(modelBuffer.byteLength),
7522
7207
  backend: this._backend
7523
7208
  });
@@ -7526,7 +7211,7 @@ var SileroVADInference = class {
7526
7211
  this.session = await ort.InferenceSession.create(modelData, sessionOptions);
7527
7212
  this.reset();
7528
7213
  const loadTimeMs = performance.now() - startTime;
7529
- logger11.info("Model loaded successfully", {
7214
+ logger12.info("Model loaded successfully", {
7530
7215
  backend: this._backend,
7531
7216
  loadTimeMs: Math.round(loadTimeMs),
7532
7217
  sampleRate: this.config.sampleRate,
@@ -7581,7 +7266,7 @@ var SileroVADInference = class {
7581
7266
  []
7582
7267
  );
7583
7268
  } catch (e) {
7584
- logger11.warn("BigInt64Array not available, using bigint array fallback", {
7269
+ logger12.warn("BigInt64Array not available, using bigint array fallback", {
7585
7270
  error: e instanceof Error ? e.message : String(e)
7586
7271
  });
7587
7272
  this.srTensor = new this.ort.Tensor(
@@ -7687,7 +7372,7 @@ var SileroVADInference = class {
7687
7372
  this.preSpeechBuffer.shift();
7688
7373
  }
7689
7374
  }
7690
- logger11.trace("Skipping VAD inference - audio too quiet", {
7375
+ logger12.trace("Skipping VAD inference - audio too quiet", {
7691
7376
  rms: Math.round(rms * 1e4) / 1e4,
7692
7377
  threshold: MIN_ENERGY_THRESHOLD
7693
7378
  });
@@ -7741,7 +7426,7 @@ var SileroVADInference = class {
7741
7426
  if (isSpeech && !this.wasSpeaking) {
7742
7427
  preSpeechChunks = [...this.preSpeechBuffer];
7743
7428
  this.preSpeechBuffer = [];
7744
- logger11.debug("Speech started with pre-speech buffer", {
7429
+ logger12.debug("Speech started with pre-speech buffer", {
7745
7430
  preSpeechChunks: preSpeechChunks.length,
7746
7431
  durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
7747
7432
  });
@@ -7754,7 +7439,7 @@ var SileroVADInference = class {
7754
7439
  this.preSpeechBuffer = [];
7755
7440
  }
7756
7441
  this.wasSpeaking = isSpeech;
7757
- logger11.trace("VAD inference completed", {
7442
+ logger12.trace("VAD inference completed", {
7758
7443
  probability: Math.round(probability * 1e3) / 1e3,
7759
7444
  isSpeech,
7760
7445
  inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100
@@ -7785,7 +7470,7 @@ var SileroVADInference = class {
7785
7470
  const oomError = new Error(
7786
7471
  `SileroVAD inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reducing concurrent model sessions or reloading the page.`
7787
7472
  );
7788
- logger11.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
7473
+ logger12.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
7789
7474
  pointer: `0x${err.toString(16)}`,
7790
7475
  backend: this._backend
7791
7476
  });
@@ -7828,7 +7513,7 @@ var SileroVADInference = class {
7828
7513
  SileroVADInference.isWebGPUAvailable = isWebGPUAvailable;
7829
7514
 
7830
7515
  // src/inference/SileroVADWorker.ts
7831
- var logger12 = createLogger("SileroVADWorker");
7516
+ var logger13 = createLogger("SileroVADWorker");
7832
7517
  var WASM_CDN_PATH5 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
7833
7518
  var LOAD_TIMEOUT_MS3 = 1e4;
7834
7519
  var INFERENCE_TIMEOUT_MS3 = 1e3;
@@ -8106,7 +7791,7 @@ var SileroVADWorker = class {
8106
7791
  this.handleWorkerMessage(event.data);
8107
7792
  };
8108
7793
  worker.onerror = (error) => {
8109
- logger12.error("Worker error", { error: error.message });
7794
+ logger13.error("Worker error", { error: error.message });
8110
7795
  for (const [, resolver] of this.pendingResolvers) {
8111
7796
  resolver.reject(new Error(`Worker error: ${error.message}`));
8112
7797
  }
@@ -8182,9 +7867,9 @@ var SileroVADWorker = class {
8182
7867
  "model.sample_rate": this.config.sampleRate
8183
7868
  });
8184
7869
  try {
8185
- logger12.info("Creating VAD worker...");
7870
+ logger13.info("Creating VAD worker...");
8186
7871
  this.worker = this.createWorker();
8187
- logger12.info("Loading model in worker...", {
7872
+ logger13.info("Loading model in worker...", {
8188
7873
  modelUrl: this.config.modelUrl,
8189
7874
  sampleRate: this.config.sampleRate
8190
7875
  });
@@ -8200,7 +7885,7 @@ var SileroVADWorker = class {
8200
7885
  );
8201
7886
  this._isLoaded = true;
8202
7887
  const loadTimeMs = performance.now() - startTime;
8203
- logger12.info("VAD worker loaded successfully", {
7888
+ logger13.info("VAD worker loaded successfully", {
8204
7889
  backend: "wasm",
8205
7890
  loadTimeMs: Math.round(loadTimeMs),
8206
7891
  workerLoadTimeMs: Math.round(result.loadTimeMs),
@@ -8307,7 +7992,7 @@ var SileroVADWorker = class {
8307
7992
  if (isSpeech && !this.wasSpeaking) {
8308
7993
  preSpeechChunks = [...this.preSpeechBuffer];
8309
7994
  this.preSpeechBuffer = [];
8310
- logger12.debug("Speech started with pre-speech buffer", {
7995
+ logger13.debug("Speech started with pre-speech buffer", {
8311
7996
  preSpeechChunks: preSpeechChunks.length,
8312
7997
  durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
8313
7998
  });
@@ -8320,7 +8005,7 @@ var SileroVADWorker = class {
8320
8005
  this.preSpeechBuffer = [];
8321
8006
  }
8322
8007
  this.wasSpeaking = isSpeech;
8323
- logger12.trace("VAD worker inference completed", {
8008
+ logger13.trace("VAD worker inference completed", {
8324
8009
  probability: Math.round(result.probability * 1e3) / 1e3,
8325
8010
  isSpeech,
8326
8011
  inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
@@ -8388,44 +8073,44 @@ var SileroVADWorker = class {
8388
8073
  };
8389
8074
 
8390
8075
  // src/inference/createSileroVAD.ts
8391
- var logger13 = createLogger("createSileroVAD");
8076
+ var logger14 = createLogger("createSileroVAD");
8392
8077
  function supportsVADWorker() {
8393
8078
  if (typeof Worker === "undefined") {
8394
- logger13.debug("Worker not supported: Worker constructor undefined");
8079
+ logger14.debug("Worker not supported: Worker constructor undefined");
8395
8080
  return false;
8396
8081
  }
8397
8082
  if (typeof URL === "undefined" || typeof URL.createObjectURL === "undefined") {
8398
- logger13.debug("Worker not supported: URL.createObjectURL unavailable");
8083
+ logger14.debug("Worker not supported: URL.createObjectURL unavailable");
8399
8084
  return false;
8400
8085
  }
8401
8086
  if (typeof Blob === "undefined") {
8402
- logger13.debug("Worker not supported: Blob constructor unavailable");
8087
+ logger14.debug("Worker not supported: Blob constructor unavailable");
8403
8088
  return false;
8404
8089
  }
8405
8090
  return true;
8406
8091
  }
8407
8092
  function createSileroVAD(config) {
8408
8093
  if (config.unifiedWorker) {
8409
- logger13.info("Creating SileroVADUnifiedAdapter (shared unified worker)");
8094
+ logger14.info("Creating SileroVADUnifiedAdapter (shared unified worker)");
8410
8095
  return new SileroVADUnifiedAdapter(config.unifiedWorker, config);
8411
8096
  }
8412
8097
  const fallbackOnError = config.fallbackOnError ?? true;
8413
8098
  let useWorker;
8414
8099
  if (config.useWorker !== void 0) {
8415
8100
  useWorker = config.useWorker;
8416
- logger13.debug("Worker preference explicitly set", { useWorker });
8101
+ logger14.debug("Worker preference explicitly set", { useWorker });
8417
8102
  } else {
8418
8103
  const workerSupported = supportsVADWorker();
8419
8104
  const onMobile = isMobile();
8420
8105
  useWorker = workerSupported && !onMobile;
8421
- logger13.debug("Auto-detected Worker preference", {
8106
+ logger14.debug("Auto-detected Worker preference", {
8422
8107
  useWorker,
8423
8108
  workerSupported,
8424
8109
  onMobile
8425
8110
  });
8426
8111
  }
8427
8112
  if (useWorker) {
8428
- logger13.info("Creating SileroVADWorker (off-main-thread)");
8113
+ logger14.info("Creating SileroVADWorker (off-main-thread)");
8429
8114
  const worker = new SileroVADWorker({
8430
8115
  modelUrl: config.modelUrl,
8431
8116
  sampleRate: config.sampleRate,
@@ -8437,7 +8122,7 @@ function createSileroVAD(config) {
8437
8122
  }
8438
8123
  return worker;
8439
8124
  }
8440
- logger13.info("Creating SileroVADInference (main thread)");
8125
+ logger14.info("Creating SileroVADInference (main thread)");
8441
8126
  return new SileroVADInference(config);
8442
8127
  }
8443
8128
  var VADWorkerWithFallback = class {
@@ -8463,7 +8148,7 @@ var VADWorkerWithFallback = class {
8463
8148
  try {
8464
8149
  return await this.implementation.load();
8465
8150
  } catch (error) {
8466
- logger13.warn("Worker load failed, falling back to main thread", {
8151
+ logger14.warn("Worker load failed, falling back to main thread", {
8467
8152
  error: error instanceof Error ? error.message : String(error)
8468
8153
  });
8469
8154
  try {
@@ -8472,7 +8157,7 @@ var VADWorkerWithFallback = class {
8472
8157
  }
8473
8158
  this.implementation = new SileroVADInference(this.config);
8474
8159
  this.hasFallenBack = true;
8475
- logger13.info("Fallback to SileroVADInference successful");
8160
+ logger14.info("Fallback to SileroVADInference successful");
8476
8161
  return await this.implementation.load();
8477
8162
  }
8478
8163
  }
@@ -8493,8 +8178,175 @@ var VADWorkerWithFallback = class {
8493
8178
  }
8494
8179
  };
8495
8180
 
8181
+ // src/inference/A2EOrchestrator.ts
8182
+ var logger15 = createLogger("A2EOrchestrator");
8183
+ var A2EOrchestrator = class {
8184
+ constructor(config) {
8185
+ this.a2e = null;
8186
+ this.processor = null;
8187
+ // Mic capture state (lightweight — no dependency on MicrophoneCapture class
8188
+ // which requires an external EventEmitter. We do raw Web Audio here.)
8189
+ this.stream = null;
8190
+ this.audioContext = null;
8191
+ this.scriptProcessor = null;
8192
+ this.nativeSampleRate = 0;
8193
+ this._isReady = false;
8194
+ this._isStreaming = false;
8195
+ this._backend = null;
8196
+ this.disposed = false;
8197
+ this.config = {
8198
+ sampleRate: 16e3,
8199
+ ...config
8200
+ };
8201
+ }
8202
+ /** Latest blendshape weights from inference (null if none yet) */
8203
+ get latestWeights() {
8204
+ return this.processor?.latestFrame ?? null;
8205
+ }
8206
+ /** Whether the model is loaded and ready for inference */
8207
+ get isReady() {
8208
+ return this._isReady;
8209
+ }
8210
+ /** Whether mic is active and inference loop is running */
8211
+ get isStreaming() {
8212
+ return this._isStreaming;
8213
+ }
8214
+ /** Current backend type (webgpu, wasm, or null) */
8215
+ get backend() {
8216
+ return this._backend;
8217
+ }
8218
+ /**
8219
+ * Load the A2E model and create the processor
8220
+ */
8221
+ async load() {
8222
+ if (this.disposed) throw new Error("A2EOrchestrator has been disposed");
8223
+ logger15.info("Loading A2E model...");
8224
+ this.a2e = createA2E({
8225
+ gpuModelUrl: this.config.gpuModelUrl,
8226
+ gpuExternalDataUrl: this.config.gpuExternalDataUrl,
8227
+ cpuModelUrl: this.config.cpuModelUrl ?? this.config.gpuModelUrl,
8228
+ ...this.config.a2eConfig
8229
+ });
8230
+ const info = await this.a2e.load();
8231
+ this._backend = info.backend;
8232
+ this.processor = new A2EProcessor({
8233
+ backend: this.a2e,
8234
+ sampleRate: this.config.sampleRate,
8235
+ chunkSize: this.config.chunkSize,
8236
+ onFrame: this.config.onFrame,
8237
+ onError: this.config.onError
8238
+ });
8239
+ this._isReady = true;
8240
+ logger15.info("A2E model loaded", {
8241
+ backend: info.backend,
8242
+ loadTimeMs: info.loadTimeMs,
8243
+ modelId: this.a2e.modelId
8244
+ });
8245
+ this.config.onReady?.();
8246
+ }
8247
+ /**
8248
+ * Start mic capture and inference loop
8249
+ */
8250
+ async start() {
8251
+ if (this.disposed) throw new Error("A2EOrchestrator has been disposed");
8252
+ if (!this._isReady || !this.processor) throw new Error("Model not loaded. Call load() first.");
8253
+ if (this._isStreaming) return;
8254
+ try {
8255
+ this.stream = await navigator.mediaDevices.getUserMedia({
8256
+ audio: {
8257
+ sampleRate: { ideal: this.config.sampleRate },
8258
+ channelCount: 1,
8259
+ echoCancellation: true,
8260
+ noiseSuppression: true,
8261
+ autoGainControl: true
8262
+ }
8263
+ });
8264
+ this.audioContext = new AudioContext({ sampleRate: this.config.sampleRate });
8265
+ if (this.audioContext.state === "suspended") {
8266
+ await this.audioContext.resume();
8267
+ }
8268
+ this.nativeSampleRate = this.audioContext.sampleRate;
8269
+ const source = this.audioContext.createMediaStreamSource(this.stream);
8270
+ this.scriptProcessor = this.audioContext.createScriptProcessor(4096, 1, 1);
8271
+ this.scriptProcessor.onaudioprocess = (e) => {
8272
+ if (!this._isStreaming || !this.processor) return;
8273
+ const input = e.inputBuffer.getChannelData(0);
8274
+ let samples;
8275
+ if (this.nativeSampleRate !== this.config.sampleRate) {
8276
+ const ratio = this.config.sampleRate / this.nativeSampleRate;
8277
+ const newLen = Math.round(input.length * ratio);
8278
+ samples = new Float32Array(newLen);
8279
+ for (let i = 0; i < newLen; i++) {
8280
+ const srcIdx = i / ratio;
8281
+ const lo = Math.floor(srcIdx);
8282
+ const hi = Math.min(lo + 1, input.length - 1);
8283
+ const frac = srcIdx - lo;
8284
+ samples[i] = input[lo] * (1 - frac) + input[hi] * frac;
8285
+ }
8286
+ } else {
8287
+ samples = new Float32Array(input);
8288
+ }
8289
+ this.processor.pushAudio(samples);
8290
+ };
8291
+ source.connect(this.scriptProcessor);
8292
+ this.scriptProcessor.connect(this.audioContext.destination);
8293
+ this._isStreaming = true;
8294
+ this.processor.startDrip();
8295
+ logger15.info("Mic capture started", { sampleRate: this.nativeSampleRate });
8296
+ } catch (err) {
8297
+ const error = err instanceof Error ? err : new Error(String(err));
8298
+ logger15.error("Failed to start mic capture", { error: error.message });
8299
+ this.config.onError?.(error);
8300
+ throw error;
8301
+ }
8302
+ }
8303
+ /**
8304
+ * Stop mic capture and inference loop
8305
+ */
8306
+ stop() {
8307
+ this._isStreaming = false;
8308
+ if (this.processor) {
8309
+ this.processor.stopDrip();
8310
+ this.processor.reset();
8311
+ }
8312
+ if (this.scriptProcessor) {
8313
+ this.scriptProcessor.disconnect();
8314
+ this.scriptProcessor.onaudioprocess = null;
8315
+ this.scriptProcessor = null;
8316
+ }
8317
+ if (this.stream) {
8318
+ this.stream.getTracks().forEach((t) => t.stop());
8319
+ this.stream = null;
8320
+ }
8321
+ if (this.audioContext) {
8322
+ this.audioContext.close().catch(() => {
8323
+ });
8324
+ this.audioContext = null;
8325
+ }
8326
+ logger15.info("Mic capture stopped");
8327
+ }
8328
+ /**
8329
+ * Dispose of all resources
8330
+ */
8331
+ async dispose() {
8332
+ if (this.disposed) return;
8333
+ this.disposed = true;
8334
+ this.stop();
8335
+ if (this.processor) {
8336
+ this.processor.dispose();
8337
+ this.processor = null;
8338
+ }
8339
+ if (this.a2e) {
8340
+ await this.a2e.dispose();
8341
+ this.a2e = null;
8342
+ }
8343
+ this._isReady = false;
8344
+ this._backend = null;
8345
+ }
8346
+ };
8347
+
8496
8348
  // src/inference/SafariSpeechRecognition.ts
8497
- var logger14 = createLogger("SafariSpeech");
8349
+ var logger16 = createLogger("SafariSpeech");
8498
8350
  var SafariSpeechRecognition = class _SafariSpeechRecognition {
8499
8351
  constructor(config = {}) {
8500
8352
  this.recognition = null;
@@ -8513,7 +8365,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
8513
8365
  interimResults: config.interimResults ?? true,
8514
8366
  maxAlternatives: config.maxAlternatives ?? 1
8515
8367
  };
8516
- logger14.debug("SafariSpeechRecognition created", {
8368
+ logger16.debug("SafariSpeechRecognition created", {
8517
8369
  language: this.config.language,
8518
8370
  continuous: this.config.continuous
8519
8371
  });
@@ -8574,7 +8426,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
8574
8426
  */
8575
8427
  async start() {
8576
8428
  if (this.isListening) {
8577
- logger14.warn("Already listening");
8429
+ logger16.warn("Already listening");
8578
8430
  return;
8579
8431
  }
8580
8432
  if (!_SafariSpeechRecognition.isAvailable()) {
@@ -8604,7 +8456,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
8604
8456
  this.isListening = true;
8605
8457
  this.startTime = performance.now();
8606
8458
  this.accumulatedText = "";
8607
- logger14.info("Speech recognition started", {
8459
+ logger16.info("Speech recognition started", {
8608
8460
  language: this.config.language
8609
8461
  });
8610
8462
  span?.end();
@@ -8619,7 +8471,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
8619
8471
  */
8620
8472
  async stop() {
8621
8473
  if (!this.isListening || !this.recognition) {
8622
- logger14.warn("Not currently listening");
8474
+ logger16.warn("Not currently listening");
8623
8475
  return {
8624
8476
  text: this.accumulatedText,
8625
8477
  language: this.config.language,
@@ -8648,7 +8500,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
8648
8500
  if (this.recognition && this.isListening) {
8649
8501
  this.recognition.abort();
8650
8502
  this.isListening = false;
8651
- logger14.info("Speech recognition aborted");
8503
+ logger16.info("Speech recognition aborted");
8652
8504
  }
8653
8505
  }
8654
8506
  /**
@@ -8679,7 +8531,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
8679
8531
  this.isListening = false;
8680
8532
  this.resultCallbacks = [];
8681
8533
  this.errorCallbacks = [];
8682
- logger14.debug("SafariSpeechRecognition disposed");
8534
+ logger16.debug("SafariSpeechRecognition disposed");
8683
8535
  }
8684
8536
  /**
8685
8537
  * Set up event handlers for the recognition instance
@@ -8707,7 +8559,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
8707
8559
  confidence: alternative.confidence
8708
8560
  };
8709
8561
  this.emitResult(speechResult);
8710
- logger14.trace("Speech result", {
8562
+ logger16.trace("Speech result", {
8711
8563
  text: text.substring(0, 50),
8712
8564
  isFinal,
8713
8565
  confidence: alternative.confidence
@@ -8717,12 +8569,12 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
8717
8569
  span?.end();
8718
8570
  } catch (error) {
8719
8571
  span?.endWithError(error instanceof Error ? error : new Error(String(error)));
8720
- logger14.error("Error processing speech result", { error });
8572
+ logger16.error("Error processing speech result", { error });
8721
8573
  }
8722
8574
  };
8723
8575
  this.recognition.onerror = (event) => {
8724
8576
  const error = new Error(`Speech recognition error: ${event.error} - ${event.message}`);
8725
- logger14.error("Speech recognition error", { error: event.error, message: event.message });
8577
+ logger16.error("Speech recognition error", { error: event.error, message: event.message });
8726
8578
  this.emitError(error);
8727
8579
  if (this.stopRejecter) {
8728
8580
  this.stopRejecter(error);
@@ -8732,7 +8584,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
8732
8584
  };
8733
8585
  this.recognition.onend = () => {
8734
8586
  this.isListening = false;
8735
- logger14.info("Speech recognition ended", {
8587
+ logger16.info("Speech recognition ended", {
8736
8588
  totalText: this.accumulatedText.length,
8737
8589
  durationMs: performance.now() - this.startTime
8738
8590
  });
@@ -8749,13 +8601,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
8749
8601
  }
8750
8602
  };
8751
8603
  this.recognition.onstart = () => {
8752
- logger14.debug("Speech recognition started by browser");
8604
+ logger16.debug("Speech recognition started by browser");
8753
8605
  };
8754
8606
  this.recognition.onspeechstart = () => {
8755
- logger14.debug("Speech detected");
8607
+ logger16.debug("Speech detected");
8756
8608
  };
8757
8609
  this.recognition.onspeechend = () => {
8758
- logger14.debug("Speech ended");
8610
+ logger16.debug("Speech ended");
8759
8611
  };
8760
8612
  }
8761
8613
  /**
@@ -8766,7 +8618,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
8766
8618
  try {
8767
8619
  callback(result);
8768
8620
  } catch (error) {
8769
- logger14.error("Error in result callback", { error });
8621
+ logger16.error("Error in result callback", { error });
8770
8622
  }
8771
8623
  }
8772
8624
  }
@@ -8778,7 +8630,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
8778
8630
  try {
8779
8631
  callback(error);
8780
8632
  } catch (callbackError) {
8781
- logger14.error("Error in error callback", { error: callbackError });
8633
+ logger16.error("Error in error callback", { error: callbackError });
8782
8634
  }
8783
8635
  }
8784
8636
  }
@@ -9191,13 +9043,14 @@ var AgentCoreAdapter = class extends EventEmitter {
9191
9043
  if (!this.lam) {
9192
9044
  throw new Error("LAM must be initialized before pipeline");
9193
9045
  }
9194
- this.pipeline = new SyncedAudioPipeline({
9046
+ this.pipeline = new FullFacePipeline({
9195
9047
  lam: this.lam,
9196
9048
  sampleRate: 16e3,
9197
9049
  chunkTargetMs: 200
9198
9050
  });
9199
9051
  await this.pipeline.initialize();
9200
- this.pipeline.on("frame_ready", (frame) => {
9052
+ this.pipeline.on("full_frame_ready", (fullFrame) => {
9053
+ const frame = fullFrame.blendshapes;
9201
9054
  this.emit("animation", {
9202
9055
  blendshapes: frame,
9203
9056
  get: (name) => {
@@ -9376,9 +9229,9 @@ var AgentCoreAdapter = class extends EventEmitter {
9376
9229
  });
9377
9230
  }
9378
9231
  }
9379
- // REMOVED: processAudioForAnimation() - now handled by SyncedAudioPipeline
9232
+ // REMOVED: processAudioForAnimation() - now handled by FullFacePipeline
9380
9233
  // The pipeline manages audio scheduling, LAM inference, and frame synchronization
9381
- // Frames are emitted via pipeline.on('frame_ready') event (see initPipeline())
9234
+ // Frames are emitted via pipeline.on('full_frame_ready') event (see initPipeline())
9382
9235
  /**
9383
9236
  * Detect voice activity using Silero VAD
9384
9237
  * Falls back to simple RMS if VAD not available
@@ -11189,6 +11042,8 @@ function isProtocolEvent(obj) {
11189
11042
  return typeof obj === "object" && obj !== null && "v" in obj && "type" in obj && "ts" in obj;
11190
11043
  }
11191
11044
  export {
11045
+ A2EOrchestrator,
11046
+ A2EProcessor,
11192
11047
  ARKIT_BLENDSHAPES,
11193
11048
  AgentCoreAdapter,
11194
11049
  AnimationGraph,
@@ -11196,23 +11051,22 @@ export {
11196
11051
  AudioEnergyAnalyzer,
11197
11052
  AudioScheduler,
11198
11053
  AudioSyncManager,
11054
+ BLENDSHAPE_TO_GROUP,
11055
+ BlendshapeSmoother,
11199
11056
  CTC_VOCAB,
11200
11057
  ConsoleExporter,
11201
11058
  ConversationOrchestrator,
11202
11059
  DEFAULT_ANIMATION_CONFIG,
11203
11060
  DEFAULT_LOGGING_CONFIG,
11204
- EMOTION_ARKIT_MAP,
11205
11061
  EMOTION_NAMES,
11206
11062
  EMOTION_VECTOR_SIZE,
11207
11063
  EmotionController,
11208
11064
  EmotionPresets,
11209
- EmotionToBlendshapeMapper,
11210
11065
  EmphasisDetector,
11211
11066
  EventEmitter,
11212
11067
  FullFacePipeline,
11213
11068
  INFERENCE_LATENCY_BUCKETS,
11214
11069
  InterruptionHandler,
11215
- LAMPipeline,
11216
11070
  LAM_BLENDSHAPES,
11217
11071
  LOG_LEVEL_PRIORITY,
11218
11072
  MODEL_LOAD_TIME_BUCKETS,
@@ -11231,73 +11085,54 @@ export {
11231
11085
  SileroVADInference,
11232
11086
  SileroVADUnifiedAdapter,
11233
11087
  SileroVADWorker,
11234
- SyncedAudioPipeline,
11235
11088
  TenantManager,
11236
- UPPER_FACE_BLENDSHAPES,
11237
11089
  UnifiedInferenceWorker,
11238
- WAV2ARKIT_BLENDSHAPES,
11239
11090
  Wav2ArkitCpuInference,
11240
11091
  Wav2ArkitCpuUnifiedAdapter,
11241
11092
  Wav2ArkitCpuWorker,
11242
11093
  Wav2Vec2Inference,
11243
- applyCMVN,
11244
- applyLFR,
11245
11094
  blendEmotions,
11246
11095
  calculatePeak,
11247
11096
  calculateRMS,
11248
- computeKaldiFbank,
11249
11097
  configureCacheLimit,
11250
11098
  configureLogging,
11251
11099
  configureTelemetry,
11100
+ createA2E,
11252
11101
  createEmotionVector,
11253
- createLipSync,
11254
11102
  createLogger,
11255
11103
  createSenseVoice,
11256
- createSessionWithFallback,
11257
11104
  createSileroVAD,
11258
- ctcGreedyDecode,
11259
11105
  fetchWithCache,
11260
11106
  formatBytes,
11261
11107
  getCacheConfig,
11262
11108
  getCacheKey,
11263
11109
  getEmotionPreset,
11264
- getLoadedBackend,
11265
11110
  getLoggingConfig,
11266
11111
  getModelCache,
11267
- getOnnxRuntime,
11268
- getOnnxRuntimeForPreference,
11269
11112
  getOptimalWasmThreads,
11270
11113
  getRecommendedBackend,
11271
- getSessionOptions,
11272
11114
  getTelemetry,
11273
11115
  hasWebGPUApi,
11274
11116
  isAndroid,
11275
11117
  isIOS,
11276
11118
  isIOSSafari,
11277
11119
  isMobile,
11278
- isOnnxRuntimeLoaded,
11279
11120
  isProtocolEvent,
11280
11121
  isSafari,
11281
11122
  isSpeechRecognitionAvailable,
11282
11123
  isWebGPUAvailable,
11124
+ lerpBlendshapes,
11283
11125
  lerpEmotion,
11284
11126
  noopLogger,
11285
- parseCMVNFromMetadata,
11286
- parseTokensFile,
11287
11127
  preloadModels,
11288
- preloadOnnxRuntime,
11289
- remapWav2ArkitToLam,
11290
11128
  resetLoggingConfig,
11291
11129
  resolveBackend,
11292
- resolveLanguageId,
11293
- resolveTextNormId,
11294
11130
  setLogLevel,
11295
11131
  setLoggingEnabled,
11296
11132
  shouldEnableWasmProxy,
11297
- shouldUseCpuLipSync,
11133
+ shouldUseCpuA2E,
11298
11134
  shouldUseNativeASR,
11299
- shouldUseServerLipSync,
11300
- supportsVADWorker,
11301
- symmetrizeBlendshapes
11135
+ shouldUseServerA2E,
11136
+ supportsVADWorker
11302
11137
  };
11303
11138
  //# sourceMappingURL=index.mjs.map