@omote/core 0.4.6 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +484 -867
- package/dist/index.d.ts +484 -867
- package/dist/index.js +1419 -1598
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +972 -1151
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -485,730 +485,353 @@ var AudioChunkCoalescer = class {
|
|
|
485
485
|
}
|
|
486
486
|
};
|
|
487
487
|
|
|
488
|
-
// src/
|
|
489
|
-
var
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
this.
|
|
495
|
-
// LAM outputs 30fps
|
|
496
|
-
this.buffer = new Float32Array(0);
|
|
488
|
+
// src/inference/A2EProcessor.ts
|
|
489
|
+
var logger = createLogger("A2EProcessor");
|
|
490
|
+
var FRAME_RATE = 30;
|
|
491
|
+
var DRIP_INTERVAL_MS = 33;
|
|
492
|
+
var A2EProcessor = class {
|
|
493
|
+
constructor(config) {
|
|
494
|
+
this.writeOffset = 0;
|
|
497
495
|
this.bufferStartTime = 0;
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
this.
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
496
|
+
// Frame queues (timestamped for pull mode, plain for drip mode)
|
|
497
|
+
this.timestampedQueue = [];
|
|
498
|
+
this.plainQueue = [];
|
|
499
|
+
// Push mode state
|
|
500
|
+
this._latestFrame = null;
|
|
501
|
+
this.dripInterval = null;
|
|
502
|
+
// Last-frame-hold for pull mode (prevents avatar freezing between frames)
|
|
503
|
+
this.lastPulledFrame = null;
|
|
504
|
+
// Inference serialization
|
|
505
|
+
this.inferenceRunning = false;
|
|
506
|
+
this.pendingChunks = [];
|
|
507
|
+
// Diagnostic: track getFrameForTime calls
|
|
508
|
+
this.getFrameCallCount = 0;
|
|
509
|
+
this.disposed = false;
|
|
510
|
+
this.backend = config.backend;
|
|
511
|
+
this.sampleRate = config.sampleRate ?? 16e3;
|
|
512
|
+
this.chunkSize = config.chunkSize ?? config.backend.chunkSize ?? 16e3;
|
|
513
|
+
this.onFrame = config.onFrame;
|
|
514
|
+
this.onError = config.onError;
|
|
515
|
+
this.bufferCapacity = this.chunkSize * 2;
|
|
516
|
+
this.buffer = new Float32Array(this.bufferCapacity);
|
|
517
|
+
}
|
|
518
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
519
|
+
// Audio Input
|
|
520
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
521
|
+
/**
|
|
522
|
+
* Push audio samples for inference (any source: mic, TTS, file).
|
|
507
523
|
*
|
|
508
|
-
*
|
|
509
|
-
*
|
|
524
|
+
* - With `timestamp`: frames stored with timestamps (pull mode)
|
|
525
|
+
* - Without `timestamp`: frames stored in plain queue (drip/push mode)
|
|
510
526
|
*
|
|
511
|
-
*
|
|
512
|
-
* @param timestamp - AudioContext time when these samples start playing
|
|
513
|
-
* @param lam - LAM inference engine
|
|
527
|
+
* Fire-and-forget: returns immediately, inference runs async.
|
|
514
528
|
*/
|
|
515
|
-
|
|
516
|
-
if (this.
|
|
529
|
+
pushAudio(samples, timestamp) {
|
|
530
|
+
if (this.disposed) return;
|
|
531
|
+
if (this.writeOffset === 0 && timestamp !== void 0) {
|
|
517
532
|
this.bufferStartTime = timestamp;
|
|
518
533
|
}
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
this.
|
|
540
|
-
const
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
this.
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
this.buffer = new Float32Array(0);
|
|
551
|
-
this.bufferStartTime = 0;
|
|
552
|
-
}
|
|
553
|
-
}
|
|
554
|
-
/**
|
|
555
|
-
* Get the frame that should be displayed at the current time
|
|
556
|
-
*
|
|
557
|
-
* Automatically removes frames that have already been displayed.
|
|
558
|
-
* This prevents memory leaks from accumulating old frames.
|
|
559
|
-
*
|
|
560
|
-
* Discard Window (prevents premature frame discarding):
|
|
561
|
-
* - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
|
|
562
|
-
* - WASM: 1.0s (LAM inference 50-500ms + higher variability)
|
|
563
|
-
*
|
|
564
|
-
* Last-Frame-Hold: Returns last valid frame instead of null to prevent
|
|
565
|
-
* avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
|
|
566
|
-
*
|
|
567
|
-
* @param currentTime - Current AudioContext time
|
|
568
|
-
* @param lam - LAM inference engine (optional, for backend detection)
|
|
569
|
-
* @returns Current frame, or last frame as fallback, or null if no frames yet
|
|
570
|
-
*/
|
|
571
|
-
getFrameForTime(currentTime, lam) {
|
|
572
|
-
const discardWindow = lam?.backend === "wasm" ? 1 : 0.5;
|
|
573
|
-
let discardedCount = 0;
|
|
574
|
-
while (this.frameQueue.length > 0 && this.frameQueue[0].timestamp < currentTime - discardWindow) {
|
|
575
|
-
const discarded = this.frameQueue.shift();
|
|
576
|
-
discardedCount++;
|
|
577
|
-
if (discardedCount === 1) {
|
|
578
|
-
const ageMs = ((currentTime - discarded.timestamp) * 1e3).toFixed(0);
|
|
579
|
-
console.warn("[LAM] Frame(s) discarded as too old", {
|
|
580
|
-
ageMs,
|
|
581
|
-
discardWindowMs: discardWindow * 1e3,
|
|
582
|
-
queueLength: this.frameQueue.length,
|
|
583
|
-
backend: lam?.backend ?? "unknown"
|
|
584
|
-
});
|
|
585
|
-
}
|
|
586
|
-
}
|
|
587
|
-
if (this.frameQueue.length > 0 && this.frameQueue[0].timestamp <= currentTime) {
|
|
588
|
-
const { frame } = this.frameQueue.shift();
|
|
589
|
-
this.lastFrame = frame;
|
|
590
|
-
return frame;
|
|
591
|
-
}
|
|
592
|
-
return this.lastFrame;
|
|
593
|
-
}
|
|
594
|
-
/**
|
|
595
|
-
* Get all frames in the queue (for debugging/monitoring)
|
|
596
|
-
*/
|
|
597
|
-
getQueuedFrames() {
|
|
598
|
-
return [...this.frameQueue];
|
|
599
|
-
}
|
|
600
|
-
/**
|
|
601
|
-
* Get current buffer fill level (0-1)
|
|
602
|
-
*/
|
|
603
|
-
get fillLevel() {
|
|
604
|
-
return Math.min(1, this.buffer.length / this.REQUIRED_SAMPLES);
|
|
605
|
-
}
|
|
606
|
-
/**
|
|
607
|
-
* Get number of frames queued
|
|
608
|
-
*/
|
|
609
|
-
get queuedFrameCount() {
|
|
610
|
-
return this.frameQueue.length;
|
|
611
|
-
}
|
|
612
|
-
/**
|
|
613
|
-
* Get buffered audio duration in seconds
|
|
614
|
-
*/
|
|
615
|
-
get bufferedDuration() {
|
|
616
|
-
return this.buffer.length / (this.options.sampleRate ?? 16e3);
|
|
617
|
-
}
|
|
618
|
-
/**
|
|
619
|
-
* Flush remaining buffered audio
|
|
620
|
-
*
|
|
621
|
-
* Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
|
|
622
|
-
* This ensures the final audio chunk generates blendshape frames.
|
|
623
|
-
*
|
|
624
|
-
* Should be called when audio stream ends to prevent losing the last 0-1 seconds.
|
|
625
|
-
*
|
|
626
|
-
* @param lam - LAM inference engine
|
|
627
|
-
*/
|
|
628
|
-
async flush(lam) {
|
|
629
|
-
if (this.buffer.length === 0) {
|
|
630
|
-
return;
|
|
631
|
-
}
|
|
632
|
-
const padded = new Float32Array(this.REQUIRED_SAMPLES);
|
|
633
|
-
padded.set(this.buffer, 0);
|
|
634
|
-
const processedStartTime = this.bufferStartTime;
|
|
635
|
-
try {
|
|
636
|
-
const result = await lam.infer(padded);
|
|
637
|
-
const actualDuration = this.buffer.length / (this.options.sampleRate ?? 16e3);
|
|
638
|
-
const frameDuration = 1 / this.FRAME_RATE;
|
|
639
|
-
const actualFrameCount = Math.ceil(actualDuration * this.FRAME_RATE);
|
|
640
|
-
for (let i = 0; i < Math.min(actualFrameCount, result.blendshapes.length); i++) {
|
|
641
|
-
const frame = result.blendshapes[i];
|
|
642
|
-
const timestamp = processedStartTime + i * frameDuration;
|
|
643
|
-
this.frameQueue.push({ frame, timestamp });
|
|
534
|
+
if (this.writeOffset + samples.length > this.bufferCapacity) {
|
|
535
|
+
this.bufferCapacity = (this.writeOffset + samples.length) * 2;
|
|
536
|
+
const grown = new Float32Array(this.bufferCapacity);
|
|
537
|
+
grown.set(this.buffer.subarray(0, this.writeOffset));
|
|
538
|
+
this.buffer = grown;
|
|
539
|
+
}
|
|
540
|
+
this.buffer.set(samples, this.writeOffset);
|
|
541
|
+
this.writeOffset += samples.length;
|
|
542
|
+
logger.debug("pushAudio", {
|
|
543
|
+
samplesIn: samples.length,
|
|
544
|
+
writeOffset: this.writeOffset,
|
|
545
|
+
chunkSize: this.chunkSize,
|
|
546
|
+
willExtract: this.writeOffset >= this.chunkSize,
|
|
547
|
+
inferenceRunning: this.inferenceRunning,
|
|
548
|
+
pendingChunks: this.pendingChunks.length,
|
|
549
|
+
queuedFrames: this.timestampedQueue.length + this.plainQueue.length
|
|
550
|
+
});
|
|
551
|
+
while (this.writeOffset >= this.chunkSize) {
|
|
552
|
+
const chunk = this.buffer.slice(0, this.chunkSize);
|
|
553
|
+
this.buffer.copyWithin(0, this.chunkSize, this.writeOffset);
|
|
554
|
+
this.writeOffset -= this.chunkSize;
|
|
555
|
+
const chunkTimestamp = timestamp !== void 0 ? this.bufferStartTime : void 0;
|
|
556
|
+
this.pendingChunks.push({ chunk, timestamp: chunkTimestamp });
|
|
557
|
+
logger.info("Chunk queued for inference", {
|
|
558
|
+
chunkSize: chunk.length,
|
|
559
|
+
chunkTimestamp,
|
|
560
|
+
pendingChunks: this.pendingChunks.length,
|
|
561
|
+
remainderOffset: this.writeOffset
|
|
562
|
+
});
|
|
563
|
+
if (timestamp !== void 0) {
|
|
564
|
+
this.bufferStartTime += this.chunkSize / this.sampleRate;
|
|
644
565
|
}
|
|
645
|
-
this.buffer = new Float32Array(0);
|
|
646
|
-
this.bufferStartTime = 0;
|
|
647
|
-
this.options.onInference?.(Math.min(actualFrameCount, result.blendshapes.length));
|
|
648
|
-
} catch (error) {
|
|
649
|
-
this.options.onError?.(error);
|
|
650
|
-
this.buffer = new Float32Array(0);
|
|
651
|
-
this.bufferStartTime = 0;
|
|
652
566
|
}
|
|
567
|
+
this.drainPendingChunks();
|
|
653
568
|
}
|
|
654
569
|
/**
|
|
655
|
-
*
|
|
570
|
+
* Flush remaining buffered audio (pads to chunkSize).
|
|
571
|
+
* Call at end of stream to process final partial chunk.
|
|
656
572
|
*
|
|
657
|
-
*
|
|
658
|
-
*
|
|
659
|
-
*
|
|
660
|
-
*
|
|
573
|
+
* Routes through the serialized pendingChunks pipeline to maintain
|
|
574
|
+
* correct frame ordering. Without this, flush() could push frames
|
|
575
|
+
* with the latest timestamp to the queue before drainPendingChunks()
|
|
576
|
+
* finishes pushing frames with earlier timestamps — causing
|
|
577
|
+
* getFrameForTime() to see out-of-order timestamps and stall.
|
|
661
578
|
*/
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
579
|
+
async flush() {
|
|
580
|
+
if (this.disposed || this.writeOffset === 0) return;
|
|
581
|
+
const padded = new Float32Array(this.chunkSize);
|
|
582
|
+
padded.set(this.buffer.subarray(0, this.writeOffset), 0);
|
|
583
|
+
const chunkTimestamp = this.bufferStartTime > 0 ? this.bufferStartTime : void 0;
|
|
584
|
+
logger.info("flush: routing through drain pipeline", {
|
|
585
|
+
actualSamples: this.writeOffset,
|
|
586
|
+
chunkTimestamp: chunkTimestamp?.toFixed(3),
|
|
587
|
+
pendingChunks: this.pendingChunks.length,
|
|
588
|
+
inferenceRunning: this.inferenceRunning
|
|
589
|
+
});
|
|
590
|
+
this.writeOffset = 0;
|
|
591
|
+
this.bufferStartTime = 0;
|
|
592
|
+
this.pendingChunks.push({ chunk: padded, timestamp: chunkTimestamp });
|
|
593
|
+
this.drainPendingChunks();
|
|
666
594
|
}
|
|
667
595
|
/**
|
|
668
|
-
* Reset
|
|
596
|
+
* Reset buffer and frame queues
|
|
669
597
|
*/
|
|
670
598
|
reset() {
|
|
671
|
-
this.
|
|
599
|
+
this.writeOffset = 0;
|
|
672
600
|
this.bufferStartTime = 0;
|
|
673
|
-
this.
|
|
674
|
-
this.
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
return float32;
|
|
687
|
-
}
|
|
688
|
-
function int16ToFloat32(int16) {
|
|
689
|
-
const float32 = new Float32Array(int16.length);
|
|
690
|
-
for (let i = 0; i < int16.length; i++) {
|
|
691
|
-
float32[i] = int16[i] / 32768;
|
|
692
|
-
}
|
|
693
|
-
return float32;
|
|
694
|
-
}
|
|
695
|
-
|
|
696
|
-
// src/audio/SyncedAudioPipeline.ts
|
|
697
|
-
var SyncedAudioPipeline = class extends EventEmitter {
|
|
698
|
-
constructor(options) {
|
|
699
|
-
super();
|
|
700
|
-
this.options = options;
|
|
701
|
-
this.playbackStarted = false;
|
|
702
|
-
this.monitorInterval = null;
|
|
703
|
-
this.frameAnimationId = null;
|
|
704
|
-
const sampleRate = options.sampleRate ?? 16e3;
|
|
705
|
-
const autoDelay = options.lam.modelId === "wav2arkit_cpu" ? 750 : options.lam.backend === "wasm" ? 350 : 50;
|
|
706
|
-
const audioDelayMs = options.audioDelayMs ?? autoDelay;
|
|
707
|
-
this.scheduler = new AudioScheduler({
|
|
708
|
-
sampleRate,
|
|
709
|
-
initialLookaheadSec: audioDelayMs / 1e3
|
|
710
|
-
});
|
|
711
|
-
this.coalescer = new AudioChunkCoalescer({
|
|
712
|
-
sampleRate,
|
|
713
|
-
targetDurationMs: options.chunkTargetMs ?? 200
|
|
714
|
-
});
|
|
715
|
-
this.lamPipeline = new LAMPipeline({
|
|
716
|
-
sampleRate,
|
|
717
|
-
onError: (error) => {
|
|
718
|
-
this.emit("error", error);
|
|
719
|
-
}
|
|
720
|
-
});
|
|
721
|
-
}
|
|
722
|
-
/**
|
|
723
|
-
* Initialize the pipeline
|
|
724
|
-
*/
|
|
725
|
-
async initialize() {
|
|
726
|
-
await this.scheduler.initialize();
|
|
727
|
-
}
|
|
728
|
-
/**
|
|
729
|
-
* Start a new playback session
|
|
601
|
+
this.timestampedQueue = [];
|
|
602
|
+
this.plainQueue = [];
|
|
603
|
+
this._latestFrame = null;
|
|
604
|
+
this.lastPulledFrame = null;
|
|
605
|
+
this.pendingChunks = [];
|
|
606
|
+
this.inferenceRunning = false;
|
|
607
|
+
this.getFrameCallCount = 0;
|
|
608
|
+
}
|
|
609
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
610
|
+
// Frame Output — Pull Mode (TTS playback)
|
|
611
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
612
|
+
/**
|
|
613
|
+
* Get frame synced to external clock (e.g. AudioContext.currentTime).
|
|
730
614
|
*
|
|
731
|
-
*
|
|
732
|
-
*
|
|
733
|
-
*/
|
|
734
|
-
start() {
|
|
735
|
-
this.stopMonitoring();
|
|
736
|
-
this.scheduler.reset();
|
|
737
|
-
this.coalescer.reset();
|
|
738
|
-
this.lamPipeline.reset();
|
|
739
|
-
this.playbackStarted = false;
|
|
740
|
-
this.scheduler.warmup();
|
|
741
|
-
this.startFrameLoop();
|
|
742
|
-
this.startMonitoring();
|
|
743
|
-
}
|
|
744
|
-
/**
|
|
745
|
-
* Receive audio chunk from network
|
|
746
|
-
*
|
|
747
|
-
* Audio-first design: schedules audio immediately, LAM runs in background.
|
|
748
|
-
* This prevents LAM inference (50-300ms) from blocking audio scheduling,
|
|
749
|
-
* which caused audible stuttering with continuous audio streams.
|
|
615
|
+
* Discards frames that are too old, returns the current frame,
|
|
616
|
+
* or holds last frame as fallback to prevent avatar freezing.
|
|
750
617
|
*
|
|
751
|
-
* @param
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
618
|
+
* @param currentTime - Current playback time (seconds)
|
|
619
|
+
* @returns Blendshape frame, or null if no frames yet
|
|
620
|
+
*/
|
|
621
|
+
getFrameForTime(currentTime) {
|
|
622
|
+
this.getFrameCallCount++;
|
|
623
|
+
const discardWindow = this.backend.backend === "wasm" ? 1 : 0.5;
|
|
624
|
+
let discardCount = 0;
|
|
625
|
+
while (this.timestampedQueue.length > 0 && this.timestampedQueue[0].timestamp < currentTime - discardWindow) {
|
|
626
|
+
this.timestampedQueue.shift();
|
|
627
|
+
discardCount++;
|
|
628
|
+
}
|
|
629
|
+
if (discardCount > 0) {
|
|
630
|
+
logger.warn("getFrameForTime DISCARDED stale frames", {
|
|
631
|
+
discardCount,
|
|
632
|
+
currentTime: currentTime.toFixed(3),
|
|
633
|
+
discardWindow,
|
|
634
|
+
remainingFrames: this.timestampedQueue.length,
|
|
635
|
+
nextFrameTs: this.timestampedQueue.length > 0 ? this.timestampedQueue[0].timestamp.toFixed(3) : "none"
|
|
636
|
+
});
|
|
757
637
|
}
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
this.emit("playback_start", scheduleTime);
|
|
638
|
+
if (this.timestampedQueue.length > 0 && this.timestampedQueue[0].timestamp <= currentTime) {
|
|
639
|
+
const { frame } = this.timestampedQueue.shift();
|
|
640
|
+
this.lastPulledFrame = frame;
|
|
641
|
+
return frame;
|
|
763
642
|
}
|
|
764
|
-
this.
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
*/
|
|
773
|
-
async end() {
|
|
774
|
-
const remaining = this.coalescer.flush();
|
|
775
|
-
if (remaining) {
|
|
776
|
-
const chunk = new Uint8Array(remaining);
|
|
777
|
-
await this.onAudioChunk(chunk);
|
|
643
|
+
if (this.timestampedQueue.length > 0 && this.getFrameCallCount % 60 === 0) {
|
|
644
|
+
logger.warn("getFrameForTime: frames in queue but NOT consumable", {
|
|
645
|
+
queueLen: this.timestampedQueue.length,
|
|
646
|
+
frontTimestamp: this.timestampedQueue[0].timestamp.toFixed(4),
|
|
647
|
+
currentTime: currentTime.toFixed(4),
|
|
648
|
+
delta: (this.timestampedQueue[0].timestamp - currentTime).toFixed(4),
|
|
649
|
+
callCount: this.getFrameCallCount
|
|
650
|
+
});
|
|
778
651
|
}
|
|
779
|
-
|
|
652
|
+
return this.lastPulledFrame;
|
|
780
653
|
}
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
* - Clears all buffers and queues
|
|
788
|
-
* - Emits 'playback_complete' event
|
|
789
|
-
*
|
|
790
|
-
* Use this for interruptions (e.g., user barge-in during AI speech).
|
|
791
|
-
*
|
|
792
|
-
* @param fadeOutMs - Fade-out duration in milliseconds (default: 50ms)
|
|
793
|
-
* @returns Promise that resolves when fade-out completes
|
|
794
|
-
*/
|
|
795
|
-
async stop(fadeOutMs = 50) {
|
|
796
|
-
this.stopMonitoring();
|
|
797
|
-
await this.scheduler.cancelAll(fadeOutMs);
|
|
798
|
-
this.coalescer.reset();
|
|
799
|
-
this.lamPipeline.reset();
|
|
800
|
-
this.playbackStarted = false;
|
|
801
|
-
this.emit("playback_complete", void 0);
|
|
654
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
655
|
+
// Frame Output — Push Mode (live mic, game loop)
|
|
656
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
657
|
+
/** Latest frame from drip-feed (live mic, game loop) */
|
|
658
|
+
get latestFrame() {
|
|
659
|
+
return this._latestFrame;
|
|
802
660
|
}
|
|
803
|
-
/**
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
*
|
|
809
|
-
* Frame Emission Strategy:
|
|
810
|
-
* - LAMPipeline uses last-frame-hold to prevent null returns
|
|
811
|
-
* - Always emit frames (even repeated frames) to maintain smooth animation
|
|
812
|
-
* - Renderer is responsible for detecting duplicate frames if needed
|
|
813
|
-
*/
|
|
814
|
-
startFrameLoop() {
|
|
815
|
-
const updateFrame = () => {
|
|
816
|
-
const currentTime = this.scheduler.getCurrentTime();
|
|
817
|
-
const frame = this.lamPipeline.getFrameForTime(currentTime, this.options.lam);
|
|
661
|
+
/** Start 30fps drip-feed timer (push mode) */
|
|
662
|
+
startDrip() {
|
|
663
|
+
if (this.dripInterval) return;
|
|
664
|
+
this.dripInterval = setInterval(() => {
|
|
665
|
+
const frame = this.plainQueue.shift();
|
|
818
666
|
if (frame) {
|
|
819
|
-
this.
|
|
667
|
+
this._latestFrame = frame;
|
|
668
|
+
this.onFrame?.(frame);
|
|
820
669
|
}
|
|
821
|
-
|
|
822
|
-
};
|
|
823
|
-
this.frameAnimationId = requestAnimationFrame(updateFrame);
|
|
670
|
+
}, DRIP_INTERVAL_MS);
|
|
824
671
|
}
|
|
825
|
-
/**
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
clearInterval(this.monitorInterval);
|
|
672
|
+
/** Stop drip-feed timer */
|
|
673
|
+
stopDrip() {
|
|
674
|
+
if (this.dripInterval) {
|
|
675
|
+
clearInterval(this.dripInterval);
|
|
676
|
+
this.dripInterval = null;
|
|
831
677
|
}
|
|
832
|
-
this.monitorInterval = window.setInterval(() => {
|
|
833
|
-
if (this.scheduler.isComplete() && this.lamPipeline.queuedFrameCount === 0) {
|
|
834
|
-
this.emit("playback_complete", void 0);
|
|
835
|
-
this.stopMonitoring();
|
|
836
|
-
}
|
|
837
|
-
}, 100);
|
|
838
678
|
}
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
this.monitorInterval = null;
|
|
846
|
-
}
|
|
847
|
-
if (this.frameAnimationId) {
|
|
848
|
-
cancelAnimationFrame(this.frameAnimationId);
|
|
849
|
-
this.frameAnimationId = null;
|
|
850
|
-
}
|
|
679
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
680
|
+
// State
|
|
681
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
682
|
+
/** Number of frames waiting in queue (both modes combined) */
|
|
683
|
+
get queuedFrameCount() {
|
|
684
|
+
return this.timestampedQueue.length + this.plainQueue.length;
|
|
851
685
|
}
|
|
852
|
-
/**
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
getState() {
|
|
856
|
-
return {
|
|
857
|
-
playbackStarted: this.playbackStarted,
|
|
858
|
-
coalescerFill: this.coalescer.fillLevel,
|
|
859
|
-
lamFill: this.lamPipeline.fillLevel,
|
|
860
|
-
queuedFrames: this.lamPipeline.queuedFrameCount,
|
|
861
|
-
currentTime: this.scheduler.getCurrentTime(),
|
|
862
|
-
playbackEndTime: this.scheduler.getPlaybackEndTime()
|
|
863
|
-
};
|
|
686
|
+
/** Buffer fill level as fraction of chunkSize (0-1) */
|
|
687
|
+
get fillLevel() {
|
|
688
|
+
return Math.min(1, this.writeOffset / this.chunkSize);
|
|
864
689
|
}
|
|
865
|
-
/**
|
|
866
|
-
* Cleanup resources
|
|
867
|
-
*/
|
|
690
|
+
/** Dispose resources */
|
|
868
691
|
dispose() {
|
|
869
|
-
this.
|
|
870
|
-
this.
|
|
871
|
-
this.
|
|
872
|
-
this.
|
|
873
|
-
}
|
|
874
|
-
};
|
|
875
|
-
|
|
876
|
-
// src/animation/EmotionToBlendshapeMapper.ts
|
|
877
|
-
var UPPER_FACE_BLENDSHAPES = [
|
|
878
|
-
// Brows (5)
|
|
879
|
-
"browDownLeft",
|
|
880
|
-
"browDownRight",
|
|
881
|
-
"browInnerUp",
|
|
882
|
-
"browOuterUpLeft",
|
|
883
|
-
"browOuterUpRight",
|
|
884
|
-
// Eyes (4)
|
|
885
|
-
"eyeSquintLeft",
|
|
886
|
-
"eyeSquintRight",
|
|
887
|
-
"eyeWideLeft",
|
|
888
|
-
"eyeWideRight",
|
|
889
|
-
// Cheeks (2)
|
|
890
|
-
"cheekSquintLeft",
|
|
891
|
-
"cheekSquintRight"
|
|
892
|
-
];
|
|
893
|
-
var EMOTION_ARKIT_MAP = {
|
|
894
|
-
happy: {
|
|
895
|
-
// AU6 - Cheek raiser (primary Duchenne smile marker)
|
|
896
|
-
cheekSquintLeft: 0.5,
|
|
897
|
-
cheekSquintRight: 0.5,
|
|
898
|
-
// Slight eye squint from genuine smile (orbicularis oculi activation)
|
|
899
|
-
eyeSquintLeft: 0.2,
|
|
900
|
-
eyeSquintRight: 0.2
|
|
901
|
-
},
|
|
902
|
-
angry: {
|
|
903
|
-
// AU4 - Brow lowerer (intense, primary anger marker)
|
|
904
|
-
browDownLeft: 0.7,
|
|
905
|
-
browDownRight: 0.7,
|
|
906
|
-
// AU5 - Upper lid raiser (wide eyes, part of the "glare")
|
|
907
|
-
eyeWideLeft: 0.4,
|
|
908
|
-
eyeWideRight: 0.4,
|
|
909
|
-
// AU7 - Lid tightener (tense stare, combines with AU5 for angry glare)
|
|
910
|
-
eyeSquintLeft: 0.3,
|
|
911
|
-
eyeSquintRight: 0.3
|
|
912
|
-
},
|
|
913
|
-
sad: {
|
|
914
|
-
// AU1 - Inner brow raiser (primary sadness marker)
|
|
915
|
-
browInnerUp: 0.6,
|
|
916
|
-
// AU4 - Brow lowerer (brows drawn together)
|
|
917
|
-
browDownLeft: 0.3,
|
|
918
|
-
browDownRight: 0.3
|
|
919
|
-
},
|
|
920
|
-
neutral: {}
|
|
921
|
-
// All zeros - no expression overlay
|
|
922
|
-
};
|
|
923
|
-
var DEFAULT_CONFIG = {
|
|
924
|
-
smoothingFactor: 0.15,
|
|
925
|
-
confidenceThreshold: 0.3,
|
|
926
|
-
intensity: 1,
|
|
927
|
-
blendMode: "dominant",
|
|
928
|
-
minBlendProbability: 0.1,
|
|
929
|
-
energyModulation: false,
|
|
930
|
-
minEnergyScale: 0.3,
|
|
931
|
-
maxEnergyScale: 1
|
|
932
|
-
};
|
|
933
|
-
function createZeroBlendshapes() {
|
|
934
|
-
const result = {};
|
|
935
|
-
for (const name of UPPER_FACE_BLENDSHAPES) {
|
|
936
|
-
result[name] = 0;
|
|
937
|
-
}
|
|
938
|
-
return result;
|
|
939
|
-
}
|
|
940
|
-
function clamp01(value) {
|
|
941
|
-
return Math.max(0, Math.min(1, value));
|
|
942
|
-
}
|
|
943
|
-
var EmotionToBlendshapeMapper = class {
|
|
944
|
-
/**
|
|
945
|
-
* Create a new EmotionToBlendshapeMapper
|
|
946
|
-
*
|
|
947
|
-
* @param config - Optional configuration
|
|
948
|
-
*/
|
|
949
|
-
constructor(config) {
|
|
950
|
-
this.currentEnergy = 1;
|
|
951
|
-
this.config = {
|
|
952
|
-
...DEFAULT_CONFIG,
|
|
953
|
-
...config
|
|
954
|
-
};
|
|
955
|
-
this.targetBlendshapes = createZeroBlendshapes();
|
|
956
|
-
this.currentBlendshapes = createZeroBlendshapes();
|
|
957
|
-
}
|
|
958
|
-
/**
|
|
959
|
-
* Map an emotion frame to target blendshapes
|
|
960
|
-
*
|
|
961
|
-
* This sets the target values that the mapper will smoothly interpolate
|
|
962
|
-
* towards. Call update() each frame to apply smoothing.
|
|
963
|
-
*
|
|
964
|
-
* @param frame - Emotion frame from Emotion2VecInference
|
|
965
|
-
* @param audioEnergy - Optional audio energy (0-1) for energy modulation
|
|
966
|
-
* @returns Target upper face blendshapes (before smoothing)
|
|
967
|
-
*/
|
|
968
|
-
mapFrame(frame, audioEnergy) {
|
|
969
|
-
this.targetBlendshapes = createZeroBlendshapes();
|
|
970
|
-
if (audioEnergy !== void 0) {
|
|
971
|
-
this.currentEnergy = clamp01(audioEnergy);
|
|
972
|
-
}
|
|
973
|
-
if (!frame) {
|
|
974
|
-
return { ...this.targetBlendshapes };
|
|
975
|
-
}
|
|
976
|
-
if (this.config.blendMode === "weighted") {
|
|
977
|
-
this.mapFrameWeighted(frame);
|
|
978
|
-
} else {
|
|
979
|
-
this.mapFrameDominant(frame);
|
|
980
|
-
}
|
|
981
|
-
if (this.config.energyModulation) {
|
|
982
|
-
this.applyEnergyModulation();
|
|
983
|
-
}
|
|
984
|
-
return { ...this.targetBlendshapes };
|
|
692
|
+
if (this.disposed) return;
|
|
693
|
+
this.disposed = true;
|
|
694
|
+
this.stopDrip();
|
|
695
|
+
this.reset();
|
|
985
696
|
}
|
|
697
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
698
|
+
// Private
|
|
699
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
986
700
|
/**
|
|
987
|
-
*
|
|
701
|
+
* Process pending chunks sequentially.
|
|
702
|
+
* Fire-and-forget — called from pushAudio() without awaiting.
|
|
988
703
|
*/
|
|
989
|
-
|
|
990
|
-
if (
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
if (!mapping) {
|
|
996
|
-
return;
|
|
997
|
-
}
|
|
998
|
-
const scale = this.config.intensity * frame.confidence;
|
|
999
|
-
for (const [name, value] of Object.entries(mapping)) {
|
|
1000
|
-
const blendshapeName = name;
|
|
1001
|
-
if (value !== void 0) {
|
|
1002
|
-
this.targetBlendshapes[blendshapeName] = clamp01(value * scale);
|
|
704
|
+
drainPendingChunks() {
|
|
705
|
+
if (this.inferenceRunning || this.pendingChunks.length === 0) {
|
|
706
|
+
if (this.inferenceRunning && this.pendingChunks.length > 0) {
|
|
707
|
+
logger.debug("drainPendingChunks skipped (inference running)", {
|
|
708
|
+
pendingChunks: this.pendingChunks.length
|
|
709
|
+
});
|
|
1003
710
|
}
|
|
1004
|
-
}
|
|
1005
|
-
}
|
|
1006
|
-
/**
|
|
1007
|
-
* Map using weighted blend of all emotions by probability
|
|
1008
|
-
* Creates more nuanced expressions (e.g., bittersweet = happy + sad)
|
|
1009
|
-
*/
|
|
1010
|
-
mapFrameWeighted(frame) {
|
|
1011
|
-
if (!frame.probabilities) {
|
|
1012
|
-
this.mapFrameDominant(frame);
|
|
1013
711
|
return;
|
|
1014
712
|
}
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
713
|
+
this.inferenceRunning = true;
|
|
714
|
+
logger.info("drainPendingChunks starting", { pendingChunks: this.pendingChunks.length });
|
|
715
|
+
const processNext = async () => {
|
|
716
|
+
while (this.pendingChunks.length > 0 && !this.disposed) {
|
|
717
|
+
const { chunk, timestamp } = this.pendingChunks.shift();
|
|
718
|
+
try {
|
|
719
|
+
const t0 = performance.now();
|
|
720
|
+
const result = await this.backend.infer(chunk);
|
|
721
|
+
const inferMs = Math.round(performance.now() - t0);
|
|
722
|
+
const actualDuration = chunk.length / this.sampleRate;
|
|
723
|
+
const actualFrameCount = Math.ceil(actualDuration * FRAME_RATE);
|
|
724
|
+
const framesToQueue = Math.min(actualFrameCount, result.blendshapes.length);
|
|
725
|
+
logger.info("Inference complete", {
|
|
726
|
+
inferMs,
|
|
727
|
+
modelFrames: result.blendshapes.length,
|
|
728
|
+
framesToQueue,
|
|
729
|
+
timestamp,
|
|
730
|
+
totalQueued: this.timestampedQueue.length + framesToQueue,
|
|
731
|
+
remainingPending: this.pendingChunks.length
|
|
732
|
+
});
|
|
733
|
+
for (let i = 0; i < framesToQueue; i++) {
|
|
734
|
+
if (timestamp !== void 0) {
|
|
735
|
+
this.timestampedQueue.push({
|
|
736
|
+
frame: result.blendshapes[i],
|
|
737
|
+
timestamp: timestamp + i / FRAME_RATE
|
|
738
|
+
});
|
|
739
|
+
} else {
|
|
740
|
+
this.plainQueue.push(result.blendshapes[i]);
|
|
741
|
+
}
|
|
742
|
+
}
|
|
743
|
+
} catch (err) {
|
|
744
|
+
this.handleError(err);
|
|
745
|
+
}
|
|
746
|
+
if (this.pendingChunks.length > 0) {
|
|
747
|
+
await new Promise((r) => setTimeout(r, 0));
|
|
1028
748
|
}
|
|
1029
749
|
}
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
}
|
|
1035
|
-
/**
|
|
1036
|
-
* Apply energy modulation to scale emotion intensity by audio energy
|
|
1037
|
-
* Louder speech = stronger expressions
|
|
1038
|
-
*/
|
|
1039
|
-
applyEnergyModulation() {
|
|
1040
|
-
const { minEnergyScale, maxEnergyScale } = this.config;
|
|
1041
|
-
const energyScale = minEnergyScale + this.currentEnergy * (maxEnergyScale - minEnergyScale);
|
|
1042
|
-
for (const name of UPPER_FACE_BLENDSHAPES) {
|
|
1043
|
-
this.targetBlendshapes[name] = clamp01(this.targetBlendshapes[name] * energyScale);
|
|
1044
|
-
}
|
|
1045
|
-
}
|
|
1046
|
-
/**
|
|
1047
|
-
* Apply smoothing to interpolate current values towards target
|
|
1048
|
-
*
|
|
1049
|
-
* Uses exponential moving average:
|
|
1050
|
-
* current = current + smoothingFactor * (target - current)
|
|
1051
|
-
*
|
|
1052
|
-
* @param _deltaMs - Delta time in milliseconds (reserved for future time-based smoothing)
|
|
1053
|
-
*/
|
|
1054
|
-
update(_deltaMs) {
|
|
1055
|
-
const factor = this.config.smoothingFactor;
|
|
1056
|
-
for (const name of UPPER_FACE_BLENDSHAPES) {
|
|
1057
|
-
const target = this.targetBlendshapes[name];
|
|
1058
|
-
const current = this.currentBlendshapes[name];
|
|
1059
|
-
this.currentBlendshapes[name] = clamp01(current + factor * (target - current));
|
|
1060
|
-
}
|
|
1061
|
-
}
|
|
1062
|
-
/**
|
|
1063
|
-
* Get current smoothed blendshape values
|
|
1064
|
-
*
|
|
1065
|
-
* @returns Current upper face blendshapes (after smoothing)
|
|
1066
|
-
*/
|
|
1067
|
-
getCurrentBlendshapes() {
|
|
1068
|
-
return { ...this.currentBlendshapes };
|
|
1069
|
-
}
|
|
1070
|
-
/**
|
|
1071
|
-
* Reset mapper to neutral state
|
|
1072
|
-
*
|
|
1073
|
-
* Sets both target and current blendshapes to zero.
|
|
1074
|
-
*/
|
|
1075
|
-
reset() {
|
|
1076
|
-
this.targetBlendshapes = createZeroBlendshapes();
|
|
1077
|
-
this.currentBlendshapes = createZeroBlendshapes();
|
|
1078
|
-
this.currentEnergy = 1;
|
|
1079
|
-
}
|
|
1080
|
-
/**
|
|
1081
|
-
* Get current configuration
|
|
1082
|
-
*/
|
|
1083
|
-
getConfig() {
|
|
1084
|
-
return { ...this.config };
|
|
1085
|
-
}
|
|
1086
|
-
/**
|
|
1087
|
-
* Update configuration
|
|
1088
|
-
*
|
|
1089
|
-
* @param config - Partial configuration to update
|
|
1090
|
-
*/
|
|
1091
|
-
setConfig(config) {
|
|
1092
|
-
this.config = {
|
|
1093
|
-
...this.config,
|
|
1094
|
-
...config
|
|
750
|
+
this.inferenceRunning = false;
|
|
751
|
+
if (this.pendingChunks.length > 0) {
|
|
752
|
+
this.drainPendingChunks();
|
|
753
|
+
}
|
|
1095
754
|
};
|
|
755
|
+
processNext().catch((err) => this.handleError(err));
|
|
756
|
+
}
|
|
757
|
+
handleError(err) {
|
|
758
|
+
const error = err instanceof Error ? err : new Error(String(err));
|
|
759
|
+
logger.warn("A2EProcessor inference error", { error: error.message });
|
|
760
|
+
this.onError?.(error);
|
|
1096
761
|
}
|
|
1097
762
|
};
|
|
1098
763
|
|
|
1099
|
-
// src/
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
764
|
+
// src/inference/BlendshapeSmoother.ts
|
|
765
|
+
var NUM_BLENDSHAPES = 52;
|
|
766
|
+
var BlendshapeSmoother = class {
|
|
767
|
+
constructor(config) {
|
|
768
|
+
/** Whether any target has been set */
|
|
769
|
+
this._hasTarget = false;
|
|
770
|
+
this.halflife = config?.halflife ?? 0.06;
|
|
771
|
+
this.values = new Float32Array(NUM_BLENDSHAPES);
|
|
772
|
+
this.velocities = new Float32Array(NUM_BLENDSHAPES);
|
|
773
|
+
this.targets = new Float32Array(NUM_BLENDSHAPES);
|
|
1105
774
|
}
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
let peak = 0;
|
|
1110
|
-
for (let i = 0; i < samples.length; i++) {
|
|
1111
|
-
const abs = Math.abs(samples[i]);
|
|
1112
|
-
if (abs > peak) peak = abs;
|
|
775
|
+
/** Whether a target frame has been set (false until first setTarget call) */
|
|
776
|
+
get hasTarget() {
|
|
777
|
+
return this._hasTarget;
|
|
1113
778
|
}
|
|
1114
|
-
return peak;
|
|
1115
|
-
}
|
|
1116
|
-
var AudioEnergyAnalyzer = class {
|
|
1117
779
|
/**
|
|
1118
|
-
*
|
|
1119
|
-
*
|
|
780
|
+
* Set new target frame from inference output.
|
|
781
|
+
* Springs will converge toward these values on subsequent update() calls.
|
|
1120
782
|
*/
|
|
1121
|
-
|
|
1122
|
-
this.
|
|
1123
|
-
this.
|
|
1124
|
-
this.smoothingFactor = Math.max(0, Math.min(0.99, smoothingFactor));
|
|
1125
|
-
this.noiseFloor = noiseFloor;
|
|
783
|
+
setTarget(frame) {
|
|
784
|
+
this.targets.set(frame);
|
|
785
|
+
this._hasTarget = true;
|
|
1126
786
|
}
|
|
1127
787
|
/**
|
|
1128
|
-
*
|
|
1129
|
-
*
|
|
1130
|
-
*
|
|
788
|
+
* Advance all 52 springs by `dt` seconds and return the smoothed frame.
|
|
789
|
+
*
|
|
790
|
+
* Call this every render frame (e.g., inside requestAnimationFrame).
|
|
791
|
+
* Returns the internal values buffer — do NOT mutate the returned array.
|
|
792
|
+
*
|
|
793
|
+
* @param dt - Time step in seconds (e.g., 1/60 for 60fps)
|
|
794
|
+
* @returns Smoothed blendshape values (Float32Array of 52)
|
|
1131
795
|
*/
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
const gatedRMS = instantRMS > this.noiseFloor ? instantRMS : 0;
|
|
1136
|
-
const gatedPeak = instantPeak > this.noiseFloor ? instantPeak : 0;
|
|
1137
|
-
if (gatedRMS > this.smoothedRMS) {
|
|
1138
|
-
this.smoothedRMS = this.smoothedRMS * 0.5 + gatedRMS * 0.5;
|
|
1139
|
-
} else {
|
|
1140
|
-
this.smoothedRMS = this.smoothedRMS * this.smoothingFactor + gatedRMS * (1 - this.smoothingFactor);
|
|
796
|
+
update(dt) {
|
|
797
|
+
if (!this._hasTarget) {
|
|
798
|
+
return this.values;
|
|
1141
799
|
}
|
|
1142
|
-
if (
|
|
1143
|
-
this.
|
|
1144
|
-
|
|
1145
|
-
|
|
800
|
+
if (this.halflife <= 0) {
|
|
801
|
+
this.values.set(this.targets);
|
|
802
|
+
this.velocities.fill(0);
|
|
803
|
+
return this.values;
|
|
1146
804
|
}
|
|
1147
|
-
const
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
*/
|
|
1158
|
-
reset() {
|
|
1159
|
-
this.smoothedRMS = 0;
|
|
1160
|
-
this.smoothedPeak = 0;
|
|
1161
|
-
}
|
|
1162
|
-
/**
|
|
1163
|
-
* Get current smoothed RMS value
|
|
1164
|
-
*/
|
|
1165
|
-
get rms() {
|
|
1166
|
-
return this.smoothedRMS;
|
|
1167
|
-
}
|
|
1168
|
-
/**
|
|
1169
|
-
* Get current smoothed peak value
|
|
1170
|
-
*/
|
|
1171
|
-
get peak() {
|
|
1172
|
-
return this.smoothedPeak;
|
|
1173
|
-
}
|
|
1174
|
-
};
|
|
1175
|
-
var EmphasisDetector = class {
|
|
1176
|
-
/**
|
|
1177
|
-
* @param historySize Number of frames to track. Default 10
|
|
1178
|
-
* @param emphasisThreshold Minimum energy increase to count as emphasis. Default 0.15
|
|
1179
|
-
*/
|
|
1180
|
-
constructor(historySize = 10, emphasisThreshold = 0.15) {
|
|
1181
|
-
this.energyHistory = [];
|
|
1182
|
-
this.historySize = historySize;
|
|
1183
|
-
this.emphasisThreshold = emphasisThreshold;
|
|
805
|
+
const damping = Math.LN2 / this.halflife;
|
|
806
|
+
const eydt = Math.exp(-damping * dt);
|
|
807
|
+
for (let i = 0; i < NUM_BLENDSHAPES; i++) {
|
|
808
|
+
const j0 = this.values[i] - this.targets[i];
|
|
809
|
+
const j1 = this.velocities[i] + j0 * damping;
|
|
810
|
+
this.values[i] = eydt * (j0 + j1 * dt) + this.targets[i];
|
|
811
|
+
this.velocities[i] = eydt * (this.velocities[i] - j1 * damping * dt);
|
|
812
|
+
this.values[i] = Math.max(0, Math.min(1, this.values[i]));
|
|
813
|
+
}
|
|
814
|
+
return this.values;
|
|
1184
815
|
}
|
|
1185
816
|
/**
|
|
1186
|
-
*
|
|
1187
|
-
*
|
|
1188
|
-
*
|
|
817
|
+
* Decay all spring targets to neutral (0).
|
|
818
|
+
*
|
|
819
|
+
* Call when inference stalls (no new frames for threshold duration).
|
|
820
|
+
* The springs will smoothly close the mouth / relax the face over
|
|
821
|
+
* the halflife period rather than freezing.
|
|
1189
822
|
*/
|
|
1190
|
-
|
|
1191
|
-
this.
|
|
1192
|
-
if (this.energyHistory.length > this.historySize) {
|
|
1193
|
-
this.energyHistory.shift();
|
|
1194
|
-
}
|
|
1195
|
-
if (this.energyHistory.length < 3) {
|
|
1196
|
-
return { isEmphasis: false, emphasisStrength: 0 };
|
|
1197
|
-
}
|
|
1198
|
-
const prevFrames = this.energyHistory.slice(0, -1);
|
|
1199
|
-
const avgPrev = prevFrames.reduce((a, b) => a + b, 0) / prevFrames.length;
|
|
1200
|
-
const increase = energy - avgPrev;
|
|
1201
|
-
const isEmphasis = increase > this.emphasisThreshold;
|
|
1202
|
-
return {
|
|
1203
|
-
isEmphasis,
|
|
1204
|
-
emphasisStrength: isEmphasis ? Math.min(1, increase / 0.3) : 0
|
|
1205
|
-
};
|
|
823
|
+
decayToNeutral() {
|
|
824
|
+
this.targets.fill(0);
|
|
1206
825
|
}
|
|
1207
826
|
/**
|
|
1208
|
-
* Reset
|
|
827
|
+
* Reset all state (values, velocities, targets).
|
|
828
|
+
* Call when starting a new playback session.
|
|
1209
829
|
*/
|
|
1210
830
|
reset() {
|
|
1211
|
-
this.
|
|
831
|
+
this.values.fill(0);
|
|
832
|
+
this.velocities.fill(0);
|
|
833
|
+
this.targets.fill(0);
|
|
834
|
+
this._hasTarget = false;
|
|
1212
835
|
}
|
|
1213
836
|
};
|
|
1214
837
|
|
|
@@ -2461,7 +2084,7 @@ function isSafari() {
|
|
|
2461
2084
|
const ua = navigator.userAgent.toLowerCase();
|
|
2462
2085
|
return /safari/.test(ua) && !/chrome|crios|fxios|chromium|edg/.test(ua);
|
|
2463
2086
|
}
|
|
2464
|
-
function
|
|
2087
|
+
function shouldUseCpuA2E() {
|
|
2465
2088
|
return isSafari() || isIOS();
|
|
2466
2089
|
}
|
|
2467
2090
|
function isSpeechRecognitionAvailable() {
|
|
@@ -2471,22 +2094,22 @@ function isSpeechRecognitionAvailable() {
|
|
|
2471
2094
|
function shouldUseNativeASR() {
|
|
2472
2095
|
return (isIOS() || isSafari()) && isSpeechRecognitionAvailable();
|
|
2473
2096
|
}
|
|
2474
|
-
function
|
|
2097
|
+
function shouldUseServerA2E() {
|
|
2475
2098
|
return isIOS();
|
|
2476
2099
|
}
|
|
2477
2100
|
|
|
2478
2101
|
// src/inference/onnxLoader.ts
|
|
2479
|
-
var
|
|
2102
|
+
var logger2 = createLogger("OnnxLoader");
|
|
2480
2103
|
var ortInstance = null;
|
|
2481
2104
|
var loadedBackend = null;
|
|
2482
2105
|
var WASM_CDN_PATH = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
2483
2106
|
async function isWebGPUAvailable() {
|
|
2484
2107
|
if (isIOS()) {
|
|
2485
|
-
|
|
2108
|
+
logger2.debug("WebGPU check: disabled on iOS (asyncify bundle crashes WebKit)");
|
|
2486
2109
|
return false;
|
|
2487
2110
|
}
|
|
2488
2111
|
if (!hasWebGPUApi()) {
|
|
2489
|
-
|
|
2112
|
+
logger2.debug("WebGPU check: navigator.gpu not available", {
|
|
2490
2113
|
isSecureContext: typeof window !== "undefined" ? window.isSecureContext : "N/A"
|
|
2491
2114
|
});
|
|
2492
2115
|
return false;
|
|
@@ -2494,19 +2117,19 @@ async function isWebGPUAvailable() {
|
|
|
2494
2117
|
try {
|
|
2495
2118
|
const adapter = await navigator.gpu.requestAdapter();
|
|
2496
2119
|
if (!adapter) {
|
|
2497
|
-
|
|
2120
|
+
logger2.debug("WebGPU check: No adapter available");
|
|
2498
2121
|
return false;
|
|
2499
2122
|
}
|
|
2500
2123
|
const device = await adapter.requestDevice();
|
|
2501
2124
|
if (!device) {
|
|
2502
|
-
|
|
2125
|
+
logger2.debug("WebGPU check: Could not create device");
|
|
2503
2126
|
return false;
|
|
2504
2127
|
}
|
|
2505
2128
|
device.destroy();
|
|
2506
|
-
|
|
2129
|
+
logger2.debug("WebGPU check: Available and working");
|
|
2507
2130
|
return true;
|
|
2508
2131
|
} catch (err) {
|
|
2509
|
-
|
|
2132
|
+
logger2.debug("WebGPU check: Error during availability check", { error: err });
|
|
2510
2133
|
return false;
|
|
2511
2134
|
}
|
|
2512
2135
|
}
|
|
@@ -2516,11 +2139,11 @@ function applyIOSWasmMemoryPatch() {
|
|
|
2516
2139
|
iosWasmPatched = true;
|
|
2517
2140
|
const OrigMemory = WebAssembly.Memory;
|
|
2518
2141
|
const MAX_IOS_PAGES = 32768;
|
|
2519
|
-
|
|
2142
|
+
logger2.info("Applying iOS WASM memory patch (max\u21922GB, shared preserved)");
|
|
2520
2143
|
WebAssembly.Memory = function IOSPatchedMemory(descriptor) {
|
|
2521
2144
|
const patched = { ...descriptor };
|
|
2522
2145
|
if (patched.maximum !== void 0 && patched.maximum > MAX_IOS_PAGES) {
|
|
2523
|
-
|
|
2146
|
+
logger2.info("iOS memory patch: capping maximum", {
|
|
2524
2147
|
original: patched.maximum,
|
|
2525
2148
|
capped: MAX_IOS_PAGES,
|
|
2526
2149
|
shared: patched.shared,
|
|
@@ -2539,7 +2162,7 @@ function configureWasm(ort) {
|
|
|
2539
2162
|
ort.env.wasm.numThreads = numThreads;
|
|
2540
2163
|
ort.env.wasm.simd = true;
|
|
2541
2164
|
ort.env.wasm.proxy = enableProxy;
|
|
2542
|
-
|
|
2165
|
+
logger2.info("WASM configured", {
|
|
2543
2166
|
numThreads,
|
|
2544
2167
|
simd: true,
|
|
2545
2168
|
proxy: enableProxy,
|
|
@@ -2551,12 +2174,12 @@ async function getOnnxRuntime(backend) {
|
|
|
2551
2174
|
return ortInstance;
|
|
2552
2175
|
}
|
|
2553
2176
|
if (ortInstance && loadedBackend !== backend) {
|
|
2554
|
-
|
|
2177
|
+
logger2.warn(
|
|
2555
2178
|
`ONNX Runtime already loaded with ${loadedBackend} backend. Cannot switch to ${backend}. Returning existing instance.`
|
|
2556
2179
|
);
|
|
2557
2180
|
return ortInstance;
|
|
2558
2181
|
}
|
|
2559
|
-
|
|
2182
|
+
logger2.info(`Loading ONNX Runtime with ${backend} backend...`);
|
|
2560
2183
|
applyIOSWasmMemoryPatch();
|
|
2561
2184
|
try {
|
|
2562
2185
|
if (backend === "wasm" && (isIOS() || isSafari())) {
|
|
@@ -2571,10 +2194,10 @@ async function getOnnxRuntime(backend) {
|
|
|
2571
2194
|
}
|
|
2572
2195
|
loadedBackend = backend;
|
|
2573
2196
|
configureWasm(ortInstance);
|
|
2574
|
-
|
|
2197
|
+
logger2.info(`ONNX Runtime loaded successfully`, { backend });
|
|
2575
2198
|
return ortInstance;
|
|
2576
2199
|
} catch (err) {
|
|
2577
|
-
|
|
2200
|
+
logger2.error(`Failed to load ONNX Runtime with ${backend} backend`, {
|
|
2578
2201
|
error: err
|
|
2579
2202
|
});
|
|
2580
2203
|
throw new Error(
|
|
@@ -2585,7 +2208,7 @@ async function getOnnxRuntime(backend) {
|
|
|
2585
2208
|
async function getOnnxRuntimeForPreference(preference = "auto") {
|
|
2586
2209
|
const webgpuAvailable = await isWebGPUAvailable();
|
|
2587
2210
|
const backend = resolveBackend(preference, webgpuAvailable);
|
|
2588
|
-
|
|
2211
|
+
logger2.info("Resolved backend preference", {
|
|
2589
2212
|
preference,
|
|
2590
2213
|
webgpuAvailable,
|
|
2591
2214
|
resolvedBackend: backend
|
|
@@ -2619,42 +2242,6 @@ function getSessionOptions(backend) {
|
|
|
2619
2242
|
graphOptimizationLevel: "all"
|
|
2620
2243
|
};
|
|
2621
2244
|
}
|
|
2622
|
-
async function createSessionWithFallback(modelBuffer, preferredBackend) {
|
|
2623
|
-
const ort = await getOnnxRuntime(preferredBackend);
|
|
2624
|
-
const modelData = new Uint8Array(modelBuffer);
|
|
2625
|
-
if (preferredBackend === "webgpu") {
|
|
2626
|
-
try {
|
|
2627
|
-
const options2 = getSessionOptions("webgpu");
|
|
2628
|
-
const session2 = await ort.InferenceSession.create(modelData, options2);
|
|
2629
|
-
logger.info("Session created with WebGPU backend");
|
|
2630
|
-
return { session: session2, backend: "webgpu" };
|
|
2631
|
-
} catch (err) {
|
|
2632
|
-
logger.warn("WebGPU session creation failed, falling back to WASM", {
|
|
2633
|
-
error: err instanceof Error ? err.message : String(err)
|
|
2634
|
-
});
|
|
2635
|
-
}
|
|
2636
|
-
}
|
|
2637
|
-
const options = getSessionOptions("wasm");
|
|
2638
|
-
const session = await ort.InferenceSession.create(modelData, options);
|
|
2639
|
-
logger.info("Session created with WASM backend");
|
|
2640
|
-
return { session, backend: "wasm" };
|
|
2641
|
-
}
|
|
2642
|
-
function getLoadedBackend() {
|
|
2643
|
-
return loadedBackend;
|
|
2644
|
-
}
|
|
2645
|
-
function isOnnxRuntimeLoaded() {
|
|
2646
|
-
return ortInstance !== null;
|
|
2647
|
-
}
|
|
2648
|
-
async function preloadOnnxRuntime(preference = "auto") {
|
|
2649
|
-
if (ortInstance) {
|
|
2650
|
-
logger.info("ONNX Runtime already preloaded", { backend: loadedBackend });
|
|
2651
|
-
return loadedBackend;
|
|
2652
|
-
}
|
|
2653
|
-
logger.info("Preloading ONNX Runtime...", { preference });
|
|
2654
|
-
const { backend } = await getOnnxRuntimeForPreference(preference);
|
|
2655
|
-
logger.info("ONNX Runtime preloaded", { backend });
|
|
2656
|
-
return backend;
|
|
2657
|
-
}
|
|
2658
2245
|
|
|
2659
2246
|
// src/inference/blendshapeUtils.ts
|
|
2660
2247
|
var LAM_BLENDSHAPES = [
|
|
@@ -2804,16 +2391,19 @@ var WAV2ARKIT_BLENDSHAPES = [
|
|
|
2804
2391
|
var REMAP_WAV2ARKIT_TO_LAM = WAV2ARKIT_BLENDSHAPES.map(
|
|
2805
2392
|
(name) => LAM_BLENDSHAPES.indexOf(name)
|
|
2806
2393
|
);
|
|
2807
|
-
function
|
|
2808
|
-
const
|
|
2809
|
-
|
|
2810
|
-
|
|
2394
|
+
function lerpBlendshapes(current, target, factor = 0.3) {
|
|
2395
|
+
const len = Math.max(current.length, target.length);
|
|
2396
|
+
const result = new Array(len);
|
|
2397
|
+
for (let i = 0; i < len; i++) {
|
|
2398
|
+
const c = current[i] ?? 0;
|
|
2399
|
+
const t = target[i] ?? 0;
|
|
2400
|
+
result[i] = c + (t - c) * factor;
|
|
2811
2401
|
}
|
|
2812
2402
|
return result;
|
|
2813
2403
|
}
|
|
2814
2404
|
|
|
2815
2405
|
// src/inference/Wav2Vec2Inference.ts
|
|
2816
|
-
var
|
|
2406
|
+
var logger3 = createLogger("Wav2Vec2");
|
|
2817
2407
|
var CTC_VOCAB = [
|
|
2818
2408
|
"<pad>",
|
|
2819
2409
|
"<s>",
|
|
@@ -2863,6 +2453,7 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
2863
2453
|
this.poisoned = false;
|
|
2864
2454
|
this.config = config;
|
|
2865
2455
|
this.numIdentityClasses = config.numIdentityClasses ?? 12;
|
|
2456
|
+
this.chunkSize = config.chunkSize ?? 16e3;
|
|
2866
2457
|
}
|
|
2867
2458
|
get backend() {
|
|
2868
2459
|
return this.session ? this._backend : null;
|
|
@@ -2892,30 +2483,30 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
2892
2483
|
"model.backend_requested": this.config.backend || "auto"
|
|
2893
2484
|
});
|
|
2894
2485
|
try {
|
|
2895
|
-
|
|
2486
|
+
logger3.info("Loading ONNX Runtime...", { preference: this.config.backend || "auto" });
|
|
2896
2487
|
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend || "auto");
|
|
2897
2488
|
this.ort = ort;
|
|
2898
2489
|
this._backend = backend;
|
|
2899
|
-
|
|
2490
|
+
logger3.info("ONNX Runtime loaded", { backend: this._backend });
|
|
2900
2491
|
const modelUrl = this.config.modelUrl;
|
|
2901
2492
|
const dataUrl = this.config.externalDataUrl !== false ? typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data` : null;
|
|
2902
2493
|
const sessionOptions = getSessionOptions(this._backend);
|
|
2903
2494
|
let isCached = false;
|
|
2904
2495
|
if (isIOS()) {
|
|
2905
|
-
|
|
2496
|
+
logger3.info("iOS: passing model URLs directly to ORT (low-memory path)", {
|
|
2906
2497
|
modelUrl,
|
|
2907
2498
|
dataUrl
|
|
2908
2499
|
});
|
|
2909
2500
|
if (dataUrl) {
|
|
2910
2501
|
const dataFilename = dataUrl.split("/").pop();
|
|
2911
|
-
|
|
2502
|
+
logger3.info("iOS: setting externalData", { dataFilename, dataUrl });
|
|
2912
2503
|
sessionOptions.externalData = [{
|
|
2913
2504
|
path: dataFilename,
|
|
2914
2505
|
data: dataUrl
|
|
2915
2506
|
// URL string — ORT fetches directly into WASM
|
|
2916
2507
|
}];
|
|
2917
2508
|
}
|
|
2918
|
-
|
|
2509
|
+
logger3.info("iOS: calling InferenceSession.create() with URL string", {
|
|
2919
2510
|
modelUrl,
|
|
2920
2511
|
sessionOptions: JSON.stringify(
|
|
2921
2512
|
sessionOptions,
|
|
@@ -2925,14 +2516,14 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
2925
2516
|
try {
|
|
2926
2517
|
this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
|
|
2927
2518
|
} catch (sessionErr) {
|
|
2928
|
-
|
|
2519
|
+
logger3.error("iOS: InferenceSession.create() failed", {
|
|
2929
2520
|
error: sessionErr instanceof Error ? sessionErr.message : String(sessionErr),
|
|
2930
2521
|
errorType: sessionErr?.constructor?.name,
|
|
2931
2522
|
stack: sessionErr instanceof Error ? sessionErr.stack : void 0
|
|
2932
2523
|
});
|
|
2933
2524
|
throw sessionErr;
|
|
2934
2525
|
}
|
|
2935
|
-
|
|
2526
|
+
logger3.info("iOS: session created successfully", {
|
|
2936
2527
|
inputNames: this.session.inputNames,
|
|
2937
2528
|
outputNames: this.session.outputNames
|
|
2938
2529
|
});
|
|
@@ -2941,15 +2532,15 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
2941
2532
|
isCached = await cache.has(modelUrl);
|
|
2942
2533
|
let modelBuffer;
|
|
2943
2534
|
if (isCached) {
|
|
2944
|
-
|
|
2535
|
+
logger3.debug("Loading model from cache", { modelUrl });
|
|
2945
2536
|
modelBuffer = await cache.get(modelUrl);
|
|
2946
2537
|
if (!modelBuffer) {
|
|
2947
|
-
|
|
2538
|
+
logger3.warn("Cache corruption detected, clearing and retrying", { modelUrl });
|
|
2948
2539
|
await cache.delete(modelUrl);
|
|
2949
2540
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
2950
2541
|
}
|
|
2951
2542
|
} else {
|
|
2952
|
-
|
|
2543
|
+
logger3.debug("Fetching and caching model", { modelUrl });
|
|
2953
2544
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
2954
2545
|
}
|
|
2955
2546
|
if (!modelBuffer) {
|
|
@@ -2960,31 +2551,31 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
2960
2551
|
try {
|
|
2961
2552
|
const isDataCached = await cache.has(dataUrl);
|
|
2962
2553
|
if (isDataCached) {
|
|
2963
|
-
|
|
2554
|
+
logger3.debug("Loading external data from cache", { dataUrl });
|
|
2964
2555
|
externalDataBuffer = await cache.get(dataUrl);
|
|
2965
2556
|
if (!externalDataBuffer) {
|
|
2966
|
-
|
|
2557
|
+
logger3.warn("Cache corruption for external data, retrying", { dataUrl });
|
|
2967
2558
|
await cache.delete(dataUrl);
|
|
2968
2559
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
2969
2560
|
}
|
|
2970
2561
|
} else {
|
|
2971
|
-
|
|
2562
|
+
logger3.info("Fetching external model data", {
|
|
2972
2563
|
dataUrl,
|
|
2973
2564
|
note: "This may be a large download (383MB+)"
|
|
2974
2565
|
});
|
|
2975
2566
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
2976
2567
|
}
|
|
2977
|
-
|
|
2568
|
+
logger3.info("External data loaded", {
|
|
2978
2569
|
size: formatBytes(externalDataBuffer.byteLength)
|
|
2979
2570
|
});
|
|
2980
2571
|
} catch (err) {
|
|
2981
|
-
|
|
2572
|
+
logger3.debug("No external data file found (single-file model)", {
|
|
2982
2573
|
dataUrl,
|
|
2983
2574
|
error: err.message
|
|
2984
2575
|
});
|
|
2985
2576
|
}
|
|
2986
2577
|
}
|
|
2987
|
-
|
|
2578
|
+
logger3.debug("Creating ONNX session", {
|
|
2988
2579
|
graphSize: formatBytes(modelBuffer.byteLength),
|
|
2989
2580
|
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
|
|
2990
2581
|
backend: this._backend
|
|
@@ -2999,12 +2590,12 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
2999
2590
|
const modelData = new Uint8Array(modelBuffer);
|
|
3000
2591
|
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
3001
2592
|
}
|
|
3002
|
-
|
|
2593
|
+
logger3.info("ONNX session created successfully", {
|
|
3003
2594
|
executionProvider: this._backend,
|
|
3004
2595
|
backend: this._backend
|
|
3005
2596
|
});
|
|
3006
2597
|
const loadTimeMs = performance.now() - startTime;
|
|
3007
|
-
|
|
2598
|
+
logger3.info("Model loaded successfully", {
|
|
3008
2599
|
backend: this._backend,
|
|
3009
2600
|
loadTimeMs: Math.round(loadTimeMs),
|
|
3010
2601
|
inputs: this.session.inputNames,
|
|
@@ -3020,13 +2611,13 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3020
2611
|
model: "wav2vec2",
|
|
3021
2612
|
backend: this._backend
|
|
3022
2613
|
});
|
|
3023
|
-
|
|
2614
|
+
logger3.debug("Running warmup inference to initialize GPU context");
|
|
3024
2615
|
const warmupStart = performance.now();
|
|
3025
|
-
const warmupAudio = new Float32Array(
|
|
2616
|
+
const warmupAudio = new Float32Array(this.chunkSize);
|
|
3026
2617
|
const warmupIdentity = new Float32Array(this.numIdentityClasses);
|
|
3027
2618
|
warmupIdentity[0] = 1;
|
|
3028
2619
|
const warmupFeeds = {
|
|
3029
|
-
"audio": new this.ort.Tensor("float32", warmupAudio, [1,
|
|
2620
|
+
"audio": new this.ort.Tensor("float32", warmupAudio, [1, this.chunkSize]),
|
|
3030
2621
|
"identity": new this.ort.Tensor("float32", warmupIdentity, [1, this.numIdentityClasses])
|
|
3031
2622
|
};
|
|
3032
2623
|
const WARMUP_TIMEOUT_MS = 15e3;
|
|
@@ -3036,12 +2627,12 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3036
2627
|
]);
|
|
3037
2628
|
const warmupTimeMs = performance.now() - warmupStart;
|
|
3038
2629
|
if (warmupResult === "timeout") {
|
|
3039
|
-
|
|
2630
|
+
logger3.warn("Warmup inference timed out \u2014 GPU may be unresponsive. Continuing without warmup.", {
|
|
3040
2631
|
timeoutMs: WARMUP_TIMEOUT_MS,
|
|
3041
2632
|
backend: this._backend
|
|
3042
2633
|
});
|
|
3043
2634
|
} else {
|
|
3044
|
-
|
|
2635
|
+
logger3.info("Warmup inference complete", {
|
|
3045
2636
|
warmupTimeMs: Math.round(warmupTimeMs),
|
|
3046
2637
|
backend: this._backend
|
|
3047
2638
|
});
|
|
@@ -3069,11 +2660,10 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3069
2660
|
}
|
|
3070
2661
|
/**
|
|
3071
2662
|
* Run inference on raw audio
|
|
3072
|
-
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
2663
|
+
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
3073
2664
|
* @param identityIndex - Optional identity index (0-11, default 0 = neutral)
|
|
3074
2665
|
*
|
|
3075
|
-
*
|
|
3076
|
-
* Audio will be zero-padded or truncated to 16000 samples.
|
|
2666
|
+
* Audio will be zero-padded or truncated to chunkSize samples.
|
|
3077
2667
|
*/
|
|
3078
2668
|
async infer(audioSamples, identityIndex = 0) {
|
|
3079
2669
|
if (!this.session) {
|
|
@@ -3084,20 +2674,20 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3084
2674
|
}
|
|
3085
2675
|
const audioSamplesCopy = new Float32Array(audioSamples);
|
|
3086
2676
|
let audio;
|
|
3087
|
-
if (audioSamplesCopy.length ===
|
|
2677
|
+
if (audioSamplesCopy.length === this.chunkSize) {
|
|
3088
2678
|
audio = audioSamplesCopy;
|
|
3089
|
-
} else if (audioSamplesCopy.length <
|
|
3090
|
-
audio = new Float32Array(
|
|
2679
|
+
} else if (audioSamplesCopy.length < this.chunkSize) {
|
|
2680
|
+
audio = new Float32Array(this.chunkSize);
|
|
3091
2681
|
audio.set(audioSamplesCopy, 0);
|
|
3092
2682
|
} else {
|
|
3093
|
-
audio = audioSamplesCopy.slice(0,
|
|
2683
|
+
audio = audioSamplesCopy.slice(0, this.chunkSize);
|
|
3094
2684
|
}
|
|
3095
2685
|
const identity = new Float32Array(this.numIdentityClasses);
|
|
3096
|
-
identity[Math.min(identityIndex, this.numIdentityClasses - 1)] = 1;
|
|
2686
|
+
identity[Math.max(0, Math.min(identityIndex, this.numIdentityClasses - 1))] = 1;
|
|
3097
2687
|
const audioCopy = new Float32Array(audio);
|
|
3098
2688
|
const identityCopy = new Float32Array(identity);
|
|
3099
2689
|
const feeds = {
|
|
3100
|
-
"audio": new this.ort.Tensor("float32", audioCopy, [1,
|
|
2690
|
+
"audio": new this.ort.Tensor("float32", audioCopy, [1, this.chunkSize]),
|
|
3101
2691
|
"identity": new this.ort.Tensor("float32", identityCopy, [1, this.numIdentityClasses])
|
|
3102
2692
|
};
|
|
3103
2693
|
return this.queueInference(feeds);
|
|
@@ -3133,7 +2723,7 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3133
2723
|
const telemetry = getTelemetry();
|
|
3134
2724
|
const span = telemetry?.startSpan("Wav2Vec2.infer", {
|
|
3135
2725
|
"inference.backend": this._backend,
|
|
3136
|
-
"inference.input_samples":
|
|
2726
|
+
"inference.input_samples": this.chunkSize
|
|
3137
2727
|
});
|
|
3138
2728
|
try {
|
|
3139
2729
|
const startTime = performance.now();
|
|
@@ -3172,7 +2762,7 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3172
2762
|
blendshapes.push(symmetrizeBlendshapes(rawFrame));
|
|
3173
2763
|
}
|
|
3174
2764
|
const text = this.decodeCTC(asrLogits);
|
|
3175
|
-
|
|
2765
|
+
logger3.trace("Inference completed", {
|
|
3176
2766
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
3177
2767
|
numA2EFrames,
|
|
3178
2768
|
numASRFrames,
|
|
@@ -3206,12 +2796,12 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3206
2796
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
3207
2797
|
if (errMsg.includes("timed out")) {
|
|
3208
2798
|
this.poisoned = true;
|
|
3209
|
-
|
|
2799
|
+
logger3.error("CRITICAL: Inference session timed out \u2014 LAM is dead. Page reload required.", {
|
|
3210
2800
|
backend: this._backend,
|
|
3211
2801
|
timeoutMs: _Wav2Vec2Inference.INFERENCE_TIMEOUT_MS
|
|
3212
2802
|
});
|
|
3213
2803
|
} else {
|
|
3214
|
-
|
|
2804
|
+
logger3.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
3215
2805
|
}
|
|
3216
2806
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
3217
2807
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -3252,56 +2842,79 @@ _Wav2Vec2Inference.INFERENCE_TIMEOUT_MS = 5e3;
|
|
|
3252
2842
|
_Wav2Vec2Inference.isWebGPUAvailable = isWebGPUAvailable;
|
|
3253
2843
|
var Wav2Vec2Inference = _Wav2Vec2Inference;
|
|
3254
2844
|
|
|
2845
|
+
// src/audio/audioUtils.ts
|
|
2846
|
+
function pcm16ToFloat32(buffer) {
|
|
2847
|
+
const byteLen = buffer.byteLength & ~1;
|
|
2848
|
+
const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
|
|
2849
|
+
const float32 = new Float32Array(int16.length);
|
|
2850
|
+
for (let i = 0; i < int16.length; i++) {
|
|
2851
|
+
float32[i] = int16[i] / 32768;
|
|
2852
|
+
}
|
|
2853
|
+
return float32;
|
|
2854
|
+
}
|
|
2855
|
+
function int16ToFloat32(int16) {
|
|
2856
|
+
const float32 = new Float32Array(int16.length);
|
|
2857
|
+
for (let i = 0; i < int16.length; i++) {
|
|
2858
|
+
float32[i] = int16[i] / 32768;
|
|
2859
|
+
}
|
|
2860
|
+
return float32;
|
|
2861
|
+
}
|
|
2862
|
+
|
|
3255
2863
|
// src/audio/FullFacePipeline.ts
|
|
3256
|
-
var
|
|
3257
|
-
var
|
|
3258
|
-
|
|
3259
|
-
|
|
3260
|
-
|
|
3261
|
-
|
|
3262
|
-
|
|
3263
|
-
|
|
3264
|
-
|
|
3265
|
-
|
|
3266
|
-
|
|
3267
|
-
|
|
3268
|
-
|
|
3269
|
-
|
|
3270
|
-
|
|
3271
|
-
|
|
3272
|
-
|
|
3273
|
-
|
|
3274
|
-
|
|
3275
|
-
|
|
3276
|
-
disappointed: "sad",
|
|
3277
|
-
frustrated: "angry",
|
|
3278
|
-
irritated: "angry",
|
|
3279
|
-
furious: "angry",
|
|
3280
|
-
annoyed: "angry",
|
|
3281
|
-
// SenseVoice labels
|
|
3282
|
-
fearful: "sad",
|
|
3283
|
-
disgusted: "angry",
|
|
3284
|
-
surprised: "happy"
|
|
3285
|
-
};
|
|
3286
|
-
var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
2864
|
+
var logger4 = createLogger("FullFacePipeline");
|
|
2865
|
+
var BLENDSHAPE_TO_GROUP = /* @__PURE__ */ new Map();
|
|
2866
|
+
for (const name of LAM_BLENDSHAPES) {
|
|
2867
|
+
if (name.startsWith("eye")) {
|
|
2868
|
+
BLENDSHAPE_TO_GROUP.set(name, "eyes");
|
|
2869
|
+
} else if (name.startsWith("brow")) {
|
|
2870
|
+
BLENDSHAPE_TO_GROUP.set(name, "brows");
|
|
2871
|
+
} else if (name.startsWith("jaw")) {
|
|
2872
|
+
BLENDSHAPE_TO_GROUP.set(name, "jaw");
|
|
2873
|
+
} else if (name.startsWith("mouth")) {
|
|
2874
|
+
BLENDSHAPE_TO_GROUP.set(name, "mouth");
|
|
2875
|
+
} else if (name.startsWith("cheek")) {
|
|
2876
|
+
BLENDSHAPE_TO_GROUP.set(name, "cheeks");
|
|
2877
|
+
} else if (name.startsWith("nose")) {
|
|
2878
|
+
BLENDSHAPE_TO_GROUP.set(name, "nose");
|
|
2879
|
+
} else if (name.startsWith("tongue")) {
|
|
2880
|
+
BLENDSHAPE_TO_GROUP.set(name, "tongue");
|
|
2881
|
+
}
|
|
2882
|
+
}
|
|
2883
|
+
var FullFacePipeline = class extends EventEmitter {
|
|
3287
2884
|
constructor(options) {
|
|
3288
2885
|
super();
|
|
3289
2886
|
this.options = options;
|
|
3290
2887
|
this.playbackStarted = false;
|
|
3291
2888
|
this.monitorInterval = null;
|
|
3292
2889
|
this.frameAnimationId = null;
|
|
3293
|
-
// Emotion state
|
|
3294
|
-
this.lastEmotionFrame = null;
|
|
3295
|
-
this.currentAudioEnergy = 0;
|
|
3296
2890
|
// Stale frame detection
|
|
3297
2891
|
this.lastNewFrameTime = 0;
|
|
3298
2892
|
this.lastKnownLamFrame = null;
|
|
3299
2893
|
this.staleWarningEmitted = false;
|
|
2894
|
+
// Frame loop timing (for dt calculation)
|
|
2895
|
+
this.lastFrameLoopTime = 0;
|
|
2896
|
+
// Diagnostic logging counter
|
|
2897
|
+
this.frameLoopCount = 0;
|
|
3300
2898
|
const sampleRate = options.sampleRate ?? 16e3;
|
|
3301
|
-
this.
|
|
3302
|
-
this.
|
|
3303
|
-
|
|
2899
|
+
this.profile = options.profile ?? {};
|
|
2900
|
+
this.staleThresholdMs = options.staleThresholdMs ?? 2e3;
|
|
2901
|
+
this.smoother = new BlendshapeSmoother({
|
|
2902
|
+
halflife: options.smoothingHalflife ?? 0.06
|
|
2903
|
+
});
|
|
2904
|
+
const isCpuModel = options.lam.modelId === "wav2arkit_cpu";
|
|
2905
|
+
const chunkSize = options.chunkSize ?? options.lam.chunkSize ?? 16e3;
|
|
2906
|
+
const chunkAccumulationMs = chunkSize / sampleRate * 1e3;
|
|
2907
|
+
const inferenceEstimateMs = isCpuModel ? 300 : options.lam.backend === "wasm" ? 250 : 80;
|
|
2908
|
+
const marginMs = 100;
|
|
2909
|
+
const autoDelay = Math.ceil(chunkAccumulationMs + inferenceEstimateMs + marginMs);
|
|
3304
2910
|
const audioDelayMs = options.audioDelayMs ?? autoDelay;
|
|
2911
|
+
logger4.info("FullFacePipeline config", {
|
|
2912
|
+
chunkSize,
|
|
2913
|
+
audioDelayMs,
|
|
2914
|
+
autoDelay,
|
|
2915
|
+
backend: options.lam.backend,
|
|
2916
|
+
modelId: options.lam.modelId
|
|
2917
|
+
});
|
|
3305
2918
|
this.scheduler = new AudioScheduler({
|
|
3306
2919
|
sampleRate,
|
|
3307
2920
|
initialLookaheadSec: audioDelayMs / 1e3
|
|
@@ -3310,20 +2923,15 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3310
2923
|
sampleRate,
|
|
3311
2924
|
targetDurationMs: options.chunkTargetMs ?? 200
|
|
3312
2925
|
});
|
|
3313
|
-
this.
|
|
2926
|
+
this.processor = new A2EProcessor({
|
|
2927
|
+
backend: options.lam,
|
|
3314
2928
|
sampleRate,
|
|
2929
|
+
chunkSize,
|
|
3315
2930
|
onError: (error) => {
|
|
3316
|
-
|
|
2931
|
+
logger4.error("A2E inference error", { message: error.message, stack: error.stack });
|
|
3317
2932
|
this.emit("error", error);
|
|
3318
2933
|
}
|
|
3319
2934
|
});
|
|
3320
|
-
this.emotionMapper = new EmotionToBlendshapeMapper({
|
|
3321
|
-
smoothingFactor: 0.15,
|
|
3322
|
-
confidenceThreshold: 0.3,
|
|
3323
|
-
intensity: 1,
|
|
3324
|
-
energyModulation: true
|
|
3325
|
-
});
|
|
3326
|
-
this.energyAnalyzer = new AudioEnergyAnalyzer();
|
|
3327
2935
|
}
|
|
3328
2936
|
/**
|
|
3329
2937
|
* Initialize the pipeline
|
|
@@ -3332,40 +2940,33 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3332
2940
|
await this.scheduler.initialize();
|
|
3333
2941
|
}
|
|
3334
2942
|
/**
|
|
3335
|
-
*
|
|
3336
|
-
|
|
3337
|
-
|
|
3338
|
-
|
|
3339
|
-
*
|
|
3340
|
-
* Supported labels: happy, excited, joyful, sad, melancholic, angry,
|
|
3341
|
-
* frustrated, neutral, etc.
|
|
3342
|
-
*
|
|
3343
|
-
* @param label - Emotion label string (case-insensitive)
|
|
3344
|
-
*/
|
|
3345
|
-
setEmotionLabel(label) {
|
|
3346
|
-
const normalized = label.toLowerCase();
|
|
3347
|
-
const mapped = EMOTION_LABEL_MAP[normalized] ?? "neutral";
|
|
3348
|
-
const probabilities = {
|
|
3349
|
-
neutral: 0.1,
|
|
3350
|
-
happy: 0.1,
|
|
3351
|
-
angry: 0.1,
|
|
3352
|
-
sad: 0.1
|
|
3353
|
-
};
|
|
3354
|
-
probabilities[mapped] = 0.7;
|
|
3355
|
-
const frame = {
|
|
3356
|
-
emotion: mapped,
|
|
3357
|
-
confidence: 0.7,
|
|
3358
|
-
probabilities
|
|
3359
|
-
};
|
|
3360
|
-
this.lastEmotionFrame = frame;
|
|
3361
|
-
logger3.info("Emotion label set", { label, mapped });
|
|
2943
|
+
* Update the ExpressionProfile at runtime (e.g., character switch).
|
|
2944
|
+
*/
|
|
2945
|
+
setProfile(profile) {
|
|
2946
|
+
this.profile = profile;
|
|
3362
2947
|
}
|
|
3363
2948
|
/**
|
|
3364
|
-
*
|
|
3365
|
-
*
|
|
2949
|
+
* Apply ExpressionProfile scaling to raw A2E blendshapes.
|
|
2950
|
+
*
|
|
2951
|
+
* For each blendshape:
|
|
2952
|
+
* 1. If an override exists for the blendshape name, use override as scaler
|
|
2953
|
+
* 2. Otherwise, use the group scaler (default 1.0)
|
|
2954
|
+
* 3. Clamp result to [0, 1]
|
|
3366
2955
|
*/
|
|
3367
|
-
|
|
3368
|
-
|
|
2956
|
+
applyProfile(raw) {
|
|
2957
|
+
const scaled = new Float32Array(52);
|
|
2958
|
+
for (let i = 0; i < 52; i++) {
|
|
2959
|
+
const name = LAM_BLENDSHAPES[i];
|
|
2960
|
+
let scaler;
|
|
2961
|
+
if (this.profile.overrides && this.profile.overrides[name] !== void 0) {
|
|
2962
|
+
scaler = this.profile.overrides[name];
|
|
2963
|
+
} else {
|
|
2964
|
+
const group = BLENDSHAPE_TO_GROUP.get(name);
|
|
2965
|
+
scaler = group ? this.profile[group] ?? 1 : 1;
|
|
2966
|
+
}
|
|
2967
|
+
scaled[i] = Math.min(1, Math.max(0, raw[i] * scaler));
|
|
2968
|
+
}
|
|
2969
|
+
return scaled;
|
|
3369
2970
|
}
|
|
3370
2971
|
/**
|
|
3371
2972
|
* Start a new playback session
|
|
@@ -3377,15 +2978,14 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3377
2978
|
this.stopMonitoring();
|
|
3378
2979
|
this.scheduler.reset();
|
|
3379
2980
|
this.coalescer.reset();
|
|
3380
|
-
this.
|
|
2981
|
+
this.processor.reset();
|
|
3381
2982
|
this.playbackStarted = false;
|
|
3382
|
-
this.lastEmotionFrame = null;
|
|
3383
|
-
this.currentAudioEnergy = 0;
|
|
3384
|
-
this.emotionMapper.reset();
|
|
3385
|
-
this.energyAnalyzer.reset();
|
|
3386
2983
|
this.lastNewFrameTime = 0;
|
|
3387
2984
|
this.lastKnownLamFrame = null;
|
|
3388
2985
|
this.staleWarningEmitted = false;
|
|
2986
|
+
this.lastFrameLoopTime = 0;
|
|
2987
|
+
this.frameLoopCount = 0;
|
|
2988
|
+
this.smoother.reset();
|
|
3389
2989
|
this.scheduler.warmup();
|
|
3390
2990
|
this.startFrameLoop();
|
|
3391
2991
|
this.startMonitoring();
|
|
@@ -3393,8 +2993,8 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3393
2993
|
/**
|
|
3394
2994
|
* Receive audio chunk from network
|
|
3395
2995
|
*
|
|
3396
|
-
* Audio-first design: schedules audio immediately,
|
|
3397
|
-
* This prevents
|
|
2996
|
+
* Audio-first design: schedules audio immediately, A2E runs in background.
|
|
2997
|
+
* This prevents A2E inference (50-300ms) from blocking audio scheduling.
|
|
3398
2998
|
*
|
|
3399
2999
|
* @param chunk - Uint8Array containing Int16 PCM audio
|
|
3400
3000
|
*/
|
|
@@ -3409,100 +3009,77 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3409
3009
|
this.playbackStarted = true;
|
|
3410
3010
|
this.emit("playback_start", scheduleTime);
|
|
3411
3011
|
}
|
|
3412
|
-
|
|
3413
|
-
|
|
3414
|
-
|
|
3415
|
-
this.
|
|
3012
|
+
logger4.info("onAudioChunk \u2192 pushAudio", {
|
|
3013
|
+
float32Samples: float32.length,
|
|
3014
|
+
scheduleTime: scheduleTime.toFixed(3),
|
|
3015
|
+
currentTime: this.scheduler.getCurrentTime().toFixed(3),
|
|
3016
|
+
deltaToPlayback: (scheduleTime - this.scheduler.getCurrentTime()).toFixed(3)
|
|
3416
3017
|
});
|
|
3417
|
-
|
|
3418
|
-
/**
|
|
3419
|
-
* Get emotion frame for current animation.
|
|
3420
|
-
*
|
|
3421
|
-
* Priority:
|
|
3422
|
-
* 1. Explicit emotion label from setEmotionLabel()
|
|
3423
|
-
* 2. Prosody fallback: subtle brow movement from audio energy
|
|
3424
|
-
*/
|
|
3425
|
-
getEmotionFrame() {
|
|
3426
|
-
if (this.lastEmotionFrame) {
|
|
3427
|
-
return { frame: this.lastEmotionFrame, energy: this.currentAudioEnergy };
|
|
3428
|
-
}
|
|
3429
|
-
return { frame: null, energy: this.currentAudioEnergy };
|
|
3430
|
-
}
|
|
3431
|
-
/**
|
|
3432
|
-
* Merge LAM blendshapes with emotion upper face blendshapes
|
|
3433
|
-
*/
|
|
3434
|
-
mergeBlendshapes(lamFrame, emotionFrame, audioEnergy) {
|
|
3435
|
-
const merged = new Float32Array(52);
|
|
3436
|
-
let emotionBlendshapes;
|
|
3437
|
-
if (emotionFrame) {
|
|
3438
|
-
this.emotionMapper.mapFrame(emotionFrame, audioEnergy);
|
|
3439
|
-
this.emotionMapper.update(33);
|
|
3440
|
-
emotionBlendshapes = this.emotionMapper.getCurrentBlendshapes();
|
|
3441
|
-
} else {
|
|
3442
|
-
emotionBlendshapes = {};
|
|
3443
|
-
for (const name of UPPER_FACE_BLENDSHAPES) {
|
|
3444
|
-
emotionBlendshapes[name] = 0;
|
|
3445
|
-
}
|
|
3446
|
-
}
|
|
3447
|
-
for (let i = 0; i < 52; i++) {
|
|
3448
|
-
const name = LAM_BLENDSHAPES[i];
|
|
3449
|
-
if (UPPER_FACE_SET.has(name)) {
|
|
3450
|
-
const emotionValue = emotionBlendshapes[name] ?? 0;
|
|
3451
|
-
const lamValue = lamFrame[i];
|
|
3452
|
-
merged[i] = emotionValue * this.emotionBlendFactor + lamValue * this.lamBlendFactor;
|
|
3453
|
-
} else {
|
|
3454
|
-
merged[i] = lamFrame[i];
|
|
3455
|
-
}
|
|
3456
|
-
}
|
|
3457
|
-
return { merged, emotionBlendshapes };
|
|
3018
|
+
this.processor.pushAudio(float32, scheduleTime);
|
|
3458
3019
|
}
|
|
3459
3020
|
/**
|
|
3460
3021
|
* Start frame animation loop
|
|
3022
|
+
*
|
|
3023
|
+
* Uses critically damped spring smoother to produce continuous output
|
|
3024
|
+
* at render rate (60fps), even between inference batches (~30fps bursts).
|
|
3025
|
+
* Springs interpolate toward the latest inference target, and decay
|
|
3026
|
+
* to neutral when inference stalls.
|
|
3461
3027
|
*/
|
|
3462
3028
|
startFrameLoop() {
|
|
3029
|
+
this.lastFrameLoopTime = 0;
|
|
3463
3030
|
const updateFrame = () => {
|
|
3031
|
+
const now = performance.now() / 1e3;
|
|
3032
|
+
const dt = this.lastFrameLoopTime > 0 ? now - this.lastFrameLoopTime : 1 / 60;
|
|
3033
|
+
this.lastFrameLoopTime = now;
|
|
3034
|
+
this.frameLoopCount++;
|
|
3464
3035
|
const currentTime = this.scheduler.getCurrentTime();
|
|
3465
|
-
const lamFrame = this.
|
|
3466
|
-
if (lamFrame) {
|
|
3467
|
-
|
|
3468
|
-
|
|
3469
|
-
|
|
3470
|
-
|
|
3036
|
+
const lamFrame = this.processor.getFrameForTime(currentTime);
|
|
3037
|
+
if (lamFrame && lamFrame !== this.lastKnownLamFrame) {
|
|
3038
|
+
this.smoother.setTarget(lamFrame);
|
|
3039
|
+
this.lastNewFrameTime = performance.now();
|
|
3040
|
+
this.lastKnownLamFrame = lamFrame;
|
|
3041
|
+
this.staleWarningEmitted = false;
|
|
3042
|
+
logger4.info("New A2E frame", {
|
|
3043
|
+
jawOpen: lamFrame[24]?.toFixed(3),
|
|
3044
|
+
mouthClose: lamFrame[26]?.toFixed(3),
|
|
3045
|
+
browInnerUp: lamFrame[2]?.toFixed(3),
|
|
3046
|
+
browDownL: lamFrame[0]?.toFixed(3),
|
|
3047
|
+
browOuterUpL: lamFrame[3]?.toFixed(3),
|
|
3048
|
+
currentTime: currentTime.toFixed(3),
|
|
3049
|
+
queuedFrames: this.processor.queuedFrameCount
|
|
3050
|
+
});
|
|
3051
|
+
}
|
|
3052
|
+
if (this.frameLoopCount % 60 === 0) {
|
|
3053
|
+
logger4.info("Frame loop heartbeat", {
|
|
3054
|
+
frameLoopCount: this.frameLoopCount,
|
|
3055
|
+
currentTime: currentTime.toFixed(3),
|
|
3056
|
+
playbackEndTime: this.scheduler.getPlaybackEndTime().toFixed(3),
|
|
3057
|
+
queuedFrames: this.processor.queuedFrameCount,
|
|
3058
|
+
hasTarget: this.smoother.hasTarget,
|
|
3059
|
+
playbackStarted: this.playbackStarted,
|
|
3060
|
+
msSinceNewFrame: this.lastNewFrameTime > 0 ? Math.round(performance.now() - this.lastNewFrameTime) : -1,
|
|
3061
|
+
processorFill: this.processor.fillLevel.toFixed(2)
|
|
3062
|
+
});
|
|
3063
|
+
}
|
|
3064
|
+
if (this.playbackStarted && this.lastNewFrameTime > 0 && performance.now() - this.lastNewFrameTime > this.staleThresholdMs) {
|
|
3065
|
+
this.smoother.decayToNeutral();
|
|
3066
|
+
if (!this.staleWarningEmitted) {
|
|
3067
|
+
this.staleWarningEmitted = true;
|
|
3068
|
+
logger4.warn("A2E stalled \u2014 decaying to neutral", {
|
|
3069
|
+
staleDurationMs: Math.round(performance.now() - this.lastNewFrameTime),
|
|
3070
|
+
queuedFrames: this.processor.queuedFrameCount
|
|
3071
|
+
});
|
|
3471
3072
|
}
|
|
3472
|
-
|
|
3473
|
-
|
|
3073
|
+
}
|
|
3074
|
+
if (lamFrame) {
|
|
3075
|
+
const scaled = this.applyProfile(lamFrame);
|
|
3474
3076
|
const fullFrame = {
|
|
3475
|
-
blendshapes:
|
|
3476
|
-
|
|
3477
|
-
emotionBlendshapes,
|
|
3478
|
-
emotion: emotionFrame,
|
|
3077
|
+
blendshapes: scaled,
|
|
3078
|
+
rawBlendshapes: lamFrame,
|
|
3479
3079
|
timestamp: currentTime
|
|
3480
3080
|
};
|
|
3481
3081
|
this.emit("full_frame_ready", fullFrame);
|
|
3482
3082
|
this.emit("lam_frame_ready", lamFrame);
|
|
3483
|
-
if (emotionFrame) {
|
|
3484
|
-
this.emit("emotion_frame_ready", emotionFrame);
|
|
3485
|
-
}
|
|
3486
|
-
} else if (this.playbackStarted && !this.lastKnownLamFrame) {
|
|
3487
|
-
const { frame: emotionFrame, energy } = this.getEmotionFrame();
|
|
3488
|
-
if (emotionFrame && energy > 0.05) {
|
|
3489
|
-
const startupFrame = new Float32Array(52);
|
|
3490
|
-
const { merged, emotionBlendshapes } = this.mergeBlendshapes(startupFrame, emotionFrame, energy);
|
|
3491
|
-
this.emit("full_frame_ready", {
|
|
3492
|
-
blendshapes: merged,
|
|
3493
|
-
lamBlendshapes: startupFrame,
|
|
3494
|
-
emotionBlendshapes,
|
|
3495
|
-
emotion: emotionFrame,
|
|
3496
|
-
timestamp: currentTime
|
|
3497
|
-
});
|
|
3498
|
-
}
|
|
3499
|
-
}
|
|
3500
|
-
if (this.playbackStarted && this.lastNewFrameTime > 0 && !this.staleWarningEmitted && performance.now() - this.lastNewFrameTime > _FullFacePipeline.STALE_FRAME_THRESHOLD_MS) {
|
|
3501
|
-
this.staleWarningEmitted = true;
|
|
3502
|
-
logger3.warn("LAM appears stalled \u2014 no new frames for 3+ seconds during playback", {
|
|
3503
|
-
staleDurationMs: Math.round(performance.now() - this.lastNewFrameTime),
|
|
3504
|
-
queuedFrames: this.lamPipeline.queuedFrameCount
|
|
3505
|
-
});
|
|
3506
3083
|
}
|
|
3507
3084
|
this.frameAnimationId = requestAnimationFrame(updateFrame);
|
|
3508
3085
|
};
|
|
@@ -3517,7 +3094,7 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3517
3094
|
const chunk = new Uint8Array(remaining);
|
|
3518
3095
|
await this.onAudioChunk(chunk);
|
|
3519
3096
|
}
|
|
3520
|
-
await this.
|
|
3097
|
+
await this.processor.flush();
|
|
3521
3098
|
}
|
|
3522
3099
|
/**
|
|
3523
3100
|
* Stop playback immediately with smooth fade-out
|
|
@@ -3526,15 +3103,13 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3526
3103
|
this.stopMonitoring();
|
|
3527
3104
|
await this.scheduler.cancelAll(fadeOutMs);
|
|
3528
3105
|
this.coalescer.reset();
|
|
3529
|
-
this.
|
|
3106
|
+
this.processor.reset();
|
|
3107
|
+
this.smoother.reset();
|
|
3530
3108
|
this.playbackStarted = false;
|
|
3531
|
-
this.lastEmotionFrame = null;
|
|
3532
|
-
this.currentAudioEnergy = 0;
|
|
3533
|
-
this.emotionMapper.reset();
|
|
3534
|
-
this.energyAnalyzer.reset();
|
|
3535
3109
|
this.lastNewFrameTime = 0;
|
|
3536
3110
|
this.lastKnownLamFrame = null;
|
|
3537
3111
|
this.staleWarningEmitted = false;
|
|
3112
|
+
this.lastFrameLoopTime = 0;
|
|
3538
3113
|
this.emit("playback_complete", void 0);
|
|
3539
3114
|
}
|
|
3540
3115
|
/**
|
|
@@ -3545,7 +3120,7 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3545
3120
|
clearInterval(this.monitorInterval);
|
|
3546
3121
|
}
|
|
3547
3122
|
this.monitorInterval = setInterval(() => {
|
|
3548
|
-
if (this.scheduler.isComplete() && this.
|
|
3123
|
+
if (this.scheduler.isComplete() && this.processor.queuedFrameCount === 0) {
|
|
3549
3124
|
this.emit("playback_complete", void 0);
|
|
3550
3125
|
this.stopMonitoring();
|
|
3551
3126
|
}
|
|
@@ -3571,20 +3146,12 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3571
3146
|
return {
|
|
3572
3147
|
playbackStarted: this.playbackStarted,
|
|
3573
3148
|
coalescerFill: this.coalescer.fillLevel,
|
|
3574
|
-
|
|
3575
|
-
|
|
3576
|
-
emotionLabel: this.lastEmotionFrame?.emotion ?? null,
|
|
3577
|
-
currentAudioEnergy: this.currentAudioEnergy,
|
|
3149
|
+
processorFill: this.processor.fillLevel,
|
|
3150
|
+
queuedFrames: this.processor.queuedFrameCount,
|
|
3578
3151
|
currentTime: this.scheduler.getCurrentTime(),
|
|
3579
3152
|
playbackEndTime: this.scheduler.getPlaybackEndTime()
|
|
3580
3153
|
};
|
|
3581
3154
|
}
|
|
3582
|
-
/**
|
|
3583
|
-
* Check if an explicit emotion label is currently set
|
|
3584
|
-
*/
|
|
3585
|
-
get hasEmotionLabel() {
|
|
3586
|
-
return this.lastEmotionFrame !== null;
|
|
3587
|
-
}
|
|
3588
3155
|
/**
|
|
3589
3156
|
* Cleanup resources
|
|
3590
3157
|
*/
|
|
@@ -3592,13 +3159,9 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3592
3159
|
this.stopMonitoring();
|
|
3593
3160
|
this.scheduler.dispose();
|
|
3594
3161
|
this.coalescer.reset();
|
|
3595
|
-
this.
|
|
3596
|
-
this.lastEmotionFrame = null;
|
|
3597
|
-
this.currentAudioEnergy = 0;
|
|
3162
|
+
this.processor.dispose();
|
|
3598
3163
|
}
|
|
3599
3164
|
};
|
|
3600
|
-
_FullFacePipeline.STALE_FRAME_THRESHOLD_MS = 3e3;
|
|
3601
|
-
var FullFacePipeline = _FullFacePipeline;
|
|
3602
3165
|
|
|
3603
3166
|
// src/inference/kaldiFbank.ts
|
|
3604
3167
|
function fft(re, im) {
|
|
@@ -3885,7 +3448,7 @@ function ctcGreedyDecode(logits, seqLen, vocabSize, tokenMap) {
|
|
|
3885
3448
|
}
|
|
3886
3449
|
|
|
3887
3450
|
// src/inference/SenseVoiceInference.ts
|
|
3888
|
-
var
|
|
3451
|
+
var logger5 = createLogger("SenseVoice");
|
|
3889
3452
|
var _SenseVoiceInference = class _SenseVoiceInference {
|
|
3890
3453
|
constructor(config) {
|
|
3891
3454
|
this.session = null;
|
|
@@ -3938,26 +3501,26 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
3938
3501
|
"model.backend_requested": this.config.backend
|
|
3939
3502
|
});
|
|
3940
3503
|
try {
|
|
3941
|
-
|
|
3504
|
+
logger5.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
3942
3505
|
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
3943
3506
|
this.ort = ort;
|
|
3944
3507
|
this._backend = backend;
|
|
3945
|
-
|
|
3946
|
-
|
|
3508
|
+
logger5.info("ONNX Runtime loaded", { backend: this._backend });
|
|
3509
|
+
logger5.debug("Fetching tokens vocabulary", { tokensUrl: this.config.tokensUrl });
|
|
3947
3510
|
const tokensResponse = await fetch(this.config.tokensUrl);
|
|
3948
3511
|
if (!tokensResponse.ok) {
|
|
3949
3512
|
throw new Error(`Failed to fetch tokens.txt: ${tokensResponse.status} ${tokensResponse.statusText}`);
|
|
3950
3513
|
}
|
|
3951
3514
|
const tokensText = await tokensResponse.text();
|
|
3952
3515
|
this.tokenMap = parseTokensFile(tokensText);
|
|
3953
|
-
|
|
3516
|
+
logger5.debug("Tokens loaded", { vocabSize: this.tokenMap.size });
|
|
3954
3517
|
const sessionOptions = getSessionOptions(this._backend);
|
|
3955
3518
|
if (this._backend === "webgpu") {
|
|
3956
3519
|
sessionOptions.graphOptimizationLevel = "basic";
|
|
3957
3520
|
}
|
|
3958
3521
|
let isCached = false;
|
|
3959
3522
|
if (isIOS()) {
|
|
3960
|
-
|
|
3523
|
+
logger5.info("iOS: passing model URL directly to ORT (low-memory path)", {
|
|
3961
3524
|
modelUrl: this.config.modelUrl
|
|
3962
3525
|
});
|
|
3963
3526
|
this.session = await this.ort.InferenceSession.create(
|
|
@@ -3969,14 +3532,14 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
3969
3532
|
isCached = await cache.has(this.config.modelUrl);
|
|
3970
3533
|
let modelBuffer;
|
|
3971
3534
|
if (isCached) {
|
|
3972
|
-
|
|
3535
|
+
logger5.debug("Loading model from cache", { modelUrl: this.config.modelUrl });
|
|
3973
3536
|
modelBuffer = await cache.get(this.config.modelUrl);
|
|
3974
3537
|
onProgress?.(modelBuffer.byteLength, modelBuffer.byteLength);
|
|
3975
3538
|
} else {
|
|
3976
|
-
|
|
3539
|
+
logger5.debug("Fetching and caching model", { modelUrl: this.config.modelUrl });
|
|
3977
3540
|
modelBuffer = await fetchWithCache(this.config.modelUrl, onProgress);
|
|
3978
3541
|
}
|
|
3979
|
-
|
|
3542
|
+
logger5.debug("Creating ONNX session", {
|
|
3980
3543
|
size: formatBytes(modelBuffer.byteLength),
|
|
3981
3544
|
backend: this._backend
|
|
3982
3545
|
});
|
|
@@ -3989,15 +3552,15 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
3989
3552
|
const cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
|
|
3990
3553
|
this.negMean = cmvn.negMean;
|
|
3991
3554
|
this.invStddev = cmvn.invStddev;
|
|
3992
|
-
|
|
3555
|
+
logger5.debug("CMVN loaded from model metadata", { dim: this.negMean.length });
|
|
3993
3556
|
} else {
|
|
3994
|
-
|
|
3557
|
+
logger5.warn("CMVN not found in model metadata \u2014 features will not be normalized");
|
|
3995
3558
|
}
|
|
3996
3559
|
} catch (cmvnErr) {
|
|
3997
|
-
|
|
3560
|
+
logger5.warn("Failed to read CMVN from model metadata", { error: cmvnErr });
|
|
3998
3561
|
}
|
|
3999
3562
|
const loadTimeMs = performance.now() - startTime;
|
|
4000
|
-
|
|
3563
|
+
logger5.info("SenseVoice model loaded", {
|
|
4001
3564
|
backend: this._backend,
|
|
4002
3565
|
loadTimeMs: Math.round(loadTimeMs),
|
|
4003
3566
|
vocabSize: this.tokenMap.size,
|
|
@@ -4108,7 +3671,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4108
3671
|
const vocabSize = logitsDims[2];
|
|
4109
3672
|
const decoded = ctcGreedyDecode(logitsData, seqLen, vocabSize, this.tokenMap);
|
|
4110
3673
|
const inferenceTimeMs = performance.now() - startTime;
|
|
4111
|
-
|
|
3674
|
+
logger5.trace("Transcription complete", {
|
|
4112
3675
|
text: decoded.text.substring(0, 50),
|
|
4113
3676
|
language: decoded.language,
|
|
4114
3677
|
emotion: decoded.emotion,
|
|
@@ -4146,7 +3709,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4146
3709
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
4147
3710
|
if (errMsg.includes("timed out")) {
|
|
4148
3711
|
this.poisoned = true;
|
|
4149
|
-
|
|
3712
|
+
logger5.error("CRITICAL: Inference session timed out \u2014 SenseVoice is dead. Page reload required.", {
|
|
4150
3713
|
backend: this._backend,
|
|
4151
3714
|
timeoutMs: _SenseVoiceInference.INFERENCE_TIMEOUT_MS
|
|
4152
3715
|
});
|
|
@@ -4154,7 +3717,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4154
3717
|
const oomError = new Error(
|
|
4155
3718
|
`SenseVoice inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
|
|
4156
3719
|
);
|
|
4157
|
-
|
|
3720
|
+
logger5.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
4158
3721
|
pointer: `0x${err.toString(16)}`,
|
|
4159
3722
|
backend: this._backend
|
|
4160
3723
|
});
|
|
@@ -4167,7 +3730,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4167
3730
|
reject(oomError);
|
|
4168
3731
|
return;
|
|
4169
3732
|
} else {
|
|
4170
|
-
|
|
3733
|
+
logger5.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
4171
3734
|
}
|
|
4172
3735
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
4173
3736
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -4196,7 +3759,7 @@ _SenseVoiceInference.INFERENCE_TIMEOUT_MS = 1e4;
|
|
|
4196
3759
|
var SenseVoiceInference = _SenseVoiceInference;
|
|
4197
3760
|
|
|
4198
3761
|
// src/inference/SenseVoiceWorker.ts
|
|
4199
|
-
var
|
|
3762
|
+
var logger6 = createLogger("SenseVoiceWorker");
|
|
4200
3763
|
var WASM_CDN_PATH2 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
4201
3764
|
var LOAD_TIMEOUT_MS = 3e4;
|
|
4202
3765
|
var INFERENCE_TIMEOUT_MS = 1e4;
|
|
@@ -4929,7 +4492,7 @@ var SenseVoiceWorker = class {
|
|
|
4929
4492
|
this.handleWorkerMessage(event.data);
|
|
4930
4493
|
};
|
|
4931
4494
|
worker.onerror = (error) => {
|
|
4932
|
-
|
|
4495
|
+
logger6.error("Worker error", { error: error.message });
|
|
4933
4496
|
for (const [, resolver] of this.pendingResolvers) {
|
|
4934
4497
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
4935
4498
|
}
|
|
@@ -5009,9 +4572,9 @@ var SenseVoiceWorker = class {
|
|
|
5009
4572
|
"model.language": this.config.language
|
|
5010
4573
|
});
|
|
5011
4574
|
try {
|
|
5012
|
-
|
|
4575
|
+
logger6.info("Creating SenseVoice worker...");
|
|
5013
4576
|
this.worker = this.createWorker();
|
|
5014
|
-
|
|
4577
|
+
logger6.info("Loading model in worker...", {
|
|
5015
4578
|
modelUrl: this.config.modelUrl,
|
|
5016
4579
|
tokensUrl: this.config.tokensUrl,
|
|
5017
4580
|
language: this.config.language,
|
|
@@ -5033,7 +4596,7 @@ var SenseVoiceWorker = class {
|
|
|
5033
4596
|
this._isLoaded = true;
|
|
5034
4597
|
const loadTimeMs = performance.now() - startTime;
|
|
5035
4598
|
onProgress?.(1, 1);
|
|
5036
|
-
|
|
4599
|
+
logger6.info("SenseVoice worker loaded successfully", {
|
|
5037
4600
|
backend: "wasm",
|
|
5038
4601
|
loadTimeMs: Math.round(loadTimeMs),
|
|
5039
4602
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -5112,7 +4675,7 @@ var SenseVoiceWorker = class {
|
|
|
5112
4675
|
INFERENCE_TIMEOUT_MS
|
|
5113
4676
|
);
|
|
5114
4677
|
const totalTimeMs = performance.now() - startTime;
|
|
5115
|
-
|
|
4678
|
+
logger6.trace("Worker transcription complete", {
|
|
5116
4679
|
text: result.text.substring(0, 50),
|
|
5117
4680
|
language: result.language,
|
|
5118
4681
|
emotion: result.emotion,
|
|
@@ -5148,11 +4711,11 @@ var SenseVoiceWorker = class {
|
|
|
5148
4711
|
} catch (err) {
|
|
5149
4712
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
5150
4713
|
if (errMsg.includes("timed out")) {
|
|
5151
|
-
|
|
4714
|
+
logger6.error("CRITICAL: Worker inference timed out \u2014 SenseVoice worker is dead. Page reload required.", {
|
|
5152
4715
|
timeoutMs: INFERENCE_TIMEOUT_MS
|
|
5153
4716
|
});
|
|
5154
4717
|
} else {
|
|
5155
|
-
|
|
4718
|
+
logger6.error("Worker inference failed", { error: errMsg });
|
|
5156
4719
|
}
|
|
5157
4720
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
5158
4721
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -5190,7 +4753,7 @@ var SenseVoiceWorker = class {
|
|
|
5190
4753
|
};
|
|
5191
4754
|
|
|
5192
4755
|
// src/inference/UnifiedInferenceWorker.ts
|
|
5193
|
-
var
|
|
4756
|
+
var logger7 = createLogger("UnifiedInferenceWorker");
|
|
5194
4757
|
var WASM_CDN_PATH3 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
5195
4758
|
var INIT_TIMEOUT_MS = 15e3;
|
|
5196
4759
|
var SV_LOAD_TIMEOUT_MS = 3e4;
|
|
@@ -5886,7 +5449,7 @@ var UnifiedInferenceWorker = class {
|
|
|
5886
5449
|
const telemetry = getTelemetry();
|
|
5887
5450
|
const span = telemetry?.startSpan("UnifiedInferenceWorker.init");
|
|
5888
5451
|
try {
|
|
5889
|
-
|
|
5452
|
+
logger7.info("Creating unified inference worker...");
|
|
5890
5453
|
this.worker = this.createWorker();
|
|
5891
5454
|
await this.sendMessage(
|
|
5892
5455
|
{ type: "init", wasmPaths: WASM_CDN_PATH3, isIOS: isIOS() },
|
|
@@ -5895,7 +5458,7 @@ var UnifiedInferenceWorker = class {
|
|
|
5895
5458
|
);
|
|
5896
5459
|
this.initialized = true;
|
|
5897
5460
|
const loadTimeMs = performance.now() - startTime;
|
|
5898
|
-
|
|
5461
|
+
logger7.info("Unified worker initialized", { loadTimeMs: Math.round(loadTimeMs) });
|
|
5899
5462
|
span?.setAttributes({ "worker.init_time_ms": loadTimeMs });
|
|
5900
5463
|
span?.end();
|
|
5901
5464
|
} catch (error) {
|
|
@@ -5949,8 +5512,8 @@ var UnifiedInferenceWorker = class {
|
|
|
5949
5512
|
if (!this.worker) return;
|
|
5950
5513
|
await this.sendMessage({ type: "sv:dispose" }, "sv:disposed", DISPOSE_TIMEOUT_MS);
|
|
5951
5514
|
}
|
|
5952
|
-
// ── Wav2ArkitCpu (
|
|
5953
|
-
async
|
|
5515
|
+
// ── Wav2ArkitCpu (A2E) ──────────────────────────────────────────────
|
|
5516
|
+
async loadA2E(config) {
|
|
5954
5517
|
this.assertReady();
|
|
5955
5518
|
const startTime = performance.now();
|
|
5956
5519
|
const result = await this.sendMessage(
|
|
@@ -5971,7 +5534,7 @@ var UnifiedInferenceWorker = class {
|
|
|
5971
5534
|
outputNames: result.outputNames
|
|
5972
5535
|
};
|
|
5973
5536
|
}
|
|
5974
|
-
async
|
|
5537
|
+
async inferA2E(audio) {
|
|
5975
5538
|
this.assertReady();
|
|
5976
5539
|
return this.sendMessage(
|
|
5977
5540
|
{ type: "cpu:infer", audio },
|
|
@@ -5979,7 +5542,7 @@ var UnifiedInferenceWorker = class {
|
|
|
5979
5542
|
CPU_INFER_TIMEOUT_MS
|
|
5980
5543
|
);
|
|
5981
5544
|
}
|
|
5982
|
-
async
|
|
5545
|
+
async disposeA2E() {
|
|
5983
5546
|
if (!this.worker) return;
|
|
5984
5547
|
await this.sendMessage({ type: "cpu:dispose" }, "cpu:disposed", DISPOSE_TIMEOUT_MS);
|
|
5985
5548
|
}
|
|
@@ -6069,7 +5632,7 @@ var UnifiedInferenceWorker = class {
|
|
|
6069
5632
|
this.handleWorkerMessage(event.data);
|
|
6070
5633
|
};
|
|
6071
5634
|
worker.onerror = (error) => {
|
|
6072
|
-
|
|
5635
|
+
logger7.error("Unified worker error", { error: error.message });
|
|
6073
5636
|
this.rejectAllPending(`Worker error: ${error.message}`);
|
|
6074
5637
|
};
|
|
6075
5638
|
return worker;
|
|
@@ -6083,7 +5646,7 @@ var UnifiedInferenceWorker = class {
|
|
|
6083
5646
|
this.pendingRequests.delete(requestId);
|
|
6084
5647
|
pending.reject(new Error(data.error));
|
|
6085
5648
|
} else {
|
|
6086
|
-
|
|
5649
|
+
logger7.error("Worker broadcast error", { error: data.error });
|
|
6087
5650
|
this.rejectAllPending(data.error);
|
|
6088
5651
|
}
|
|
6089
5652
|
return;
|
|
@@ -6105,7 +5668,7 @@ var UnifiedInferenceWorker = class {
|
|
|
6105
5668
|
const timeout = setTimeout(() => {
|
|
6106
5669
|
this.pendingRequests.delete(requestId);
|
|
6107
5670
|
this.poisoned = true;
|
|
6108
|
-
|
|
5671
|
+
logger7.error("CRITICAL: Worker operation timed out \u2014 worker is dead", {
|
|
6109
5672
|
type: message.type,
|
|
6110
5673
|
timeoutMs
|
|
6111
5674
|
});
|
|
@@ -6171,7 +5734,7 @@ var SenseVoiceUnifiedAdapter = class {
|
|
|
6171
5734
|
});
|
|
6172
5735
|
this._isLoaded = true;
|
|
6173
5736
|
onProgress?.(1, 1);
|
|
6174
|
-
|
|
5737
|
+
logger7.info("SenseVoice loaded via unified worker", {
|
|
6175
5738
|
backend: "wasm",
|
|
6176
5739
|
loadTimeMs: Math.round(result.loadTimeMs),
|
|
6177
5740
|
vocabSize: result.vocabSize
|
|
@@ -6212,6 +5775,7 @@ var SenseVoiceUnifiedAdapter = class {
|
|
|
6212
5775
|
var Wav2ArkitCpuUnifiedAdapter = class {
|
|
6213
5776
|
constructor(worker, config) {
|
|
6214
5777
|
this.modelId = "wav2arkit_cpu";
|
|
5778
|
+
this.chunkSize = 16e3;
|
|
6215
5779
|
this._isLoaded = false;
|
|
6216
5780
|
this.inferenceQueue = Promise.resolve();
|
|
6217
5781
|
this.worker = worker;
|
|
@@ -6230,12 +5794,12 @@ var Wav2ArkitCpuUnifiedAdapter = class {
|
|
|
6230
5794
|
});
|
|
6231
5795
|
try {
|
|
6232
5796
|
const externalDataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${this.config.modelUrl}.data` : null;
|
|
6233
|
-
const result = await this.worker.
|
|
5797
|
+
const result = await this.worker.loadA2E({
|
|
6234
5798
|
modelUrl: this.config.modelUrl,
|
|
6235
5799
|
externalDataUrl: externalDataUrl || null
|
|
6236
5800
|
});
|
|
6237
5801
|
this._isLoaded = true;
|
|
6238
|
-
|
|
5802
|
+
logger7.info("Wav2ArkitCpu loaded via unified worker", {
|
|
6239
5803
|
backend: "wasm",
|
|
6240
5804
|
loadTimeMs: Math.round(result.loadTimeMs)
|
|
6241
5805
|
});
|
|
@@ -6262,7 +5826,7 @@ var Wav2ArkitCpuUnifiedAdapter = class {
|
|
|
6262
5826
|
});
|
|
6263
5827
|
try {
|
|
6264
5828
|
const startTime = performance.now();
|
|
6265
|
-
const result = await this.worker.
|
|
5829
|
+
const result = await this.worker.inferA2E(audioCopy);
|
|
6266
5830
|
const inferenceTimeMs = performance.now() - startTime;
|
|
6267
5831
|
const flatBuffer = result.blendshapes;
|
|
6268
5832
|
const { numFrames, numBlendshapes } = result;
|
|
@@ -6285,7 +5849,7 @@ var Wav2ArkitCpuUnifiedAdapter = class {
|
|
|
6285
5849
|
}
|
|
6286
5850
|
async dispose() {
|
|
6287
5851
|
if (this._isLoaded) {
|
|
6288
|
-
await this.worker.
|
|
5852
|
+
await this.worker.disposeA2E();
|
|
6289
5853
|
this._isLoaded = false;
|
|
6290
5854
|
}
|
|
6291
5855
|
}
|
|
@@ -6341,7 +5905,7 @@ var SileroVADUnifiedAdapter = class {
|
|
|
6341
5905
|
sampleRate: this.config.sampleRate
|
|
6342
5906
|
});
|
|
6343
5907
|
this._isLoaded = true;
|
|
6344
|
-
|
|
5908
|
+
logger7.info("SileroVAD loaded via unified worker", {
|
|
6345
5909
|
backend: "wasm",
|
|
6346
5910
|
loadTimeMs: Math.round(result.loadTimeMs),
|
|
6347
5911
|
sampleRate: this.config.sampleRate,
|
|
@@ -6422,10 +5986,10 @@ var SileroVADUnifiedAdapter = class {
|
|
|
6422
5986
|
};
|
|
6423
5987
|
|
|
6424
5988
|
// src/inference/createSenseVoice.ts
|
|
6425
|
-
var
|
|
5989
|
+
var logger8 = createLogger("createSenseVoice");
|
|
6426
5990
|
function createSenseVoice(config) {
|
|
6427
5991
|
if (config.unifiedWorker) {
|
|
6428
|
-
|
|
5992
|
+
logger8.info("Creating SenseVoiceUnifiedAdapter (shared unified worker)");
|
|
6429
5993
|
return new SenseVoiceUnifiedAdapter(config.unifiedWorker, {
|
|
6430
5994
|
modelUrl: config.modelUrl,
|
|
6431
5995
|
tokensUrl: config.tokensUrl,
|
|
@@ -6438,7 +6002,7 @@ function createSenseVoice(config) {
|
|
|
6438
6002
|
if (!SenseVoiceWorker.isSupported()) {
|
|
6439
6003
|
throw new Error("Web Workers are not supported in this environment");
|
|
6440
6004
|
}
|
|
6441
|
-
|
|
6005
|
+
logger8.info("Creating SenseVoiceWorker (off-main-thread)");
|
|
6442
6006
|
return new SenseVoiceWorker({
|
|
6443
6007
|
modelUrl: config.modelUrl,
|
|
6444
6008
|
tokensUrl: config.tokensUrl,
|
|
@@ -6447,7 +6011,7 @@ function createSenseVoice(config) {
|
|
|
6447
6011
|
});
|
|
6448
6012
|
}
|
|
6449
6013
|
if (useWorker === false) {
|
|
6450
|
-
|
|
6014
|
+
logger8.info("Creating SenseVoiceInference (main thread)");
|
|
6451
6015
|
return new SenseVoiceInference({
|
|
6452
6016
|
modelUrl: config.modelUrl,
|
|
6453
6017
|
tokensUrl: config.tokensUrl,
|
|
@@ -6456,7 +6020,7 @@ function createSenseVoice(config) {
|
|
|
6456
6020
|
});
|
|
6457
6021
|
}
|
|
6458
6022
|
if (SenseVoiceWorker.isSupported() && !isIOS()) {
|
|
6459
|
-
|
|
6023
|
+
logger8.info("Auto-detected: creating SenseVoiceWorker (off-main-thread)");
|
|
6460
6024
|
return new SenseVoiceWorker({
|
|
6461
6025
|
modelUrl: config.modelUrl,
|
|
6462
6026
|
tokensUrl: config.tokensUrl,
|
|
@@ -6464,7 +6028,7 @@ function createSenseVoice(config) {
|
|
|
6464
6028
|
textNorm: config.textNorm
|
|
6465
6029
|
});
|
|
6466
6030
|
}
|
|
6467
|
-
|
|
6031
|
+
logger8.info("Auto-detected: creating SenseVoiceInference (main thread)", {
|
|
6468
6032
|
reason: isIOS() ? "iOS (shared ORT instance)" : "Worker unsupported"
|
|
6469
6033
|
});
|
|
6470
6034
|
return new SenseVoiceInference({
|
|
@@ -6476,10 +6040,11 @@ function createSenseVoice(config) {
|
|
|
6476
6040
|
}
|
|
6477
6041
|
|
|
6478
6042
|
// src/inference/Wav2ArkitCpuInference.ts
|
|
6479
|
-
var
|
|
6043
|
+
var logger9 = createLogger("Wav2ArkitCpu");
|
|
6480
6044
|
var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
6481
6045
|
constructor(config) {
|
|
6482
6046
|
this.modelId = "wav2arkit_cpu";
|
|
6047
|
+
this.chunkSize = 16e3;
|
|
6483
6048
|
this.session = null;
|
|
6484
6049
|
this.ort = null;
|
|
6485
6050
|
this._backend = "wasm";
|
|
@@ -6517,16 +6082,16 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6517
6082
|
});
|
|
6518
6083
|
try {
|
|
6519
6084
|
const preference = this.config.backend || "wasm";
|
|
6520
|
-
|
|
6085
|
+
logger9.info("Loading ONNX Runtime...", { preference });
|
|
6521
6086
|
const { ort, backend } = await getOnnxRuntimeForPreference(preference);
|
|
6522
6087
|
this.ort = ort;
|
|
6523
6088
|
this._backend = backend;
|
|
6524
|
-
|
|
6089
|
+
logger9.info("ONNX Runtime loaded", { backend: this._backend });
|
|
6525
6090
|
const modelUrl = this.config.modelUrl;
|
|
6526
6091
|
const dataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${modelUrl}.data` : null;
|
|
6527
6092
|
const sessionOptions = getSessionOptions(this._backend);
|
|
6528
6093
|
if (isIOS()) {
|
|
6529
|
-
|
|
6094
|
+
logger9.info("iOS: passing model URLs directly to ORT (low-memory path)", {
|
|
6530
6095
|
modelUrl,
|
|
6531
6096
|
dataUrl
|
|
6532
6097
|
});
|
|
@@ -6544,15 +6109,15 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6544
6109
|
const isCached = await cache.has(modelUrl);
|
|
6545
6110
|
let modelBuffer;
|
|
6546
6111
|
if (isCached) {
|
|
6547
|
-
|
|
6112
|
+
logger9.debug("Loading model from cache", { modelUrl });
|
|
6548
6113
|
modelBuffer = await cache.get(modelUrl);
|
|
6549
6114
|
if (!modelBuffer) {
|
|
6550
|
-
|
|
6115
|
+
logger9.warn("Cache corruption detected, clearing and retrying", { modelUrl });
|
|
6551
6116
|
await cache.delete(modelUrl);
|
|
6552
6117
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
6553
6118
|
}
|
|
6554
6119
|
} else {
|
|
6555
|
-
|
|
6120
|
+
logger9.debug("Fetching and caching model graph", { modelUrl });
|
|
6556
6121
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
6557
6122
|
}
|
|
6558
6123
|
if (!modelBuffer) {
|
|
@@ -6563,31 +6128,31 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6563
6128
|
try {
|
|
6564
6129
|
const isDataCached = await cache.has(dataUrl);
|
|
6565
6130
|
if (isDataCached) {
|
|
6566
|
-
|
|
6131
|
+
logger9.debug("Loading external data from cache", { dataUrl });
|
|
6567
6132
|
externalDataBuffer = await cache.get(dataUrl);
|
|
6568
6133
|
if (!externalDataBuffer) {
|
|
6569
|
-
|
|
6134
|
+
logger9.warn("Cache corruption for external data, retrying", { dataUrl });
|
|
6570
6135
|
await cache.delete(dataUrl);
|
|
6571
6136
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
6572
6137
|
}
|
|
6573
6138
|
} else {
|
|
6574
|
-
|
|
6139
|
+
logger9.info("Fetching external model data", {
|
|
6575
6140
|
dataUrl,
|
|
6576
6141
|
note: "This may be a large download (400MB+)"
|
|
6577
6142
|
});
|
|
6578
6143
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
6579
6144
|
}
|
|
6580
|
-
|
|
6145
|
+
logger9.info("External data loaded", {
|
|
6581
6146
|
size: formatBytes(externalDataBuffer.byteLength)
|
|
6582
6147
|
});
|
|
6583
6148
|
} catch (err) {
|
|
6584
|
-
|
|
6149
|
+
logger9.debug("No external data file found (single-file model)", {
|
|
6585
6150
|
dataUrl,
|
|
6586
6151
|
error: err.message
|
|
6587
6152
|
});
|
|
6588
6153
|
}
|
|
6589
6154
|
}
|
|
6590
|
-
|
|
6155
|
+
logger9.debug("Creating ONNX session", {
|
|
6591
6156
|
graphSize: formatBytes(modelBuffer.byteLength),
|
|
6592
6157
|
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
|
|
6593
6158
|
backend: this._backend
|
|
@@ -6603,7 +6168,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6603
6168
|
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
6604
6169
|
}
|
|
6605
6170
|
const loadTimeMs = performance.now() - startTime;
|
|
6606
|
-
|
|
6171
|
+
logger9.info("Model loaded successfully", {
|
|
6607
6172
|
backend: this._backend,
|
|
6608
6173
|
loadTimeMs: Math.round(loadTimeMs),
|
|
6609
6174
|
inputs: this.session.inputNames,
|
|
@@ -6619,12 +6184,12 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6619
6184
|
model: "wav2arkit_cpu",
|
|
6620
6185
|
backend: this._backend
|
|
6621
6186
|
});
|
|
6622
|
-
|
|
6187
|
+
logger9.debug("Running warmup inference");
|
|
6623
6188
|
const warmupStart = performance.now();
|
|
6624
6189
|
const silentAudio = new Float32Array(16e3);
|
|
6625
6190
|
await this.infer(silentAudio);
|
|
6626
6191
|
const warmupTimeMs = performance.now() - warmupStart;
|
|
6627
|
-
|
|
6192
|
+
logger9.info("Warmup inference complete", {
|
|
6628
6193
|
warmupTimeMs: Math.round(warmupTimeMs),
|
|
6629
6194
|
backend: this._backend
|
|
6630
6195
|
});
|
|
@@ -6711,7 +6276,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6711
6276
|
const symmetrized = symmetrizeBlendshapes(rawFrame);
|
|
6712
6277
|
blendshapes.push(symmetrized);
|
|
6713
6278
|
}
|
|
6714
|
-
|
|
6279
|
+
logger9.trace("Inference completed", {
|
|
6715
6280
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
6716
6281
|
numFrames,
|
|
6717
6282
|
inputSamples
|
|
@@ -6739,7 +6304,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6739
6304
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
6740
6305
|
if (errMsg.includes("timed out")) {
|
|
6741
6306
|
this.poisoned = true;
|
|
6742
|
-
|
|
6307
|
+
logger9.error("CRITICAL: Inference session timed out \u2014 Wav2ArkitCpu is dead. Page reload required.", {
|
|
6743
6308
|
backend: this._backend,
|
|
6744
6309
|
timeoutMs: _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
|
|
6745
6310
|
});
|
|
@@ -6747,7 +6312,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6747
6312
|
const oomError = new Error(
|
|
6748
6313
|
`Wav2ArkitCpu inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
|
|
6749
6314
|
);
|
|
6750
|
-
|
|
6315
|
+
logger9.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
6751
6316
|
pointer: `0x${err.toString(16)}`,
|
|
6752
6317
|
backend: this._backend
|
|
6753
6318
|
});
|
|
@@ -6760,7 +6325,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6760
6325
|
reject(oomError);
|
|
6761
6326
|
return;
|
|
6762
6327
|
} else {
|
|
6763
|
-
|
|
6328
|
+
logger9.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
6764
6329
|
}
|
|
6765
6330
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
6766
6331
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -6787,7 +6352,7 @@ _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS = 5e3;
|
|
|
6787
6352
|
var Wav2ArkitCpuInference = _Wav2ArkitCpuInference;
|
|
6788
6353
|
|
|
6789
6354
|
// src/inference/Wav2ArkitCpuWorker.ts
|
|
6790
|
-
var
|
|
6355
|
+
var logger10 = createLogger("Wav2ArkitCpuWorker");
|
|
6791
6356
|
var WASM_CDN_PATH4 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
6792
6357
|
var LOAD_TIMEOUT_MS2 = 6e4;
|
|
6793
6358
|
var INFERENCE_TIMEOUT_MS2 = 5e3;
|
|
@@ -7033,6 +6598,7 @@ self.onerror = function(err) {
|
|
|
7033
6598
|
var Wav2ArkitCpuWorker = class {
|
|
7034
6599
|
constructor(config) {
|
|
7035
6600
|
this.modelId = "wav2arkit_cpu";
|
|
6601
|
+
this.chunkSize = 16e3;
|
|
7036
6602
|
this.worker = null;
|
|
7037
6603
|
this.isLoading = false;
|
|
7038
6604
|
this._isLoaded = false;
|
|
@@ -7067,7 +6633,7 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7067
6633
|
this.handleWorkerMessage(event.data);
|
|
7068
6634
|
};
|
|
7069
6635
|
worker.onerror = (error) => {
|
|
7070
|
-
|
|
6636
|
+
logger10.error("Worker error", { error: error.message });
|
|
7071
6637
|
for (const [, resolver] of this.pendingResolvers) {
|
|
7072
6638
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
7073
6639
|
}
|
|
@@ -7143,10 +6709,10 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7143
6709
|
"model.backend_requested": "wasm"
|
|
7144
6710
|
});
|
|
7145
6711
|
try {
|
|
7146
|
-
|
|
6712
|
+
logger10.info("Creating wav2arkit_cpu worker...");
|
|
7147
6713
|
this.worker = this.createWorker();
|
|
7148
6714
|
const externalDataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${this.config.modelUrl}.data` : null;
|
|
7149
|
-
|
|
6715
|
+
logger10.info("Loading model in worker...", {
|
|
7150
6716
|
modelUrl: this.config.modelUrl,
|
|
7151
6717
|
externalDataUrl,
|
|
7152
6718
|
isIOS: isIOS()
|
|
@@ -7164,7 +6730,7 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7164
6730
|
);
|
|
7165
6731
|
this._isLoaded = true;
|
|
7166
6732
|
const loadTimeMs = performance.now() - startTime;
|
|
7167
|
-
|
|
6733
|
+
logger10.info("Wav2ArkitCpu worker loaded successfully", {
|
|
7168
6734
|
backend: "wasm",
|
|
7169
6735
|
loadTimeMs: Math.round(loadTimeMs),
|
|
7170
6736
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -7249,7 +6815,7 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7249
6815
|
for (let f = 0; f < numFrames; f++) {
|
|
7250
6816
|
blendshapes.push(flatBuffer.slice(f * numBlendshapes, (f + 1) * numBlendshapes));
|
|
7251
6817
|
}
|
|
7252
|
-
|
|
6818
|
+
logger10.trace("Worker inference completed", {
|
|
7253
6819
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
7254
6820
|
workerTimeMs: Math.round(result.inferenceTimeMs * 100) / 100,
|
|
7255
6821
|
numFrames,
|
|
@@ -7279,12 +6845,12 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7279
6845
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
7280
6846
|
if (errMsg.includes("timed out")) {
|
|
7281
6847
|
this.poisoned = true;
|
|
7282
|
-
|
|
6848
|
+
logger10.error("CRITICAL: Worker inference timed out \u2014 Wav2ArkitCpu worker is dead. Page reload required.", {
|
|
7283
6849
|
backend: "wasm",
|
|
7284
6850
|
timeoutMs: INFERENCE_TIMEOUT_MS2
|
|
7285
6851
|
});
|
|
7286
6852
|
} else {
|
|
7287
|
-
|
|
6853
|
+
logger10.error("Worker inference failed", { error: errMsg, backend: "wasm" });
|
|
7288
6854
|
}
|
|
7289
6855
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
7290
6856
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -7321,39 +6887,39 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7321
6887
|
}
|
|
7322
6888
|
};
|
|
7323
6889
|
|
|
7324
|
-
// src/inference/
|
|
7325
|
-
var
|
|
7326
|
-
function
|
|
6890
|
+
// src/inference/createA2E.ts
|
|
6891
|
+
var logger11 = createLogger("createA2E");
|
|
6892
|
+
function createA2E(config) {
|
|
7327
6893
|
const mode = config.mode ?? "auto";
|
|
7328
6894
|
const fallbackOnError = config.fallbackOnError ?? true;
|
|
7329
6895
|
let useCpu;
|
|
7330
6896
|
if (mode === "cpu") {
|
|
7331
6897
|
useCpu = true;
|
|
7332
|
-
|
|
6898
|
+
logger11.info("Forcing CPU A2E model (wav2arkit_cpu)");
|
|
7333
6899
|
} else if (mode === "gpu") {
|
|
7334
6900
|
useCpu = false;
|
|
7335
|
-
|
|
6901
|
+
logger11.info("Forcing GPU A2E model (Wav2Vec2)");
|
|
7336
6902
|
} else {
|
|
7337
|
-
useCpu =
|
|
7338
|
-
|
|
6903
|
+
useCpu = shouldUseCpuA2E();
|
|
6904
|
+
logger11.info("Auto-detected A2E model", {
|
|
7339
6905
|
useCpu,
|
|
7340
6906
|
isSafari: isSafari()
|
|
7341
6907
|
});
|
|
7342
6908
|
}
|
|
7343
6909
|
if (useCpu) {
|
|
7344
6910
|
if (config.unifiedWorker) {
|
|
7345
|
-
|
|
6911
|
+
logger11.info("Creating Wav2ArkitCpuUnifiedAdapter (404MB, WASM, shared unified worker)");
|
|
7346
6912
|
return new Wav2ArkitCpuUnifiedAdapter(config.unifiedWorker, {
|
|
7347
6913
|
modelUrl: config.cpuModelUrl
|
|
7348
6914
|
});
|
|
7349
6915
|
}
|
|
7350
6916
|
if (config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
|
|
7351
|
-
|
|
6917
|
+
logger11.info("Creating Wav2ArkitCpuWorker (404MB, WASM, off-main-thread)");
|
|
7352
6918
|
return new Wav2ArkitCpuWorker({
|
|
7353
6919
|
modelUrl: config.cpuModelUrl
|
|
7354
6920
|
});
|
|
7355
6921
|
}
|
|
7356
|
-
|
|
6922
|
+
logger11.info("Creating Wav2ArkitCpuInference (404MB, WASM)");
|
|
7357
6923
|
return new Wav2ArkitCpuInference({
|
|
7358
6924
|
modelUrl: config.cpuModelUrl
|
|
7359
6925
|
});
|
|
@@ -7365,13 +6931,13 @@ function createLipSync(config) {
|
|
|
7365
6931
|
numIdentityClasses: config.numIdentityClasses
|
|
7366
6932
|
});
|
|
7367
6933
|
if (fallbackOnError) {
|
|
7368
|
-
|
|
7369
|
-
return new
|
|
6934
|
+
logger11.info("Creating Wav2Vec2Inference with CPU fallback");
|
|
6935
|
+
return new A2EWithFallback(gpuInstance, config);
|
|
7370
6936
|
}
|
|
7371
|
-
|
|
6937
|
+
logger11.info("Creating Wav2Vec2Inference (no fallback)");
|
|
7372
6938
|
return gpuInstance;
|
|
7373
6939
|
}
|
|
7374
|
-
var
|
|
6940
|
+
var A2EWithFallback = class {
|
|
7375
6941
|
constructor(gpuInstance, config) {
|
|
7376
6942
|
this.hasFallenBack = false;
|
|
7377
6943
|
this.implementation = gpuInstance;
|
|
@@ -7380,6 +6946,9 @@ var LipSyncWithFallback = class {
|
|
|
7380
6946
|
get modelId() {
|
|
7381
6947
|
return this.implementation.modelId;
|
|
7382
6948
|
}
|
|
6949
|
+
get chunkSize() {
|
|
6950
|
+
return this.implementation.chunkSize;
|
|
6951
|
+
}
|
|
7383
6952
|
get backend() {
|
|
7384
6953
|
return this.implementation.backend;
|
|
7385
6954
|
}
|
|
@@ -7394,7 +6963,7 @@ var LipSyncWithFallback = class {
|
|
|
7394
6963
|
}
|
|
7395
6964
|
}
|
|
7396
6965
|
async fallbackToCpu(reason) {
|
|
7397
|
-
|
|
6966
|
+
logger11.warn("GPU model load failed, falling back to CPU model", { reason });
|
|
7398
6967
|
try {
|
|
7399
6968
|
await this.implementation.dispose();
|
|
7400
6969
|
} catch {
|
|
@@ -7403,17 +6972,17 @@ var LipSyncWithFallback = class {
|
|
|
7403
6972
|
this.implementation = new Wav2ArkitCpuUnifiedAdapter(this.config.unifiedWorker, {
|
|
7404
6973
|
modelUrl: this.config.cpuModelUrl
|
|
7405
6974
|
});
|
|
7406
|
-
|
|
6975
|
+
logger11.info("Fallback to Wav2ArkitCpuUnifiedAdapter successful");
|
|
7407
6976
|
} else if (this.config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
|
|
7408
6977
|
this.implementation = new Wav2ArkitCpuWorker({
|
|
7409
6978
|
modelUrl: this.config.cpuModelUrl
|
|
7410
6979
|
});
|
|
7411
|
-
|
|
6980
|
+
logger11.info("Fallback to Wav2ArkitCpuWorker successful");
|
|
7412
6981
|
} else {
|
|
7413
6982
|
this.implementation = new Wav2ArkitCpuInference({
|
|
7414
6983
|
modelUrl: this.config.cpuModelUrl
|
|
7415
6984
|
});
|
|
7416
|
-
|
|
6985
|
+
logger11.info("Fallback to Wav2ArkitCpuInference successful");
|
|
7417
6986
|
}
|
|
7418
6987
|
this.hasFallenBack = true;
|
|
7419
6988
|
return await this.implementation.load();
|
|
@@ -7426,8 +6995,124 @@ var LipSyncWithFallback = class {
|
|
|
7426
6995
|
}
|
|
7427
6996
|
};
|
|
7428
6997
|
|
|
6998
|
+
// src/animation/audioEnergy.ts
|
|
6999
|
+
function calculateRMS(samples) {
|
|
7000
|
+
if (samples.length === 0) return 0;
|
|
7001
|
+
let sumSquares = 0;
|
|
7002
|
+
for (let i = 0; i < samples.length; i++) {
|
|
7003
|
+
sumSquares += samples[i] * samples[i];
|
|
7004
|
+
}
|
|
7005
|
+
return Math.sqrt(sumSquares / samples.length);
|
|
7006
|
+
}
|
|
7007
|
+
function calculatePeak(samples) {
|
|
7008
|
+
let peak = 0;
|
|
7009
|
+
for (let i = 0; i < samples.length; i++) {
|
|
7010
|
+
const abs = Math.abs(samples[i]);
|
|
7011
|
+
if (abs > peak) peak = abs;
|
|
7012
|
+
}
|
|
7013
|
+
return peak;
|
|
7014
|
+
}
|
|
7015
|
+
var AudioEnergyAnalyzer = class {
|
|
7016
|
+
/**
|
|
7017
|
+
* @param smoothingFactor How much to smooth (0 = no smoothing, 1 = infinite smoothing). Default 0.85
|
|
7018
|
+
* @param noiseFloor Minimum energy threshold to consider as signal. Default 0.01
|
|
7019
|
+
*/
|
|
7020
|
+
constructor(smoothingFactor = 0.85, noiseFloor = 0.01) {
|
|
7021
|
+
this.smoothedRMS = 0;
|
|
7022
|
+
this.smoothedPeak = 0;
|
|
7023
|
+
this.smoothingFactor = Math.max(0, Math.min(0.99, smoothingFactor));
|
|
7024
|
+
this.noiseFloor = noiseFloor;
|
|
7025
|
+
}
|
|
7026
|
+
/**
|
|
7027
|
+
* Process audio samples and return smoothed energy values
|
|
7028
|
+
* @param samples Audio samples (Float32Array)
|
|
7029
|
+
* @returns Object with rms and peak values
|
|
7030
|
+
*/
|
|
7031
|
+
process(samples) {
|
|
7032
|
+
const instantRMS = calculateRMS(samples);
|
|
7033
|
+
const instantPeak = calculatePeak(samples);
|
|
7034
|
+
const gatedRMS = instantRMS > this.noiseFloor ? instantRMS : 0;
|
|
7035
|
+
const gatedPeak = instantPeak > this.noiseFloor ? instantPeak : 0;
|
|
7036
|
+
if (gatedRMS > this.smoothedRMS) {
|
|
7037
|
+
this.smoothedRMS = this.smoothedRMS * 0.5 + gatedRMS * 0.5;
|
|
7038
|
+
} else {
|
|
7039
|
+
this.smoothedRMS = this.smoothedRMS * this.smoothingFactor + gatedRMS * (1 - this.smoothingFactor);
|
|
7040
|
+
}
|
|
7041
|
+
if (gatedPeak > this.smoothedPeak) {
|
|
7042
|
+
this.smoothedPeak = this.smoothedPeak * 0.3 + gatedPeak * 0.7;
|
|
7043
|
+
} else {
|
|
7044
|
+
this.smoothedPeak = this.smoothedPeak * this.smoothingFactor + gatedPeak * (1 - this.smoothingFactor);
|
|
7045
|
+
}
|
|
7046
|
+
const energy = this.smoothedRMS * 0.7 + this.smoothedPeak * 0.3;
|
|
7047
|
+
return {
|
|
7048
|
+
rms: this.smoothedRMS,
|
|
7049
|
+
peak: this.smoothedPeak,
|
|
7050
|
+
energy: Math.min(1, energy * 2)
|
|
7051
|
+
// Scale up and clamp
|
|
7052
|
+
};
|
|
7053
|
+
}
|
|
7054
|
+
/**
|
|
7055
|
+
* Reset analyzer state
|
|
7056
|
+
*/
|
|
7057
|
+
reset() {
|
|
7058
|
+
this.smoothedRMS = 0;
|
|
7059
|
+
this.smoothedPeak = 0;
|
|
7060
|
+
}
|
|
7061
|
+
/**
|
|
7062
|
+
* Get current smoothed RMS value
|
|
7063
|
+
*/
|
|
7064
|
+
get rms() {
|
|
7065
|
+
return this.smoothedRMS;
|
|
7066
|
+
}
|
|
7067
|
+
/**
|
|
7068
|
+
* Get current smoothed peak value
|
|
7069
|
+
*/
|
|
7070
|
+
get peak() {
|
|
7071
|
+
return this.smoothedPeak;
|
|
7072
|
+
}
|
|
7073
|
+
};
|
|
7074
|
+
var EmphasisDetector = class {
|
|
7075
|
+
/**
|
|
7076
|
+
* @param historySize Number of frames to track. Default 10
|
|
7077
|
+
* @param emphasisThreshold Minimum energy increase to count as emphasis. Default 0.15
|
|
7078
|
+
*/
|
|
7079
|
+
constructor(historySize = 10, emphasisThreshold = 0.15) {
|
|
7080
|
+
this.energyHistory = [];
|
|
7081
|
+
this.historySize = historySize;
|
|
7082
|
+
this.emphasisThreshold = emphasisThreshold;
|
|
7083
|
+
}
|
|
7084
|
+
/**
|
|
7085
|
+
* Process energy value and detect emphasis
|
|
7086
|
+
* @param energy Current energy value (0-1)
|
|
7087
|
+
* @returns Object with isEmphasis flag and emphasisStrength
|
|
7088
|
+
*/
|
|
7089
|
+
process(energy) {
|
|
7090
|
+
this.energyHistory.push(energy);
|
|
7091
|
+
if (this.energyHistory.length > this.historySize) {
|
|
7092
|
+
this.energyHistory.shift();
|
|
7093
|
+
}
|
|
7094
|
+
if (this.energyHistory.length < 3) {
|
|
7095
|
+
return { isEmphasis: false, emphasisStrength: 0 };
|
|
7096
|
+
}
|
|
7097
|
+
const prevFrames = this.energyHistory.slice(0, -1);
|
|
7098
|
+
const avgPrev = prevFrames.reduce((a, b) => a + b, 0) / prevFrames.length;
|
|
7099
|
+
const increase = energy - avgPrev;
|
|
7100
|
+
const isEmphasis = increase > this.emphasisThreshold;
|
|
7101
|
+
return {
|
|
7102
|
+
isEmphasis,
|
|
7103
|
+
emphasisStrength: isEmphasis ? Math.min(1, increase / 0.3) : 0
|
|
7104
|
+
};
|
|
7105
|
+
}
|
|
7106
|
+
/**
|
|
7107
|
+
* Reset detector state
|
|
7108
|
+
*/
|
|
7109
|
+
reset() {
|
|
7110
|
+
this.energyHistory = [];
|
|
7111
|
+
}
|
|
7112
|
+
};
|
|
7113
|
+
|
|
7429
7114
|
// src/inference/SileroVADInference.ts
|
|
7430
|
-
var
|
|
7115
|
+
var logger12 = createLogger("SileroVAD");
|
|
7431
7116
|
var SileroVADInference = class {
|
|
7432
7117
|
constructor(config) {
|
|
7433
7118
|
this.session = null;
|
|
@@ -7501,23 +7186,23 @@ var SileroVADInference = class {
|
|
|
7501
7186
|
"model.sample_rate": this.config.sampleRate
|
|
7502
7187
|
});
|
|
7503
7188
|
try {
|
|
7504
|
-
|
|
7189
|
+
logger12.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
7505
7190
|
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
7506
7191
|
this.ort = ort;
|
|
7507
7192
|
this._backend = backend;
|
|
7508
|
-
|
|
7193
|
+
logger12.info("ONNX Runtime loaded", { backend: this._backend });
|
|
7509
7194
|
const cache = getModelCache();
|
|
7510
7195
|
const modelUrl = this.config.modelUrl;
|
|
7511
7196
|
const isCached = await cache.has(modelUrl);
|
|
7512
7197
|
let modelBuffer;
|
|
7513
7198
|
if (isCached) {
|
|
7514
|
-
|
|
7199
|
+
logger12.debug("Loading model from cache", { modelUrl });
|
|
7515
7200
|
modelBuffer = await cache.get(modelUrl);
|
|
7516
7201
|
} else {
|
|
7517
|
-
|
|
7202
|
+
logger12.debug("Fetching and caching model", { modelUrl });
|
|
7518
7203
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
7519
7204
|
}
|
|
7520
|
-
|
|
7205
|
+
logger12.debug("Creating ONNX session", {
|
|
7521
7206
|
size: formatBytes(modelBuffer.byteLength),
|
|
7522
7207
|
backend: this._backend
|
|
7523
7208
|
});
|
|
@@ -7526,7 +7211,7 @@ var SileroVADInference = class {
|
|
|
7526
7211
|
this.session = await ort.InferenceSession.create(modelData, sessionOptions);
|
|
7527
7212
|
this.reset();
|
|
7528
7213
|
const loadTimeMs = performance.now() - startTime;
|
|
7529
|
-
|
|
7214
|
+
logger12.info("Model loaded successfully", {
|
|
7530
7215
|
backend: this._backend,
|
|
7531
7216
|
loadTimeMs: Math.round(loadTimeMs),
|
|
7532
7217
|
sampleRate: this.config.sampleRate,
|
|
@@ -7581,7 +7266,7 @@ var SileroVADInference = class {
|
|
|
7581
7266
|
[]
|
|
7582
7267
|
);
|
|
7583
7268
|
} catch (e) {
|
|
7584
|
-
|
|
7269
|
+
logger12.warn("BigInt64Array not available, using bigint array fallback", {
|
|
7585
7270
|
error: e instanceof Error ? e.message : String(e)
|
|
7586
7271
|
});
|
|
7587
7272
|
this.srTensor = new this.ort.Tensor(
|
|
@@ -7687,7 +7372,7 @@ var SileroVADInference = class {
|
|
|
7687
7372
|
this.preSpeechBuffer.shift();
|
|
7688
7373
|
}
|
|
7689
7374
|
}
|
|
7690
|
-
|
|
7375
|
+
logger12.trace("Skipping VAD inference - audio too quiet", {
|
|
7691
7376
|
rms: Math.round(rms * 1e4) / 1e4,
|
|
7692
7377
|
threshold: MIN_ENERGY_THRESHOLD
|
|
7693
7378
|
});
|
|
@@ -7741,7 +7426,7 @@ var SileroVADInference = class {
|
|
|
7741
7426
|
if (isSpeech && !this.wasSpeaking) {
|
|
7742
7427
|
preSpeechChunks = [...this.preSpeechBuffer];
|
|
7743
7428
|
this.preSpeechBuffer = [];
|
|
7744
|
-
|
|
7429
|
+
logger12.debug("Speech started with pre-speech buffer", {
|
|
7745
7430
|
preSpeechChunks: preSpeechChunks.length,
|
|
7746
7431
|
durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
|
|
7747
7432
|
});
|
|
@@ -7754,7 +7439,7 @@ var SileroVADInference = class {
|
|
|
7754
7439
|
this.preSpeechBuffer = [];
|
|
7755
7440
|
}
|
|
7756
7441
|
this.wasSpeaking = isSpeech;
|
|
7757
|
-
|
|
7442
|
+
logger12.trace("VAD inference completed", {
|
|
7758
7443
|
probability: Math.round(probability * 1e3) / 1e3,
|
|
7759
7444
|
isSpeech,
|
|
7760
7445
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100
|
|
@@ -7785,7 +7470,7 @@ var SileroVADInference = class {
|
|
|
7785
7470
|
const oomError = new Error(
|
|
7786
7471
|
`SileroVAD inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reducing concurrent model sessions or reloading the page.`
|
|
7787
7472
|
);
|
|
7788
|
-
|
|
7473
|
+
logger12.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
7789
7474
|
pointer: `0x${err.toString(16)}`,
|
|
7790
7475
|
backend: this._backend
|
|
7791
7476
|
});
|
|
@@ -7828,7 +7513,7 @@ var SileroVADInference = class {
|
|
|
7828
7513
|
SileroVADInference.isWebGPUAvailable = isWebGPUAvailable;
|
|
7829
7514
|
|
|
7830
7515
|
// src/inference/SileroVADWorker.ts
|
|
7831
|
-
var
|
|
7516
|
+
var logger13 = createLogger("SileroVADWorker");
|
|
7832
7517
|
var WASM_CDN_PATH5 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
7833
7518
|
var LOAD_TIMEOUT_MS3 = 1e4;
|
|
7834
7519
|
var INFERENCE_TIMEOUT_MS3 = 1e3;
|
|
@@ -8106,7 +7791,7 @@ var SileroVADWorker = class {
|
|
|
8106
7791
|
this.handleWorkerMessage(event.data);
|
|
8107
7792
|
};
|
|
8108
7793
|
worker.onerror = (error) => {
|
|
8109
|
-
|
|
7794
|
+
logger13.error("Worker error", { error: error.message });
|
|
8110
7795
|
for (const [, resolver] of this.pendingResolvers) {
|
|
8111
7796
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
8112
7797
|
}
|
|
@@ -8182,9 +7867,9 @@ var SileroVADWorker = class {
|
|
|
8182
7867
|
"model.sample_rate": this.config.sampleRate
|
|
8183
7868
|
});
|
|
8184
7869
|
try {
|
|
8185
|
-
|
|
7870
|
+
logger13.info("Creating VAD worker...");
|
|
8186
7871
|
this.worker = this.createWorker();
|
|
8187
|
-
|
|
7872
|
+
logger13.info("Loading model in worker...", {
|
|
8188
7873
|
modelUrl: this.config.modelUrl,
|
|
8189
7874
|
sampleRate: this.config.sampleRate
|
|
8190
7875
|
});
|
|
@@ -8200,7 +7885,7 @@ var SileroVADWorker = class {
|
|
|
8200
7885
|
);
|
|
8201
7886
|
this._isLoaded = true;
|
|
8202
7887
|
const loadTimeMs = performance.now() - startTime;
|
|
8203
|
-
|
|
7888
|
+
logger13.info("VAD worker loaded successfully", {
|
|
8204
7889
|
backend: "wasm",
|
|
8205
7890
|
loadTimeMs: Math.round(loadTimeMs),
|
|
8206
7891
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -8307,7 +7992,7 @@ var SileroVADWorker = class {
|
|
|
8307
7992
|
if (isSpeech && !this.wasSpeaking) {
|
|
8308
7993
|
preSpeechChunks = [...this.preSpeechBuffer];
|
|
8309
7994
|
this.preSpeechBuffer = [];
|
|
8310
|
-
|
|
7995
|
+
logger13.debug("Speech started with pre-speech buffer", {
|
|
8311
7996
|
preSpeechChunks: preSpeechChunks.length,
|
|
8312
7997
|
durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
|
|
8313
7998
|
});
|
|
@@ -8320,7 +8005,7 @@ var SileroVADWorker = class {
|
|
|
8320
8005
|
this.preSpeechBuffer = [];
|
|
8321
8006
|
}
|
|
8322
8007
|
this.wasSpeaking = isSpeech;
|
|
8323
|
-
|
|
8008
|
+
logger13.trace("VAD worker inference completed", {
|
|
8324
8009
|
probability: Math.round(result.probability * 1e3) / 1e3,
|
|
8325
8010
|
isSpeech,
|
|
8326
8011
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
@@ -8388,44 +8073,44 @@ var SileroVADWorker = class {
|
|
|
8388
8073
|
};
|
|
8389
8074
|
|
|
8390
8075
|
// src/inference/createSileroVAD.ts
|
|
8391
|
-
var
|
|
8076
|
+
var logger14 = createLogger("createSileroVAD");
|
|
8392
8077
|
function supportsVADWorker() {
|
|
8393
8078
|
if (typeof Worker === "undefined") {
|
|
8394
|
-
|
|
8079
|
+
logger14.debug("Worker not supported: Worker constructor undefined");
|
|
8395
8080
|
return false;
|
|
8396
8081
|
}
|
|
8397
8082
|
if (typeof URL === "undefined" || typeof URL.createObjectURL === "undefined") {
|
|
8398
|
-
|
|
8083
|
+
logger14.debug("Worker not supported: URL.createObjectURL unavailable");
|
|
8399
8084
|
return false;
|
|
8400
8085
|
}
|
|
8401
8086
|
if (typeof Blob === "undefined") {
|
|
8402
|
-
|
|
8087
|
+
logger14.debug("Worker not supported: Blob constructor unavailable");
|
|
8403
8088
|
return false;
|
|
8404
8089
|
}
|
|
8405
8090
|
return true;
|
|
8406
8091
|
}
|
|
8407
8092
|
function createSileroVAD(config) {
|
|
8408
8093
|
if (config.unifiedWorker) {
|
|
8409
|
-
|
|
8094
|
+
logger14.info("Creating SileroVADUnifiedAdapter (shared unified worker)");
|
|
8410
8095
|
return new SileroVADUnifiedAdapter(config.unifiedWorker, config);
|
|
8411
8096
|
}
|
|
8412
8097
|
const fallbackOnError = config.fallbackOnError ?? true;
|
|
8413
8098
|
let useWorker;
|
|
8414
8099
|
if (config.useWorker !== void 0) {
|
|
8415
8100
|
useWorker = config.useWorker;
|
|
8416
|
-
|
|
8101
|
+
logger14.debug("Worker preference explicitly set", { useWorker });
|
|
8417
8102
|
} else {
|
|
8418
8103
|
const workerSupported = supportsVADWorker();
|
|
8419
8104
|
const onMobile = isMobile();
|
|
8420
8105
|
useWorker = workerSupported && !onMobile;
|
|
8421
|
-
|
|
8106
|
+
logger14.debug("Auto-detected Worker preference", {
|
|
8422
8107
|
useWorker,
|
|
8423
8108
|
workerSupported,
|
|
8424
8109
|
onMobile
|
|
8425
8110
|
});
|
|
8426
8111
|
}
|
|
8427
8112
|
if (useWorker) {
|
|
8428
|
-
|
|
8113
|
+
logger14.info("Creating SileroVADWorker (off-main-thread)");
|
|
8429
8114
|
const worker = new SileroVADWorker({
|
|
8430
8115
|
modelUrl: config.modelUrl,
|
|
8431
8116
|
sampleRate: config.sampleRate,
|
|
@@ -8437,7 +8122,7 @@ function createSileroVAD(config) {
|
|
|
8437
8122
|
}
|
|
8438
8123
|
return worker;
|
|
8439
8124
|
}
|
|
8440
|
-
|
|
8125
|
+
logger14.info("Creating SileroVADInference (main thread)");
|
|
8441
8126
|
return new SileroVADInference(config);
|
|
8442
8127
|
}
|
|
8443
8128
|
var VADWorkerWithFallback = class {
|
|
@@ -8463,7 +8148,7 @@ var VADWorkerWithFallback = class {
|
|
|
8463
8148
|
try {
|
|
8464
8149
|
return await this.implementation.load();
|
|
8465
8150
|
} catch (error) {
|
|
8466
|
-
|
|
8151
|
+
logger14.warn("Worker load failed, falling back to main thread", {
|
|
8467
8152
|
error: error instanceof Error ? error.message : String(error)
|
|
8468
8153
|
});
|
|
8469
8154
|
try {
|
|
@@ -8472,7 +8157,7 @@ var VADWorkerWithFallback = class {
|
|
|
8472
8157
|
}
|
|
8473
8158
|
this.implementation = new SileroVADInference(this.config);
|
|
8474
8159
|
this.hasFallenBack = true;
|
|
8475
|
-
|
|
8160
|
+
logger14.info("Fallback to SileroVADInference successful");
|
|
8476
8161
|
return await this.implementation.load();
|
|
8477
8162
|
}
|
|
8478
8163
|
}
|
|
@@ -8493,8 +8178,175 @@ var VADWorkerWithFallback = class {
|
|
|
8493
8178
|
}
|
|
8494
8179
|
};
|
|
8495
8180
|
|
|
8181
|
+
// src/inference/A2EOrchestrator.ts
|
|
8182
|
+
var logger15 = createLogger("A2EOrchestrator");
|
|
8183
|
+
var A2EOrchestrator = class {
|
|
8184
|
+
constructor(config) {
|
|
8185
|
+
this.a2e = null;
|
|
8186
|
+
this.processor = null;
|
|
8187
|
+
// Mic capture state (lightweight — no dependency on MicrophoneCapture class
|
|
8188
|
+
// which requires an external EventEmitter. We do raw Web Audio here.)
|
|
8189
|
+
this.stream = null;
|
|
8190
|
+
this.audioContext = null;
|
|
8191
|
+
this.scriptProcessor = null;
|
|
8192
|
+
this.nativeSampleRate = 0;
|
|
8193
|
+
this._isReady = false;
|
|
8194
|
+
this._isStreaming = false;
|
|
8195
|
+
this._backend = null;
|
|
8196
|
+
this.disposed = false;
|
|
8197
|
+
this.config = {
|
|
8198
|
+
sampleRate: 16e3,
|
|
8199
|
+
...config
|
|
8200
|
+
};
|
|
8201
|
+
}
|
|
8202
|
+
/** Latest blendshape weights from inference (null if none yet) */
|
|
8203
|
+
get latestWeights() {
|
|
8204
|
+
return this.processor?.latestFrame ?? null;
|
|
8205
|
+
}
|
|
8206
|
+
/** Whether the model is loaded and ready for inference */
|
|
8207
|
+
get isReady() {
|
|
8208
|
+
return this._isReady;
|
|
8209
|
+
}
|
|
8210
|
+
/** Whether mic is active and inference loop is running */
|
|
8211
|
+
get isStreaming() {
|
|
8212
|
+
return this._isStreaming;
|
|
8213
|
+
}
|
|
8214
|
+
/** Current backend type (webgpu, wasm, or null) */
|
|
8215
|
+
get backend() {
|
|
8216
|
+
return this._backend;
|
|
8217
|
+
}
|
|
8218
|
+
/**
|
|
8219
|
+
* Load the A2E model and create the processor
|
|
8220
|
+
*/
|
|
8221
|
+
async load() {
|
|
8222
|
+
if (this.disposed) throw new Error("A2EOrchestrator has been disposed");
|
|
8223
|
+
logger15.info("Loading A2E model...");
|
|
8224
|
+
this.a2e = createA2E({
|
|
8225
|
+
gpuModelUrl: this.config.gpuModelUrl,
|
|
8226
|
+
gpuExternalDataUrl: this.config.gpuExternalDataUrl,
|
|
8227
|
+
cpuModelUrl: this.config.cpuModelUrl ?? this.config.gpuModelUrl,
|
|
8228
|
+
...this.config.a2eConfig
|
|
8229
|
+
});
|
|
8230
|
+
const info = await this.a2e.load();
|
|
8231
|
+
this._backend = info.backend;
|
|
8232
|
+
this.processor = new A2EProcessor({
|
|
8233
|
+
backend: this.a2e,
|
|
8234
|
+
sampleRate: this.config.sampleRate,
|
|
8235
|
+
chunkSize: this.config.chunkSize,
|
|
8236
|
+
onFrame: this.config.onFrame,
|
|
8237
|
+
onError: this.config.onError
|
|
8238
|
+
});
|
|
8239
|
+
this._isReady = true;
|
|
8240
|
+
logger15.info("A2E model loaded", {
|
|
8241
|
+
backend: info.backend,
|
|
8242
|
+
loadTimeMs: info.loadTimeMs,
|
|
8243
|
+
modelId: this.a2e.modelId
|
|
8244
|
+
});
|
|
8245
|
+
this.config.onReady?.();
|
|
8246
|
+
}
|
|
8247
|
+
/**
|
|
8248
|
+
* Start mic capture and inference loop
|
|
8249
|
+
*/
|
|
8250
|
+
async start() {
|
|
8251
|
+
if (this.disposed) throw new Error("A2EOrchestrator has been disposed");
|
|
8252
|
+
if (!this._isReady || !this.processor) throw new Error("Model not loaded. Call load() first.");
|
|
8253
|
+
if (this._isStreaming) return;
|
|
8254
|
+
try {
|
|
8255
|
+
this.stream = await navigator.mediaDevices.getUserMedia({
|
|
8256
|
+
audio: {
|
|
8257
|
+
sampleRate: { ideal: this.config.sampleRate },
|
|
8258
|
+
channelCount: 1,
|
|
8259
|
+
echoCancellation: true,
|
|
8260
|
+
noiseSuppression: true,
|
|
8261
|
+
autoGainControl: true
|
|
8262
|
+
}
|
|
8263
|
+
});
|
|
8264
|
+
this.audioContext = new AudioContext({ sampleRate: this.config.sampleRate });
|
|
8265
|
+
if (this.audioContext.state === "suspended") {
|
|
8266
|
+
await this.audioContext.resume();
|
|
8267
|
+
}
|
|
8268
|
+
this.nativeSampleRate = this.audioContext.sampleRate;
|
|
8269
|
+
const source = this.audioContext.createMediaStreamSource(this.stream);
|
|
8270
|
+
this.scriptProcessor = this.audioContext.createScriptProcessor(4096, 1, 1);
|
|
8271
|
+
this.scriptProcessor.onaudioprocess = (e) => {
|
|
8272
|
+
if (!this._isStreaming || !this.processor) return;
|
|
8273
|
+
const input = e.inputBuffer.getChannelData(0);
|
|
8274
|
+
let samples;
|
|
8275
|
+
if (this.nativeSampleRate !== this.config.sampleRate) {
|
|
8276
|
+
const ratio = this.config.sampleRate / this.nativeSampleRate;
|
|
8277
|
+
const newLen = Math.round(input.length * ratio);
|
|
8278
|
+
samples = new Float32Array(newLen);
|
|
8279
|
+
for (let i = 0; i < newLen; i++) {
|
|
8280
|
+
const srcIdx = i / ratio;
|
|
8281
|
+
const lo = Math.floor(srcIdx);
|
|
8282
|
+
const hi = Math.min(lo + 1, input.length - 1);
|
|
8283
|
+
const frac = srcIdx - lo;
|
|
8284
|
+
samples[i] = input[lo] * (1 - frac) + input[hi] * frac;
|
|
8285
|
+
}
|
|
8286
|
+
} else {
|
|
8287
|
+
samples = new Float32Array(input);
|
|
8288
|
+
}
|
|
8289
|
+
this.processor.pushAudio(samples);
|
|
8290
|
+
};
|
|
8291
|
+
source.connect(this.scriptProcessor);
|
|
8292
|
+
this.scriptProcessor.connect(this.audioContext.destination);
|
|
8293
|
+
this._isStreaming = true;
|
|
8294
|
+
this.processor.startDrip();
|
|
8295
|
+
logger15.info("Mic capture started", { sampleRate: this.nativeSampleRate });
|
|
8296
|
+
} catch (err) {
|
|
8297
|
+
const error = err instanceof Error ? err : new Error(String(err));
|
|
8298
|
+
logger15.error("Failed to start mic capture", { error: error.message });
|
|
8299
|
+
this.config.onError?.(error);
|
|
8300
|
+
throw error;
|
|
8301
|
+
}
|
|
8302
|
+
}
|
|
8303
|
+
/**
|
|
8304
|
+
* Stop mic capture and inference loop
|
|
8305
|
+
*/
|
|
8306
|
+
stop() {
|
|
8307
|
+
this._isStreaming = false;
|
|
8308
|
+
if (this.processor) {
|
|
8309
|
+
this.processor.stopDrip();
|
|
8310
|
+
this.processor.reset();
|
|
8311
|
+
}
|
|
8312
|
+
if (this.scriptProcessor) {
|
|
8313
|
+
this.scriptProcessor.disconnect();
|
|
8314
|
+
this.scriptProcessor.onaudioprocess = null;
|
|
8315
|
+
this.scriptProcessor = null;
|
|
8316
|
+
}
|
|
8317
|
+
if (this.stream) {
|
|
8318
|
+
this.stream.getTracks().forEach((t) => t.stop());
|
|
8319
|
+
this.stream = null;
|
|
8320
|
+
}
|
|
8321
|
+
if (this.audioContext) {
|
|
8322
|
+
this.audioContext.close().catch(() => {
|
|
8323
|
+
});
|
|
8324
|
+
this.audioContext = null;
|
|
8325
|
+
}
|
|
8326
|
+
logger15.info("Mic capture stopped");
|
|
8327
|
+
}
|
|
8328
|
+
/**
|
|
8329
|
+
* Dispose of all resources
|
|
8330
|
+
*/
|
|
8331
|
+
async dispose() {
|
|
8332
|
+
if (this.disposed) return;
|
|
8333
|
+
this.disposed = true;
|
|
8334
|
+
this.stop();
|
|
8335
|
+
if (this.processor) {
|
|
8336
|
+
this.processor.dispose();
|
|
8337
|
+
this.processor = null;
|
|
8338
|
+
}
|
|
8339
|
+
if (this.a2e) {
|
|
8340
|
+
await this.a2e.dispose();
|
|
8341
|
+
this.a2e = null;
|
|
8342
|
+
}
|
|
8343
|
+
this._isReady = false;
|
|
8344
|
+
this._backend = null;
|
|
8345
|
+
}
|
|
8346
|
+
};
|
|
8347
|
+
|
|
8496
8348
|
// src/inference/SafariSpeechRecognition.ts
|
|
8497
|
-
var
|
|
8349
|
+
var logger16 = createLogger("SafariSpeech");
|
|
8498
8350
|
var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
8499
8351
|
constructor(config = {}) {
|
|
8500
8352
|
this.recognition = null;
|
|
@@ -8513,7 +8365,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8513
8365
|
interimResults: config.interimResults ?? true,
|
|
8514
8366
|
maxAlternatives: config.maxAlternatives ?? 1
|
|
8515
8367
|
};
|
|
8516
|
-
|
|
8368
|
+
logger16.debug("SafariSpeechRecognition created", {
|
|
8517
8369
|
language: this.config.language,
|
|
8518
8370
|
continuous: this.config.continuous
|
|
8519
8371
|
});
|
|
@@ -8574,7 +8426,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8574
8426
|
*/
|
|
8575
8427
|
async start() {
|
|
8576
8428
|
if (this.isListening) {
|
|
8577
|
-
|
|
8429
|
+
logger16.warn("Already listening");
|
|
8578
8430
|
return;
|
|
8579
8431
|
}
|
|
8580
8432
|
if (!_SafariSpeechRecognition.isAvailable()) {
|
|
@@ -8604,7 +8456,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8604
8456
|
this.isListening = true;
|
|
8605
8457
|
this.startTime = performance.now();
|
|
8606
8458
|
this.accumulatedText = "";
|
|
8607
|
-
|
|
8459
|
+
logger16.info("Speech recognition started", {
|
|
8608
8460
|
language: this.config.language
|
|
8609
8461
|
});
|
|
8610
8462
|
span?.end();
|
|
@@ -8619,7 +8471,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8619
8471
|
*/
|
|
8620
8472
|
async stop() {
|
|
8621
8473
|
if (!this.isListening || !this.recognition) {
|
|
8622
|
-
|
|
8474
|
+
logger16.warn("Not currently listening");
|
|
8623
8475
|
return {
|
|
8624
8476
|
text: this.accumulatedText,
|
|
8625
8477
|
language: this.config.language,
|
|
@@ -8648,7 +8500,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8648
8500
|
if (this.recognition && this.isListening) {
|
|
8649
8501
|
this.recognition.abort();
|
|
8650
8502
|
this.isListening = false;
|
|
8651
|
-
|
|
8503
|
+
logger16.info("Speech recognition aborted");
|
|
8652
8504
|
}
|
|
8653
8505
|
}
|
|
8654
8506
|
/**
|
|
@@ -8679,7 +8531,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8679
8531
|
this.isListening = false;
|
|
8680
8532
|
this.resultCallbacks = [];
|
|
8681
8533
|
this.errorCallbacks = [];
|
|
8682
|
-
|
|
8534
|
+
logger16.debug("SafariSpeechRecognition disposed");
|
|
8683
8535
|
}
|
|
8684
8536
|
/**
|
|
8685
8537
|
* Set up event handlers for the recognition instance
|
|
@@ -8707,7 +8559,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8707
8559
|
confidence: alternative.confidence
|
|
8708
8560
|
};
|
|
8709
8561
|
this.emitResult(speechResult);
|
|
8710
|
-
|
|
8562
|
+
logger16.trace("Speech result", {
|
|
8711
8563
|
text: text.substring(0, 50),
|
|
8712
8564
|
isFinal,
|
|
8713
8565
|
confidence: alternative.confidence
|
|
@@ -8717,12 +8569,12 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8717
8569
|
span?.end();
|
|
8718
8570
|
} catch (error) {
|
|
8719
8571
|
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
8720
|
-
|
|
8572
|
+
logger16.error("Error processing speech result", { error });
|
|
8721
8573
|
}
|
|
8722
8574
|
};
|
|
8723
8575
|
this.recognition.onerror = (event) => {
|
|
8724
8576
|
const error = new Error(`Speech recognition error: ${event.error} - ${event.message}`);
|
|
8725
|
-
|
|
8577
|
+
logger16.error("Speech recognition error", { error: event.error, message: event.message });
|
|
8726
8578
|
this.emitError(error);
|
|
8727
8579
|
if (this.stopRejecter) {
|
|
8728
8580
|
this.stopRejecter(error);
|
|
@@ -8732,7 +8584,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8732
8584
|
};
|
|
8733
8585
|
this.recognition.onend = () => {
|
|
8734
8586
|
this.isListening = false;
|
|
8735
|
-
|
|
8587
|
+
logger16.info("Speech recognition ended", {
|
|
8736
8588
|
totalText: this.accumulatedText.length,
|
|
8737
8589
|
durationMs: performance.now() - this.startTime
|
|
8738
8590
|
});
|
|
@@ -8749,13 +8601,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8749
8601
|
}
|
|
8750
8602
|
};
|
|
8751
8603
|
this.recognition.onstart = () => {
|
|
8752
|
-
|
|
8604
|
+
logger16.debug("Speech recognition started by browser");
|
|
8753
8605
|
};
|
|
8754
8606
|
this.recognition.onspeechstart = () => {
|
|
8755
|
-
|
|
8607
|
+
logger16.debug("Speech detected");
|
|
8756
8608
|
};
|
|
8757
8609
|
this.recognition.onspeechend = () => {
|
|
8758
|
-
|
|
8610
|
+
logger16.debug("Speech ended");
|
|
8759
8611
|
};
|
|
8760
8612
|
}
|
|
8761
8613
|
/**
|
|
@@ -8766,7 +8618,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8766
8618
|
try {
|
|
8767
8619
|
callback(result);
|
|
8768
8620
|
} catch (error) {
|
|
8769
|
-
|
|
8621
|
+
logger16.error("Error in result callback", { error });
|
|
8770
8622
|
}
|
|
8771
8623
|
}
|
|
8772
8624
|
}
|
|
@@ -8778,7 +8630,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8778
8630
|
try {
|
|
8779
8631
|
callback(error);
|
|
8780
8632
|
} catch (callbackError) {
|
|
8781
|
-
|
|
8633
|
+
logger16.error("Error in error callback", { error: callbackError });
|
|
8782
8634
|
}
|
|
8783
8635
|
}
|
|
8784
8636
|
}
|
|
@@ -9105,11 +8957,14 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
9105
8957
|
return new Promise((resolve) => {
|
|
9106
8958
|
const timeout = setTimeout(() => resolve(false), 5e3);
|
|
9107
8959
|
const handler = (event) => {
|
|
9108
|
-
|
|
9109
|
-
|
|
9110
|
-
|
|
9111
|
-
|
|
9112
|
-
|
|
8960
|
+
try {
|
|
8961
|
+
const data = JSON.parse(event.data);
|
|
8962
|
+
if (data.type === "pong") {
|
|
8963
|
+
clearTimeout(timeout);
|
|
8964
|
+
this.ws?.removeEventListener("message", handler);
|
|
8965
|
+
resolve(true);
|
|
8966
|
+
}
|
|
8967
|
+
} catch {
|
|
9113
8968
|
}
|
|
9114
8969
|
};
|
|
9115
8970
|
this.ws?.addEventListener("message", handler);
|
|
@@ -9188,13 +9043,14 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
9188
9043
|
if (!this.lam) {
|
|
9189
9044
|
throw new Error("LAM must be initialized before pipeline");
|
|
9190
9045
|
}
|
|
9191
|
-
this.pipeline = new
|
|
9046
|
+
this.pipeline = new FullFacePipeline({
|
|
9192
9047
|
lam: this.lam,
|
|
9193
9048
|
sampleRate: 16e3,
|
|
9194
9049
|
chunkTargetMs: 200
|
|
9195
9050
|
});
|
|
9196
9051
|
await this.pipeline.initialize();
|
|
9197
|
-
this.pipeline.on("
|
|
9052
|
+
this.pipeline.on("full_frame_ready", (fullFrame) => {
|
|
9053
|
+
const frame = fullFrame.blendshapes;
|
|
9198
9054
|
this.emit("animation", {
|
|
9199
9055
|
blendshapes: frame,
|
|
9200
9056
|
get: (name) => {
|
|
@@ -9235,7 +9091,10 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
9235
9091
|
}));
|
|
9236
9092
|
};
|
|
9237
9093
|
this.ws.onmessage = (event) => {
|
|
9238
|
-
|
|
9094
|
+
try {
|
|
9095
|
+
this.handleAgentCoreMessage(JSON.parse(event.data));
|
|
9096
|
+
} catch {
|
|
9097
|
+
}
|
|
9239
9098
|
};
|
|
9240
9099
|
this.ws.onerror = () => {
|
|
9241
9100
|
reject(new Error("WebSocket connection failed"));
|
|
@@ -9247,14 +9106,17 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
9247
9106
|
reject(new Error("Auth timeout"));
|
|
9248
9107
|
}, 1e4);
|
|
9249
9108
|
const authHandler = (event) => {
|
|
9250
|
-
|
|
9251
|
-
|
|
9252
|
-
|
|
9253
|
-
|
|
9254
|
-
|
|
9255
|
-
|
|
9256
|
-
|
|
9257
|
-
|
|
9109
|
+
try {
|
|
9110
|
+
const data = JSON.parse(event.data);
|
|
9111
|
+
if (data.type === "auth_success") {
|
|
9112
|
+
clearTimeout(authTimeout);
|
|
9113
|
+
this.ws?.removeEventListener("message", authHandler);
|
|
9114
|
+
resolve();
|
|
9115
|
+
} else if (data.type === "auth_failed") {
|
|
9116
|
+
clearTimeout(authTimeout);
|
|
9117
|
+
reject(new Error(data.message));
|
|
9118
|
+
}
|
|
9119
|
+
} catch {
|
|
9258
9120
|
}
|
|
9259
9121
|
};
|
|
9260
9122
|
this.ws.addEventListener("message", authHandler);
|
|
@@ -9367,9 +9229,9 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
9367
9229
|
});
|
|
9368
9230
|
}
|
|
9369
9231
|
}
|
|
9370
|
-
// REMOVED: processAudioForAnimation() - now handled by
|
|
9232
|
+
// REMOVED: processAudioForAnimation() - now handled by FullFacePipeline
|
|
9371
9233
|
// The pipeline manages audio scheduling, LAM inference, and frame synchronization
|
|
9372
|
-
// Frames are emitted via pipeline.on('
|
|
9234
|
+
// Frames are emitted via pipeline.on('full_frame_ready') event (see initPipeline())
|
|
9373
9235
|
/**
|
|
9374
9236
|
* Detect voice activity using Silero VAD
|
|
9375
9237
|
* Falls back to simple RMS if VAD not available
|
|
@@ -10063,20 +9925,6 @@ var InterruptionHandler = class extends EventEmitter {
|
|
|
10063
9925
|
this.onSilenceDetected();
|
|
10064
9926
|
}
|
|
10065
9927
|
}
|
|
10066
|
-
/**
|
|
10067
|
-
* @deprecated Use processVADResult() instead. This method uses naive RMS detection.
|
|
10068
|
-
* Process audio samples for VAD (legacy - uses simple RMS)
|
|
10069
|
-
*/
|
|
10070
|
-
processAudio(samples) {
|
|
10071
|
-
if (!this.config.enabled) return;
|
|
10072
|
-
const rms = this.calculateRMS(samples);
|
|
10073
|
-
const vadProbability = Math.min(rms / 0.02, 1);
|
|
10074
|
-
if (vadProbability > this.config.vadThreshold) {
|
|
10075
|
-
this.onSpeechDetected(rms);
|
|
10076
|
-
} else {
|
|
10077
|
-
this.onSilenceDetected();
|
|
10078
|
-
}
|
|
10079
|
-
}
|
|
10080
9928
|
/**
|
|
10081
9929
|
* Notify that AI started speaking
|
|
10082
9930
|
*/
|
|
@@ -10121,15 +9969,6 @@ var InterruptionHandler = class extends EventEmitter {
|
|
|
10121
9969
|
};
|
|
10122
9970
|
}
|
|
10123
9971
|
// ==================== Private Methods ====================
|
|
10124
|
-
calculateRMS(samples) {
|
|
10125
|
-
let sum = 0;
|
|
10126
|
-
const scale = samples instanceof Int16Array ? 32768 : 1;
|
|
10127
|
-
for (let i = 0; i < samples.length; i++) {
|
|
10128
|
-
const sample = samples[i] / scale;
|
|
10129
|
-
sum += sample * sample;
|
|
10130
|
-
}
|
|
10131
|
-
return Math.sqrt(sum / samples.length);
|
|
10132
|
-
}
|
|
10133
9972
|
onSpeechDetected(rms) {
|
|
10134
9973
|
const now = Date.now();
|
|
10135
9974
|
this.lastSpeechTime = now;
|
|
@@ -11203,6 +11042,8 @@ function isProtocolEvent(obj) {
|
|
|
11203
11042
|
return typeof obj === "object" && obj !== null && "v" in obj && "type" in obj && "ts" in obj;
|
|
11204
11043
|
}
|
|
11205
11044
|
export {
|
|
11045
|
+
A2EOrchestrator,
|
|
11046
|
+
A2EProcessor,
|
|
11206
11047
|
ARKIT_BLENDSHAPES,
|
|
11207
11048
|
AgentCoreAdapter,
|
|
11208
11049
|
AnimationGraph,
|
|
@@ -11210,23 +11051,22 @@ export {
|
|
|
11210
11051
|
AudioEnergyAnalyzer,
|
|
11211
11052
|
AudioScheduler,
|
|
11212
11053
|
AudioSyncManager,
|
|
11054
|
+
BLENDSHAPE_TO_GROUP,
|
|
11055
|
+
BlendshapeSmoother,
|
|
11213
11056
|
CTC_VOCAB,
|
|
11214
11057
|
ConsoleExporter,
|
|
11215
11058
|
ConversationOrchestrator,
|
|
11216
11059
|
DEFAULT_ANIMATION_CONFIG,
|
|
11217
11060
|
DEFAULT_LOGGING_CONFIG,
|
|
11218
|
-
EMOTION_ARKIT_MAP,
|
|
11219
11061
|
EMOTION_NAMES,
|
|
11220
11062
|
EMOTION_VECTOR_SIZE,
|
|
11221
11063
|
EmotionController,
|
|
11222
11064
|
EmotionPresets,
|
|
11223
|
-
EmotionToBlendshapeMapper,
|
|
11224
11065
|
EmphasisDetector,
|
|
11225
11066
|
EventEmitter,
|
|
11226
11067
|
FullFacePipeline,
|
|
11227
11068
|
INFERENCE_LATENCY_BUCKETS,
|
|
11228
11069
|
InterruptionHandler,
|
|
11229
|
-
LAMPipeline,
|
|
11230
11070
|
LAM_BLENDSHAPES,
|
|
11231
11071
|
LOG_LEVEL_PRIORITY,
|
|
11232
11072
|
MODEL_LOAD_TIME_BUCKETS,
|
|
@@ -11245,73 +11085,54 @@ export {
|
|
|
11245
11085
|
SileroVADInference,
|
|
11246
11086
|
SileroVADUnifiedAdapter,
|
|
11247
11087
|
SileroVADWorker,
|
|
11248
|
-
SyncedAudioPipeline,
|
|
11249
11088
|
TenantManager,
|
|
11250
|
-
UPPER_FACE_BLENDSHAPES,
|
|
11251
11089
|
UnifiedInferenceWorker,
|
|
11252
|
-
WAV2ARKIT_BLENDSHAPES,
|
|
11253
11090
|
Wav2ArkitCpuInference,
|
|
11254
11091
|
Wav2ArkitCpuUnifiedAdapter,
|
|
11255
11092
|
Wav2ArkitCpuWorker,
|
|
11256
11093
|
Wav2Vec2Inference,
|
|
11257
|
-
applyCMVN,
|
|
11258
|
-
applyLFR,
|
|
11259
11094
|
blendEmotions,
|
|
11260
11095
|
calculatePeak,
|
|
11261
11096
|
calculateRMS,
|
|
11262
|
-
computeKaldiFbank,
|
|
11263
11097
|
configureCacheLimit,
|
|
11264
11098
|
configureLogging,
|
|
11265
11099
|
configureTelemetry,
|
|
11100
|
+
createA2E,
|
|
11266
11101
|
createEmotionVector,
|
|
11267
|
-
createLipSync,
|
|
11268
11102
|
createLogger,
|
|
11269
11103
|
createSenseVoice,
|
|
11270
|
-
createSessionWithFallback,
|
|
11271
11104
|
createSileroVAD,
|
|
11272
|
-
ctcGreedyDecode,
|
|
11273
11105
|
fetchWithCache,
|
|
11274
11106
|
formatBytes,
|
|
11275
11107
|
getCacheConfig,
|
|
11276
11108
|
getCacheKey,
|
|
11277
11109
|
getEmotionPreset,
|
|
11278
|
-
getLoadedBackend,
|
|
11279
11110
|
getLoggingConfig,
|
|
11280
11111
|
getModelCache,
|
|
11281
|
-
getOnnxRuntime,
|
|
11282
|
-
getOnnxRuntimeForPreference,
|
|
11283
11112
|
getOptimalWasmThreads,
|
|
11284
11113
|
getRecommendedBackend,
|
|
11285
|
-
getSessionOptions,
|
|
11286
11114
|
getTelemetry,
|
|
11287
11115
|
hasWebGPUApi,
|
|
11288
11116
|
isAndroid,
|
|
11289
11117
|
isIOS,
|
|
11290
11118
|
isIOSSafari,
|
|
11291
11119
|
isMobile,
|
|
11292
|
-
isOnnxRuntimeLoaded,
|
|
11293
11120
|
isProtocolEvent,
|
|
11294
11121
|
isSafari,
|
|
11295
11122
|
isSpeechRecognitionAvailable,
|
|
11296
11123
|
isWebGPUAvailable,
|
|
11124
|
+
lerpBlendshapes,
|
|
11297
11125
|
lerpEmotion,
|
|
11298
11126
|
noopLogger,
|
|
11299
|
-
parseCMVNFromMetadata,
|
|
11300
|
-
parseTokensFile,
|
|
11301
11127
|
preloadModels,
|
|
11302
|
-
preloadOnnxRuntime,
|
|
11303
|
-
remapWav2ArkitToLam,
|
|
11304
11128
|
resetLoggingConfig,
|
|
11305
11129
|
resolveBackend,
|
|
11306
|
-
resolveLanguageId,
|
|
11307
|
-
resolveTextNormId,
|
|
11308
11130
|
setLogLevel,
|
|
11309
11131
|
setLoggingEnabled,
|
|
11310
11132
|
shouldEnableWasmProxy,
|
|
11311
|
-
|
|
11133
|
+
shouldUseCpuA2E,
|
|
11312
11134
|
shouldUseNativeASR,
|
|
11313
|
-
|
|
11314
|
-
supportsVADWorker
|
|
11315
|
-
symmetrizeBlendshapes
|
|
11135
|
+
shouldUseServerA2E,
|
|
11136
|
+
supportsVADWorker
|
|
11316
11137
|
};
|
|
11317
11138
|
//# sourceMappingURL=index.mjs.map
|