@omote/core 0.4.7 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +470 -861
- package/dist/index.d.ts +470 -861
- package/dist/index.js +1383 -1565
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +949 -1131
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -485,730 +485,279 @@ var AudioChunkCoalescer = class {
|
|
|
485
485
|
}
|
|
486
486
|
};
|
|
487
487
|
|
|
488
|
-
// src/
|
|
489
|
-
var
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
this.
|
|
495
|
-
// LAM outputs 30fps
|
|
496
|
-
this.buffer = new Float32Array(0);
|
|
488
|
+
// src/inference/A2EProcessor.ts
|
|
489
|
+
var logger = createLogger("A2EProcessor");
|
|
490
|
+
var FRAME_RATE = 30;
|
|
491
|
+
var DRIP_INTERVAL_MS = 33;
|
|
492
|
+
var A2EProcessor = class {
|
|
493
|
+
constructor(config) {
|
|
494
|
+
this.writeOffset = 0;
|
|
497
495
|
this.bufferStartTime = 0;
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
this.
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
496
|
+
// Frame queues (timestamped for pull mode, plain for drip mode)
|
|
497
|
+
this.timestampedQueue = [];
|
|
498
|
+
this.plainQueue = [];
|
|
499
|
+
// Push mode state
|
|
500
|
+
this._latestFrame = null;
|
|
501
|
+
this.dripInterval = null;
|
|
502
|
+
// Last-frame-hold for pull mode (prevents avatar freezing between frames)
|
|
503
|
+
this.lastPulledFrame = null;
|
|
504
|
+
// Inference serialization
|
|
505
|
+
this.inferenceRunning = false;
|
|
506
|
+
this.pendingChunks = [];
|
|
507
|
+
// Diagnostic: track getFrameForTime calls
|
|
508
|
+
this.getFrameCallCount = 0;
|
|
509
|
+
this.disposed = false;
|
|
510
|
+
this.backend = config.backend;
|
|
511
|
+
this.sampleRate = config.sampleRate ?? 16e3;
|
|
512
|
+
this.chunkSize = config.chunkSize ?? config.backend.chunkSize ?? 16e3;
|
|
513
|
+
this.onFrame = config.onFrame;
|
|
514
|
+
this.onError = config.onError;
|
|
515
|
+
this.bufferCapacity = this.chunkSize * 2;
|
|
516
|
+
this.buffer = new Float32Array(this.bufferCapacity);
|
|
517
|
+
}
|
|
518
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
519
|
+
// Audio Input
|
|
520
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
521
|
+
/**
|
|
522
|
+
* Push audio samples for inference (any source: mic, TTS, file).
|
|
507
523
|
*
|
|
508
|
-
*
|
|
509
|
-
*
|
|
524
|
+
* - With `timestamp`: frames stored with timestamps (pull mode)
|
|
525
|
+
* - Without `timestamp`: frames stored in plain queue (drip/push mode)
|
|
510
526
|
*
|
|
511
|
-
*
|
|
512
|
-
* @param timestamp - AudioContext time when these samples start playing
|
|
513
|
-
* @param lam - LAM inference engine
|
|
527
|
+
* Fire-and-forget: returns immediately, inference runs async.
|
|
514
528
|
*/
|
|
515
|
-
|
|
516
|
-
if (this.
|
|
529
|
+
pushAudio(samples, timestamp) {
|
|
530
|
+
if (this.disposed) return;
|
|
531
|
+
if (this.writeOffset === 0 && timestamp !== void 0) {
|
|
517
532
|
this.bufferStartTime = timestamp;
|
|
518
533
|
}
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
this.
|
|
540
|
-
const
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
this.
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
this.buffer = new Float32Array(0);
|
|
551
|
-
this.bufferStartTime = 0;
|
|
552
|
-
}
|
|
553
|
-
}
|
|
554
|
-
/**
|
|
555
|
-
* Get the frame that should be displayed at the current time
|
|
556
|
-
*
|
|
557
|
-
* Automatically removes frames that have already been displayed.
|
|
558
|
-
* This prevents memory leaks from accumulating old frames.
|
|
559
|
-
*
|
|
560
|
-
* Discard Window (prevents premature frame discarding):
|
|
561
|
-
* - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
|
|
562
|
-
* - WASM: 1.0s (LAM inference 50-500ms + higher variability)
|
|
563
|
-
*
|
|
564
|
-
* Last-Frame-Hold: Returns last valid frame instead of null to prevent
|
|
565
|
-
* avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
|
|
566
|
-
*
|
|
567
|
-
* @param currentTime - Current AudioContext time
|
|
568
|
-
* @param lam - LAM inference engine (optional, for backend detection)
|
|
569
|
-
* @returns Current frame, or last frame as fallback, or null if no frames yet
|
|
570
|
-
*/
|
|
571
|
-
getFrameForTime(currentTime, lam) {
|
|
572
|
-
const discardWindow = lam?.backend === "wasm" ? 1 : 0.5;
|
|
573
|
-
let discardedCount = 0;
|
|
574
|
-
while (this.frameQueue.length > 0 && this.frameQueue[0].timestamp < currentTime - discardWindow) {
|
|
575
|
-
const discarded = this.frameQueue.shift();
|
|
576
|
-
discardedCount++;
|
|
577
|
-
if (discardedCount === 1) {
|
|
578
|
-
const ageMs = ((currentTime - discarded.timestamp) * 1e3).toFixed(0);
|
|
579
|
-
console.warn("[LAM] Frame(s) discarded as too old", {
|
|
580
|
-
ageMs,
|
|
581
|
-
discardWindowMs: discardWindow * 1e3,
|
|
582
|
-
queueLength: this.frameQueue.length,
|
|
583
|
-
backend: lam?.backend ?? "unknown"
|
|
584
|
-
});
|
|
585
|
-
}
|
|
586
|
-
}
|
|
587
|
-
if (this.frameQueue.length > 0 && this.frameQueue[0].timestamp <= currentTime) {
|
|
588
|
-
const { frame } = this.frameQueue.shift();
|
|
589
|
-
this.lastFrame = frame;
|
|
590
|
-
return frame;
|
|
591
|
-
}
|
|
592
|
-
return this.lastFrame;
|
|
593
|
-
}
|
|
594
|
-
/**
|
|
595
|
-
* Get all frames in the queue (for debugging/monitoring)
|
|
596
|
-
*/
|
|
597
|
-
getQueuedFrames() {
|
|
598
|
-
return [...this.frameQueue];
|
|
599
|
-
}
|
|
600
|
-
/**
|
|
601
|
-
* Get current buffer fill level (0-1)
|
|
602
|
-
*/
|
|
603
|
-
get fillLevel() {
|
|
604
|
-
return Math.min(1, this.buffer.length / this.REQUIRED_SAMPLES);
|
|
605
|
-
}
|
|
606
|
-
/**
|
|
607
|
-
* Get number of frames queued
|
|
608
|
-
*/
|
|
609
|
-
get queuedFrameCount() {
|
|
610
|
-
return this.frameQueue.length;
|
|
611
|
-
}
|
|
612
|
-
/**
|
|
613
|
-
* Get buffered audio duration in seconds
|
|
614
|
-
*/
|
|
615
|
-
get bufferedDuration() {
|
|
616
|
-
return this.buffer.length / (this.options.sampleRate ?? 16e3);
|
|
617
|
-
}
|
|
618
|
-
/**
|
|
619
|
-
* Flush remaining buffered audio
|
|
620
|
-
*
|
|
621
|
-
* Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
|
|
622
|
-
* This ensures the final audio chunk generates blendshape frames.
|
|
623
|
-
*
|
|
624
|
-
* Should be called when audio stream ends to prevent losing the last 0-1 seconds.
|
|
625
|
-
*
|
|
626
|
-
* @param lam - LAM inference engine
|
|
627
|
-
*/
|
|
628
|
-
async flush(lam) {
|
|
629
|
-
if (this.buffer.length === 0) {
|
|
630
|
-
return;
|
|
631
|
-
}
|
|
632
|
-
const padded = new Float32Array(this.REQUIRED_SAMPLES);
|
|
633
|
-
padded.set(this.buffer, 0);
|
|
634
|
-
const processedStartTime = this.bufferStartTime;
|
|
635
|
-
try {
|
|
636
|
-
const result = await lam.infer(padded);
|
|
637
|
-
const actualDuration = this.buffer.length / (this.options.sampleRate ?? 16e3);
|
|
638
|
-
const frameDuration = 1 / this.FRAME_RATE;
|
|
639
|
-
const actualFrameCount = Math.ceil(actualDuration * this.FRAME_RATE);
|
|
640
|
-
for (let i = 0; i < Math.min(actualFrameCount, result.blendshapes.length); i++) {
|
|
641
|
-
const frame = result.blendshapes[i];
|
|
642
|
-
const timestamp = processedStartTime + i * frameDuration;
|
|
643
|
-
this.frameQueue.push({ frame, timestamp });
|
|
534
|
+
if (this.writeOffset + samples.length > this.bufferCapacity) {
|
|
535
|
+
this.bufferCapacity = (this.writeOffset + samples.length) * 2;
|
|
536
|
+
const grown = new Float32Array(this.bufferCapacity);
|
|
537
|
+
grown.set(this.buffer.subarray(0, this.writeOffset));
|
|
538
|
+
this.buffer = grown;
|
|
539
|
+
}
|
|
540
|
+
this.buffer.set(samples, this.writeOffset);
|
|
541
|
+
this.writeOffset += samples.length;
|
|
542
|
+
logger.debug("pushAudio", {
|
|
543
|
+
samplesIn: samples.length,
|
|
544
|
+
writeOffset: this.writeOffset,
|
|
545
|
+
chunkSize: this.chunkSize,
|
|
546
|
+
willExtract: this.writeOffset >= this.chunkSize,
|
|
547
|
+
inferenceRunning: this.inferenceRunning,
|
|
548
|
+
pendingChunks: this.pendingChunks.length,
|
|
549
|
+
queuedFrames: this.timestampedQueue.length + this.plainQueue.length
|
|
550
|
+
});
|
|
551
|
+
while (this.writeOffset >= this.chunkSize) {
|
|
552
|
+
const chunk = this.buffer.slice(0, this.chunkSize);
|
|
553
|
+
this.buffer.copyWithin(0, this.chunkSize, this.writeOffset);
|
|
554
|
+
this.writeOffset -= this.chunkSize;
|
|
555
|
+
const chunkTimestamp = timestamp !== void 0 ? this.bufferStartTime : void 0;
|
|
556
|
+
this.pendingChunks.push({ chunk, timestamp: chunkTimestamp });
|
|
557
|
+
logger.info("Chunk queued for inference", {
|
|
558
|
+
chunkSize: chunk.length,
|
|
559
|
+
chunkTimestamp,
|
|
560
|
+
pendingChunks: this.pendingChunks.length,
|
|
561
|
+
remainderOffset: this.writeOffset
|
|
562
|
+
});
|
|
563
|
+
if (timestamp !== void 0) {
|
|
564
|
+
this.bufferStartTime += this.chunkSize / this.sampleRate;
|
|
644
565
|
}
|
|
645
|
-
this.buffer = new Float32Array(0);
|
|
646
|
-
this.bufferStartTime = 0;
|
|
647
|
-
this.options.onInference?.(Math.min(actualFrameCount, result.blendshapes.length));
|
|
648
|
-
} catch (error) {
|
|
649
|
-
this.options.onError?.(error);
|
|
650
|
-
this.buffer = new Float32Array(0);
|
|
651
|
-
this.bufferStartTime = 0;
|
|
652
566
|
}
|
|
567
|
+
this.drainPendingChunks();
|
|
653
568
|
}
|
|
654
569
|
/**
|
|
655
|
-
*
|
|
656
|
-
*
|
|
657
|
-
* Used for synchronization when audio scheduling time differs from
|
|
658
|
-
* the estimated time used during LAM processing.
|
|
570
|
+
* Flush remaining buffered audio (pads to chunkSize).
|
|
571
|
+
* Call at end of stream to process final partial chunk.
|
|
659
572
|
*
|
|
660
|
-
*
|
|
573
|
+
* Routes through the serialized pendingChunks pipeline to maintain
|
|
574
|
+
* correct frame ordering. Without this, flush() could push frames
|
|
575
|
+
* with the latest timestamp to the queue before drainPendingChunks()
|
|
576
|
+
* finishes pushing frames with earlier timestamps — causing
|
|
577
|
+
* getFrameForTime() to see out-of-order timestamps and stall.
|
|
661
578
|
*/
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
579
|
+
async flush() {
|
|
580
|
+
if (this.disposed || this.writeOffset === 0) return;
|
|
581
|
+
const padded = new Float32Array(this.chunkSize);
|
|
582
|
+
padded.set(this.buffer.subarray(0, this.writeOffset), 0);
|
|
583
|
+
const chunkTimestamp = this.bufferStartTime > 0 ? this.bufferStartTime : void 0;
|
|
584
|
+
logger.info("flush: routing through drain pipeline", {
|
|
585
|
+
actualSamples: this.writeOffset,
|
|
586
|
+
chunkTimestamp: chunkTimestamp?.toFixed(3),
|
|
587
|
+
pendingChunks: this.pendingChunks.length,
|
|
588
|
+
inferenceRunning: this.inferenceRunning
|
|
589
|
+
});
|
|
590
|
+
this.writeOffset = 0;
|
|
591
|
+
this.bufferStartTime = 0;
|
|
592
|
+
this.pendingChunks.push({ chunk: padded, timestamp: chunkTimestamp });
|
|
593
|
+
this.drainPendingChunks();
|
|
666
594
|
}
|
|
667
595
|
/**
|
|
668
|
-
* Reset
|
|
596
|
+
* Reset buffer and frame queues
|
|
669
597
|
*/
|
|
670
598
|
reset() {
|
|
671
|
-
this.
|
|
599
|
+
this.writeOffset = 0;
|
|
672
600
|
this.bufferStartTime = 0;
|
|
673
|
-
this.
|
|
674
|
-
this.
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
return float32;
|
|
687
|
-
}
|
|
688
|
-
function int16ToFloat32(int16) {
|
|
689
|
-
const float32 = new Float32Array(int16.length);
|
|
690
|
-
for (let i = 0; i < int16.length; i++) {
|
|
691
|
-
float32[i] = int16[i] / 32768;
|
|
692
|
-
}
|
|
693
|
-
return float32;
|
|
694
|
-
}
|
|
695
|
-
|
|
696
|
-
// src/audio/SyncedAudioPipeline.ts
|
|
697
|
-
var SyncedAudioPipeline = class extends EventEmitter {
|
|
698
|
-
constructor(options) {
|
|
699
|
-
super();
|
|
700
|
-
this.options = options;
|
|
701
|
-
this.playbackStarted = false;
|
|
702
|
-
this.monitorInterval = null;
|
|
703
|
-
this.frameAnimationId = null;
|
|
704
|
-
const sampleRate = options.sampleRate ?? 16e3;
|
|
705
|
-
const autoDelay = options.lam.modelId === "wav2arkit_cpu" ? 750 : options.lam.backend === "wasm" ? 350 : 50;
|
|
706
|
-
const audioDelayMs = options.audioDelayMs ?? autoDelay;
|
|
707
|
-
this.scheduler = new AudioScheduler({
|
|
708
|
-
sampleRate,
|
|
709
|
-
initialLookaheadSec: audioDelayMs / 1e3
|
|
710
|
-
});
|
|
711
|
-
this.coalescer = new AudioChunkCoalescer({
|
|
712
|
-
sampleRate,
|
|
713
|
-
targetDurationMs: options.chunkTargetMs ?? 200
|
|
714
|
-
});
|
|
715
|
-
this.lamPipeline = new LAMPipeline({
|
|
716
|
-
sampleRate,
|
|
717
|
-
onError: (error) => {
|
|
718
|
-
this.emit("error", error);
|
|
719
|
-
}
|
|
720
|
-
});
|
|
721
|
-
}
|
|
722
|
-
/**
|
|
723
|
-
* Initialize the pipeline
|
|
724
|
-
*/
|
|
725
|
-
async initialize() {
|
|
726
|
-
await this.scheduler.initialize();
|
|
727
|
-
}
|
|
728
|
-
/**
|
|
729
|
-
* Start a new playback session
|
|
730
|
-
*
|
|
731
|
-
* Resets all state and prepares for incoming audio chunks.
|
|
732
|
-
* Audio will be scheduled immediately as chunks arrive (no buffering).
|
|
733
|
-
*/
|
|
734
|
-
start() {
|
|
735
|
-
this.stopMonitoring();
|
|
736
|
-
this.scheduler.reset();
|
|
737
|
-
this.coalescer.reset();
|
|
738
|
-
this.lamPipeline.reset();
|
|
739
|
-
this.playbackStarted = false;
|
|
740
|
-
this.scheduler.warmup();
|
|
741
|
-
this.startFrameLoop();
|
|
742
|
-
this.startMonitoring();
|
|
743
|
-
}
|
|
744
|
-
/**
|
|
745
|
-
* Receive audio chunk from network
|
|
601
|
+
this.timestampedQueue = [];
|
|
602
|
+
this.plainQueue = [];
|
|
603
|
+
this._latestFrame = null;
|
|
604
|
+
this.lastPulledFrame = null;
|
|
605
|
+
this.pendingChunks = [];
|
|
606
|
+
this.inferenceRunning = false;
|
|
607
|
+
this.getFrameCallCount = 0;
|
|
608
|
+
}
|
|
609
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
610
|
+
// Frame Output — Pull Mode (TTS playback)
|
|
611
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
612
|
+
/**
|
|
613
|
+
* Get frame synced to external clock (e.g. AudioContext.currentTime).
|
|
746
614
|
*
|
|
747
|
-
*
|
|
748
|
-
*
|
|
749
|
-
* which caused audible stuttering with continuous audio streams.
|
|
615
|
+
* Discards frames that are too old, returns the current frame,
|
|
616
|
+
* or holds last frame as fallback to prevent avatar freezing.
|
|
750
617
|
*
|
|
751
|
-
* @param
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
618
|
+
* @param currentTime - Current playback time (seconds)
|
|
619
|
+
* @returns Blendshape frame, or null if no frames yet
|
|
620
|
+
*/
|
|
621
|
+
getFrameForTime(currentTime) {
|
|
622
|
+
this.getFrameCallCount++;
|
|
623
|
+
const discardWindow = this.backend.backend === "wasm" ? 1 : 0.5;
|
|
624
|
+
let discardCount = 0;
|
|
625
|
+
while (this.timestampedQueue.length > 0 && this.timestampedQueue[0].timestamp < currentTime - discardWindow) {
|
|
626
|
+
this.timestampedQueue.shift();
|
|
627
|
+
discardCount++;
|
|
628
|
+
}
|
|
629
|
+
if (discardCount > 0) {
|
|
630
|
+
logger.warn("getFrameForTime DISCARDED stale frames", {
|
|
631
|
+
discardCount,
|
|
632
|
+
currentTime: currentTime.toFixed(3),
|
|
633
|
+
discardWindow,
|
|
634
|
+
remainingFrames: this.timestampedQueue.length,
|
|
635
|
+
nextFrameTs: this.timestampedQueue.length > 0 ? this.timestampedQueue[0].timestamp.toFixed(3) : "none"
|
|
636
|
+
});
|
|
757
637
|
}
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
this.emit("playback_start", scheduleTime);
|
|
638
|
+
if (this.timestampedQueue.length > 0 && this.timestampedQueue[0].timestamp <= currentTime) {
|
|
639
|
+
const { frame } = this.timestampedQueue.shift();
|
|
640
|
+
this.lastPulledFrame = frame;
|
|
641
|
+
return frame;
|
|
763
642
|
}
|
|
764
|
-
this.
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
*/
|
|
773
|
-
async end() {
|
|
774
|
-
const remaining = this.coalescer.flush();
|
|
775
|
-
if (remaining) {
|
|
776
|
-
const chunk = new Uint8Array(remaining);
|
|
777
|
-
await this.onAudioChunk(chunk);
|
|
643
|
+
if (this.timestampedQueue.length > 0 && this.getFrameCallCount % 60 === 0) {
|
|
644
|
+
logger.warn("getFrameForTime: frames in queue but NOT consumable", {
|
|
645
|
+
queueLen: this.timestampedQueue.length,
|
|
646
|
+
frontTimestamp: this.timestampedQueue[0].timestamp.toFixed(4),
|
|
647
|
+
currentTime: currentTime.toFixed(4),
|
|
648
|
+
delta: (this.timestampedQueue[0].timestamp - currentTime).toFixed(4),
|
|
649
|
+
callCount: this.getFrameCallCount
|
|
650
|
+
});
|
|
778
651
|
}
|
|
779
|
-
|
|
652
|
+
return this.lastPulledFrame;
|
|
780
653
|
}
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
* - Clears all buffers and queues
|
|
788
|
-
* - Emits 'playback_complete' event
|
|
789
|
-
*
|
|
790
|
-
* Use this for interruptions (e.g., user barge-in during AI speech).
|
|
791
|
-
*
|
|
792
|
-
* @param fadeOutMs - Fade-out duration in milliseconds (default: 50ms)
|
|
793
|
-
* @returns Promise that resolves when fade-out completes
|
|
794
|
-
*/
|
|
795
|
-
async stop(fadeOutMs = 50) {
|
|
796
|
-
this.stopMonitoring();
|
|
797
|
-
await this.scheduler.cancelAll(fadeOutMs);
|
|
798
|
-
this.coalescer.reset();
|
|
799
|
-
this.lamPipeline.reset();
|
|
800
|
-
this.playbackStarted = false;
|
|
801
|
-
this.emit("playback_complete", void 0);
|
|
654
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
655
|
+
// Frame Output — Push Mode (live mic, game loop)
|
|
656
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
657
|
+
/** Latest frame from drip-feed (live mic, game loop) */
|
|
658
|
+
get latestFrame() {
|
|
659
|
+
return this._latestFrame;
|
|
802
660
|
}
|
|
803
|
-
/**
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
*
|
|
809
|
-
* Frame Emission Strategy:
|
|
810
|
-
* - LAMPipeline uses last-frame-hold to prevent null returns
|
|
811
|
-
* - Always emit frames (even repeated frames) to maintain smooth animation
|
|
812
|
-
* - Renderer is responsible for detecting duplicate frames if needed
|
|
813
|
-
*/
|
|
814
|
-
startFrameLoop() {
|
|
815
|
-
const updateFrame = () => {
|
|
816
|
-
const currentTime = this.scheduler.getCurrentTime();
|
|
817
|
-
const frame = this.lamPipeline.getFrameForTime(currentTime, this.options.lam);
|
|
661
|
+
/** Start 30fps drip-feed timer (push mode) */
|
|
662
|
+
startDrip() {
|
|
663
|
+
if (this.dripInterval) return;
|
|
664
|
+
this.dripInterval = setInterval(() => {
|
|
665
|
+
const frame = this.plainQueue.shift();
|
|
818
666
|
if (frame) {
|
|
819
|
-
this.
|
|
667
|
+
this._latestFrame = frame;
|
|
668
|
+
this.onFrame?.(frame);
|
|
820
669
|
}
|
|
821
|
-
|
|
822
|
-
};
|
|
823
|
-
this.frameAnimationId = requestAnimationFrame(updateFrame);
|
|
670
|
+
}, DRIP_INTERVAL_MS);
|
|
824
671
|
}
|
|
825
|
-
/**
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
clearInterval(this.monitorInterval);
|
|
672
|
+
/** Stop drip-feed timer */
|
|
673
|
+
stopDrip() {
|
|
674
|
+
if (this.dripInterval) {
|
|
675
|
+
clearInterval(this.dripInterval);
|
|
676
|
+
this.dripInterval = null;
|
|
831
677
|
}
|
|
832
|
-
this.monitorInterval = window.setInterval(() => {
|
|
833
|
-
if (this.scheduler.isComplete() && this.lamPipeline.queuedFrameCount === 0) {
|
|
834
|
-
this.emit("playback_complete", void 0);
|
|
835
|
-
this.stopMonitoring();
|
|
836
|
-
}
|
|
837
|
-
}, 100);
|
|
838
678
|
}
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
this.monitorInterval = null;
|
|
846
|
-
}
|
|
847
|
-
if (this.frameAnimationId) {
|
|
848
|
-
cancelAnimationFrame(this.frameAnimationId);
|
|
849
|
-
this.frameAnimationId = null;
|
|
850
|
-
}
|
|
679
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
680
|
+
// State
|
|
681
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
682
|
+
/** Number of frames waiting in queue (both modes combined) */
|
|
683
|
+
get queuedFrameCount() {
|
|
684
|
+
return this.timestampedQueue.length + this.plainQueue.length;
|
|
851
685
|
}
|
|
852
|
-
/**
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
getState() {
|
|
856
|
-
return {
|
|
857
|
-
playbackStarted: this.playbackStarted,
|
|
858
|
-
coalescerFill: this.coalescer.fillLevel,
|
|
859
|
-
lamFill: this.lamPipeline.fillLevel,
|
|
860
|
-
queuedFrames: this.lamPipeline.queuedFrameCount,
|
|
861
|
-
currentTime: this.scheduler.getCurrentTime(),
|
|
862
|
-
playbackEndTime: this.scheduler.getPlaybackEndTime()
|
|
863
|
-
};
|
|
686
|
+
/** Buffer fill level as fraction of chunkSize (0-1) */
|
|
687
|
+
get fillLevel() {
|
|
688
|
+
return Math.min(1, this.writeOffset / this.chunkSize);
|
|
864
689
|
}
|
|
865
|
-
/**
|
|
866
|
-
* Cleanup resources
|
|
867
|
-
*/
|
|
690
|
+
/** Dispose resources */
|
|
868
691
|
dispose() {
|
|
869
|
-
this.
|
|
870
|
-
this.
|
|
871
|
-
this.
|
|
872
|
-
this.
|
|
873
|
-
}
|
|
874
|
-
};
|
|
875
|
-
|
|
876
|
-
// src/animation/EmotionToBlendshapeMapper.ts
|
|
877
|
-
var UPPER_FACE_BLENDSHAPES = [
|
|
878
|
-
// Brows (5)
|
|
879
|
-
"browDownLeft",
|
|
880
|
-
"browDownRight",
|
|
881
|
-
"browInnerUp",
|
|
882
|
-
"browOuterUpLeft",
|
|
883
|
-
"browOuterUpRight",
|
|
884
|
-
// Eyes (4)
|
|
885
|
-
"eyeSquintLeft",
|
|
886
|
-
"eyeSquintRight",
|
|
887
|
-
"eyeWideLeft",
|
|
888
|
-
"eyeWideRight",
|
|
889
|
-
// Cheeks (2)
|
|
890
|
-
"cheekSquintLeft",
|
|
891
|
-
"cheekSquintRight"
|
|
892
|
-
];
|
|
893
|
-
var EMOTION_ARKIT_MAP = {
|
|
894
|
-
happy: {
|
|
895
|
-
// AU6 - Cheek raiser (primary Duchenne smile marker)
|
|
896
|
-
cheekSquintLeft: 0.5,
|
|
897
|
-
cheekSquintRight: 0.5,
|
|
898
|
-
// Slight eye squint from genuine smile (orbicularis oculi activation)
|
|
899
|
-
eyeSquintLeft: 0.2,
|
|
900
|
-
eyeSquintRight: 0.2
|
|
901
|
-
},
|
|
902
|
-
angry: {
|
|
903
|
-
// AU4 - Brow lowerer (intense, primary anger marker)
|
|
904
|
-
browDownLeft: 0.7,
|
|
905
|
-
browDownRight: 0.7,
|
|
906
|
-
// AU5 - Upper lid raiser (wide eyes, part of the "glare")
|
|
907
|
-
eyeWideLeft: 0.4,
|
|
908
|
-
eyeWideRight: 0.4,
|
|
909
|
-
// AU7 - Lid tightener (tense stare, combines with AU5 for angry glare)
|
|
910
|
-
eyeSquintLeft: 0.3,
|
|
911
|
-
eyeSquintRight: 0.3
|
|
912
|
-
},
|
|
913
|
-
sad: {
|
|
914
|
-
// AU1 - Inner brow raiser (primary sadness marker)
|
|
915
|
-
browInnerUp: 0.6,
|
|
916
|
-
// AU4 - Brow lowerer (brows drawn together)
|
|
917
|
-
browDownLeft: 0.3,
|
|
918
|
-
browDownRight: 0.3
|
|
919
|
-
},
|
|
920
|
-
neutral: {}
|
|
921
|
-
// All zeros - no expression overlay
|
|
922
|
-
};
|
|
923
|
-
var DEFAULT_CONFIG = {
|
|
924
|
-
smoothingFactor: 0.15,
|
|
925
|
-
confidenceThreshold: 0.3,
|
|
926
|
-
intensity: 1,
|
|
927
|
-
blendMode: "dominant",
|
|
928
|
-
minBlendProbability: 0.1,
|
|
929
|
-
energyModulation: false,
|
|
930
|
-
minEnergyScale: 0.3,
|
|
931
|
-
maxEnergyScale: 1
|
|
932
|
-
};
|
|
933
|
-
function createZeroBlendshapes() {
|
|
934
|
-
const result = {};
|
|
935
|
-
for (const name of UPPER_FACE_BLENDSHAPES) {
|
|
936
|
-
result[name] = 0;
|
|
937
|
-
}
|
|
938
|
-
return result;
|
|
939
|
-
}
|
|
940
|
-
function clamp01(value) {
|
|
941
|
-
return Math.max(0, Math.min(1, value));
|
|
942
|
-
}
|
|
943
|
-
var EmotionToBlendshapeMapper = class {
|
|
944
|
-
/**
|
|
945
|
-
* Create a new EmotionToBlendshapeMapper
|
|
946
|
-
*
|
|
947
|
-
* @param config - Optional configuration
|
|
948
|
-
*/
|
|
949
|
-
constructor(config) {
|
|
950
|
-
this.currentEnergy = 1;
|
|
951
|
-
this.config = {
|
|
952
|
-
...DEFAULT_CONFIG,
|
|
953
|
-
...config
|
|
954
|
-
};
|
|
955
|
-
this.targetBlendshapes = createZeroBlendshapes();
|
|
956
|
-
this.currentBlendshapes = createZeroBlendshapes();
|
|
957
|
-
}
|
|
958
|
-
/**
|
|
959
|
-
* Map an emotion frame to target blendshapes
|
|
960
|
-
*
|
|
961
|
-
* This sets the target values that the mapper will smoothly interpolate
|
|
962
|
-
* towards. Call update() each frame to apply smoothing.
|
|
963
|
-
*
|
|
964
|
-
* @param frame - Emotion frame from Emotion2VecInference
|
|
965
|
-
* @param audioEnergy - Optional audio energy (0-1) for energy modulation
|
|
966
|
-
* @returns Target upper face blendshapes (before smoothing)
|
|
967
|
-
*/
|
|
968
|
-
mapFrame(frame, audioEnergy) {
|
|
969
|
-
this.targetBlendshapes = createZeroBlendshapes();
|
|
970
|
-
if (audioEnergy !== void 0) {
|
|
971
|
-
this.currentEnergy = clamp01(audioEnergy);
|
|
972
|
-
}
|
|
973
|
-
if (!frame) {
|
|
974
|
-
return { ...this.targetBlendshapes };
|
|
975
|
-
}
|
|
976
|
-
if (this.config.blendMode === "weighted") {
|
|
977
|
-
this.mapFrameWeighted(frame);
|
|
978
|
-
} else {
|
|
979
|
-
this.mapFrameDominant(frame);
|
|
980
|
-
}
|
|
981
|
-
if (this.config.energyModulation) {
|
|
982
|
-
this.applyEnergyModulation();
|
|
983
|
-
}
|
|
984
|
-
return { ...this.targetBlendshapes };
|
|
692
|
+
if (this.disposed) return;
|
|
693
|
+
this.disposed = true;
|
|
694
|
+
this.stopDrip();
|
|
695
|
+
this.reset();
|
|
985
696
|
}
|
|
697
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
698
|
+
// Private
|
|
699
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
986
700
|
/**
|
|
987
|
-
*
|
|
701
|
+
* Process pending chunks sequentially.
|
|
702
|
+
* Fire-and-forget — called from pushAudio() without awaiting.
|
|
988
703
|
*/
|
|
989
|
-
|
|
990
|
-
if (
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
if (!mapping) {
|
|
996
|
-
return;
|
|
997
|
-
}
|
|
998
|
-
const scale = this.config.intensity * frame.confidence;
|
|
999
|
-
for (const [name, value] of Object.entries(mapping)) {
|
|
1000
|
-
const blendshapeName = name;
|
|
1001
|
-
if (value !== void 0) {
|
|
1002
|
-
this.targetBlendshapes[blendshapeName] = clamp01(value * scale);
|
|
704
|
+
drainPendingChunks() {
|
|
705
|
+
if (this.inferenceRunning || this.pendingChunks.length === 0) {
|
|
706
|
+
if (this.inferenceRunning && this.pendingChunks.length > 0) {
|
|
707
|
+
logger.debug("drainPendingChunks skipped (inference running)", {
|
|
708
|
+
pendingChunks: this.pendingChunks.length
|
|
709
|
+
});
|
|
1003
710
|
}
|
|
1004
|
-
}
|
|
1005
|
-
}
|
|
1006
|
-
/**
|
|
1007
|
-
* Map using weighted blend of all emotions by probability
|
|
1008
|
-
* Creates more nuanced expressions (e.g., bittersweet = happy + sad)
|
|
1009
|
-
*/
|
|
1010
|
-
mapFrameWeighted(frame) {
|
|
1011
|
-
if (!frame.probabilities) {
|
|
1012
|
-
this.mapFrameDominant(frame);
|
|
1013
711
|
return;
|
|
1014
712
|
}
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
713
|
+
this.inferenceRunning = true;
|
|
714
|
+
logger.info("drainPendingChunks starting", { pendingChunks: this.pendingChunks.length });
|
|
715
|
+
const processNext = async () => {
|
|
716
|
+
while (this.pendingChunks.length > 0 && !this.disposed) {
|
|
717
|
+
const { chunk, timestamp } = this.pendingChunks.shift();
|
|
718
|
+
try {
|
|
719
|
+
const t0 = performance.now();
|
|
720
|
+
const result = await this.backend.infer(chunk);
|
|
721
|
+
const inferMs = Math.round(performance.now() - t0);
|
|
722
|
+
const actualDuration = chunk.length / this.sampleRate;
|
|
723
|
+
const actualFrameCount = Math.ceil(actualDuration * FRAME_RATE);
|
|
724
|
+
const framesToQueue = Math.min(actualFrameCount, result.blendshapes.length);
|
|
725
|
+
logger.info("Inference complete", {
|
|
726
|
+
inferMs,
|
|
727
|
+
modelFrames: result.blendshapes.length,
|
|
728
|
+
framesToQueue,
|
|
729
|
+
timestamp,
|
|
730
|
+
totalQueued: this.timestampedQueue.length + framesToQueue,
|
|
731
|
+
remainingPending: this.pendingChunks.length
|
|
732
|
+
});
|
|
733
|
+
for (let i = 0; i < framesToQueue; i++) {
|
|
734
|
+
if (timestamp !== void 0) {
|
|
735
|
+
this.timestampedQueue.push({
|
|
736
|
+
frame: result.blendshapes[i],
|
|
737
|
+
timestamp: timestamp + i / FRAME_RATE
|
|
738
|
+
});
|
|
739
|
+
} else {
|
|
740
|
+
this.plainQueue.push(result.blendshapes[i]);
|
|
741
|
+
}
|
|
742
|
+
}
|
|
743
|
+
} catch (err) {
|
|
744
|
+
this.handleError(err);
|
|
745
|
+
}
|
|
746
|
+
if (this.pendingChunks.length > 0) {
|
|
747
|
+
await new Promise((r) => setTimeout(r, 0));
|
|
1028
748
|
}
|
|
1029
749
|
}
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
}
|
|
1035
|
-
/**
|
|
1036
|
-
* Apply energy modulation to scale emotion intensity by audio energy
|
|
1037
|
-
* Louder speech = stronger expressions
|
|
1038
|
-
*/
|
|
1039
|
-
applyEnergyModulation() {
|
|
1040
|
-
const { minEnergyScale, maxEnergyScale } = this.config;
|
|
1041
|
-
const energyScale = minEnergyScale + this.currentEnergy * (maxEnergyScale - minEnergyScale);
|
|
1042
|
-
for (const name of UPPER_FACE_BLENDSHAPES) {
|
|
1043
|
-
this.targetBlendshapes[name] = clamp01(this.targetBlendshapes[name] * energyScale);
|
|
1044
|
-
}
|
|
1045
|
-
}
|
|
1046
|
-
/**
|
|
1047
|
-
* Apply smoothing to interpolate current values towards target
|
|
1048
|
-
*
|
|
1049
|
-
* Uses exponential moving average:
|
|
1050
|
-
* current = current + smoothingFactor * (target - current)
|
|
1051
|
-
*
|
|
1052
|
-
* @param _deltaMs - Delta time in milliseconds (reserved for future time-based smoothing)
|
|
1053
|
-
*/
|
|
1054
|
-
update(_deltaMs) {
|
|
1055
|
-
const factor = this.config.smoothingFactor;
|
|
1056
|
-
for (const name of UPPER_FACE_BLENDSHAPES) {
|
|
1057
|
-
const target = this.targetBlendshapes[name];
|
|
1058
|
-
const current = this.currentBlendshapes[name];
|
|
1059
|
-
this.currentBlendshapes[name] = clamp01(current + factor * (target - current));
|
|
1060
|
-
}
|
|
1061
|
-
}
|
|
1062
|
-
/**
|
|
1063
|
-
* Get current smoothed blendshape values
|
|
1064
|
-
*
|
|
1065
|
-
* @returns Current upper face blendshapes (after smoothing)
|
|
1066
|
-
*/
|
|
1067
|
-
getCurrentBlendshapes() {
|
|
1068
|
-
return { ...this.currentBlendshapes };
|
|
1069
|
-
}
|
|
1070
|
-
/**
|
|
1071
|
-
* Reset mapper to neutral state
|
|
1072
|
-
*
|
|
1073
|
-
* Sets both target and current blendshapes to zero.
|
|
1074
|
-
*/
|
|
1075
|
-
reset() {
|
|
1076
|
-
this.targetBlendshapes = createZeroBlendshapes();
|
|
1077
|
-
this.currentBlendshapes = createZeroBlendshapes();
|
|
1078
|
-
this.currentEnergy = 1;
|
|
1079
|
-
}
|
|
1080
|
-
/**
|
|
1081
|
-
* Get current configuration
|
|
1082
|
-
*/
|
|
1083
|
-
getConfig() {
|
|
1084
|
-
return { ...this.config };
|
|
1085
|
-
}
|
|
1086
|
-
/**
|
|
1087
|
-
* Update configuration
|
|
1088
|
-
*
|
|
1089
|
-
* @param config - Partial configuration to update
|
|
1090
|
-
*/
|
|
1091
|
-
setConfig(config) {
|
|
1092
|
-
this.config = {
|
|
1093
|
-
...this.config,
|
|
1094
|
-
...config
|
|
1095
|
-
};
|
|
1096
|
-
}
|
|
1097
|
-
};
|
|
1098
|
-
|
|
1099
|
-
// src/animation/audioEnergy.ts
|
|
1100
|
-
function calculateRMS(samples) {
|
|
1101
|
-
if (samples.length === 0) return 0;
|
|
1102
|
-
let sumSquares = 0;
|
|
1103
|
-
for (let i = 0; i < samples.length; i++) {
|
|
1104
|
-
sumSquares += samples[i] * samples[i];
|
|
1105
|
-
}
|
|
1106
|
-
return Math.sqrt(sumSquares / samples.length);
|
|
1107
|
-
}
|
|
1108
|
-
function calculatePeak(samples) {
|
|
1109
|
-
let peak = 0;
|
|
1110
|
-
for (let i = 0; i < samples.length; i++) {
|
|
1111
|
-
const abs = Math.abs(samples[i]);
|
|
1112
|
-
if (abs > peak) peak = abs;
|
|
1113
|
-
}
|
|
1114
|
-
return peak;
|
|
1115
|
-
}
|
|
1116
|
-
var AudioEnergyAnalyzer = class {
|
|
1117
|
-
/**
|
|
1118
|
-
* @param smoothingFactor How much to smooth (0 = no smoothing, 1 = infinite smoothing). Default 0.85
|
|
1119
|
-
* @param noiseFloor Minimum energy threshold to consider as signal. Default 0.01
|
|
1120
|
-
*/
|
|
1121
|
-
constructor(smoothingFactor = 0.85, noiseFloor = 0.01) {
|
|
1122
|
-
this.smoothedRMS = 0;
|
|
1123
|
-
this.smoothedPeak = 0;
|
|
1124
|
-
this.smoothingFactor = Math.max(0, Math.min(0.99, smoothingFactor));
|
|
1125
|
-
this.noiseFloor = noiseFloor;
|
|
1126
|
-
}
|
|
1127
|
-
/**
|
|
1128
|
-
* Process audio samples and return smoothed energy values
|
|
1129
|
-
* @param samples Audio samples (Float32Array)
|
|
1130
|
-
* @returns Object with rms and peak values
|
|
1131
|
-
*/
|
|
1132
|
-
process(samples) {
|
|
1133
|
-
const instantRMS = calculateRMS(samples);
|
|
1134
|
-
const instantPeak = calculatePeak(samples);
|
|
1135
|
-
const gatedRMS = instantRMS > this.noiseFloor ? instantRMS : 0;
|
|
1136
|
-
const gatedPeak = instantPeak > this.noiseFloor ? instantPeak : 0;
|
|
1137
|
-
if (gatedRMS > this.smoothedRMS) {
|
|
1138
|
-
this.smoothedRMS = this.smoothedRMS * 0.5 + gatedRMS * 0.5;
|
|
1139
|
-
} else {
|
|
1140
|
-
this.smoothedRMS = this.smoothedRMS * this.smoothingFactor + gatedRMS * (1 - this.smoothingFactor);
|
|
1141
|
-
}
|
|
1142
|
-
if (gatedPeak > this.smoothedPeak) {
|
|
1143
|
-
this.smoothedPeak = this.smoothedPeak * 0.3 + gatedPeak * 0.7;
|
|
1144
|
-
} else {
|
|
1145
|
-
this.smoothedPeak = this.smoothedPeak * this.smoothingFactor + gatedPeak * (1 - this.smoothingFactor);
|
|
1146
|
-
}
|
|
1147
|
-
const energy = this.smoothedRMS * 0.7 + this.smoothedPeak * 0.3;
|
|
1148
|
-
return {
|
|
1149
|
-
rms: this.smoothedRMS,
|
|
1150
|
-
peak: this.smoothedPeak,
|
|
1151
|
-
energy: Math.min(1, energy * 2)
|
|
1152
|
-
// Scale up and clamp
|
|
1153
|
-
};
|
|
1154
|
-
}
|
|
1155
|
-
/**
|
|
1156
|
-
* Reset analyzer state
|
|
1157
|
-
*/
|
|
1158
|
-
reset() {
|
|
1159
|
-
this.smoothedRMS = 0;
|
|
1160
|
-
this.smoothedPeak = 0;
|
|
1161
|
-
}
|
|
1162
|
-
/**
|
|
1163
|
-
* Get current smoothed RMS value
|
|
1164
|
-
*/
|
|
1165
|
-
get rms() {
|
|
1166
|
-
return this.smoothedRMS;
|
|
1167
|
-
}
|
|
1168
|
-
/**
|
|
1169
|
-
* Get current smoothed peak value
|
|
1170
|
-
*/
|
|
1171
|
-
get peak() {
|
|
1172
|
-
return this.smoothedPeak;
|
|
1173
|
-
}
|
|
1174
|
-
};
|
|
1175
|
-
var EmphasisDetector = class {
|
|
1176
|
-
/**
|
|
1177
|
-
* @param historySize Number of frames to track. Default 10
|
|
1178
|
-
* @param emphasisThreshold Minimum energy increase to count as emphasis. Default 0.15
|
|
1179
|
-
*/
|
|
1180
|
-
constructor(historySize = 10, emphasisThreshold = 0.15) {
|
|
1181
|
-
this.energyHistory = [];
|
|
1182
|
-
this.historySize = historySize;
|
|
1183
|
-
this.emphasisThreshold = emphasisThreshold;
|
|
1184
|
-
}
|
|
1185
|
-
/**
|
|
1186
|
-
* Process energy value and detect emphasis
|
|
1187
|
-
* @param energy Current energy value (0-1)
|
|
1188
|
-
* @returns Object with isEmphasis flag and emphasisStrength
|
|
1189
|
-
*/
|
|
1190
|
-
process(energy) {
|
|
1191
|
-
this.energyHistory.push(energy);
|
|
1192
|
-
if (this.energyHistory.length > this.historySize) {
|
|
1193
|
-
this.energyHistory.shift();
|
|
1194
|
-
}
|
|
1195
|
-
if (this.energyHistory.length < 3) {
|
|
1196
|
-
return { isEmphasis: false, emphasisStrength: 0 };
|
|
1197
|
-
}
|
|
1198
|
-
const prevFrames = this.energyHistory.slice(0, -1);
|
|
1199
|
-
const avgPrev = prevFrames.reduce((a, b) => a + b, 0) / prevFrames.length;
|
|
1200
|
-
const increase = energy - avgPrev;
|
|
1201
|
-
const isEmphasis = increase > this.emphasisThreshold;
|
|
1202
|
-
return {
|
|
1203
|
-
isEmphasis,
|
|
1204
|
-
emphasisStrength: isEmphasis ? Math.min(1, increase / 0.3) : 0
|
|
750
|
+
this.inferenceRunning = false;
|
|
751
|
+
if (this.pendingChunks.length > 0) {
|
|
752
|
+
this.drainPendingChunks();
|
|
753
|
+
}
|
|
1205
754
|
};
|
|
755
|
+
processNext().catch((err) => this.handleError(err));
|
|
1206
756
|
}
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
this.energyHistory = [];
|
|
757
|
+
handleError(err) {
|
|
758
|
+
const error = err instanceof Error ? err : new Error(String(err));
|
|
759
|
+
logger.warn("A2EProcessor inference error", { error: error.message });
|
|
760
|
+
this.onError?.(error);
|
|
1212
761
|
}
|
|
1213
762
|
};
|
|
1214
763
|
|
|
@@ -2461,7 +2010,7 @@ function isSafari() {
|
|
|
2461
2010
|
const ua = navigator.userAgent.toLowerCase();
|
|
2462
2011
|
return /safari/.test(ua) && !/chrome|crios|fxios|chromium|edg/.test(ua);
|
|
2463
2012
|
}
|
|
2464
|
-
function
|
|
2013
|
+
function shouldUseCpuA2E() {
|
|
2465
2014
|
return isSafari() || isIOS();
|
|
2466
2015
|
}
|
|
2467
2016
|
function isSpeechRecognitionAvailable() {
|
|
@@ -2471,22 +2020,22 @@ function isSpeechRecognitionAvailable() {
|
|
|
2471
2020
|
function shouldUseNativeASR() {
|
|
2472
2021
|
return (isIOS() || isSafari()) && isSpeechRecognitionAvailable();
|
|
2473
2022
|
}
|
|
2474
|
-
function
|
|
2023
|
+
function shouldUseServerA2E() {
|
|
2475
2024
|
return isIOS();
|
|
2476
2025
|
}
|
|
2477
2026
|
|
|
2478
2027
|
// src/inference/onnxLoader.ts
|
|
2479
|
-
var
|
|
2028
|
+
var logger2 = createLogger("OnnxLoader");
|
|
2480
2029
|
var ortInstance = null;
|
|
2481
2030
|
var loadedBackend = null;
|
|
2482
2031
|
var WASM_CDN_PATH = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
2483
2032
|
async function isWebGPUAvailable() {
|
|
2484
2033
|
if (isIOS()) {
|
|
2485
|
-
|
|
2034
|
+
logger2.debug("WebGPU check: disabled on iOS (asyncify bundle crashes WebKit)");
|
|
2486
2035
|
return false;
|
|
2487
2036
|
}
|
|
2488
2037
|
if (!hasWebGPUApi()) {
|
|
2489
|
-
|
|
2038
|
+
logger2.debug("WebGPU check: navigator.gpu not available", {
|
|
2490
2039
|
isSecureContext: typeof window !== "undefined" ? window.isSecureContext : "N/A"
|
|
2491
2040
|
});
|
|
2492
2041
|
return false;
|
|
@@ -2494,19 +2043,19 @@ async function isWebGPUAvailable() {
|
|
|
2494
2043
|
try {
|
|
2495
2044
|
const adapter = await navigator.gpu.requestAdapter();
|
|
2496
2045
|
if (!adapter) {
|
|
2497
|
-
|
|
2046
|
+
logger2.debug("WebGPU check: No adapter available");
|
|
2498
2047
|
return false;
|
|
2499
2048
|
}
|
|
2500
2049
|
const device = await adapter.requestDevice();
|
|
2501
2050
|
if (!device) {
|
|
2502
|
-
|
|
2051
|
+
logger2.debug("WebGPU check: Could not create device");
|
|
2503
2052
|
return false;
|
|
2504
2053
|
}
|
|
2505
2054
|
device.destroy();
|
|
2506
|
-
|
|
2055
|
+
logger2.debug("WebGPU check: Available and working");
|
|
2507
2056
|
return true;
|
|
2508
2057
|
} catch (err) {
|
|
2509
|
-
|
|
2058
|
+
logger2.debug("WebGPU check: Error during availability check", { error: err });
|
|
2510
2059
|
return false;
|
|
2511
2060
|
}
|
|
2512
2061
|
}
|
|
@@ -2516,11 +2065,11 @@ function applyIOSWasmMemoryPatch() {
|
|
|
2516
2065
|
iosWasmPatched = true;
|
|
2517
2066
|
const OrigMemory = WebAssembly.Memory;
|
|
2518
2067
|
const MAX_IOS_PAGES = 32768;
|
|
2519
|
-
|
|
2068
|
+
logger2.info("Applying iOS WASM memory patch (max\u21922GB, shared preserved)");
|
|
2520
2069
|
WebAssembly.Memory = function IOSPatchedMemory(descriptor) {
|
|
2521
2070
|
const patched = { ...descriptor };
|
|
2522
2071
|
if (patched.maximum !== void 0 && patched.maximum > MAX_IOS_PAGES) {
|
|
2523
|
-
|
|
2072
|
+
logger2.info("iOS memory patch: capping maximum", {
|
|
2524
2073
|
original: patched.maximum,
|
|
2525
2074
|
capped: MAX_IOS_PAGES,
|
|
2526
2075
|
shared: patched.shared,
|
|
@@ -2539,7 +2088,7 @@ function configureWasm(ort) {
|
|
|
2539
2088
|
ort.env.wasm.numThreads = numThreads;
|
|
2540
2089
|
ort.env.wasm.simd = true;
|
|
2541
2090
|
ort.env.wasm.proxy = enableProxy;
|
|
2542
|
-
|
|
2091
|
+
logger2.info("WASM configured", {
|
|
2543
2092
|
numThreads,
|
|
2544
2093
|
simd: true,
|
|
2545
2094
|
proxy: enableProxy,
|
|
@@ -2551,12 +2100,12 @@ async function getOnnxRuntime(backend) {
|
|
|
2551
2100
|
return ortInstance;
|
|
2552
2101
|
}
|
|
2553
2102
|
if (ortInstance && loadedBackend !== backend) {
|
|
2554
|
-
|
|
2103
|
+
logger2.warn(
|
|
2555
2104
|
`ONNX Runtime already loaded with ${loadedBackend} backend. Cannot switch to ${backend}. Returning existing instance.`
|
|
2556
2105
|
);
|
|
2557
2106
|
return ortInstance;
|
|
2558
2107
|
}
|
|
2559
|
-
|
|
2108
|
+
logger2.info(`Loading ONNX Runtime with ${backend} backend...`);
|
|
2560
2109
|
applyIOSWasmMemoryPatch();
|
|
2561
2110
|
try {
|
|
2562
2111
|
if (backend === "wasm" && (isIOS() || isSafari())) {
|
|
@@ -2571,10 +2120,10 @@ async function getOnnxRuntime(backend) {
|
|
|
2571
2120
|
}
|
|
2572
2121
|
loadedBackend = backend;
|
|
2573
2122
|
configureWasm(ortInstance);
|
|
2574
|
-
|
|
2123
|
+
logger2.info(`ONNX Runtime loaded successfully`, { backend });
|
|
2575
2124
|
return ortInstance;
|
|
2576
2125
|
} catch (err) {
|
|
2577
|
-
|
|
2126
|
+
logger2.error(`Failed to load ONNX Runtime with ${backend} backend`, {
|
|
2578
2127
|
error: err
|
|
2579
2128
|
});
|
|
2580
2129
|
throw new Error(
|
|
@@ -2585,7 +2134,7 @@ async function getOnnxRuntime(backend) {
|
|
|
2585
2134
|
async function getOnnxRuntimeForPreference(preference = "auto") {
|
|
2586
2135
|
const webgpuAvailable = await isWebGPUAvailable();
|
|
2587
2136
|
const backend = resolveBackend(preference, webgpuAvailable);
|
|
2588
|
-
|
|
2137
|
+
logger2.info("Resolved backend preference", {
|
|
2589
2138
|
preference,
|
|
2590
2139
|
webgpuAvailable,
|
|
2591
2140
|
resolvedBackend: backend
|
|
@@ -2619,42 +2168,6 @@ function getSessionOptions(backend) {
|
|
|
2619
2168
|
graphOptimizationLevel: "all"
|
|
2620
2169
|
};
|
|
2621
2170
|
}
|
|
2622
|
-
async function createSessionWithFallback(modelBuffer, preferredBackend) {
|
|
2623
|
-
const ort = await getOnnxRuntime(preferredBackend);
|
|
2624
|
-
const modelData = new Uint8Array(modelBuffer);
|
|
2625
|
-
if (preferredBackend === "webgpu") {
|
|
2626
|
-
try {
|
|
2627
|
-
const options2 = getSessionOptions("webgpu");
|
|
2628
|
-
const session2 = await ort.InferenceSession.create(modelData, options2);
|
|
2629
|
-
logger.info("Session created with WebGPU backend");
|
|
2630
|
-
return { session: session2, backend: "webgpu" };
|
|
2631
|
-
} catch (err) {
|
|
2632
|
-
logger.warn("WebGPU session creation failed, falling back to WASM", {
|
|
2633
|
-
error: err instanceof Error ? err.message : String(err)
|
|
2634
|
-
});
|
|
2635
|
-
}
|
|
2636
|
-
}
|
|
2637
|
-
const options = getSessionOptions("wasm");
|
|
2638
|
-
const session = await ort.InferenceSession.create(modelData, options);
|
|
2639
|
-
logger.info("Session created with WASM backend");
|
|
2640
|
-
return { session, backend: "wasm" };
|
|
2641
|
-
}
|
|
2642
|
-
function getLoadedBackend() {
|
|
2643
|
-
return loadedBackend;
|
|
2644
|
-
}
|
|
2645
|
-
function isOnnxRuntimeLoaded() {
|
|
2646
|
-
return ortInstance !== null;
|
|
2647
|
-
}
|
|
2648
|
-
async function preloadOnnxRuntime(preference = "auto") {
|
|
2649
|
-
if (ortInstance) {
|
|
2650
|
-
logger.info("ONNX Runtime already preloaded", { backend: loadedBackend });
|
|
2651
|
-
return loadedBackend;
|
|
2652
|
-
}
|
|
2653
|
-
logger.info("Preloading ONNX Runtime...", { preference });
|
|
2654
|
-
const { backend } = await getOnnxRuntimeForPreference(preference);
|
|
2655
|
-
logger.info("ONNX Runtime preloaded", { backend });
|
|
2656
|
-
return backend;
|
|
2657
|
-
}
|
|
2658
2171
|
|
|
2659
2172
|
// src/inference/blendshapeUtils.ts
|
|
2660
2173
|
var LAM_BLENDSHAPES = [
|
|
@@ -2804,16 +2317,19 @@ var WAV2ARKIT_BLENDSHAPES = [
|
|
|
2804
2317
|
var REMAP_WAV2ARKIT_TO_LAM = WAV2ARKIT_BLENDSHAPES.map(
|
|
2805
2318
|
(name) => LAM_BLENDSHAPES.indexOf(name)
|
|
2806
2319
|
);
|
|
2807
|
-
function
|
|
2808
|
-
const
|
|
2809
|
-
|
|
2810
|
-
|
|
2320
|
+
function lerpBlendshapes(current, target, factor = 0.3) {
|
|
2321
|
+
const len = Math.max(current.length, target.length);
|
|
2322
|
+
const result = new Array(len);
|
|
2323
|
+
for (let i = 0; i < len; i++) {
|
|
2324
|
+
const c = current[i] ?? 0;
|
|
2325
|
+
const t = target[i] ?? 0;
|
|
2326
|
+
result[i] = c + (t - c) * factor;
|
|
2811
2327
|
}
|
|
2812
2328
|
return result;
|
|
2813
2329
|
}
|
|
2814
2330
|
|
|
2815
2331
|
// src/inference/Wav2Vec2Inference.ts
|
|
2816
|
-
var
|
|
2332
|
+
var logger3 = createLogger("Wav2Vec2");
|
|
2817
2333
|
var CTC_VOCAB = [
|
|
2818
2334
|
"<pad>",
|
|
2819
2335
|
"<s>",
|
|
@@ -2863,6 +2379,7 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
2863
2379
|
this.poisoned = false;
|
|
2864
2380
|
this.config = config;
|
|
2865
2381
|
this.numIdentityClasses = config.numIdentityClasses ?? 12;
|
|
2382
|
+
this.chunkSize = config.chunkSize ?? 16e3;
|
|
2866
2383
|
}
|
|
2867
2384
|
get backend() {
|
|
2868
2385
|
return this.session ? this._backend : null;
|
|
@@ -2892,30 +2409,30 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
2892
2409
|
"model.backend_requested": this.config.backend || "auto"
|
|
2893
2410
|
});
|
|
2894
2411
|
try {
|
|
2895
|
-
|
|
2412
|
+
logger3.info("Loading ONNX Runtime...", { preference: this.config.backend || "auto" });
|
|
2896
2413
|
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend || "auto");
|
|
2897
2414
|
this.ort = ort;
|
|
2898
2415
|
this._backend = backend;
|
|
2899
|
-
|
|
2416
|
+
logger3.info("ONNX Runtime loaded", { backend: this._backend });
|
|
2900
2417
|
const modelUrl = this.config.modelUrl;
|
|
2901
2418
|
const dataUrl = this.config.externalDataUrl !== false ? typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data` : null;
|
|
2902
2419
|
const sessionOptions = getSessionOptions(this._backend);
|
|
2903
2420
|
let isCached = false;
|
|
2904
2421
|
if (isIOS()) {
|
|
2905
|
-
|
|
2422
|
+
logger3.info("iOS: passing model URLs directly to ORT (low-memory path)", {
|
|
2906
2423
|
modelUrl,
|
|
2907
2424
|
dataUrl
|
|
2908
2425
|
});
|
|
2909
2426
|
if (dataUrl) {
|
|
2910
2427
|
const dataFilename = dataUrl.split("/").pop();
|
|
2911
|
-
|
|
2428
|
+
logger3.info("iOS: setting externalData", { dataFilename, dataUrl });
|
|
2912
2429
|
sessionOptions.externalData = [{
|
|
2913
2430
|
path: dataFilename,
|
|
2914
2431
|
data: dataUrl
|
|
2915
2432
|
// URL string — ORT fetches directly into WASM
|
|
2916
2433
|
}];
|
|
2917
2434
|
}
|
|
2918
|
-
|
|
2435
|
+
logger3.info("iOS: calling InferenceSession.create() with URL string", {
|
|
2919
2436
|
modelUrl,
|
|
2920
2437
|
sessionOptions: JSON.stringify(
|
|
2921
2438
|
sessionOptions,
|
|
@@ -2925,14 +2442,14 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
2925
2442
|
try {
|
|
2926
2443
|
this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
|
|
2927
2444
|
} catch (sessionErr) {
|
|
2928
|
-
|
|
2445
|
+
logger3.error("iOS: InferenceSession.create() failed", {
|
|
2929
2446
|
error: sessionErr instanceof Error ? sessionErr.message : String(sessionErr),
|
|
2930
2447
|
errorType: sessionErr?.constructor?.name,
|
|
2931
2448
|
stack: sessionErr instanceof Error ? sessionErr.stack : void 0
|
|
2932
2449
|
});
|
|
2933
2450
|
throw sessionErr;
|
|
2934
2451
|
}
|
|
2935
|
-
|
|
2452
|
+
logger3.info("iOS: session created successfully", {
|
|
2936
2453
|
inputNames: this.session.inputNames,
|
|
2937
2454
|
outputNames: this.session.outputNames
|
|
2938
2455
|
});
|
|
@@ -2941,15 +2458,15 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
2941
2458
|
isCached = await cache.has(modelUrl);
|
|
2942
2459
|
let modelBuffer;
|
|
2943
2460
|
if (isCached) {
|
|
2944
|
-
|
|
2461
|
+
logger3.debug("Loading model from cache", { modelUrl });
|
|
2945
2462
|
modelBuffer = await cache.get(modelUrl);
|
|
2946
2463
|
if (!modelBuffer) {
|
|
2947
|
-
|
|
2464
|
+
logger3.warn("Cache corruption detected, clearing and retrying", { modelUrl });
|
|
2948
2465
|
await cache.delete(modelUrl);
|
|
2949
2466
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
2950
2467
|
}
|
|
2951
2468
|
} else {
|
|
2952
|
-
|
|
2469
|
+
logger3.debug("Fetching and caching model", { modelUrl });
|
|
2953
2470
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
2954
2471
|
}
|
|
2955
2472
|
if (!modelBuffer) {
|
|
@@ -2960,31 +2477,31 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
2960
2477
|
try {
|
|
2961
2478
|
const isDataCached = await cache.has(dataUrl);
|
|
2962
2479
|
if (isDataCached) {
|
|
2963
|
-
|
|
2480
|
+
logger3.debug("Loading external data from cache", { dataUrl });
|
|
2964
2481
|
externalDataBuffer = await cache.get(dataUrl);
|
|
2965
2482
|
if (!externalDataBuffer) {
|
|
2966
|
-
|
|
2483
|
+
logger3.warn("Cache corruption for external data, retrying", { dataUrl });
|
|
2967
2484
|
await cache.delete(dataUrl);
|
|
2968
2485
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
2969
2486
|
}
|
|
2970
2487
|
} else {
|
|
2971
|
-
|
|
2488
|
+
logger3.info("Fetching external model data", {
|
|
2972
2489
|
dataUrl,
|
|
2973
2490
|
note: "This may be a large download (383MB+)"
|
|
2974
2491
|
});
|
|
2975
2492
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
2976
2493
|
}
|
|
2977
|
-
|
|
2494
|
+
logger3.info("External data loaded", {
|
|
2978
2495
|
size: formatBytes(externalDataBuffer.byteLength)
|
|
2979
2496
|
});
|
|
2980
2497
|
} catch (err) {
|
|
2981
|
-
|
|
2498
|
+
logger3.debug("No external data file found (single-file model)", {
|
|
2982
2499
|
dataUrl,
|
|
2983
2500
|
error: err.message
|
|
2984
2501
|
});
|
|
2985
2502
|
}
|
|
2986
2503
|
}
|
|
2987
|
-
|
|
2504
|
+
logger3.debug("Creating ONNX session", {
|
|
2988
2505
|
graphSize: formatBytes(modelBuffer.byteLength),
|
|
2989
2506
|
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
|
|
2990
2507
|
backend: this._backend
|
|
@@ -2999,12 +2516,12 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
2999
2516
|
const modelData = new Uint8Array(modelBuffer);
|
|
3000
2517
|
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
3001
2518
|
}
|
|
3002
|
-
|
|
2519
|
+
logger3.info("ONNX session created successfully", {
|
|
3003
2520
|
executionProvider: this._backend,
|
|
3004
2521
|
backend: this._backend
|
|
3005
2522
|
});
|
|
3006
2523
|
const loadTimeMs = performance.now() - startTime;
|
|
3007
|
-
|
|
2524
|
+
logger3.info("Model loaded successfully", {
|
|
3008
2525
|
backend: this._backend,
|
|
3009
2526
|
loadTimeMs: Math.round(loadTimeMs),
|
|
3010
2527
|
inputs: this.session.inputNames,
|
|
@@ -3020,13 +2537,13 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3020
2537
|
model: "wav2vec2",
|
|
3021
2538
|
backend: this._backend
|
|
3022
2539
|
});
|
|
3023
|
-
|
|
2540
|
+
logger3.debug("Running warmup inference to initialize GPU context");
|
|
3024
2541
|
const warmupStart = performance.now();
|
|
3025
|
-
const warmupAudio = new Float32Array(
|
|
2542
|
+
const warmupAudio = new Float32Array(this.chunkSize);
|
|
3026
2543
|
const warmupIdentity = new Float32Array(this.numIdentityClasses);
|
|
3027
2544
|
warmupIdentity[0] = 1;
|
|
3028
2545
|
const warmupFeeds = {
|
|
3029
|
-
"audio": new this.ort.Tensor("float32", warmupAudio, [1,
|
|
2546
|
+
"audio": new this.ort.Tensor("float32", warmupAudio, [1, this.chunkSize]),
|
|
3030
2547
|
"identity": new this.ort.Tensor("float32", warmupIdentity, [1, this.numIdentityClasses])
|
|
3031
2548
|
};
|
|
3032
2549
|
const WARMUP_TIMEOUT_MS = 15e3;
|
|
@@ -3036,12 +2553,12 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3036
2553
|
]);
|
|
3037
2554
|
const warmupTimeMs = performance.now() - warmupStart;
|
|
3038
2555
|
if (warmupResult === "timeout") {
|
|
3039
|
-
|
|
2556
|
+
logger3.warn("Warmup inference timed out \u2014 GPU may be unresponsive. Continuing without warmup.", {
|
|
3040
2557
|
timeoutMs: WARMUP_TIMEOUT_MS,
|
|
3041
2558
|
backend: this._backend
|
|
3042
2559
|
});
|
|
3043
2560
|
} else {
|
|
3044
|
-
|
|
2561
|
+
logger3.info("Warmup inference complete", {
|
|
3045
2562
|
warmupTimeMs: Math.round(warmupTimeMs),
|
|
3046
2563
|
backend: this._backend
|
|
3047
2564
|
});
|
|
@@ -3069,11 +2586,10 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3069
2586
|
}
|
|
3070
2587
|
/**
|
|
3071
2588
|
* Run inference on raw audio
|
|
3072
|
-
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
2589
|
+
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
3073
2590
|
* @param identityIndex - Optional identity index (0-11, default 0 = neutral)
|
|
3074
2591
|
*
|
|
3075
|
-
*
|
|
3076
|
-
* Audio will be zero-padded or truncated to 16000 samples.
|
|
2592
|
+
* Audio will be zero-padded or truncated to chunkSize samples.
|
|
3077
2593
|
*/
|
|
3078
2594
|
async infer(audioSamples, identityIndex = 0) {
|
|
3079
2595
|
if (!this.session) {
|
|
@@ -3084,20 +2600,20 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3084
2600
|
}
|
|
3085
2601
|
const audioSamplesCopy = new Float32Array(audioSamples);
|
|
3086
2602
|
let audio;
|
|
3087
|
-
if (audioSamplesCopy.length ===
|
|
2603
|
+
if (audioSamplesCopy.length === this.chunkSize) {
|
|
3088
2604
|
audio = audioSamplesCopy;
|
|
3089
|
-
} else if (audioSamplesCopy.length <
|
|
3090
|
-
audio = new Float32Array(
|
|
2605
|
+
} else if (audioSamplesCopy.length < this.chunkSize) {
|
|
2606
|
+
audio = new Float32Array(this.chunkSize);
|
|
3091
2607
|
audio.set(audioSamplesCopy, 0);
|
|
3092
2608
|
} else {
|
|
3093
|
-
audio = audioSamplesCopy.slice(0,
|
|
2609
|
+
audio = audioSamplesCopy.slice(0, this.chunkSize);
|
|
3094
2610
|
}
|
|
3095
2611
|
const identity = new Float32Array(this.numIdentityClasses);
|
|
3096
2612
|
identity[Math.max(0, Math.min(identityIndex, this.numIdentityClasses - 1))] = 1;
|
|
3097
2613
|
const audioCopy = new Float32Array(audio);
|
|
3098
2614
|
const identityCopy = new Float32Array(identity);
|
|
3099
2615
|
const feeds = {
|
|
3100
|
-
"audio": new this.ort.Tensor("float32", audioCopy, [1,
|
|
2616
|
+
"audio": new this.ort.Tensor("float32", audioCopy, [1, this.chunkSize]),
|
|
3101
2617
|
"identity": new this.ort.Tensor("float32", identityCopy, [1, this.numIdentityClasses])
|
|
3102
2618
|
};
|
|
3103
2619
|
return this.queueInference(feeds);
|
|
@@ -3133,7 +2649,7 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3133
2649
|
const telemetry = getTelemetry();
|
|
3134
2650
|
const span = telemetry?.startSpan("Wav2Vec2.infer", {
|
|
3135
2651
|
"inference.backend": this._backend,
|
|
3136
|
-
"inference.input_samples":
|
|
2652
|
+
"inference.input_samples": this.chunkSize
|
|
3137
2653
|
});
|
|
3138
2654
|
try {
|
|
3139
2655
|
const startTime = performance.now();
|
|
@@ -3172,7 +2688,7 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3172
2688
|
blendshapes.push(symmetrizeBlendshapes(rawFrame));
|
|
3173
2689
|
}
|
|
3174
2690
|
const text = this.decodeCTC(asrLogits);
|
|
3175
|
-
|
|
2691
|
+
logger3.trace("Inference completed", {
|
|
3176
2692
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
3177
2693
|
numA2EFrames,
|
|
3178
2694
|
numASRFrames,
|
|
@@ -3206,12 +2722,12 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3206
2722
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
3207
2723
|
if (errMsg.includes("timed out")) {
|
|
3208
2724
|
this.poisoned = true;
|
|
3209
|
-
|
|
2725
|
+
logger3.error("CRITICAL: Inference session timed out \u2014 LAM is dead. Page reload required.", {
|
|
3210
2726
|
backend: this._backend,
|
|
3211
2727
|
timeoutMs: _Wav2Vec2Inference.INFERENCE_TIMEOUT_MS
|
|
3212
2728
|
});
|
|
3213
2729
|
} else {
|
|
3214
|
-
|
|
2730
|
+
logger3.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
3215
2731
|
}
|
|
3216
2732
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
3217
2733
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -3252,56 +2768,74 @@ _Wav2Vec2Inference.INFERENCE_TIMEOUT_MS = 5e3;
|
|
|
3252
2768
|
_Wav2Vec2Inference.isWebGPUAvailable = isWebGPUAvailable;
|
|
3253
2769
|
var Wav2Vec2Inference = _Wav2Vec2Inference;
|
|
3254
2770
|
|
|
2771
|
+
// src/audio/audioUtils.ts
|
|
2772
|
+
function pcm16ToFloat32(buffer) {
|
|
2773
|
+
const byteLen = buffer.byteLength & ~1;
|
|
2774
|
+
const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
|
|
2775
|
+
const float32 = new Float32Array(int16.length);
|
|
2776
|
+
for (let i = 0; i < int16.length; i++) {
|
|
2777
|
+
float32[i] = int16[i] / 32768;
|
|
2778
|
+
}
|
|
2779
|
+
return float32;
|
|
2780
|
+
}
|
|
2781
|
+
function int16ToFloat32(int16) {
|
|
2782
|
+
const float32 = new Float32Array(int16.length);
|
|
2783
|
+
for (let i = 0; i < int16.length; i++) {
|
|
2784
|
+
float32[i] = int16[i] / 32768;
|
|
2785
|
+
}
|
|
2786
|
+
return float32;
|
|
2787
|
+
}
|
|
2788
|
+
|
|
3255
2789
|
// src/audio/FullFacePipeline.ts
|
|
3256
|
-
var
|
|
3257
|
-
var
|
|
3258
|
-
|
|
3259
|
-
|
|
3260
|
-
|
|
3261
|
-
|
|
3262
|
-
|
|
3263
|
-
|
|
3264
|
-
|
|
3265
|
-
|
|
3266
|
-
|
|
3267
|
-
|
|
3268
|
-
|
|
3269
|
-
|
|
3270
|
-
|
|
3271
|
-
|
|
3272
|
-
|
|
3273
|
-
|
|
3274
|
-
|
|
3275
|
-
|
|
3276
|
-
disappointed: "sad",
|
|
3277
|
-
frustrated: "angry",
|
|
3278
|
-
irritated: "angry",
|
|
3279
|
-
furious: "angry",
|
|
3280
|
-
annoyed: "angry",
|
|
3281
|
-
// SenseVoice labels
|
|
3282
|
-
fearful: "sad",
|
|
3283
|
-
disgusted: "angry",
|
|
3284
|
-
surprised: "happy"
|
|
3285
|
-
};
|
|
3286
|
-
var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
2790
|
+
var logger4 = createLogger("FullFacePipeline");
|
|
2791
|
+
var BLENDSHAPE_TO_GROUP = /* @__PURE__ */ new Map();
|
|
2792
|
+
for (const name of LAM_BLENDSHAPES) {
|
|
2793
|
+
if (name.startsWith("eye")) {
|
|
2794
|
+
BLENDSHAPE_TO_GROUP.set(name, "eyes");
|
|
2795
|
+
} else if (name.startsWith("brow")) {
|
|
2796
|
+
BLENDSHAPE_TO_GROUP.set(name, "brows");
|
|
2797
|
+
} else if (name.startsWith("jaw")) {
|
|
2798
|
+
BLENDSHAPE_TO_GROUP.set(name, "jaw");
|
|
2799
|
+
} else if (name.startsWith("mouth")) {
|
|
2800
|
+
BLENDSHAPE_TO_GROUP.set(name, "mouth");
|
|
2801
|
+
} else if (name.startsWith("cheek")) {
|
|
2802
|
+
BLENDSHAPE_TO_GROUP.set(name, "cheeks");
|
|
2803
|
+
} else if (name.startsWith("nose")) {
|
|
2804
|
+
BLENDSHAPE_TO_GROUP.set(name, "nose");
|
|
2805
|
+
} else if (name.startsWith("tongue")) {
|
|
2806
|
+
BLENDSHAPE_TO_GROUP.set(name, "tongue");
|
|
2807
|
+
}
|
|
2808
|
+
}
|
|
2809
|
+
var FullFacePipeline = class extends EventEmitter {
|
|
3287
2810
|
constructor(options) {
|
|
3288
2811
|
super();
|
|
3289
2812
|
this.options = options;
|
|
3290
2813
|
this.playbackStarted = false;
|
|
3291
2814
|
this.monitorInterval = null;
|
|
3292
2815
|
this.frameAnimationId = null;
|
|
3293
|
-
// Emotion state
|
|
3294
|
-
this.lastEmotionFrame = null;
|
|
3295
|
-
this.currentAudioEnergy = 0;
|
|
3296
2816
|
// Stale frame detection
|
|
3297
2817
|
this.lastNewFrameTime = 0;
|
|
3298
2818
|
this.lastKnownLamFrame = null;
|
|
3299
2819
|
this.staleWarningEmitted = false;
|
|
2820
|
+
// Diagnostic logging counter
|
|
2821
|
+
this.frameLoopCount = 0;
|
|
3300
2822
|
const sampleRate = options.sampleRate ?? 16e3;
|
|
3301
|
-
this.
|
|
3302
|
-
this.
|
|
3303
|
-
const
|
|
2823
|
+
this.profile = options.profile ?? {};
|
|
2824
|
+
this.staleThresholdMs = options.staleThresholdMs ?? 2e3;
|
|
2825
|
+
const isCpuModel = options.lam.modelId === "wav2arkit_cpu";
|
|
2826
|
+
const chunkSize = options.chunkSize ?? options.lam.chunkSize ?? 16e3;
|
|
2827
|
+
const chunkAccumulationMs = chunkSize / sampleRate * 1e3;
|
|
2828
|
+
const inferenceEstimateMs = isCpuModel ? 300 : options.lam.backend === "wasm" ? 250 : 80;
|
|
2829
|
+
const marginMs = 100;
|
|
2830
|
+
const autoDelay = Math.ceil(chunkAccumulationMs + inferenceEstimateMs + marginMs);
|
|
3304
2831
|
const audioDelayMs = options.audioDelayMs ?? autoDelay;
|
|
2832
|
+
logger4.info("FullFacePipeline config", {
|
|
2833
|
+
chunkSize,
|
|
2834
|
+
audioDelayMs,
|
|
2835
|
+
autoDelay,
|
|
2836
|
+
backend: options.lam.backend,
|
|
2837
|
+
modelId: options.lam.modelId
|
|
2838
|
+
});
|
|
3305
2839
|
this.scheduler = new AudioScheduler({
|
|
3306
2840
|
sampleRate,
|
|
3307
2841
|
initialLookaheadSec: audioDelayMs / 1e3
|
|
@@ -3310,20 +2844,15 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3310
2844
|
sampleRate,
|
|
3311
2845
|
targetDurationMs: options.chunkTargetMs ?? 200
|
|
3312
2846
|
});
|
|
3313
|
-
this.
|
|
2847
|
+
this.processor = new A2EProcessor({
|
|
2848
|
+
backend: options.lam,
|
|
3314
2849
|
sampleRate,
|
|
2850
|
+
chunkSize,
|
|
3315
2851
|
onError: (error) => {
|
|
3316
|
-
|
|
2852
|
+
logger4.error("A2E inference error", { message: error.message, stack: error.stack });
|
|
3317
2853
|
this.emit("error", error);
|
|
3318
2854
|
}
|
|
3319
2855
|
});
|
|
3320
|
-
this.emotionMapper = new EmotionToBlendshapeMapper({
|
|
3321
|
-
smoothingFactor: 0.15,
|
|
3322
|
-
confidenceThreshold: 0.3,
|
|
3323
|
-
intensity: 1,
|
|
3324
|
-
energyModulation: true
|
|
3325
|
-
});
|
|
3326
|
-
this.energyAnalyzer = new AudioEnergyAnalyzer();
|
|
3327
2856
|
}
|
|
3328
2857
|
/**
|
|
3329
2858
|
* Initialize the pipeline
|
|
@@ -3332,40 +2861,33 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3332
2861
|
await this.scheduler.initialize();
|
|
3333
2862
|
}
|
|
3334
2863
|
/**
|
|
3335
|
-
*
|
|
3336
|
-
|
|
3337
|
-
|
|
3338
|
-
|
|
3339
|
-
*
|
|
3340
|
-
* Supported labels: happy, excited, joyful, sad, melancholic, angry,
|
|
3341
|
-
* frustrated, neutral, etc.
|
|
3342
|
-
*
|
|
3343
|
-
* @param label - Emotion label string (case-insensitive)
|
|
3344
|
-
*/
|
|
3345
|
-
setEmotionLabel(label) {
|
|
3346
|
-
const normalized = label.toLowerCase();
|
|
3347
|
-
const mapped = EMOTION_LABEL_MAP[normalized] ?? "neutral";
|
|
3348
|
-
const probabilities = {
|
|
3349
|
-
neutral: 0.1,
|
|
3350
|
-
happy: 0.1,
|
|
3351
|
-
angry: 0.1,
|
|
3352
|
-
sad: 0.1
|
|
3353
|
-
};
|
|
3354
|
-
probabilities[mapped] = 0.7;
|
|
3355
|
-
const frame = {
|
|
3356
|
-
emotion: mapped,
|
|
3357
|
-
confidence: 0.7,
|
|
3358
|
-
probabilities
|
|
3359
|
-
};
|
|
3360
|
-
this.lastEmotionFrame = frame;
|
|
3361
|
-
logger3.info("Emotion label set", { label, mapped });
|
|
2864
|
+
* Update the ExpressionProfile at runtime (e.g., character switch).
|
|
2865
|
+
*/
|
|
2866
|
+
setProfile(profile) {
|
|
2867
|
+
this.profile = profile;
|
|
3362
2868
|
}
|
|
3363
2869
|
/**
|
|
3364
|
-
*
|
|
3365
|
-
*
|
|
2870
|
+
* Apply ExpressionProfile scaling to raw A2E blendshapes.
|
|
2871
|
+
*
|
|
2872
|
+
* For each blendshape:
|
|
2873
|
+
* 1. If an override exists for the blendshape name, use override as scaler
|
|
2874
|
+
* 2. Otherwise, use the group scaler (default 1.0)
|
|
2875
|
+
* 3. Clamp result to [0, 1]
|
|
3366
2876
|
*/
|
|
3367
|
-
|
|
3368
|
-
|
|
2877
|
+
applyProfile(raw) {
|
|
2878
|
+
const scaled = new Float32Array(52);
|
|
2879
|
+
for (let i = 0; i < 52; i++) {
|
|
2880
|
+
const name = LAM_BLENDSHAPES[i];
|
|
2881
|
+
let scaler;
|
|
2882
|
+
if (this.profile.overrides && this.profile.overrides[name] !== void 0) {
|
|
2883
|
+
scaler = this.profile.overrides[name];
|
|
2884
|
+
} else {
|
|
2885
|
+
const group = BLENDSHAPE_TO_GROUP.get(name);
|
|
2886
|
+
scaler = group ? this.profile[group] ?? 1 : 1;
|
|
2887
|
+
}
|
|
2888
|
+
scaled[i] = Math.min(1, Math.max(0, raw[i] * scaler));
|
|
2889
|
+
}
|
|
2890
|
+
return scaled;
|
|
3369
2891
|
}
|
|
3370
2892
|
/**
|
|
3371
2893
|
* Start a new playback session
|
|
@@ -3377,15 +2899,12 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3377
2899
|
this.stopMonitoring();
|
|
3378
2900
|
this.scheduler.reset();
|
|
3379
2901
|
this.coalescer.reset();
|
|
3380
|
-
this.
|
|
2902
|
+
this.processor.reset();
|
|
3381
2903
|
this.playbackStarted = false;
|
|
3382
|
-
this.lastEmotionFrame = null;
|
|
3383
|
-
this.currentAudioEnergy = 0;
|
|
3384
|
-
this.emotionMapper.reset();
|
|
3385
|
-
this.energyAnalyzer.reset();
|
|
3386
2904
|
this.lastNewFrameTime = 0;
|
|
3387
2905
|
this.lastKnownLamFrame = null;
|
|
3388
2906
|
this.staleWarningEmitted = false;
|
|
2907
|
+
this.frameLoopCount = 0;
|
|
3389
2908
|
this.scheduler.warmup();
|
|
3390
2909
|
this.startFrameLoop();
|
|
3391
2910
|
this.startMonitoring();
|
|
@@ -3393,8 +2912,8 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3393
2912
|
/**
|
|
3394
2913
|
* Receive audio chunk from network
|
|
3395
2914
|
*
|
|
3396
|
-
* Audio-first design: schedules audio immediately,
|
|
3397
|
-
* This prevents
|
|
2915
|
+
* Audio-first design: schedules audio immediately, A2E runs in background.
|
|
2916
|
+
* This prevents A2E inference (50-300ms) from blocking audio scheduling.
|
|
3398
2917
|
*
|
|
3399
2918
|
* @param chunk - Uint8Array containing Int16 PCM audio
|
|
3400
2919
|
*/
|
|
@@ -3409,100 +2928,69 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3409
2928
|
this.playbackStarted = true;
|
|
3410
2929
|
this.emit("playback_start", scheduleTime);
|
|
3411
2930
|
}
|
|
3412
|
-
|
|
3413
|
-
|
|
3414
|
-
|
|
3415
|
-
this.
|
|
2931
|
+
logger4.info("onAudioChunk \u2192 pushAudio", {
|
|
2932
|
+
float32Samples: float32.length,
|
|
2933
|
+
scheduleTime: scheduleTime.toFixed(3),
|
|
2934
|
+
currentTime: this.scheduler.getCurrentTime().toFixed(3),
|
|
2935
|
+
deltaToPlayback: (scheduleTime - this.scheduler.getCurrentTime()).toFixed(3)
|
|
3416
2936
|
});
|
|
3417
|
-
|
|
3418
|
-
/**
|
|
3419
|
-
* Get emotion frame for current animation.
|
|
3420
|
-
*
|
|
3421
|
-
* Priority:
|
|
3422
|
-
* 1. Explicit emotion label from setEmotionLabel()
|
|
3423
|
-
* 2. Prosody fallback: subtle brow movement from audio energy
|
|
3424
|
-
*/
|
|
3425
|
-
getEmotionFrame() {
|
|
3426
|
-
if (this.lastEmotionFrame) {
|
|
3427
|
-
return { frame: this.lastEmotionFrame, energy: this.currentAudioEnergy };
|
|
3428
|
-
}
|
|
3429
|
-
return { frame: null, energy: this.currentAudioEnergy };
|
|
3430
|
-
}
|
|
3431
|
-
/**
|
|
3432
|
-
* Merge LAM blendshapes with emotion upper face blendshapes
|
|
3433
|
-
*/
|
|
3434
|
-
mergeBlendshapes(lamFrame, emotionFrame, audioEnergy) {
|
|
3435
|
-
const merged = new Float32Array(52);
|
|
3436
|
-
let emotionBlendshapes;
|
|
3437
|
-
if (emotionFrame) {
|
|
3438
|
-
this.emotionMapper.mapFrame(emotionFrame, audioEnergy);
|
|
3439
|
-
this.emotionMapper.update(33);
|
|
3440
|
-
emotionBlendshapes = this.emotionMapper.getCurrentBlendshapes();
|
|
3441
|
-
} else {
|
|
3442
|
-
emotionBlendshapes = {};
|
|
3443
|
-
for (const name of UPPER_FACE_BLENDSHAPES) {
|
|
3444
|
-
emotionBlendshapes[name] = 0;
|
|
3445
|
-
}
|
|
3446
|
-
}
|
|
3447
|
-
for (let i = 0; i < 52; i++) {
|
|
3448
|
-
const name = LAM_BLENDSHAPES[i];
|
|
3449
|
-
if (UPPER_FACE_SET.has(name)) {
|
|
3450
|
-
const emotionValue = emotionBlendshapes[name] ?? 0;
|
|
3451
|
-
const lamValue = lamFrame[i];
|
|
3452
|
-
merged[i] = emotionValue * this.emotionBlendFactor + lamValue * this.lamBlendFactor;
|
|
3453
|
-
} else {
|
|
3454
|
-
merged[i] = lamFrame[i];
|
|
3455
|
-
}
|
|
3456
|
-
}
|
|
3457
|
-
return { merged, emotionBlendshapes };
|
|
2937
|
+
this.processor.pushAudio(float32, scheduleTime);
|
|
3458
2938
|
}
|
|
3459
2939
|
/**
|
|
3460
2940
|
* Start frame animation loop
|
|
2941
|
+
*
|
|
2942
|
+
* Polls A2EProcessor at render rate (60fps) for the latest inference frame
|
|
2943
|
+
* matching the current AudioContext time. Between inference batches (~30fps
|
|
2944
|
+
* bursts), getFrameForTime() holds the last frame.
|
|
3461
2945
|
*/
|
|
3462
2946
|
startFrameLoop() {
|
|
3463
2947
|
const updateFrame = () => {
|
|
2948
|
+
this.frameLoopCount++;
|
|
3464
2949
|
const currentTime = this.scheduler.getCurrentTime();
|
|
3465
|
-
const lamFrame = this.
|
|
3466
|
-
if (lamFrame) {
|
|
3467
|
-
|
|
3468
|
-
|
|
3469
|
-
|
|
3470
|
-
|
|
2950
|
+
const lamFrame = this.processor.getFrameForTime(currentTime);
|
|
2951
|
+
if (lamFrame && lamFrame !== this.lastKnownLamFrame) {
|
|
2952
|
+
this.lastNewFrameTime = performance.now();
|
|
2953
|
+
this.lastKnownLamFrame = lamFrame;
|
|
2954
|
+
this.staleWarningEmitted = false;
|
|
2955
|
+
logger4.info("New A2E frame", {
|
|
2956
|
+
jawOpen: lamFrame[24]?.toFixed(3),
|
|
2957
|
+
mouthClose: lamFrame[26]?.toFixed(3),
|
|
2958
|
+
browInnerUp: lamFrame[2]?.toFixed(3),
|
|
2959
|
+
browDownL: lamFrame[0]?.toFixed(3),
|
|
2960
|
+
browOuterUpL: lamFrame[3]?.toFixed(3),
|
|
2961
|
+
currentTime: currentTime.toFixed(3),
|
|
2962
|
+
queuedFrames: this.processor.queuedFrameCount
|
|
2963
|
+
});
|
|
2964
|
+
}
|
|
2965
|
+
if (this.frameLoopCount % 60 === 0) {
|
|
2966
|
+
logger4.info("Frame loop heartbeat", {
|
|
2967
|
+
frameLoopCount: this.frameLoopCount,
|
|
2968
|
+
currentTime: currentTime.toFixed(3),
|
|
2969
|
+
playbackEndTime: this.scheduler.getPlaybackEndTime().toFixed(3),
|
|
2970
|
+
queuedFrames: this.processor.queuedFrameCount,
|
|
2971
|
+
playbackStarted: this.playbackStarted,
|
|
2972
|
+
msSinceNewFrame: this.lastNewFrameTime > 0 ? Math.round(performance.now() - this.lastNewFrameTime) : -1,
|
|
2973
|
+
processorFill: this.processor.fillLevel.toFixed(2)
|
|
2974
|
+
});
|
|
2975
|
+
}
|
|
2976
|
+
if (this.playbackStarted && this.lastNewFrameTime > 0 && performance.now() - this.lastNewFrameTime > this.staleThresholdMs) {
|
|
2977
|
+
if (!this.staleWarningEmitted) {
|
|
2978
|
+
this.staleWarningEmitted = true;
|
|
2979
|
+
logger4.warn("A2E stalled \u2014 no new inference frames", {
|
|
2980
|
+
staleDurationMs: Math.round(performance.now() - this.lastNewFrameTime),
|
|
2981
|
+
queuedFrames: this.processor.queuedFrameCount
|
|
2982
|
+
});
|
|
3471
2983
|
}
|
|
3472
|
-
|
|
3473
|
-
|
|
2984
|
+
}
|
|
2985
|
+
if (lamFrame) {
|
|
2986
|
+
const scaled = this.applyProfile(lamFrame);
|
|
3474
2987
|
const fullFrame = {
|
|
3475
|
-
blendshapes:
|
|
3476
|
-
|
|
3477
|
-
emotionBlendshapes,
|
|
3478
|
-
emotion: emotionFrame,
|
|
2988
|
+
blendshapes: scaled,
|
|
2989
|
+
rawBlendshapes: lamFrame,
|
|
3479
2990
|
timestamp: currentTime
|
|
3480
2991
|
};
|
|
3481
2992
|
this.emit("full_frame_ready", fullFrame);
|
|
3482
2993
|
this.emit("lam_frame_ready", lamFrame);
|
|
3483
|
-
if (emotionFrame) {
|
|
3484
|
-
this.emit("emotion_frame_ready", emotionFrame);
|
|
3485
|
-
}
|
|
3486
|
-
} else if (this.playbackStarted && !this.lastKnownLamFrame) {
|
|
3487
|
-
const { frame: emotionFrame, energy } = this.getEmotionFrame();
|
|
3488
|
-
if (emotionFrame && energy > 0.05) {
|
|
3489
|
-
const startupFrame = new Float32Array(52);
|
|
3490
|
-
const { merged, emotionBlendshapes } = this.mergeBlendshapes(startupFrame, emotionFrame, energy);
|
|
3491
|
-
this.emit("full_frame_ready", {
|
|
3492
|
-
blendshapes: merged,
|
|
3493
|
-
lamBlendshapes: startupFrame,
|
|
3494
|
-
emotionBlendshapes,
|
|
3495
|
-
emotion: emotionFrame,
|
|
3496
|
-
timestamp: currentTime
|
|
3497
|
-
});
|
|
3498
|
-
}
|
|
3499
|
-
}
|
|
3500
|
-
if (this.playbackStarted && this.lastNewFrameTime > 0 && !this.staleWarningEmitted && performance.now() - this.lastNewFrameTime > _FullFacePipeline.STALE_FRAME_THRESHOLD_MS) {
|
|
3501
|
-
this.staleWarningEmitted = true;
|
|
3502
|
-
logger3.warn("LAM appears stalled \u2014 no new frames for 3+ seconds during playback", {
|
|
3503
|
-
staleDurationMs: Math.round(performance.now() - this.lastNewFrameTime),
|
|
3504
|
-
queuedFrames: this.lamPipeline.queuedFrameCount
|
|
3505
|
-
});
|
|
3506
2994
|
}
|
|
3507
2995
|
this.frameAnimationId = requestAnimationFrame(updateFrame);
|
|
3508
2996
|
};
|
|
@@ -3517,7 +3005,7 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3517
3005
|
const chunk = new Uint8Array(remaining);
|
|
3518
3006
|
await this.onAudioChunk(chunk);
|
|
3519
3007
|
}
|
|
3520
|
-
await this.
|
|
3008
|
+
await this.processor.flush();
|
|
3521
3009
|
}
|
|
3522
3010
|
/**
|
|
3523
3011
|
* Stop playback immediately with smooth fade-out
|
|
@@ -3526,12 +3014,8 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3526
3014
|
this.stopMonitoring();
|
|
3527
3015
|
await this.scheduler.cancelAll(fadeOutMs);
|
|
3528
3016
|
this.coalescer.reset();
|
|
3529
|
-
this.
|
|
3017
|
+
this.processor.reset();
|
|
3530
3018
|
this.playbackStarted = false;
|
|
3531
|
-
this.lastEmotionFrame = null;
|
|
3532
|
-
this.currentAudioEnergy = 0;
|
|
3533
|
-
this.emotionMapper.reset();
|
|
3534
|
-
this.energyAnalyzer.reset();
|
|
3535
3019
|
this.lastNewFrameTime = 0;
|
|
3536
3020
|
this.lastKnownLamFrame = null;
|
|
3537
3021
|
this.staleWarningEmitted = false;
|
|
@@ -3545,7 +3029,7 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3545
3029
|
clearInterval(this.monitorInterval);
|
|
3546
3030
|
}
|
|
3547
3031
|
this.monitorInterval = setInterval(() => {
|
|
3548
|
-
if (this.scheduler.isComplete() && this.
|
|
3032
|
+
if (this.scheduler.isComplete() && this.processor.queuedFrameCount === 0) {
|
|
3549
3033
|
this.emit("playback_complete", void 0);
|
|
3550
3034
|
this.stopMonitoring();
|
|
3551
3035
|
}
|
|
@@ -3571,20 +3055,12 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3571
3055
|
return {
|
|
3572
3056
|
playbackStarted: this.playbackStarted,
|
|
3573
3057
|
coalescerFill: this.coalescer.fillLevel,
|
|
3574
|
-
|
|
3575
|
-
|
|
3576
|
-
emotionLabel: this.lastEmotionFrame?.emotion ?? null,
|
|
3577
|
-
currentAudioEnergy: this.currentAudioEnergy,
|
|
3058
|
+
processorFill: this.processor.fillLevel,
|
|
3059
|
+
queuedFrames: this.processor.queuedFrameCount,
|
|
3578
3060
|
currentTime: this.scheduler.getCurrentTime(),
|
|
3579
3061
|
playbackEndTime: this.scheduler.getPlaybackEndTime()
|
|
3580
3062
|
};
|
|
3581
3063
|
}
|
|
3582
|
-
/**
|
|
3583
|
-
* Check if an explicit emotion label is currently set
|
|
3584
|
-
*/
|
|
3585
|
-
get hasEmotionLabel() {
|
|
3586
|
-
return this.lastEmotionFrame !== null;
|
|
3587
|
-
}
|
|
3588
3064
|
/**
|
|
3589
3065
|
* Cleanup resources
|
|
3590
3066
|
*/
|
|
@@ -3592,13 +3068,9 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3592
3068
|
this.stopMonitoring();
|
|
3593
3069
|
this.scheduler.dispose();
|
|
3594
3070
|
this.coalescer.reset();
|
|
3595
|
-
this.
|
|
3596
|
-
this.lastEmotionFrame = null;
|
|
3597
|
-
this.currentAudioEnergy = 0;
|
|
3071
|
+
this.processor.dispose();
|
|
3598
3072
|
}
|
|
3599
3073
|
};
|
|
3600
|
-
_FullFacePipeline.STALE_FRAME_THRESHOLD_MS = 3e3;
|
|
3601
|
-
var FullFacePipeline = _FullFacePipeline;
|
|
3602
3074
|
|
|
3603
3075
|
// src/inference/kaldiFbank.ts
|
|
3604
3076
|
function fft(re, im) {
|
|
@@ -3885,7 +3357,7 @@ function ctcGreedyDecode(logits, seqLen, vocabSize, tokenMap) {
|
|
|
3885
3357
|
}
|
|
3886
3358
|
|
|
3887
3359
|
// src/inference/SenseVoiceInference.ts
|
|
3888
|
-
var
|
|
3360
|
+
var logger5 = createLogger("SenseVoice");
|
|
3889
3361
|
var _SenseVoiceInference = class _SenseVoiceInference {
|
|
3890
3362
|
constructor(config) {
|
|
3891
3363
|
this.session = null;
|
|
@@ -3938,26 +3410,26 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
3938
3410
|
"model.backend_requested": this.config.backend
|
|
3939
3411
|
});
|
|
3940
3412
|
try {
|
|
3941
|
-
|
|
3413
|
+
logger5.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
3942
3414
|
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
3943
3415
|
this.ort = ort;
|
|
3944
3416
|
this._backend = backend;
|
|
3945
|
-
|
|
3946
|
-
|
|
3417
|
+
logger5.info("ONNX Runtime loaded", { backend: this._backend });
|
|
3418
|
+
logger5.debug("Fetching tokens vocabulary", { tokensUrl: this.config.tokensUrl });
|
|
3947
3419
|
const tokensResponse = await fetch(this.config.tokensUrl);
|
|
3948
3420
|
if (!tokensResponse.ok) {
|
|
3949
3421
|
throw new Error(`Failed to fetch tokens.txt: ${tokensResponse.status} ${tokensResponse.statusText}`);
|
|
3950
3422
|
}
|
|
3951
3423
|
const tokensText = await tokensResponse.text();
|
|
3952
3424
|
this.tokenMap = parseTokensFile(tokensText);
|
|
3953
|
-
|
|
3425
|
+
logger5.debug("Tokens loaded", { vocabSize: this.tokenMap.size });
|
|
3954
3426
|
const sessionOptions = getSessionOptions(this._backend);
|
|
3955
3427
|
if (this._backend === "webgpu") {
|
|
3956
3428
|
sessionOptions.graphOptimizationLevel = "basic";
|
|
3957
3429
|
}
|
|
3958
3430
|
let isCached = false;
|
|
3959
3431
|
if (isIOS()) {
|
|
3960
|
-
|
|
3432
|
+
logger5.info("iOS: passing model URL directly to ORT (low-memory path)", {
|
|
3961
3433
|
modelUrl: this.config.modelUrl
|
|
3962
3434
|
});
|
|
3963
3435
|
this.session = await this.ort.InferenceSession.create(
|
|
@@ -3969,14 +3441,14 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
3969
3441
|
isCached = await cache.has(this.config.modelUrl);
|
|
3970
3442
|
let modelBuffer;
|
|
3971
3443
|
if (isCached) {
|
|
3972
|
-
|
|
3444
|
+
logger5.debug("Loading model from cache", { modelUrl: this.config.modelUrl });
|
|
3973
3445
|
modelBuffer = await cache.get(this.config.modelUrl);
|
|
3974
3446
|
onProgress?.(modelBuffer.byteLength, modelBuffer.byteLength);
|
|
3975
3447
|
} else {
|
|
3976
|
-
|
|
3448
|
+
logger5.debug("Fetching and caching model", { modelUrl: this.config.modelUrl });
|
|
3977
3449
|
modelBuffer = await fetchWithCache(this.config.modelUrl, onProgress);
|
|
3978
3450
|
}
|
|
3979
|
-
|
|
3451
|
+
logger5.debug("Creating ONNX session", {
|
|
3980
3452
|
size: formatBytes(modelBuffer.byteLength),
|
|
3981
3453
|
backend: this._backend
|
|
3982
3454
|
});
|
|
@@ -3989,15 +3461,15 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
3989
3461
|
const cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
|
|
3990
3462
|
this.negMean = cmvn.negMean;
|
|
3991
3463
|
this.invStddev = cmvn.invStddev;
|
|
3992
|
-
|
|
3464
|
+
logger5.debug("CMVN loaded from model metadata", { dim: this.negMean.length });
|
|
3993
3465
|
} else {
|
|
3994
|
-
|
|
3466
|
+
logger5.warn("CMVN not found in model metadata \u2014 features will not be normalized");
|
|
3995
3467
|
}
|
|
3996
3468
|
} catch (cmvnErr) {
|
|
3997
|
-
|
|
3469
|
+
logger5.warn("Failed to read CMVN from model metadata", { error: cmvnErr });
|
|
3998
3470
|
}
|
|
3999
3471
|
const loadTimeMs = performance.now() - startTime;
|
|
4000
|
-
|
|
3472
|
+
logger5.info("SenseVoice model loaded", {
|
|
4001
3473
|
backend: this._backend,
|
|
4002
3474
|
loadTimeMs: Math.round(loadTimeMs),
|
|
4003
3475
|
vocabSize: this.tokenMap.size,
|
|
@@ -4108,7 +3580,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4108
3580
|
const vocabSize = logitsDims[2];
|
|
4109
3581
|
const decoded = ctcGreedyDecode(logitsData, seqLen, vocabSize, this.tokenMap);
|
|
4110
3582
|
const inferenceTimeMs = performance.now() - startTime;
|
|
4111
|
-
|
|
3583
|
+
logger5.trace("Transcription complete", {
|
|
4112
3584
|
text: decoded.text.substring(0, 50),
|
|
4113
3585
|
language: decoded.language,
|
|
4114
3586
|
emotion: decoded.emotion,
|
|
@@ -4146,7 +3618,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4146
3618
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
4147
3619
|
if (errMsg.includes("timed out")) {
|
|
4148
3620
|
this.poisoned = true;
|
|
4149
|
-
|
|
3621
|
+
logger5.error("CRITICAL: Inference session timed out \u2014 SenseVoice is dead. Page reload required.", {
|
|
4150
3622
|
backend: this._backend,
|
|
4151
3623
|
timeoutMs: _SenseVoiceInference.INFERENCE_TIMEOUT_MS
|
|
4152
3624
|
});
|
|
@@ -4154,7 +3626,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4154
3626
|
const oomError = new Error(
|
|
4155
3627
|
`SenseVoice inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
|
|
4156
3628
|
);
|
|
4157
|
-
|
|
3629
|
+
logger5.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
4158
3630
|
pointer: `0x${err.toString(16)}`,
|
|
4159
3631
|
backend: this._backend
|
|
4160
3632
|
});
|
|
@@ -4167,7 +3639,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4167
3639
|
reject(oomError);
|
|
4168
3640
|
return;
|
|
4169
3641
|
} else {
|
|
4170
|
-
|
|
3642
|
+
logger5.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
4171
3643
|
}
|
|
4172
3644
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
4173
3645
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -4196,7 +3668,7 @@ _SenseVoiceInference.INFERENCE_TIMEOUT_MS = 1e4;
|
|
|
4196
3668
|
var SenseVoiceInference = _SenseVoiceInference;
|
|
4197
3669
|
|
|
4198
3670
|
// src/inference/SenseVoiceWorker.ts
|
|
4199
|
-
var
|
|
3671
|
+
var logger6 = createLogger("SenseVoiceWorker");
|
|
4200
3672
|
var WASM_CDN_PATH2 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
4201
3673
|
var LOAD_TIMEOUT_MS = 3e4;
|
|
4202
3674
|
var INFERENCE_TIMEOUT_MS = 1e4;
|
|
@@ -4929,7 +4401,7 @@ var SenseVoiceWorker = class {
|
|
|
4929
4401
|
this.handleWorkerMessage(event.data);
|
|
4930
4402
|
};
|
|
4931
4403
|
worker.onerror = (error) => {
|
|
4932
|
-
|
|
4404
|
+
logger6.error("Worker error", { error: error.message });
|
|
4933
4405
|
for (const [, resolver] of this.pendingResolvers) {
|
|
4934
4406
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
4935
4407
|
}
|
|
@@ -5009,9 +4481,9 @@ var SenseVoiceWorker = class {
|
|
|
5009
4481
|
"model.language": this.config.language
|
|
5010
4482
|
});
|
|
5011
4483
|
try {
|
|
5012
|
-
|
|
4484
|
+
logger6.info("Creating SenseVoice worker...");
|
|
5013
4485
|
this.worker = this.createWorker();
|
|
5014
|
-
|
|
4486
|
+
logger6.info("Loading model in worker...", {
|
|
5015
4487
|
modelUrl: this.config.modelUrl,
|
|
5016
4488
|
tokensUrl: this.config.tokensUrl,
|
|
5017
4489
|
language: this.config.language,
|
|
@@ -5033,7 +4505,7 @@ var SenseVoiceWorker = class {
|
|
|
5033
4505
|
this._isLoaded = true;
|
|
5034
4506
|
const loadTimeMs = performance.now() - startTime;
|
|
5035
4507
|
onProgress?.(1, 1);
|
|
5036
|
-
|
|
4508
|
+
logger6.info("SenseVoice worker loaded successfully", {
|
|
5037
4509
|
backend: "wasm",
|
|
5038
4510
|
loadTimeMs: Math.round(loadTimeMs),
|
|
5039
4511
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -5112,7 +4584,7 @@ var SenseVoiceWorker = class {
|
|
|
5112
4584
|
INFERENCE_TIMEOUT_MS
|
|
5113
4585
|
);
|
|
5114
4586
|
const totalTimeMs = performance.now() - startTime;
|
|
5115
|
-
|
|
4587
|
+
logger6.trace("Worker transcription complete", {
|
|
5116
4588
|
text: result.text.substring(0, 50),
|
|
5117
4589
|
language: result.language,
|
|
5118
4590
|
emotion: result.emotion,
|
|
@@ -5148,11 +4620,11 @@ var SenseVoiceWorker = class {
|
|
|
5148
4620
|
} catch (err) {
|
|
5149
4621
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
5150
4622
|
if (errMsg.includes("timed out")) {
|
|
5151
|
-
|
|
4623
|
+
logger6.error("CRITICAL: Worker inference timed out \u2014 SenseVoice worker is dead. Page reload required.", {
|
|
5152
4624
|
timeoutMs: INFERENCE_TIMEOUT_MS
|
|
5153
4625
|
});
|
|
5154
4626
|
} else {
|
|
5155
|
-
|
|
4627
|
+
logger6.error("Worker inference failed", { error: errMsg });
|
|
5156
4628
|
}
|
|
5157
4629
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
5158
4630
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -5190,7 +4662,7 @@ var SenseVoiceWorker = class {
|
|
|
5190
4662
|
};
|
|
5191
4663
|
|
|
5192
4664
|
// src/inference/UnifiedInferenceWorker.ts
|
|
5193
|
-
var
|
|
4665
|
+
var logger7 = createLogger("UnifiedInferenceWorker");
|
|
5194
4666
|
var WASM_CDN_PATH3 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
5195
4667
|
var INIT_TIMEOUT_MS = 15e3;
|
|
5196
4668
|
var SV_LOAD_TIMEOUT_MS = 3e4;
|
|
@@ -5886,7 +5358,7 @@ var UnifiedInferenceWorker = class {
|
|
|
5886
5358
|
const telemetry = getTelemetry();
|
|
5887
5359
|
const span = telemetry?.startSpan("UnifiedInferenceWorker.init");
|
|
5888
5360
|
try {
|
|
5889
|
-
|
|
5361
|
+
logger7.info("Creating unified inference worker...");
|
|
5890
5362
|
this.worker = this.createWorker();
|
|
5891
5363
|
await this.sendMessage(
|
|
5892
5364
|
{ type: "init", wasmPaths: WASM_CDN_PATH3, isIOS: isIOS() },
|
|
@@ -5895,7 +5367,7 @@ var UnifiedInferenceWorker = class {
|
|
|
5895
5367
|
);
|
|
5896
5368
|
this.initialized = true;
|
|
5897
5369
|
const loadTimeMs = performance.now() - startTime;
|
|
5898
|
-
|
|
5370
|
+
logger7.info("Unified worker initialized", { loadTimeMs: Math.round(loadTimeMs) });
|
|
5899
5371
|
span?.setAttributes({ "worker.init_time_ms": loadTimeMs });
|
|
5900
5372
|
span?.end();
|
|
5901
5373
|
} catch (error) {
|
|
@@ -5949,8 +5421,8 @@ var UnifiedInferenceWorker = class {
|
|
|
5949
5421
|
if (!this.worker) return;
|
|
5950
5422
|
await this.sendMessage({ type: "sv:dispose" }, "sv:disposed", DISPOSE_TIMEOUT_MS);
|
|
5951
5423
|
}
|
|
5952
|
-
// ── Wav2ArkitCpu (
|
|
5953
|
-
async
|
|
5424
|
+
// ── Wav2ArkitCpu (A2E) ──────────────────────────────────────────────
|
|
5425
|
+
async loadA2E(config) {
|
|
5954
5426
|
this.assertReady();
|
|
5955
5427
|
const startTime = performance.now();
|
|
5956
5428
|
const result = await this.sendMessage(
|
|
@@ -5971,7 +5443,7 @@ var UnifiedInferenceWorker = class {
|
|
|
5971
5443
|
outputNames: result.outputNames
|
|
5972
5444
|
};
|
|
5973
5445
|
}
|
|
5974
|
-
async
|
|
5446
|
+
async inferA2E(audio) {
|
|
5975
5447
|
this.assertReady();
|
|
5976
5448
|
return this.sendMessage(
|
|
5977
5449
|
{ type: "cpu:infer", audio },
|
|
@@ -5979,7 +5451,7 @@ var UnifiedInferenceWorker = class {
|
|
|
5979
5451
|
CPU_INFER_TIMEOUT_MS
|
|
5980
5452
|
);
|
|
5981
5453
|
}
|
|
5982
|
-
async
|
|
5454
|
+
async disposeA2E() {
|
|
5983
5455
|
if (!this.worker) return;
|
|
5984
5456
|
await this.sendMessage({ type: "cpu:dispose" }, "cpu:disposed", DISPOSE_TIMEOUT_MS);
|
|
5985
5457
|
}
|
|
@@ -6069,7 +5541,7 @@ var UnifiedInferenceWorker = class {
|
|
|
6069
5541
|
this.handleWorkerMessage(event.data);
|
|
6070
5542
|
};
|
|
6071
5543
|
worker.onerror = (error) => {
|
|
6072
|
-
|
|
5544
|
+
logger7.error("Unified worker error", { error: error.message });
|
|
6073
5545
|
this.rejectAllPending(`Worker error: ${error.message}`);
|
|
6074
5546
|
};
|
|
6075
5547
|
return worker;
|
|
@@ -6083,7 +5555,7 @@ var UnifiedInferenceWorker = class {
|
|
|
6083
5555
|
this.pendingRequests.delete(requestId);
|
|
6084
5556
|
pending.reject(new Error(data.error));
|
|
6085
5557
|
} else {
|
|
6086
|
-
|
|
5558
|
+
logger7.error("Worker broadcast error", { error: data.error });
|
|
6087
5559
|
this.rejectAllPending(data.error);
|
|
6088
5560
|
}
|
|
6089
5561
|
return;
|
|
@@ -6105,7 +5577,7 @@ var UnifiedInferenceWorker = class {
|
|
|
6105
5577
|
const timeout = setTimeout(() => {
|
|
6106
5578
|
this.pendingRequests.delete(requestId);
|
|
6107
5579
|
this.poisoned = true;
|
|
6108
|
-
|
|
5580
|
+
logger7.error("CRITICAL: Worker operation timed out \u2014 worker is dead", {
|
|
6109
5581
|
type: message.type,
|
|
6110
5582
|
timeoutMs
|
|
6111
5583
|
});
|
|
@@ -6171,7 +5643,7 @@ var SenseVoiceUnifiedAdapter = class {
|
|
|
6171
5643
|
});
|
|
6172
5644
|
this._isLoaded = true;
|
|
6173
5645
|
onProgress?.(1, 1);
|
|
6174
|
-
|
|
5646
|
+
logger7.info("SenseVoice loaded via unified worker", {
|
|
6175
5647
|
backend: "wasm",
|
|
6176
5648
|
loadTimeMs: Math.round(result.loadTimeMs),
|
|
6177
5649
|
vocabSize: result.vocabSize
|
|
@@ -6212,6 +5684,7 @@ var SenseVoiceUnifiedAdapter = class {
|
|
|
6212
5684
|
var Wav2ArkitCpuUnifiedAdapter = class {
|
|
6213
5685
|
constructor(worker, config) {
|
|
6214
5686
|
this.modelId = "wav2arkit_cpu";
|
|
5687
|
+
this.chunkSize = 16e3;
|
|
6215
5688
|
this._isLoaded = false;
|
|
6216
5689
|
this.inferenceQueue = Promise.resolve();
|
|
6217
5690
|
this.worker = worker;
|
|
@@ -6230,12 +5703,12 @@ var Wav2ArkitCpuUnifiedAdapter = class {
|
|
|
6230
5703
|
});
|
|
6231
5704
|
try {
|
|
6232
5705
|
const externalDataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${this.config.modelUrl}.data` : null;
|
|
6233
|
-
const result = await this.worker.
|
|
5706
|
+
const result = await this.worker.loadA2E({
|
|
6234
5707
|
modelUrl: this.config.modelUrl,
|
|
6235
5708
|
externalDataUrl: externalDataUrl || null
|
|
6236
5709
|
});
|
|
6237
5710
|
this._isLoaded = true;
|
|
6238
|
-
|
|
5711
|
+
logger7.info("Wav2ArkitCpu loaded via unified worker", {
|
|
6239
5712
|
backend: "wasm",
|
|
6240
5713
|
loadTimeMs: Math.round(result.loadTimeMs)
|
|
6241
5714
|
});
|
|
@@ -6262,7 +5735,7 @@ var Wav2ArkitCpuUnifiedAdapter = class {
|
|
|
6262
5735
|
});
|
|
6263
5736
|
try {
|
|
6264
5737
|
const startTime = performance.now();
|
|
6265
|
-
const result = await this.worker.
|
|
5738
|
+
const result = await this.worker.inferA2E(audioCopy);
|
|
6266
5739
|
const inferenceTimeMs = performance.now() - startTime;
|
|
6267
5740
|
const flatBuffer = result.blendshapes;
|
|
6268
5741
|
const { numFrames, numBlendshapes } = result;
|
|
@@ -6285,7 +5758,7 @@ var Wav2ArkitCpuUnifiedAdapter = class {
|
|
|
6285
5758
|
}
|
|
6286
5759
|
async dispose() {
|
|
6287
5760
|
if (this._isLoaded) {
|
|
6288
|
-
await this.worker.
|
|
5761
|
+
await this.worker.disposeA2E();
|
|
6289
5762
|
this._isLoaded = false;
|
|
6290
5763
|
}
|
|
6291
5764
|
}
|
|
@@ -6341,7 +5814,7 @@ var SileroVADUnifiedAdapter = class {
|
|
|
6341
5814
|
sampleRate: this.config.sampleRate
|
|
6342
5815
|
});
|
|
6343
5816
|
this._isLoaded = true;
|
|
6344
|
-
|
|
5817
|
+
logger7.info("SileroVAD loaded via unified worker", {
|
|
6345
5818
|
backend: "wasm",
|
|
6346
5819
|
loadTimeMs: Math.round(result.loadTimeMs),
|
|
6347
5820
|
sampleRate: this.config.sampleRate,
|
|
@@ -6422,10 +5895,10 @@ var SileroVADUnifiedAdapter = class {
|
|
|
6422
5895
|
};
|
|
6423
5896
|
|
|
6424
5897
|
// src/inference/createSenseVoice.ts
|
|
6425
|
-
var
|
|
5898
|
+
var logger8 = createLogger("createSenseVoice");
|
|
6426
5899
|
function createSenseVoice(config) {
|
|
6427
5900
|
if (config.unifiedWorker) {
|
|
6428
|
-
|
|
5901
|
+
logger8.info("Creating SenseVoiceUnifiedAdapter (shared unified worker)");
|
|
6429
5902
|
return new SenseVoiceUnifiedAdapter(config.unifiedWorker, {
|
|
6430
5903
|
modelUrl: config.modelUrl,
|
|
6431
5904
|
tokensUrl: config.tokensUrl,
|
|
@@ -6438,7 +5911,7 @@ function createSenseVoice(config) {
|
|
|
6438
5911
|
if (!SenseVoiceWorker.isSupported()) {
|
|
6439
5912
|
throw new Error("Web Workers are not supported in this environment");
|
|
6440
5913
|
}
|
|
6441
|
-
|
|
5914
|
+
logger8.info("Creating SenseVoiceWorker (off-main-thread)");
|
|
6442
5915
|
return new SenseVoiceWorker({
|
|
6443
5916
|
modelUrl: config.modelUrl,
|
|
6444
5917
|
tokensUrl: config.tokensUrl,
|
|
@@ -6447,7 +5920,7 @@ function createSenseVoice(config) {
|
|
|
6447
5920
|
});
|
|
6448
5921
|
}
|
|
6449
5922
|
if (useWorker === false) {
|
|
6450
|
-
|
|
5923
|
+
logger8.info("Creating SenseVoiceInference (main thread)");
|
|
6451
5924
|
return new SenseVoiceInference({
|
|
6452
5925
|
modelUrl: config.modelUrl,
|
|
6453
5926
|
tokensUrl: config.tokensUrl,
|
|
@@ -6456,7 +5929,7 @@ function createSenseVoice(config) {
|
|
|
6456
5929
|
});
|
|
6457
5930
|
}
|
|
6458
5931
|
if (SenseVoiceWorker.isSupported() && !isIOS()) {
|
|
6459
|
-
|
|
5932
|
+
logger8.info("Auto-detected: creating SenseVoiceWorker (off-main-thread)");
|
|
6460
5933
|
return new SenseVoiceWorker({
|
|
6461
5934
|
modelUrl: config.modelUrl,
|
|
6462
5935
|
tokensUrl: config.tokensUrl,
|
|
@@ -6464,7 +5937,7 @@ function createSenseVoice(config) {
|
|
|
6464
5937
|
textNorm: config.textNorm
|
|
6465
5938
|
});
|
|
6466
5939
|
}
|
|
6467
|
-
|
|
5940
|
+
logger8.info("Auto-detected: creating SenseVoiceInference (main thread)", {
|
|
6468
5941
|
reason: isIOS() ? "iOS (shared ORT instance)" : "Worker unsupported"
|
|
6469
5942
|
});
|
|
6470
5943
|
return new SenseVoiceInference({
|
|
@@ -6476,10 +5949,11 @@ function createSenseVoice(config) {
|
|
|
6476
5949
|
}
|
|
6477
5950
|
|
|
6478
5951
|
// src/inference/Wav2ArkitCpuInference.ts
|
|
6479
|
-
var
|
|
5952
|
+
var logger9 = createLogger("Wav2ArkitCpu");
|
|
6480
5953
|
var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
6481
5954
|
constructor(config) {
|
|
6482
5955
|
this.modelId = "wav2arkit_cpu";
|
|
5956
|
+
this.chunkSize = 16e3;
|
|
6483
5957
|
this.session = null;
|
|
6484
5958
|
this.ort = null;
|
|
6485
5959
|
this._backend = "wasm";
|
|
@@ -6517,16 +5991,16 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6517
5991
|
});
|
|
6518
5992
|
try {
|
|
6519
5993
|
const preference = this.config.backend || "wasm";
|
|
6520
|
-
|
|
5994
|
+
logger9.info("Loading ONNX Runtime...", { preference });
|
|
6521
5995
|
const { ort, backend } = await getOnnxRuntimeForPreference(preference);
|
|
6522
5996
|
this.ort = ort;
|
|
6523
5997
|
this._backend = backend;
|
|
6524
|
-
|
|
5998
|
+
logger9.info("ONNX Runtime loaded", { backend: this._backend });
|
|
6525
5999
|
const modelUrl = this.config.modelUrl;
|
|
6526
6000
|
const dataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${modelUrl}.data` : null;
|
|
6527
6001
|
const sessionOptions = getSessionOptions(this._backend);
|
|
6528
6002
|
if (isIOS()) {
|
|
6529
|
-
|
|
6003
|
+
logger9.info("iOS: passing model URLs directly to ORT (low-memory path)", {
|
|
6530
6004
|
modelUrl,
|
|
6531
6005
|
dataUrl
|
|
6532
6006
|
});
|
|
@@ -6544,15 +6018,15 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6544
6018
|
const isCached = await cache.has(modelUrl);
|
|
6545
6019
|
let modelBuffer;
|
|
6546
6020
|
if (isCached) {
|
|
6547
|
-
|
|
6021
|
+
logger9.debug("Loading model from cache", { modelUrl });
|
|
6548
6022
|
modelBuffer = await cache.get(modelUrl);
|
|
6549
6023
|
if (!modelBuffer) {
|
|
6550
|
-
|
|
6024
|
+
logger9.warn("Cache corruption detected, clearing and retrying", { modelUrl });
|
|
6551
6025
|
await cache.delete(modelUrl);
|
|
6552
6026
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
6553
6027
|
}
|
|
6554
6028
|
} else {
|
|
6555
|
-
|
|
6029
|
+
logger9.debug("Fetching and caching model graph", { modelUrl });
|
|
6556
6030
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
6557
6031
|
}
|
|
6558
6032
|
if (!modelBuffer) {
|
|
@@ -6563,31 +6037,31 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6563
6037
|
try {
|
|
6564
6038
|
const isDataCached = await cache.has(dataUrl);
|
|
6565
6039
|
if (isDataCached) {
|
|
6566
|
-
|
|
6040
|
+
logger9.debug("Loading external data from cache", { dataUrl });
|
|
6567
6041
|
externalDataBuffer = await cache.get(dataUrl);
|
|
6568
6042
|
if (!externalDataBuffer) {
|
|
6569
|
-
|
|
6043
|
+
logger9.warn("Cache corruption for external data, retrying", { dataUrl });
|
|
6570
6044
|
await cache.delete(dataUrl);
|
|
6571
6045
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
6572
6046
|
}
|
|
6573
6047
|
} else {
|
|
6574
|
-
|
|
6048
|
+
logger9.info("Fetching external model data", {
|
|
6575
6049
|
dataUrl,
|
|
6576
6050
|
note: "This may be a large download (400MB+)"
|
|
6577
6051
|
});
|
|
6578
6052
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
6579
6053
|
}
|
|
6580
|
-
|
|
6054
|
+
logger9.info("External data loaded", {
|
|
6581
6055
|
size: formatBytes(externalDataBuffer.byteLength)
|
|
6582
6056
|
});
|
|
6583
6057
|
} catch (err) {
|
|
6584
|
-
|
|
6058
|
+
logger9.debug("No external data file found (single-file model)", {
|
|
6585
6059
|
dataUrl,
|
|
6586
6060
|
error: err.message
|
|
6587
6061
|
});
|
|
6588
6062
|
}
|
|
6589
6063
|
}
|
|
6590
|
-
|
|
6064
|
+
logger9.debug("Creating ONNX session", {
|
|
6591
6065
|
graphSize: formatBytes(modelBuffer.byteLength),
|
|
6592
6066
|
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
|
|
6593
6067
|
backend: this._backend
|
|
@@ -6603,7 +6077,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6603
6077
|
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
6604
6078
|
}
|
|
6605
6079
|
const loadTimeMs = performance.now() - startTime;
|
|
6606
|
-
|
|
6080
|
+
logger9.info("Model loaded successfully", {
|
|
6607
6081
|
backend: this._backend,
|
|
6608
6082
|
loadTimeMs: Math.round(loadTimeMs),
|
|
6609
6083
|
inputs: this.session.inputNames,
|
|
@@ -6619,12 +6093,12 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6619
6093
|
model: "wav2arkit_cpu",
|
|
6620
6094
|
backend: this._backend
|
|
6621
6095
|
});
|
|
6622
|
-
|
|
6096
|
+
logger9.debug("Running warmup inference");
|
|
6623
6097
|
const warmupStart = performance.now();
|
|
6624
6098
|
const silentAudio = new Float32Array(16e3);
|
|
6625
6099
|
await this.infer(silentAudio);
|
|
6626
6100
|
const warmupTimeMs = performance.now() - warmupStart;
|
|
6627
|
-
|
|
6101
|
+
logger9.info("Warmup inference complete", {
|
|
6628
6102
|
warmupTimeMs: Math.round(warmupTimeMs),
|
|
6629
6103
|
backend: this._backend
|
|
6630
6104
|
});
|
|
@@ -6711,7 +6185,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6711
6185
|
const symmetrized = symmetrizeBlendshapes(rawFrame);
|
|
6712
6186
|
blendshapes.push(symmetrized);
|
|
6713
6187
|
}
|
|
6714
|
-
|
|
6188
|
+
logger9.trace("Inference completed", {
|
|
6715
6189
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
6716
6190
|
numFrames,
|
|
6717
6191
|
inputSamples
|
|
@@ -6739,7 +6213,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6739
6213
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
6740
6214
|
if (errMsg.includes("timed out")) {
|
|
6741
6215
|
this.poisoned = true;
|
|
6742
|
-
|
|
6216
|
+
logger9.error("CRITICAL: Inference session timed out \u2014 Wav2ArkitCpu is dead. Page reload required.", {
|
|
6743
6217
|
backend: this._backend,
|
|
6744
6218
|
timeoutMs: _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
|
|
6745
6219
|
});
|
|
@@ -6747,7 +6221,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6747
6221
|
const oomError = new Error(
|
|
6748
6222
|
`Wav2ArkitCpu inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
|
|
6749
6223
|
);
|
|
6750
|
-
|
|
6224
|
+
logger9.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
6751
6225
|
pointer: `0x${err.toString(16)}`,
|
|
6752
6226
|
backend: this._backend
|
|
6753
6227
|
});
|
|
@@ -6760,7 +6234,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6760
6234
|
reject(oomError);
|
|
6761
6235
|
return;
|
|
6762
6236
|
} else {
|
|
6763
|
-
|
|
6237
|
+
logger9.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
6764
6238
|
}
|
|
6765
6239
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
6766
6240
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -6787,7 +6261,7 @@ _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS = 5e3;
|
|
|
6787
6261
|
var Wav2ArkitCpuInference = _Wav2ArkitCpuInference;
|
|
6788
6262
|
|
|
6789
6263
|
// src/inference/Wav2ArkitCpuWorker.ts
|
|
6790
|
-
var
|
|
6264
|
+
var logger10 = createLogger("Wav2ArkitCpuWorker");
|
|
6791
6265
|
var WASM_CDN_PATH4 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
6792
6266
|
var LOAD_TIMEOUT_MS2 = 6e4;
|
|
6793
6267
|
var INFERENCE_TIMEOUT_MS2 = 5e3;
|
|
@@ -7033,6 +6507,7 @@ self.onerror = function(err) {
|
|
|
7033
6507
|
var Wav2ArkitCpuWorker = class {
|
|
7034
6508
|
constructor(config) {
|
|
7035
6509
|
this.modelId = "wav2arkit_cpu";
|
|
6510
|
+
this.chunkSize = 16e3;
|
|
7036
6511
|
this.worker = null;
|
|
7037
6512
|
this.isLoading = false;
|
|
7038
6513
|
this._isLoaded = false;
|
|
@@ -7067,7 +6542,7 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7067
6542
|
this.handleWorkerMessage(event.data);
|
|
7068
6543
|
};
|
|
7069
6544
|
worker.onerror = (error) => {
|
|
7070
|
-
|
|
6545
|
+
logger10.error("Worker error", { error: error.message });
|
|
7071
6546
|
for (const [, resolver] of this.pendingResolvers) {
|
|
7072
6547
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
7073
6548
|
}
|
|
@@ -7143,10 +6618,10 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7143
6618
|
"model.backend_requested": "wasm"
|
|
7144
6619
|
});
|
|
7145
6620
|
try {
|
|
7146
|
-
|
|
6621
|
+
logger10.info("Creating wav2arkit_cpu worker...");
|
|
7147
6622
|
this.worker = this.createWorker();
|
|
7148
6623
|
const externalDataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${this.config.modelUrl}.data` : null;
|
|
7149
|
-
|
|
6624
|
+
logger10.info("Loading model in worker...", {
|
|
7150
6625
|
modelUrl: this.config.modelUrl,
|
|
7151
6626
|
externalDataUrl,
|
|
7152
6627
|
isIOS: isIOS()
|
|
@@ -7164,7 +6639,7 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7164
6639
|
);
|
|
7165
6640
|
this._isLoaded = true;
|
|
7166
6641
|
const loadTimeMs = performance.now() - startTime;
|
|
7167
|
-
|
|
6642
|
+
logger10.info("Wav2ArkitCpu worker loaded successfully", {
|
|
7168
6643
|
backend: "wasm",
|
|
7169
6644
|
loadTimeMs: Math.round(loadTimeMs),
|
|
7170
6645
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -7249,7 +6724,7 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7249
6724
|
for (let f = 0; f < numFrames; f++) {
|
|
7250
6725
|
blendshapes.push(flatBuffer.slice(f * numBlendshapes, (f + 1) * numBlendshapes));
|
|
7251
6726
|
}
|
|
7252
|
-
|
|
6727
|
+
logger10.trace("Worker inference completed", {
|
|
7253
6728
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
7254
6729
|
workerTimeMs: Math.round(result.inferenceTimeMs * 100) / 100,
|
|
7255
6730
|
numFrames,
|
|
@@ -7279,12 +6754,12 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7279
6754
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
7280
6755
|
if (errMsg.includes("timed out")) {
|
|
7281
6756
|
this.poisoned = true;
|
|
7282
|
-
|
|
6757
|
+
logger10.error("CRITICAL: Worker inference timed out \u2014 Wav2ArkitCpu worker is dead. Page reload required.", {
|
|
7283
6758
|
backend: "wasm",
|
|
7284
6759
|
timeoutMs: INFERENCE_TIMEOUT_MS2
|
|
7285
6760
|
});
|
|
7286
6761
|
} else {
|
|
7287
|
-
|
|
6762
|
+
logger10.error("Worker inference failed", { error: errMsg, backend: "wasm" });
|
|
7288
6763
|
}
|
|
7289
6764
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
7290
6765
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -7321,39 +6796,39 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7321
6796
|
}
|
|
7322
6797
|
};
|
|
7323
6798
|
|
|
7324
|
-
// src/inference/
|
|
7325
|
-
var
|
|
7326
|
-
function
|
|
6799
|
+
// src/inference/createA2E.ts
|
|
6800
|
+
var logger11 = createLogger("createA2E");
|
|
6801
|
+
function createA2E(config) {
|
|
7327
6802
|
const mode = config.mode ?? "auto";
|
|
7328
6803
|
const fallbackOnError = config.fallbackOnError ?? true;
|
|
7329
6804
|
let useCpu;
|
|
7330
6805
|
if (mode === "cpu") {
|
|
7331
6806
|
useCpu = true;
|
|
7332
|
-
|
|
6807
|
+
logger11.info("Forcing CPU A2E model (wav2arkit_cpu)");
|
|
7333
6808
|
} else if (mode === "gpu") {
|
|
7334
6809
|
useCpu = false;
|
|
7335
|
-
|
|
6810
|
+
logger11.info("Forcing GPU A2E model (Wav2Vec2)");
|
|
7336
6811
|
} else {
|
|
7337
|
-
useCpu =
|
|
7338
|
-
|
|
6812
|
+
useCpu = shouldUseCpuA2E();
|
|
6813
|
+
logger11.info("Auto-detected A2E model", {
|
|
7339
6814
|
useCpu,
|
|
7340
6815
|
isSafari: isSafari()
|
|
7341
6816
|
});
|
|
7342
6817
|
}
|
|
7343
6818
|
if (useCpu) {
|
|
7344
6819
|
if (config.unifiedWorker) {
|
|
7345
|
-
|
|
6820
|
+
logger11.info("Creating Wav2ArkitCpuUnifiedAdapter (404MB, WASM, shared unified worker)");
|
|
7346
6821
|
return new Wav2ArkitCpuUnifiedAdapter(config.unifiedWorker, {
|
|
7347
6822
|
modelUrl: config.cpuModelUrl
|
|
7348
6823
|
});
|
|
7349
6824
|
}
|
|
7350
6825
|
if (config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
|
|
7351
|
-
|
|
6826
|
+
logger11.info("Creating Wav2ArkitCpuWorker (404MB, WASM, off-main-thread)");
|
|
7352
6827
|
return new Wav2ArkitCpuWorker({
|
|
7353
6828
|
modelUrl: config.cpuModelUrl
|
|
7354
6829
|
});
|
|
7355
6830
|
}
|
|
7356
|
-
|
|
6831
|
+
logger11.info("Creating Wav2ArkitCpuInference (404MB, WASM)");
|
|
7357
6832
|
return new Wav2ArkitCpuInference({
|
|
7358
6833
|
modelUrl: config.cpuModelUrl
|
|
7359
6834
|
});
|
|
@@ -7365,13 +6840,13 @@ function createLipSync(config) {
|
|
|
7365
6840
|
numIdentityClasses: config.numIdentityClasses
|
|
7366
6841
|
});
|
|
7367
6842
|
if (fallbackOnError) {
|
|
7368
|
-
|
|
7369
|
-
return new
|
|
6843
|
+
logger11.info("Creating Wav2Vec2Inference with CPU fallback");
|
|
6844
|
+
return new A2EWithFallback(gpuInstance, config);
|
|
7370
6845
|
}
|
|
7371
|
-
|
|
6846
|
+
logger11.info("Creating Wav2Vec2Inference (no fallback)");
|
|
7372
6847
|
return gpuInstance;
|
|
7373
6848
|
}
|
|
7374
|
-
var
|
|
6849
|
+
var A2EWithFallback = class {
|
|
7375
6850
|
constructor(gpuInstance, config) {
|
|
7376
6851
|
this.hasFallenBack = false;
|
|
7377
6852
|
this.implementation = gpuInstance;
|
|
@@ -7380,6 +6855,9 @@ var LipSyncWithFallback = class {
|
|
|
7380
6855
|
get modelId() {
|
|
7381
6856
|
return this.implementation.modelId;
|
|
7382
6857
|
}
|
|
6858
|
+
get chunkSize() {
|
|
6859
|
+
return this.implementation.chunkSize;
|
|
6860
|
+
}
|
|
7383
6861
|
get backend() {
|
|
7384
6862
|
return this.implementation.backend;
|
|
7385
6863
|
}
|
|
@@ -7394,7 +6872,7 @@ var LipSyncWithFallback = class {
|
|
|
7394
6872
|
}
|
|
7395
6873
|
}
|
|
7396
6874
|
async fallbackToCpu(reason) {
|
|
7397
|
-
|
|
6875
|
+
logger11.warn("GPU model load failed, falling back to CPU model", { reason });
|
|
7398
6876
|
try {
|
|
7399
6877
|
await this.implementation.dispose();
|
|
7400
6878
|
} catch {
|
|
@@ -7403,17 +6881,17 @@ var LipSyncWithFallback = class {
|
|
|
7403
6881
|
this.implementation = new Wav2ArkitCpuUnifiedAdapter(this.config.unifiedWorker, {
|
|
7404
6882
|
modelUrl: this.config.cpuModelUrl
|
|
7405
6883
|
});
|
|
7406
|
-
|
|
6884
|
+
logger11.info("Fallback to Wav2ArkitCpuUnifiedAdapter successful");
|
|
7407
6885
|
} else if (this.config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
|
|
7408
6886
|
this.implementation = new Wav2ArkitCpuWorker({
|
|
7409
6887
|
modelUrl: this.config.cpuModelUrl
|
|
7410
6888
|
});
|
|
7411
|
-
|
|
6889
|
+
logger11.info("Fallback to Wav2ArkitCpuWorker successful");
|
|
7412
6890
|
} else {
|
|
7413
6891
|
this.implementation = new Wav2ArkitCpuInference({
|
|
7414
6892
|
modelUrl: this.config.cpuModelUrl
|
|
7415
6893
|
});
|
|
7416
|
-
|
|
6894
|
+
logger11.info("Fallback to Wav2ArkitCpuInference successful");
|
|
7417
6895
|
}
|
|
7418
6896
|
this.hasFallenBack = true;
|
|
7419
6897
|
return await this.implementation.load();
|
|
@@ -7426,8 +6904,198 @@ var LipSyncWithFallback = class {
|
|
|
7426
6904
|
}
|
|
7427
6905
|
};
|
|
7428
6906
|
|
|
6907
|
+
// src/inference/BlendshapeSmoother.ts
|
|
6908
|
+
var NUM_BLENDSHAPES = 52;
|
|
6909
|
+
var BlendshapeSmoother = class {
|
|
6910
|
+
constructor(config) {
|
|
6911
|
+
/** Whether any target has been set */
|
|
6912
|
+
this._hasTarget = false;
|
|
6913
|
+
this.halflife = config?.halflife ?? 0.06;
|
|
6914
|
+
this.values = new Float32Array(NUM_BLENDSHAPES);
|
|
6915
|
+
this.velocities = new Float32Array(NUM_BLENDSHAPES);
|
|
6916
|
+
this.targets = new Float32Array(NUM_BLENDSHAPES);
|
|
6917
|
+
}
|
|
6918
|
+
/** Whether a target frame has been set (false until first setTarget call) */
|
|
6919
|
+
get hasTarget() {
|
|
6920
|
+
return this._hasTarget;
|
|
6921
|
+
}
|
|
6922
|
+
/**
|
|
6923
|
+
* Set new target frame from inference output.
|
|
6924
|
+
* Springs will converge toward these values on subsequent update() calls.
|
|
6925
|
+
*/
|
|
6926
|
+
setTarget(frame) {
|
|
6927
|
+
this.targets.set(frame);
|
|
6928
|
+
this._hasTarget = true;
|
|
6929
|
+
}
|
|
6930
|
+
/**
|
|
6931
|
+
* Advance all 52 springs by `dt` seconds and return the smoothed frame.
|
|
6932
|
+
*
|
|
6933
|
+
* Call this every render frame (e.g., inside requestAnimationFrame).
|
|
6934
|
+
* Returns the internal values buffer — do NOT mutate the returned array.
|
|
6935
|
+
*
|
|
6936
|
+
* @param dt - Time step in seconds (e.g., 1/60 for 60fps)
|
|
6937
|
+
* @returns Smoothed blendshape values (Float32Array of 52)
|
|
6938
|
+
*/
|
|
6939
|
+
update(dt) {
|
|
6940
|
+
if (!this._hasTarget) {
|
|
6941
|
+
return this.values;
|
|
6942
|
+
}
|
|
6943
|
+
if (this.halflife <= 0) {
|
|
6944
|
+
this.values.set(this.targets);
|
|
6945
|
+
this.velocities.fill(0);
|
|
6946
|
+
return this.values;
|
|
6947
|
+
}
|
|
6948
|
+
const damping = Math.LN2 / this.halflife;
|
|
6949
|
+
const eydt = Math.exp(-damping * dt);
|
|
6950
|
+
for (let i = 0; i < NUM_BLENDSHAPES; i++) {
|
|
6951
|
+
const j0 = this.values[i] - this.targets[i];
|
|
6952
|
+
const j1 = this.velocities[i] + j0 * damping;
|
|
6953
|
+
this.values[i] = eydt * (j0 + j1 * dt) + this.targets[i];
|
|
6954
|
+
this.velocities[i] = eydt * (this.velocities[i] - j1 * damping * dt);
|
|
6955
|
+
this.values[i] = Math.max(0, Math.min(1, this.values[i]));
|
|
6956
|
+
}
|
|
6957
|
+
return this.values;
|
|
6958
|
+
}
|
|
6959
|
+
/**
|
|
6960
|
+
* Decay all spring targets to neutral (0).
|
|
6961
|
+
*
|
|
6962
|
+
* Call when inference stalls (no new frames for threshold duration).
|
|
6963
|
+
* The springs will smoothly close the mouth / relax the face over
|
|
6964
|
+
* the halflife period rather than freezing.
|
|
6965
|
+
*/
|
|
6966
|
+
decayToNeutral() {
|
|
6967
|
+
this.targets.fill(0);
|
|
6968
|
+
}
|
|
6969
|
+
/**
|
|
6970
|
+
* Reset all state (values, velocities, targets).
|
|
6971
|
+
* Call when starting a new playback session.
|
|
6972
|
+
*/
|
|
6973
|
+
reset() {
|
|
6974
|
+
this.values.fill(0);
|
|
6975
|
+
this.velocities.fill(0);
|
|
6976
|
+
this.targets.fill(0);
|
|
6977
|
+
this._hasTarget = false;
|
|
6978
|
+
}
|
|
6979
|
+
};
|
|
6980
|
+
|
|
6981
|
+
// src/animation/audioEnergy.ts
|
|
6982
|
+
function calculateRMS(samples) {
|
|
6983
|
+
if (samples.length === 0) return 0;
|
|
6984
|
+
let sumSquares = 0;
|
|
6985
|
+
for (let i = 0; i < samples.length; i++) {
|
|
6986
|
+
sumSquares += samples[i] * samples[i];
|
|
6987
|
+
}
|
|
6988
|
+
return Math.sqrt(sumSquares / samples.length);
|
|
6989
|
+
}
|
|
6990
|
+
function calculatePeak(samples) {
|
|
6991
|
+
let peak = 0;
|
|
6992
|
+
for (let i = 0; i < samples.length; i++) {
|
|
6993
|
+
const abs = Math.abs(samples[i]);
|
|
6994
|
+
if (abs > peak) peak = abs;
|
|
6995
|
+
}
|
|
6996
|
+
return peak;
|
|
6997
|
+
}
|
|
6998
|
+
var AudioEnergyAnalyzer = class {
|
|
6999
|
+
/**
|
|
7000
|
+
* @param smoothingFactor How much to smooth (0 = no smoothing, 1 = infinite smoothing). Default 0.85
|
|
7001
|
+
* @param noiseFloor Minimum energy threshold to consider as signal. Default 0.01
|
|
7002
|
+
*/
|
|
7003
|
+
constructor(smoothingFactor = 0.85, noiseFloor = 0.01) {
|
|
7004
|
+
this.smoothedRMS = 0;
|
|
7005
|
+
this.smoothedPeak = 0;
|
|
7006
|
+
this.smoothingFactor = Math.max(0, Math.min(0.99, smoothingFactor));
|
|
7007
|
+
this.noiseFloor = noiseFloor;
|
|
7008
|
+
}
|
|
7009
|
+
/**
|
|
7010
|
+
* Process audio samples and return smoothed energy values
|
|
7011
|
+
* @param samples Audio samples (Float32Array)
|
|
7012
|
+
* @returns Object with rms and peak values
|
|
7013
|
+
*/
|
|
7014
|
+
process(samples) {
|
|
7015
|
+
const instantRMS = calculateRMS(samples);
|
|
7016
|
+
const instantPeak = calculatePeak(samples);
|
|
7017
|
+
const gatedRMS = instantRMS > this.noiseFloor ? instantRMS : 0;
|
|
7018
|
+
const gatedPeak = instantPeak > this.noiseFloor ? instantPeak : 0;
|
|
7019
|
+
if (gatedRMS > this.smoothedRMS) {
|
|
7020
|
+
this.smoothedRMS = this.smoothedRMS * 0.5 + gatedRMS * 0.5;
|
|
7021
|
+
} else {
|
|
7022
|
+
this.smoothedRMS = this.smoothedRMS * this.smoothingFactor + gatedRMS * (1 - this.smoothingFactor);
|
|
7023
|
+
}
|
|
7024
|
+
if (gatedPeak > this.smoothedPeak) {
|
|
7025
|
+
this.smoothedPeak = this.smoothedPeak * 0.3 + gatedPeak * 0.7;
|
|
7026
|
+
} else {
|
|
7027
|
+
this.smoothedPeak = this.smoothedPeak * this.smoothingFactor + gatedPeak * (1 - this.smoothingFactor);
|
|
7028
|
+
}
|
|
7029
|
+
const energy = this.smoothedRMS * 0.7 + this.smoothedPeak * 0.3;
|
|
7030
|
+
return {
|
|
7031
|
+
rms: this.smoothedRMS,
|
|
7032
|
+
peak: this.smoothedPeak,
|
|
7033
|
+
energy: Math.min(1, energy * 2)
|
|
7034
|
+
// Scale up and clamp
|
|
7035
|
+
};
|
|
7036
|
+
}
|
|
7037
|
+
/**
|
|
7038
|
+
* Reset analyzer state
|
|
7039
|
+
*/
|
|
7040
|
+
reset() {
|
|
7041
|
+
this.smoothedRMS = 0;
|
|
7042
|
+
this.smoothedPeak = 0;
|
|
7043
|
+
}
|
|
7044
|
+
/**
|
|
7045
|
+
* Get current smoothed RMS value
|
|
7046
|
+
*/
|
|
7047
|
+
get rms() {
|
|
7048
|
+
return this.smoothedRMS;
|
|
7049
|
+
}
|
|
7050
|
+
/**
|
|
7051
|
+
* Get current smoothed peak value
|
|
7052
|
+
*/
|
|
7053
|
+
get peak() {
|
|
7054
|
+
return this.smoothedPeak;
|
|
7055
|
+
}
|
|
7056
|
+
};
|
|
7057
|
+
var EmphasisDetector = class {
|
|
7058
|
+
/**
|
|
7059
|
+
* @param historySize Number of frames to track. Default 10
|
|
7060
|
+
* @param emphasisThreshold Minimum energy increase to count as emphasis. Default 0.15
|
|
7061
|
+
*/
|
|
7062
|
+
constructor(historySize = 10, emphasisThreshold = 0.15) {
|
|
7063
|
+
this.energyHistory = [];
|
|
7064
|
+
this.historySize = historySize;
|
|
7065
|
+
this.emphasisThreshold = emphasisThreshold;
|
|
7066
|
+
}
|
|
7067
|
+
/**
|
|
7068
|
+
* Process energy value and detect emphasis
|
|
7069
|
+
* @param energy Current energy value (0-1)
|
|
7070
|
+
* @returns Object with isEmphasis flag and emphasisStrength
|
|
7071
|
+
*/
|
|
7072
|
+
process(energy) {
|
|
7073
|
+
this.energyHistory.push(energy);
|
|
7074
|
+
if (this.energyHistory.length > this.historySize) {
|
|
7075
|
+
this.energyHistory.shift();
|
|
7076
|
+
}
|
|
7077
|
+
if (this.energyHistory.length < 3) {
|
|
7078
|
+
return { isEmphasis: false, emphasisStrength: 0 };
|
|
7079
|
+
}
|
|
7080
|
+
const prevFrames = this.energyHistory.slice(0, -1);
|
|
7081
|
+
const avgPrev = prevFrames.reduce((a, b) => a + b, 0) / prevFrames.length;
|
|
7082
|
+
const increase = energy - avgPrev;
|
|
7083
|
+
const isEmphasis = increase > this.emphasisThreshold;
|
|
7084
|
+
return {
|
|
7085
|
+
isEmphasis,
|
|
7086
|
+
emphasisStrength: isEmphasis ? Math.min(1, increase / 0.3) : 0
|
|
7087
|
+
};
|
|
7088
|
+
}
|
|
7089
|
+
/**
|
|
7090
|
+
* Reset detector state
|
|
7091
|
+
*/
|
|
7092
|
+
reset() {
|
|
7093
|
+
this.energyHistory = [];
|
|
7094
|
+
}
|
|
7095
|
+
};
|
|
7096
|
+
|
|
7429
7097
|
// src/inference/SileroVADInference.ts
|
|
7430
|
-
var
|
|
7098
|
+
var logger12 = createLogger("SileroVAD");
|
|
7431
7099
|
var SileroVADInference = class {
|
|
7432
7100
|
constructor(config) {
|
|
7433
7101
|
this.session = null;
|
|
@@ -7501,23 +7169,23 @@ var SileroVADInference = class {
|
|
|
7501
7169
|
"model.sample_rate": this.config.sampleRate
|
|
7502
7170
|
});
|
|
7503
7171
|
try {
|
|
7504
|
-
|
|
7172
|
+
logger12.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
7505
7173
|
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
7506
7174
|
this.ort = ort;
|
|
7507
7175
|
this._backend = backend;
|
|
7508
|
-
|
|
7176
|
+
logger12.info("ONNX Runtime loaded", { backend: this._backend });
|
|
7509
7177
|
const cache = getModelCache();
|
|
7510
7178
|
const modelUrl = this.config.modelUrl;
|
|
7511
7179
|
const isCached = await cache.has(modelUrl);
|
|
7512
7180
|
let modelBuffer;
|
|
7513
7181
|
if (isCached) {
|
|
7514
|
-
|
|
7182
|
+
logger12.debug("Loading model from cache", { modelUrl });
|
|
7515
7183
|
modelBuffer = await cache.get(modelUrl);
|
|
7516
7184
|
} else {
|
|
7517
|
-
|
|
7185
|
+
logger12.debug("Fetching and caching model", { modelUrl });
|
|
7518
7186
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
7519
7187
|
}
|
|
7520
|
-
|
|
7188
|
+
logger12.debug("Creating ONNX session", {
|
|
7521
7189
|
size: formatBytes(modelBuffer.byteLength),
|
|
7522
7190
|
backend: this._backend
|
|
7523
7191
|
});
|
|
@@ -7526,7 +7194,7 @@ var SileroVADInference = class {
|
|
|
7526
7194
|
this.session = await ort.InferenceSession.create(modelData, sessionOptions);
|
|
7527
7195
|
this.reset();
|
|
7528
7196
|
const loadTimeMs = performance.now() - startTime;
|
|
7529
|
-
|
|
7197
|
+
logger12.info("Model loaded successfully", {
|
|
7530
7198
|
backend: this._backend,
|
|
7531
7199
|
loadTimeMs: Math.round(loadTimeMs),
|
|
7532
7200
|
sampleRate: this.config.sampleRate,
|
|
@@ -7581,7 +7249,7 @@ var SileroVADInference = class {
|
|
|
7581
7249
|
[]
|
|
7582
7250
|
);
|
|
7583
7251
|
} catch (e) {
|
|
7584
|
-
|
|
7252
|
+
logger12.warn("BigInt64Array not available, using bigint array fallback", {
|
|
7585
7253
|
error: e instanceof Error ? e.message : String(e)
|
|
7586
7254
|
});
|
|
7587
7255
|
this.srTensor = new this.ort.Tensor(
|
|
@@ -7687,7 +7355,7 @@ var SileroVADInference = class {
|
|
|
7687
7355
|
this.preSpeechBuffer.shift();
|
|
7688
7356
|
}
|
|
7689
7357
|
}
|
|
7690
|
-
|
|
7358
|
+
logger12.trace("Skipping VAD inference - audio too quiet", {
|
|
7691
7359
|
rms: Math.round(rms * 1e4) / 1e4,
|
|
7692
7360
|
threshold: MIN_ENERGY_THRESHOLD
|
|
7693
7361
|
});
|
|
@@ -7741,7 +7409,7 @@ var SileroVADInference = class {
|
|
|
7741
7409
|
if (isSpeech && !this.wasSpeaking) {
|
|
7742
7410
|
preSpeechChunks = [...this.preSpeechBuffer];
|
|
7743
7411
|
this.preSpeechBuffer = [];
|
|
7744
|
-
|
|
7412
|
+
logger12.debug("Speech started with pre-speech buffer", {
|
|
7745
7413
|
preSpeechChunks: preSpeechChunks.length,
|
|
7746
7414
|
durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
|
|
7747
7415
|
});
|
|
@@ -7754,7 +7422,7 @@ var SileroVADInference = class {
|
|
|
7754
7422
|
this.preSpeechBuffer = [];
|
|
7755
7423
|
}
|
|
7756
7424
|
this.wasSpeaking = isSpeech;
|
|
7757
|
-
|
|
7425
|
+
logger12.trace("VAD inference completed", {
|
|
7758
7426
|
probability: Math.round(probability * 1e3) / 1e3,
|
|
7759
7427
|
isSpeech,
|
|
7760
7428
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100
|
|
@@ -7785,7 +7453,7 @@ var SileroVADInference = class {
|
|
|
7785
7453
|
const oomError = new Error(
|
|
7786
7454
|
`SileroVAD inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reducing concurrent model sessions or reloading the page.`
|
|
7787
7455
|
);
|
|
7788
|
-
|
|
7456
|
+
logger12.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
7789
7457
|
pointer: `0x${err.toString(16)}`,
|
|
7790
7458
|
backend: this._backend
|
|
7791
7459
|
});
|
|
@@ -7828,7 +7496,7 @@ var SileroVADInference = class {
|
|
|
7828
7496
|
SileroVADInference.isWebGPUAvailable = isWebGPUAvailable;
|
|
7829
7497
|
|
|
7830
7498
|
// src/inference/SileroVADWorker.ts
|
|
7831
|
-
var
|
|
7499
|
+
var logger13 = createLogger("SileroVADWorker");
|
|
7832
7500
|
var WASM_CDN_PATH5 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
7833
7501
|
var LOAD_TIMEOUT_MS3 = 1e4;
|
|
7834
7502
|
var INFERENCE_TIMEOUT_MS3 = 1e3;
|
|
@@ -8106,7 +7774,7 @@ var SileroVADWorker = class {
|
|
|
8106
7774
|
this.handleWorkerMessage(event.data);
|
|
8107
7775
|
};
|
|
8108
7776
|
worker.onerror = (error) => {
|
|
8109
|
-
|
|
7777
|
+
logger13.error("Worker error", { error: error.message });
|
|
8110
7778
|
for (const [, resolver] of this.pendingResolvers) {
|
|
8111
7779
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
8112
7780
|
}
|
|
@@ -8182,9 +7850,9 @@ var SileroVADWorker = class {
|
|
|
8182
7850
|
"model.sample_rate": this.config.sampleRate
|
|
8183
7851
|
});
|
|
8184
7852
|
try {
|
|
8185
|
-
|
|
7853
|
+
logger13.info("Creating VAD worker...");
|
|
8186
7854
|
this.worker = this.createWorker();
|
|
8187
|
-
|
|
7855
|
+
logger13.info("Loading model in worker...", {
|
|
8188
7856
|
modelUrl: this.config.modelUrl,
|
|
8189
7857
|
sampleRate: this.config.sampleRate
|
|
8190
7858
|
});
|
|
@@ -8200,7 +7868,7 @@ var SileroVADWorker = class {
|
|
|
8200
7868
|
);
|
|
8201
7869
|
this._isLoaded = true;
|
|
8202
7870
|
const loadTimeMs = performance.now() - startTime;
|
|
8203
|
-
|
|
7871
|
+
logger13.info("VAD worker loaded successfully", {
|
|
8204
7872
|
backend: "wasm",
|
|
8205
7873
|
loadTimeMs: Math.round(loadTimeMs),
|
|
8206
7874
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -8307,7 +7975,7 @@ var SileroVADWorker = class {
|
|
|
8307
7975
|
if (isSpeech && !this.wasSpeaking) {
|
|
8308
7976
|
preSpeechChunks = [...this.preSpeechBuffer];
|
|
8309
7977
|
this.preSpeechBuffer = [];
|
|
8310
|
-
|
|
7978
|
+
logger13.debug("Speech started with pre-speech buffer", {
|
|
8311
7979
|
preSpeechChunks: preSpeechChunks.length,
|
|
8312
7980
|
durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
|
|
8313
7981
|
});
|
|
@@ -8320,7 +7988,7 @@ var SileroVADWorker = class {
|
|
|
8320
7988
|
this.preSpeechBuffer = [];
|
|
8321
7989
|
}
|
|
8322
7990
|
this.wasSpeaking = isSpeech;
|
|
8323
|
-
|
|
7991
|
+
logger13.trace("VAD worker inference completed", {
|
|
8324
7992
|
probability: Math.round(result.probability * 1e3) / 1e3,
|
|
8325
7993
|
isSpeech,
|
|
8326
7994
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
@@ -8388,44 +8056,44 @@ var SileroVADWorker = class {
|
|
|
8388
8056
|
};
|
|
8389
8057
|
|
|
8390
8058
|
// src/inference/createSileroVAD.ts
|
|
8391
|
-
var
|
|
8059
|
+
var logger14 = createLogger("createSileroVAD");
|
|
8392
8060
|
function supportsVADWorker() {
|
|
8393
8061
|
if (typeof Worker === "undefined") {
|
|
8394
|
-
|
|
8062
|
+
logger14.debug("Worker not supported: Worker constructor undefined");
|
|
8395
8063
|
return false;
|
|
8396
8064
|
}
|
|
8397
8065
|
if (typeof URL === "undefined" || typeof URL.createObjectURL === "undefined") {
|
|
8398
|
-
|
|
8066
|
+
logger14.debug("Worker not supported: URL.createObjectURL unavailable");
|
|
8399
8067
|
return false;
|
|
8400
8068
|
}
|
|
8401
8069
|
if (typeof Blob === "undefined") {
|
|
8402
|
-
|
|
8070
|
+
logger14.debug("Worker not supported: Blob constructor unavailable");
|
|
8403
8071
|
return false;
|
|
8404
8072
|
}
|
|
8405
8073
|
return true;
|
|
8406
8074
|
}
|
|
8407
8075
|
function createSileroVAD(config) {
|
|
8408
8076
|
if (config.unifiedWorker) {
|
|
8409
|
-
|
|
8077
|
+
logger14.info("Creating SileroVADUnifiedAdapter (shared unified worker)");
|
|
8410
8078
|
return new SileroVADUnifiedAdapter(config.unifiedWorker, config);
|
|
8411
8079
|
}
|
|
8412
8080
|
const fallbackOnError = config.fallbackOnError ?? true;
|
|
8413
8081
|
let useWorker;
|
|
8414
8082
|
if (config.useWorker !== void 0) {
|
|
8415
8083
|
useWorker = config.useWorker;
|
|
8416
|
-
|
|
8084
|
+
logger14.debug("Worker preference explicitly set", { useWorker });
|
|
8417
8085
|
} else {
|
|
8418
8086
|
const workerSupported = supportsVADWorker();
|
|
8419
8087
|
const onMobile = isMobile();
|
|
8420
8088
|
useWorker = workerSupported && !onMobile;
|
|
8421
|
-
|
|
8089
|
+
logger14.debug("Auto-detected Worker preference", {
|
|
8422
8090
|
useWorker,
|
|
8423
8091
|
workerSupported,
|
|
8424
8092
|
onMobile
|
|
8425
8093
|
});
|
|
8426
8094
|
}
|
|
8427
8095
|
if (useWorker) {
|
|
8428
|
-
|
|
8096
|
+
logger14.info("Creating SileroVADWorker (off-main-thread)");
|
|
8429
8097
|
const worker = new SileroVADWorker({
|
|
8430
8098
|
modelUrl: config.modelUrl,
|
|
8431
8099
|
sampleRate: config.sampleRate,
|
|
@@ -8437,7 +8105,7 @@ function createSileroVAD(config) {
|
|
|
8437
8105
|
}
|
|
8438
8106
|
return worker;
|
|
8439
8107
|
}
|
|
8440
|
-
|
|
8108
|
+
logger14.info("Creating SileroVADInference (main thread)");
|
|
8441
8109
|
return new SileroVADInference(config);
|
|
8442
8110
|
}
|
|
8443
8111
|
var VADWorkerWithFallback = class {
|
|
@@ -8463,7 +8131,7 @@ var VADWorkerWithFallback = class {
|
|
|
8463
8131
|
try {
|
|
8464
8132
|
return await this.implementation.load();
|
|
8465
8133
|
} catch (error) {
|
|
8466
|
-
|
|
8134
|
+
logger14.warn("Worker load failed, falling back to main thread", {
|
|
8467
8135
|
error: error instanceof Error ? error.message : String(error)
|
|
8468
8136
|
});
|
|
8469
8137
|
try {
|
|
@@ -8472,7 +8140,7 @@ var VADWorkerWithFallback = class {
|
|
|
8472
8140
|
}
|
|
8473
8141
|
this.implementation = new SileroVADInference(this.config);
|
|
8474
8142
|
this.hasFallenBack = true;
|
|
8475
|
-
|
|
8143
|
+
logger14.info("Fallback to SileroVADInference successful");
|
|
8476
8144
|
return await this.implementation.load();
|
|
8477
8145
|
}
|
|
8478
8146
|
}
|
|
@@ -8493,8 +8161,175 @@ var VADWorkerWithFallback = class {
|
|
|
8493
8161
|
}
|
|
8494
8162
|
};
|
|
8495
8163
|
|
|
8164
|
+
// src/inference/A2EOrchestrator.ts
|
|
8165
|
+
var logger15 = createLogger("A2EOrchestrator");
|
|
8166
|
+
var A2EOrchestrator = class {
|
|
8167
|
+
constructor(config) {
|
|
8168
|
+
this.a2e = null;
|
|
8169
|
+
this.processor = null;
|
|
8170
|
+
// Mic capture state (lightweight — no dependency on MicrophoneCapture class
|
|
8171
|
+
// which requires an external EventEmitter. We do raw Web Audio here.)
|
|
8172
|
+
this.stream = null;
|
|
8173
|
+
this.audioContext = null;
|
|
8174
|
+
this.scriptProcessor = null;
|
|
8175
|
+
this.nativeSampleRate = 0;
|
|
8176
|
+
this._isReady = false;
|
|
8177
|
+
this._isStreaming = false;
|
|
8178
|
+
this._backend = null;
|
|
8179
|
+
this.disposed = false;
|
|
8180
|
+
this.config = {
|
|
8181
|
+
sampleRate: 16e3,
|
|
8182
|
+
...config
|
|
8183
|
+
};
|
|
8184
|
+
}
|
|
8185
|
+
/** Latest blendshape weights from inference (null if none yet) */
|
|
8186
|
+
get latestWeights() {
|
|
8187
|
+
return this.processor?.latestFrame ?? null;
|
|
8188
|
+
}
|
|
8189
|
+
/** Whether the model is loaded and ready for inference */
|
|
8190
|
+
get isReady() {
|
|
8191
|
+
return this._isReady;
|
|
8192
|
+
}
|
|
8193
|
+
/** Whether mic is active and inference loop is running */
|
|
8194
|
+
get isStreaming() {
|
|
8195
|
+
return this._isStreaming;
|
|
8196
|
+
}
|
|
8197
|
+
/** Current backend type (webgpu, wasm, or null) */
|
|
8198
|
+
get backend() {
|
|
8199
|
+
return this._backend;
|
|
8200
|
+
}
|
|
8201
|
+
/**
|
|
8202
|
+
* Load the A2E model and create the processor
|
|
8203
|
+
*/
|
|
8204
|
+
async load() {
|
|
8205
|
+
if (this.disposed) throw new Error("A2EOrchestrator has been disposed");
|
|
8206
|
+
logger15.info("Loading A2E model...");
|
|
8207
|
+
this.a2e = createA2E({
|
|
8208
|
+
gpuModelUrl: this.config.gpuModelUrl,
|
|
8209
|
+
gpuExternalDataUrl: this.config.gpuExternalDataUrl,
|
|
8210
|
+
cpuModelUrl: this.config.cpuModelUrl ?? this.config.gpuModelUrl,
|
|
8211
|
+
...this.config.a2eConfig
|
|
8212
|
+
});
|
|
8213
|
+
const info = await this.a2e.load();
|
|
8214
|
+
this._backend = info.backend;
|
|
8215
|
+
this.processor = new A2EProcessor({
|
|
8216
|
+
backend: this.a2e,
|
|
8217
|
+
sampleRate: this.config.sampleRate,
|
|
8218
|
+
chunkSize: this.config.chunkSize,
|
|
8219
|
+
onFrame: this.config.onFrame,
|
|
8220
|
+
onError: this.config.onError
|
|
8221
|
+
});
|
|
8222
|
+
this._isReady = true;
|
|
8223
|
+
logger15.info("A2E model loaded", {
|
|
8224
|
+
backend: info.backend,
|
|
8225
|
+
loadTimeMs: info.loadTimeMs,
|
|
8226
|
+
modelId: this.a2e.modelId
|
|
8227
|
+
});
|
|
8228
|
+
this.config.onReady?.();
|
|
8229
|
+
}
|
|
8230
|
+
/**
|
|
8231
|
+
* Start mic capture and inference loop
|
|
8232
|
+
*/
|
|
8233
|
+
async start() {
|
|
8234
|
+
if (this.disposed) throw new Error("A2EOrchestrator has been disposed");
|
|
8235
|
+
if (!this._isReady || !this.processor) throw new Error("Model not loaded. Call load() first.");
|
|
8236
|
+
if (this._isStreaming) return;
|
|
8237
|
+
try {
|
|
8238
|
+
this.stream = await navigator.mediaDevices.getUserMedia({
|
|
8239
|
+
audio: {
|
|
8240
|
+
sampleRate: { ideal: this.config.sampleRate },
|
|
8241
|
+
channelCount: 1,
|
|
8242
|
+
echoCancellation: true,
|
|
8243
|
+
noiseSuppression: true,
|
|
8244
|
+
autoGainControl: true
|
|
8245
|
+
}
|
|
8246
|
+
});
|
|
8247
|
+
this.audioContext = new AudioContext({ sampleRate: this.config.sampleRate });
|
|
8248
|
+
if (this.audioContext.state === "suspended") {
|
|
8249
|
+
await this.audioContext.resume();
|
|
8250
|
+
}
|
|
8251
|
+
this.nativeSampleRate = this.audioContext.sampleRate;
|
|
8252
|
+
const source = this.audioContext.createMediaStreamSource(this.stream);
|
|
8253
|
+
this.scriptProcessor = this.audioContext.createScriptProcessor(4096, 1, 1);
|
|
8254
|
+
this.scriptProcessor.onaudioprocess = (e) => {
|
|
8255
|
+
if (!this._isStreaming || !this.processor) return;
|
|
8256
|
+
const input = e.inputBuffer.getChannelData(0);
|
|
8257
|
+
let samples;
|
|
8258
|
+
if (this.nativeSampleRate !== this.config.sampleRate) {
|
|
8259
|
+
const ratio = this.config.sampleRate / this.nativeSampleRate;
|
|
8260
|
+
const newLen = Math.round(input.length * ratio);
|
|
8261
|
+
samples = new Float32Array(newLen);
|
|
8262
|
+
for (let i = 0; i < newLen; i++) {
|
|
8263
|
+
const srcIdx = i / ratio;
|
|
8264
|
+
const lo = Math.floor(srcIdx);
|
|
8265
|
+
const hi = Math.min(lo + 1, input.length - 1);
|
|
8266
|
+
const frac = srcIdx - lo;
|
|
8267
|
+
samples[i] = input[lo] * (1 - frac) + input[hi] * frac;
|
|
8268
|
+
}
|
|
8269
|
+
} else {
|
|
8270
|
+
samples = new Float32Array(input);
|
|
8271
|
+
}
|
|
8272
|
+
this.processor.pushAudio(samples);
|
|
8273
|
+
};
|
|
8274
|
+
source.connect(this.scriptProcessor);
|
|
8275
|
+
this.scriptProcessor.connect(this.audioContext.destination);
|
|
8276
|
+
this._isStreaming = true;
|
|
8277
|
+
this.processor.startDrip();
|
|
8278
|
+
logger15.info("Mic capture started", { sampleRate: this.nativeSampleRate });
|
|
8279
|
+
} catch (err) {
|
|
8280
|
+
const error = err instanceof Error ? err : new Error(String(err));
|
|
8281
|
+
logger15.error("Failed to start mic capture", { error: error.message });
|
|
8282
|
+
this.config.onError?.(error);
|
|
8283
|
+
throw error;
|
|
8284
|
+
}
|
|
8285
|
+
}
|
|
8286
|
+
/**
|
|
8287
|
+
* Stop mic capture and inference loop
|
|
8288
|
+
*/
|
|
8289
|
+
stop() {
|
|
8290
|
+
this._isStreaming = false;
|
|
8291
|
+
if (this.processor) {
|
|
8292
|
+
this.processor.stopDrip();
|
|
8293
|
+
this.processor.reset();
|
|
8294
|
+
}
|
|
8295
|
+
if (this.scriptProcessor) {
|
|
8296
|
+
this.scriptProcessor.disconnect();
|
|
8297
|
+
this.scriptProcessor.onaudioprocess = null;
|
|
8298
|
+
this.scriptProcessor = null;
|
|
8299
|
+
}
|
|
8300
|
+
if (this.stream) {
|
|
8301
|
+
this.stream.getTracks().forEach((t) => t.stop());
|
|
8302
|
+
this.stream = null;
|
|
8303
|
+
}
|
|
8304
|
+
if (this.audioContext) {
|
|
8305
|
+
this.audioContext.close().catch(() => {
|
|
8306
|
+
});
|
|
8307
|
+
this.audioContext = null;
|
|
8308
|
+
}
|
|
8309
|
+
logger15.info("Mic capture stopped");
|
|
8310
|
+
}
|
|
8311
|
+
/**
|
|
8312
|
+
* Dispose of all resources
|
|
8313
|
+
*/
|
|
8314
|
+
async dispose() {
|
|
8315
|
+
if (this.disposed) return;
|
|
8316
|
+
this.disposed = true;
|
|
8317
|
+
this.stop();
|
|
8318
|
+
if (this.processor) {
|
|
8319
|
+
this.processor.dispose();
|
|
8320
|
+
this.processor = null;
|
|
8321
|
+
}
|
|
8322
|
+
if (this.a2e) {
|
|
8323
|
+
await this.a2e.dispose();
|
|
8324
|
+
this.a2e = null;
|
|
8325
|
+
}
|
|
8326
|
+
this._isReady = false;
|
|
8327
|
+
this._backend = null;
|
|
8328
|
+
}
|
|
8329
|
+
};
|
|
8330
|
+
|
|
8496
8331
|
// src/inference/SafariSpeechRecognition.ts
|
|
8497
|
-
var
|
|
8332
|
+
var logger16 = createLogger("SafariSpeech");
|
|
8498
8333
|
var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
8499
8334
|
constructor(config = {}) {
|
|
8500
8335
|
this.recognition = null;
|
|
@@ -8513,7 +8348,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8513
8348
|
interimResults: config.interimResults ?? true,
|
|
8514
8349
|
maxAlternatives: config.maxAlternatives ?? 1
|
|
8515
8350
|
};
|
|
8516
|
-
|
|
8351
|
+
logger16.debug("SafariSpeechRecognition created", {
|
|
8517
8352
|
language: this.config.language,
|
|
8518
8353
|
continuous: this.config.continuous
|
|
8519
8354
|
});
|
|
@@ -8574,7 +8409,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8574
8409
|
*/
|
|
8575
8410
|
async start() {
|
|
8576
8411
|
if (this.isListening) {
|
|
8577
|
-
|
|
8412
|
+
logger16.warn("Already listening");
|
|
8578
8413
|
return;
|
|
8579
8414
|
}
|
|
8580
8415
|
if (!_SafariSpeechRecognition.isAvailable()) {
|
|
@@ -8604,7 +8439,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8604
8439
|
this.isListening = true;
|
|
8605
8440
|
this.startTime = performance.now();
|
|
8606
8441
|
this.accumulatedText = "";
|
|
8607
|
-
|
|
8442
|
+
logger16.info("Speech recognition started", {
|
|
8608
8443
|
language: this.config.language
|
|
8609
8444
|
});
|
|
8610
8445
|
span?.end();
|
|
@@ -8619,7 +8454,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8619
8454
|
*/
|
|
8620
8455
|
async stop() {
|
|
8621
8456
|
if (!this.isListening || !this.recognition) {
|
|
8622
|
-
|
|
8457
|
+
logger16.warn("Not currently listening");
|
|
8623
8458
|
return {
|
|
8624
8459
|
text: this.accumulatedText,
|
|
8625
8460
|
language: this.config.language,
|
|
@@ -8648,7 +8483,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8648
8483
|
if (this.recognition && this.isListening) {
|
|
8649
8484
|
this.recognition.abort();
|
|
8650
8485
|
this.isListening = false;
|
|
8651
|
-
|
|
8486
|
+
logger16.info("Speech recognition aborted");
|
|
8652
8487
|
}
|
|
8653
8488
|
}
|
|
8654
8489
|
/**
|
|
@@ -8679,7 +8514,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8679
8514
|
this.isListening = false;
|
|
8680
8515
|
this.resultCallbacks = [];
|
|
8681
8516
|
this.errorCallbacks = [];
|
|
8682
|
-
|
|
8517
|
+
logger16.debug("SafariSpeechRecognition disposed");
|
|
8683
8518
|
}
|
|
8684
8519
|
/**
|
|
8685
8520
|
* Set up event handlers for the recognition instance
|
|
@@ -8707,7 +8542,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8707
8542
|
confidence: alternative.confidence
|
|
8708
8543
|
};
|
|
8709
8544
|
this.emitResult(speechResult);
|
|
8710
|
-
|
|
8545
|
+
logger16.trace("Speech result", {
|
|
8711
8546
|
text: text.substring(0, 50),
|
|
8712
8547
|
isFinal,
|
|
8713
8548
|
confidence: alternative.confidence
|
|
@@ -8717,12 +8552,12 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8717
8552
|
span?.end();
|
|
8718
8553
|
} catch (error) {
|
|
8719
8554
|
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
8720
|
-
|
|
8555
|
+
logger16.error("Error processing speech result", { error });
|
|
8721
8556
|
}
|
|
8722
8557
|
};
|
|
8723
8558
|
this.recognition.onerror = (event) => {
|
|
8724
8559
|
const error = new Error(`Speech recognition error: ${event.error} - ${event.message}`);
|
|
8725
|
-
|
|
8560
|
+
logger16.error("Speech recognition error", { error: event.error, message: event.message });
|
|
8726
8561
|
this.emitError(error);
|
|
8727
8562
|
if (this.stopRejecter) {
|
|
8728
8563
|
this.stopRejecter(error);
|
|
@@ -8732,7 +8567,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8732
8567
|
};
|
|
8733
8568
|
this.recognition.onend = () => {
|
|
8734
8569
|
this.isListening = false;
|
|
8735
|
-
|
|
8570
|
+
logger16.info("Speech recognition ended", {
|
|
8736
8571
|
totalText: this.accumulatedText.length,
|
|
8737
8572
|
durationMs: performance.now() - this.startTime
|
|
8738
8573
|
});
|
|
@@ -8749,13 +8584,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8749
8584
|
}
|
|
8750
8585
|
};
|
|
8751
8586
|
this.recognition.onstart = () => {
|
|
8752
|
-
|
|
8587
|
+
logger16.debug("Speech recognition started by browser");
|
|
8753
8588
|
};
|
|
8754
8589
|
this.recognition.onspeechstart = () => {
|
|
8755
|
-
|
|
8590
|
+
logger16.debug("Speech detected");
|
|
8756
8591
|
};
|
|
8757
8592
|
this.recognition.onspeechend = () => {
|
|
8758
|
-
|
|
8593
|
+
logger16.debug("Speech ended");
|
|
8759
8594
|
};
|
|
8760
8595
|
}
|
|
8761
8596
|
/**
|
|
@@ -8766,7 +8601,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8766
8601
|
try {
|
|
8767
8602
|
callback(result);
|
|
8768
8603
|
} catch (error) {
|
|
8769
|
-
|
|
8604
|
+
logger16.error("Error in result callback", { error });
|
|
8770
8605
|
}
|
|
8771
8606
|
}
|
|
8772
8607
|
}
|
|
@@ -8778,7 +8613,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8778
8613
|
try {
|
|
8779
8614
|
callback(error);
|
|
8780
8615
|
} catch (callbackError) {
|
|
8781
|
-
|
|
8616
|
+
logger16.error("Error in error callback", { error: callbackError });
|
|
8782
8617
|
}
|
|
8783
8618
|
}
|
|
8784
8619
|
}
|
|
@@ -9191,13 +9026,14 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
9191
9026
|
if (!this.lam) {
|
|
9192
9027
|
throw new Error("LAM must be initialized before pipeline");
|
|
9193
9028
|
}
|
|
9194
|
-
this.pipeline = new
|
|
9029
|
+
this.pipeline = new FullFacePipeline({
|
|
9195
9030
|
lam: this.lam,
|
|
9196
9031
|
sampleRate: 16e3,
|
|
9197
9032
|
chunkTargetMs: 200
|
|
9198
9033
|
});
|
|
9199
9034
|
await this.pipeline.initialize();
|
|
9200
|
-
this.pipeline.on("
|
|
9035
|
+
this.pipeline.on("full_frame_ready", (fullFrame) => {
|
|
9036
|
+
const frame = fullFrame.blendshapes;
|
|
9201
9037
|
this.emit("animation", {
|
|
9202
9038
|
blendshapes: frame,
|
|
9203
9039
|
get: (name) => {
|
|
@@ -9376,9 +9212,9 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
9376
9212
|
});
|
|
9377
9213
|
}
|
|
9378
9214
|
}
|
|
9379
|
-
// REMOVED: processAudioForAnimation() - now handled by
|
|
9215
|
+
// REMOVED: processAudioForAnimation() - now handled by FullFacePipeline
|
|
9380
9216
|
// The pipeline manages audio scheduling, LAM inference, and frame synchronization
|
|
9381
|
-
// Frames are emitted via pipeline.on('
|
|
9217
|
+
// Frames are emitted via pipeline.on('full_frame_ready') event (see initPipeline())
|
|
9382
9218
|
/**
|
|
9383
9219
|
* Detect voice activity using Silero VAD
|
|
9384
9220
|
* Falls back to simple RMS if VAD not available
|
|
@@ -11189,6 +11025,8 @@ function isProtocolEvent(obj) {
|
|
|
11189
11025
|
return typeof obj === "object" && obj !== null && "v" in obj && "type" in obj && "ts" in obj;
|
|
11190
11026
|
}
|
|
11191
11027
|
export {
|
|
11028
|
+
A2EOrchestrator,
|
|
11029
|
+
A2EProcessor,
|
|
11192
11030
|
ARKIT_BLENDSHAPES,
|
|
11193
11031
|
AgentCoreAdapter,
|
|
11194
11032
|
AnimationGraph,
|
|
@@ -11196,23 +11034,22 @@ export {
|
|
|
11196
11034
|
AudioEnergyAnalyzer,
|
|
11197
11035
|
AudioScheduler,
|
|
11198
11036
|
AudioSyncManager,
|
|
11037
|
+
BLENDSHAPE_TO_GROUP,
|
|
11038
|
+
BlendshapeSmoother,
|
|
11199
11039
|
CTC_VOCAB,
|
|
11200
11040
|
ConsoleExporter,
|
|
11201
11041
|
ConversationOrchestrator,
|
|
11202
11042
|
DEFAULT_ANIMATION_CONFIG,
|
|
11203
11043
|
DEFAULT_LOGGING_CONFIG,
|
|
11204
|
-
EMOTION_ARKIT_MAP,
|
|
11205
11044
|
EMOTION_NAMES,
|
|
11206
11045
|
EMOTION_VECTOR_SIZE,
|
|
11207
11046
|
EmotionController,
|
|
11208
11047
|
EmotionPresets,
|
|
11209
|
-
EmotionToBlendshapeMapper,
|
|
11210
11048
|
EmphasisDetector,
|
|
11211
11049
|
EventEmitter,
|
|
11212
11050
|
FullFacePipeline,
|
|
11213
11051
|
INFERENCE_LATENCY_BUCKETS,
|
|
11214
11052
|
InterruptionHandler,
|
|
11215
|
-
LAMPipeline,
|
|
11216
11053
|
LAM_BLENDSHAPES,
|
|
11217
11054
|
LOG_LEVEL_PRIORITY,
|
|
11218
11055
|
MODEL_LOAD_TIME_BUCKETS,
|
|
@@ -11231,73 +11068,54 @@ export {
|
|
|
11231
11068
|
SileroVADInference,
|
|
11232
11069
|
SileroVADUnifiedAdapter,
|
|
11233
11070
|
SileroVADWorker,
|
|
11234
|
-
SyncedAudioPipeline,
|
|
11235
11071
|
TenantManager,
|
|
11236
|
-
UPPER_FACE_BLENDSHAPES,
|
|
11237
11072
|
UnifiedInferenceWorker,
|
|
11238
|
-
WAV2ARKIT_BLENDSHAPES,
|
|
11239
11073
|
Wav2ArkitCpuInference,
|
|
11240
11074
|
Wav2ArkitCpuUnifiedAdapter,
|
|
11241
11075
|
Wav2ArkitCpuWorker,
|
|
11242
11076
|
Wav2Vec2Inference,
|
|
11243
|
-
applyCMVN,
|
|
11244
|
-
applyLFR,
|
|
11245
11077
|
blendEmotions,
|
|
11246
11078
|
calculatePeak,
|
|
11247
11079
|
calculateRMS,
|
|
11248
|
-
computeKaldiFbank,
|
|
11249
11080
|
configureCacheLimit,
|
|
11250
11081
|
configureLogging,
|
|
11251
11082
|
configureTelemetry,
|
|
11083
|
+
createA2E,
|
|
11252
11084
|
createEmotionVector,
|
|
11253
|
-
createLipSync,
|
|
11254
11085
|
createLogger,
|
|
11255
11086
|
createSenseVoice,
|
|
11256
|
-
createSessionWithFallback,
|
|
11257
11087
|
createSileroVAD,
|
|
11258
|
-
ctcGreedyDecode,
|
|
11259
11088
|
fetchWithCache,
|
|
11260
11089
|
formatBytes,
|
|
11261
11090
|
getCacheConfig,
|
|
11262
11091
|
getCacheKey,
|
|
11263
11092
|
getEmotionPreset,
|
|
11264
|
-
getLoadedBackend,
|
|
11265
11093
|
getLoggingConfig,
|
|
11266
11094
|
getModelCache,
|
|
11267
|
-
getOnnxRuntime,
|
|
11268
|
-
getOnnxRuntimeForPreference,
|
|
11269
11095
|
getOptimalWasmThreads,
|
|
11270
11096
|
getRecommendedBackend,
|
|
11271
|
-
getSessionOptions,
|
|
11272
11097
|
getTelemetry,
|
|
11273
11098
|
hasWebGPUApi,
|
|
11274
11099
|
isAndroid,
|
|
11275
11100
|
isIOS,
|
|
11276
11101
|
isIOSSafari,
|
|
11277
11102
|
isMobile,
|
|
11278
|
-
isOnnxRuntimeLoaded,
|
|
11279
11103
|
isProtocolEvent,
|
|
11280
11104
|
isSafari,
|
|
11281
11105
|
isSpeechRecognitionAvailable,
|
|
11282
11106
|
isWebGPUAvailable,
|
|
11107
|
+
lerpBlendshapes,
|
|
11283
11108
|
lerpEmotion,
|
|
11284
11109
|
noopLogger,
|
|
11285
|
-
parseCMVNFromMetadata,
|
|
11286
|
-
parseTokensFile,
|
|
11287
11110
|
preloadModels,
|
|
11288
|
-
preloadOnnxRuntime,
|
|
11289
|
-
remapWav2ArkitToLam,
|
|
11290
11111
|
resetLoggingConfig,
|
|
11291
11112
|
resolveBackend,
|
|
11292
|
-
resolveLanguageId,
|
|
11293
|
-
resolveTextNormId,
|
|
11294
11113
|
setLogLevel,
|
|
11295
11114
|
setLoggingEnabled,
|
|
11296
11115
|
shouldEnableWasmProxy,
|
|
11297
|
-
|
|
11116
|
+
shouldUseCpuA2E,
|
|
11298
11117
|
shouldUseNativeASR,
|
|
11299
|
-
|
|
11300
|
-
supportsVADWorker
|
|
11301
|
-
symmetrizeBlendshapes
|
|
11118
|
+
shouldUseServerA2E,
|
|
11119
|
+
supportsVADWorker
|
|
11302
11120
|
};
|
|
11303
11121
|
//# sourceMappingURL=index.mjs.map
|