@omote/core 0.4.7 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +484 -861
- package/dist/index.d.ts +484 -861
- package/dist/index.js +1275 -1440
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +948 -1113
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -30,6 +30,8 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
30
30
|
// src/index.ts
|
|
31
31
|
var index_exports = {};
|
|
32
32
|
__export(index_exports, {
|
|
33
|
+
A2EOrchestrator: () => A2EOrchestrator,
|
|
34
|
+
A2EProcessor: () => A2EProcessor,
|
|
33
35
|
ARKIT_BLENDSHAPES: () => ARKIT_BLENDSHAPES,
|
|
34
36
|
AgentCoreAdapter: () => AgentCoreAdapter,
|
|
35
37
|
AnimationGraph: () => AnimationGraph,
|
|
@@ -37,23 +39,22 @@ __export(index_exports, {
|
|
|
37
39
|
AudioEnergyAnalyzer: () => AudioEnergyAnalyzer,
|
|
38
40
|
AudioScheduler: () => AudioScheduler,
|
|
39
41
|
AudioSyncManager: () => AudioSyncManager,
|
|
42
|
+
BLENDSHAPE_TO_GROUP: () => BLENDSHAPE_TO_GROUP,
|
|
43
|
+
BlendshapeSmoother: () => BlendshapeSmoother,
|
|
40
44
|
CTC_VOCAB: () => CTC_VOCAB,
|
|
41
45
|
ConsoleExporter: () => ConsoleExporter,
|
|
42
46
|
ConversationOrchestrator: () => ConversationOrchestrator,
|
|
43
47
|
DEFAULT_ANIMATION_CONFIG: () => DEFAULT_ANIMATION_CONFIG,
|
|
44
48
|
DEFAULT_LOGGING_CONFIG: () => DEFAULT_LOGGING_CONFIG,
|
|
45
|
-
EMOTION_ARKIT_MAP: () => EMOTION_ARKIT_MAP,
|
|
46
49
|
EMOTION_NAMES: () => EMOTION_NAMES,
|
|
47
50
|
EMOTION_VECTOR_SIZE: () => EMOTION_VECTOR_SIZE,
|
|
48
51
|
EmotionController: () => EmotionController,
|
|
49
52
|
EmotionPresets: () => EmotionPresets,
|
|
50
|
-
EmotionToBlendshapeMapper: () => EmotionToBlendshapeMapper,
|
|
51
53
|
EmphasisDetector: () => EmphasisDetector,
|
|
52
54
|
EventEmitter: () => EventEmitter,
|
|
53
55
|
FullFacePipeline: () => FullFacePipeline,
|
|
54
56
|
INFERENCE_LATENCY_BUCKETS: () => INFERENCE_LATENCY_BUCKETS,
|
|
55
57
|
InterruptionHandler: () => InterruptionHandler,
|
|
56
|
-
LAMPipeline: () => LAMPipeline,
|
|
57
58
|
LAM_BLENDSHAPES: () => LAM_BLENDSHAPES,
|
|
58
59
|
LOG_LEVEL_PRIORITY: () => LOG_LEVEL_PRIORITY,
|
|
59
60
|
MODEL_LOAD_TIME_BUCKETS: () => MODEL_LOAD_TIME_BUCKETS,
|
|
@@ -72,74 +73,55 @@ __export(index_exports, {
|
|
|
72
73
|
SileroVADInference: () => SileroVADInference,
|
|
73
74
|
SileroVADUnifiedAdapter: () => SileroVADUnifiedAdapter,
|
|
74
75
|
SileroVADWorker: () => SileroVADWorker,
|
|
75
|
-
SyncedAudioPipeline: () => SyncedAudioPipeline,
|
|
76
76
|
TenantManager: () => TenantManager,
|
|
77
|
-
UPPER_FACE_BLENDSHAPES: () => UPPER_FACE_BLENDSHAPES,
|
|
78
77
|
UnifiedInferenceWorker: () => UnifiedInferenceWorker,
|
|
79
|
-
WAV2ARKIT_BLENDSHAPES: () => WAV2ARKIT_BLENDSHAPES,
|
|
80
78
|
Wav2ArkitCpuInference: () => Wav2ArkitCpuInference,
|
|
81
79
|
Wav2ArkitCpuUnifiedAdapter: () => Wav2ArkitCpuUnifiedAdapter,
|
|
82
80
|
Wav2ArkitCpuWorker: () => Wav2ArkitCpuWorker,
|
|
83
81
|
Wav2Vec2Inference: () => Wav2Vec2Inference,
|
|
84
|
-
applyCMVN: () => applyCMVN,
|
|
85
|
-
applyLFR: () => applyLFR,
|
|
86
82
|
blendEmotions: () => blendEmotions,
|
|
87
83
|
calculatePeak: () => calculatePeak,
|
|
88
84
|
calculateRMS: () => calculateRMS,
|
|
89
|
-
computeKaldiFbank: () => computeKaldiFbank,
|
|
90
85
|
configureCacheLimit: () => configureCacheLimit,
|
|
91
86
|
configureLogging: () => configureLogging,
|
|
92
87
|
configureTelemetry: () => configureTelemetry,
|
|
88
|
+
createA2E: () => createA2E,
|
|
93
89
|
createEmotionVector: () => createEmotionVector,
|
|
94
|
-
createLipSync: () => createLipSync,
|
|
95
90
|
createLogger: () => createLogger,
|
|
96
91
|
createSenseVoice: () => createSenseVoice,
|
|
97
|
-
createSessionWithFallback: () => createSessionWithFallback,
|
|
98
92
|
createSileroVAD: () => createSileroVAD,
|
|
99
|
-
ctcGreedyDecode: () => ctcGreedyDecode,
|
|
100
93
|
fetchWithCache: () => fetchWithCache,
|
|
101
94
|
formatBytes: () => formatBytes,
|
|
102
95
|
getCacheConfig: () => getCacheConfig,
|
|
103
96
|
getCacheKey: () => getCacheKey,
|
|
104
97
|
getEmotionPreset: () => getEmotionPreset,
|
|
105
|
-
getLoadedBackend: () => getLoadedBackend,
|
|
106
98
|
getLoggingConfig: () => getLoggingConfig,
|
|
107
99
|
getModelCache: () => getModelCache,
|
|
108
|
-
getOnnxRuntime: () => getOnnxRuntime,
|
|
109
|
-
getOnnxRuntimeForPreference: () => getOnnxRuntimeForPreference,
|
|
110
100
|
getOptimalWasmThreads: () => getOptimalWasmThreads,
|
|
111
101
|
getRecommendedBackend: () => getRecommendedBackend,
|
|
112
|
-
getSessionOptions: () => getSessionOptions,
|
|
113
102
|
getTelemetry: () => getTelemetry,
|
|
114
103
|
hasWebGPUApi: () => hasWebGPUApi,
|
|
115
104
|
isAndroid: () => isAndroid,
|
|
116
105
|
isIOS: () => isIOS,
|
|
117
106
|
isIOSSafari: () => isIOSSafari,
|
|
118
107
|
isMobile: () => isMobile,
|
|
119
|
-
isOnnxRuntimeLoaded: () => isOnnxRuntimeLoaded,
|
|
120
108
|
isProtocolEvent: () => isProtocolEvent,
|
|
121
109
|
isSafari: () => isSafari,
|
|
122
110
|
isSpeechRecognitionAvailable: () => isSpeechRecognitionAvailable,
|
|
123
111
|
isWebGPUAvailable: () => isWebGPUAvailable,
|
|
112
|
+
lerpBlendshapes: () => lerpBlendshapes,
|
|
124
113
|
lerpEmotion: () => lerpEmotion,
|
|
125
114
|
noopLogger: () => noopLogger,
|
|
126
|
-
parseCMVNFromMetadata: () => parseCMVNFromMetadata,
|
|
127
|
-
parseTokensFile: () => parseTokensFile,
|
|
128
115
|
preloadModels: () => preloadModels,
|
|
129
|
-
preloadOnnxRuntime: () => preloadOnnxRuntime,
|
|
130
|
-
remapWav2ArkitToLam: () => remapWav2ArkitToLam,
|
|
131
116
|
resetLoggingConfig: () => resetLoggingConfig,
|
|
132
117
|
resolveBackend: () => resolveBackend,
|
|
133
|
-
resolveLanguageId: () => resolveLanguageId,
|
|
134
|
-
resolveTextNormId: () => resolveTextNormId,
|
|
135
118
|
setLogLevel: () => setLogLevel,
|
|
136
119
|
setLoggingEnabled: () => setLoggingEnabled,
|
|
137
120
|
shouldEnableWasmProxy: () => shouldEnableWasmProxy,
|
|
138
|
-
|
|
121
|
+
shouldUseCpuA2E: () => shouldUseCpuA2E,
|
|
139
122
|
shouldUseNativeASR: () => shouldUseNativeASR,
|
|
140
|
-
|
|
141
|
-
supportsVADWorker: () => supportsVADWorker
|
|
142
|
-
symmetrizeBlendshapes: () => symmetrizeBlendshapes
|
|
123
|
+
shouldUseServerA2E: () => shouldUseServerA2E,
|
|
124
|
+
supportsVADWorker: () => supportsVADWorker
|
|
143
125
|
});
|
|
144
126
|
module.exports = __toCommonJS(index_exports);
|
|
145
127
|
|
|
@@ -649,730 +631,617 @@ var AudioChunkCoalescer = class {
|
|
|
649
631
|
}
|
|
650
632
|
};
|
|
651
633
|
|
|
652
|
-
// src/
|
|
653
|
-
var
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
this.lastFrame = null;
|
|
668
|
-
}
|
|
669
|
-
/**
|
|
670
|
-
* Push audio samples into the pipeline
|
|
671
|
-
*
|
|
672
|
-
* Accumulates samples and triggers LAM inference when buffer is full.
|
|
673
|
-
* Multiple calls may be needed to accumulate enough samples.
|
|
674
|
-
*
|
|
675
|
-
* @param samples - Float32Array of audio samples
|
|
676
|
-
* @param timestamp - AudioContext time when these samples start playing
|
|
677
|
-
* @param lam - LAM inference engine
|
|
678
|
-
*/
|
|
679
|
-
async push(samples, timestamp, lam) {
|
|
680
|
-
if (this.buffer.length === 0) {
|
|
681
|
-
this.bufferStartTime = timestamp;
|
|
682
|
-
}
|
|
683
|
-
const newBuffer = new Float32Array(this.buffer.length + samples.length);
|
|
684
|
-
newBuffer.set(this.buffer, 0);
|
|
685
|
-
newBuffer.set(samples, this.buffer.length);
|
|
686
|
-
this.buffer = newBuffer;
|
|
687
|
-
while (this.buffer.length >= this.REQUIRED_SAMPLES) {
|
|
688
|
-
await this.processBuffer(lam);
|
|
689
|
-
if (this.buffer.length >= this.REQUIRED_SAMPLES) {
|
|
690
|
-
await new Promise((r) => setTimeout(r, 0));
|
|
691
|
-
}
|
|
692
|
-
}
|
|
693
|
-
}
|
|
694
|
-
/**
|
|
695
|
-
* Process accumulated buffer through LAM inference
|
|
696
|
-
*/
|
|
697
|
-
async processBuffer(lam) {
|
|
698
|
-
try {
|
|
699
|
-
const toProcess = this.buffer.slice(0, this.REQUIRED_SAMPLES);
|
|
700
|
-
const processedStartTime = this.bufferStartTime;
|
|
701
|
-
this.buffer = this.buffer.slice(this.REQUIRED_SAMPLES);
|
|
702
|
-
const processedDuration = this.REQUIRED_SAMPLES / (this.options.sampleRate ?? 16e3);
|
|
703
|
-
this.bufferStartTime = processedStartTime + processedDuration;
|
|
704
|
-
const result = await lam.infer(toProcess);
|
|
705
|
-
const frameDuration = 1 / this.FRAME_RATE;
|
|
706
|
-
for (let i = 0; i < result.blendshapes.length; i++) {
|
|
707
|
-
const frame = result.blendshapes[i];
|
|
708
|
-
const timestamp = processedStartTime + i * frameDuration;
|
|
709
|
-
this.frameQueue.push({ frame, timestamp });
|
|
710
|
-
}
|
|
711
|
-
this.options.onInference?.(result.blendshapes.length);
|
|
712
|
-
} catch (error) {
|
|
713
|
-
this.options.onError?.(error);
|
|
714
|
-
this.buffer = new Float32Array(0);
|
|
715
|
-
this.bufferStartTime = 0;
|
|
716
|
-
}
|
|
717
|
-
}
|
|
718
|
-
/**
|
|
719
|
-
* Get the frame that should be displayed at the current time
|
|
720
|
-
*
|
|
721
|
-
* Automatically removes frames that have already been displayed.
|
|
722
|
-
* This prevents memory leaks from accumulating old frames.
|
|
723
|
-
*
|
|
724
|
-
* Discard Window (prevents premature frame discarding):
|
|
725
|
-
* - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
|
|
726
|
-
* - WASM: 1.0s (LAM inference 50-500ms + higher variability)
|
|
727
|
-
*
|
|
728
|
-
* Last-Frame-Hold: Returns last valid frame instead of null to prevent
|
|
729
|
-
* avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
|
|
730
|
-
*
|
|
731
|
-
* @param currentTime - Current AudioContext time
|
|
732
|
-
* @param lam - LAM inference engine (optional, for backend detection)
|
|
733
|
-
* @returns Current frame, or last frame as fallback, or null if no frames yet
|
|
734
|
-
*/
|
|
735
|
-
getFrameForTime(currentTime, lam) {
|
|
736
|
-
const discardWindow = lam?.backend === "wasm" ? 1 : 0.5;
|
|
737
|
-
let discardedCount = 0;
|
|
738
|
-
while (this.frameQueue.length > 0 && this.frameQueue[0].timestamp < currentTime - discardWindow) {
|
|
739
|
-
const discarded = this.frameQueue.shift();
|
|
740
|
-
discardedCount++;
|
|
741
|
-
if (discardedCount === 1) {
|
|
742
|
-
const ageMs = ((currentTime - discarded.timestamp) * 1e3).toFixed(0);
|
|
743
|
-
console.warn("[LAM] Frame(s) discarded as too old", {
|
|
744
|
-
ageMs,
|
|
745
|
-
discardWindowMs: discardWindow * 1e3,
|
|
746
|
-
queueLength: this.frameQueue.length,
|
|
747
|
-
backend: lam?.backend ?? "unknown"
|
|
748
|
-
});
|
|
749
|
-
}
|
|
750
|
-
}
|
|
751
|
-
if (this.frameQueue.length > 0 && this.frameQueue[0].timestamp <= currentTime) {
|
|
752
|
-
const { frame } = this.frameQueue.shift();
|
|
753
|
-
this.lastFrame = frame;
|
|
754
|
-
return frame;
|
|
755
|
-
}
|
|
756
|
-
return this.lastFrame;
|
|
757
|
-
}
|
|
758
|
-
/**
|
|
759
|
-
* Get all frames in the queue (for debugging/monitoring)
|
|
760
|
-
*/
|
|
761
|
-
getQueuedFrames() {
|
|
762
|
-
return [...this.frameQueue];
|
|
763
|
-
}
|
|
764
|
-
/**
|
|
765
|
-
* Get current buffer fill level (0-1)
|
|
766
|
-
*/
|
|
767
|
-
get fillLevel() {
|
|
768
|
-
return Math.min(1, this.buffer.length / this.REQUIRED_SAMPLES);
|
|
769
|
-
}
|
|
770
|
-
/**
|
|
771
|
-
* Get number of frames queued
|
|
772
|
-
*/
|
|
773
|
-
get queuedFrameCount() {
|
|
774
|
-
return this.frameQueue.length;
|
|
775
|
-
}
|
|
776
|
-
/**
|
|
777
|
-
* Get buffered audio duration in seconds
|
|
778
|
-
*/
|
|
779
|
-
get bufferedDuration() {
|
|
780
|
-
return this.buffer.length / (this.options.sampleRate ?? 16e3);
|
|
781
|
-
}
|
|
782
|
-
/**
|
|
783
|
-
* Flush remaining buffered audio
|
|
784
|
-
*
|
|
785
|
-
* Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
|
|
786
|
-
* This ensures the final audio chunk generates blendshape frames.
|
|
787
|
-
*
|
|
788
|
-
* Should be called when audio stream ends to prevent losing the last 0-1 seconds.
|
|
789
|
-
*
|
|
790
|
-
* @param lam - LAM inference engine
|
|
791
|
-
*/
|
|
792
|
-
async flush(lam) {
|
|
793
|
-
if (this.buffer.length === 0) {
|
|
794
|
-
return;
|
|
795
|
-
}
|
|
796
|
-
const padded = new Float32Array(this.REQUIRED_SAMPLES);
|
|
797
|
-
padded.set(this.buffer, 0);
|
|
798
|
-
const processedStartTime = this.bufferStartTime;
|
|
799
|
-
try {
|
|
800
|
-
const result = await lam.infer(padded);
|
|
801
|
-
const actualDuration = this.buffer.length / (this.options.sampleRate ?? 16e3);
|
|
802
|
-
const frameDuration = 1 / this.FRAME_RATE;
|
|
803
|
-
const actualFrameCount = Math.ceil(actualDuration * this.FRAME_RATE);
|
|
804
|
-
for (let i = 0; i < Math.min(actualFrameCount, result.blendshapes.length); i++) {
|
|
805
|
-
const frame = result.blendshapes[i];
|
|
806
|
-
const timestamp = processedStartTime + i * frameDuration;
|
|
807
|
-
this.frameQueue.push({ frame, timestamp });
|
|
808
|
-
}
|
|
809
|
-
this.buffer = new Float32Array(0);
|
|
810
|
-
this.bufferStartTime = 0;
|
|
811
|
-
this.options.onInference?.(Math.min(actualFrameCount, result.blendshapes.length));
|
|
812
|
-
} catch (error) {
|
|
813
|
-
this.options.onError?.(error);
|
|
814
|
-
this.buffer = new Float32Array(0);
|
|
815
|
-
this.bufferStartTime = 0;
|
|
816
|
-
}
|
|
817
|
-
}
|
|
818
|
-
/**
|
|
819
|
-
* Adjust all queued frame timestamps by an offset
|
|
820
|
-
*
|
|
821
|
-
* Used for synchronization when audio scheduling time differs from
|
|
822
|
-
* the estimated time used during LAM processing.
|
|
823
|
-
*
|
|
824
|
-
* @param offset - Time offset in seconds to add to all timestamps
|
|
825
|
-
*/
|
|
826
|
-
adjustTimestamps(offset) {
|
|
827
|
-
for (const frame of this.frameQueue) {
|
|
828
|
-
frame.timestamp += offset;
|
|
829
|
-
}
|
|
830
|
-
}
|
|
831
|
-
/**
|
|
832
|
-
* Reset the pipeline
|
|
833
|
-
*/
|
|
834
|
-
reset() {
|
|
835
|
-
this.buffer = new Float32Array(0);
|
|
836
|
-
this.bufferStartTime = 0;
|
|
837
|
-
this.frameQueue = [];
|
|
838
|
-
this.lastFrame = null;
|
|
839
|
-
}
|
|
634
|
+
// src/logging/types.ts
|
|
635
|
+
var LOG_LEVEL_PRIORITY = {
|
|
636
|
+
error: 0,
|
|
637
|
+
warn: 1,
|
|
638
|
+
info: 2,
|
|
639
|
+
debug: 3,
|
|
640
|
+
trace: 4,
|
|
641
|
+
verbose: 5
|
|
642
|
+
};
|
|
643
|
+
var DEFAULT_LOGGING_CONFIG = {
|
|
644
|
+
level: "info",
|
|
645
|
+
enabled: true,
|
|
646
|
+
format: "pretty",
|
|
647
|
+
timestamps: true,
|
|
648
|
+
includeModule: true
|
|
840
649
|
};
|
|
841
650
|
|
|
842
|
-
// src/
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
651
|
+
// src/logging/formatters.ts
|
|
652
|
+
var COLORS = {
|
|
653
|
+
reset: "\x1B[0m",
|
|
654
|
+
red: "\x1B[31m",
|
|
655
|
+
yellow: "\x1B[33m",
|
|
656
|
+
blue: "\x1B[34m",
|
|
657
|
+
cyan: "\x1B[36m",
|
|
658
|
+
gray: "\x1B[90m",
|
|
659
|
+
white: "\x1B[37m",
|
|
660
|
+
magenta: "\x1B[35m"
|
|
661
|
+
};
|
|
662
|
+
var LEVEL_COLORS = {
|
|
663
|
+
error: COLORS.red,
|
|
664
|
+
warn: COLORS.yellow,
|
|
665
|
+
info: COLORS.blue,
|
|
666
|
+
debug: COLORS.cyan,
|
|
667
|
+
trace: COLORS.magenta,
|
|
668
|
+
verbose: COLORS.gray
|
|
669
|
+
};
|
|
670
|
+
var LEVEL_NAMES = {
|
|
671
|
+
error: "ERROR ",
|
|
672
|
+
warn: "WARN ",
|
|
673
|
+
info: "INFO ",
|
|
674
|
+
debug: "DEBUG ",
|
|
675
|
+
trace: "TRACE ",
|
|
676
|
+
verbose: "VERBOSE"
|
|
677
|
+
};
|
|
678
|
+
var isBrowser = typeof window !== "undefined";
|
|
679
|
+
function formatTimestamp(timestamp) {
|
|
680
|
+
const date = new Date(timestamp);
|
|
681
|
+
return date.toISOString().substring(11, 23);
|
|
858
682
|
}
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
this.playbackStarted = false;
|
|
866
|
-
this.monitorInterval = null;
|
|
867
|
-
this.frameAnimationId = null;
|
|
868
|
-
const sampleRate = options.sampleRate ?? 16e3;
|
|
869
|
-
const autoDelay = options.lam.modelId === "wav2arkit_cpu" ? 750 : options.lam.backend === "wasm" ? 350 : 50;
|
|
870
|
-
const audioDelayMs = options.audioDelayMs ?? autoDelay;
|
|
871
|
-
this.scheduler = new AudioScheduler({
|
|
872
|
-
sampleRate,
|
|
873
|
-
initialLookaheadSec: audioDelayMs / 1e3
|
|
874
|
-
});
|
|
875
|
-
this.coalescer = new AudioChunkCoalescer({
|
|
876
|
-
sampleRate,
|
|
877
|
-
targetDurationMs: options.chunkTargetMs ?? 200
|
|
878
|
-
});
|
|
879
|
-
this.lamPipeline = new LAMPipeline({
|
|
880
|
-
sampleRate,
|
|
881
|
-
onError: (error) => {
|
|
882
|
-
this.emit("error", error);
|
|
683
|
+
function safeStringify(data) {
|
|
684
|
+
const seen = /* @__PURE__ */ new WeakSet();
|
|
685
|
+
return JSON.stringify(data, (key, value) => {
|
|
686
|
+
if (typeof value === "object" && value !== null) {
|
|
687
|
+
if (seen.has(value)) {
|
|
688
|
+
return "[Circular]";
|
|
883
689
|
}
|
|
884
|
-
|
|
885
|
-
}
|
|
886
|
-
/**
|
|
887
|
-
* Initialize the pipeline
|
|
888
|
-
*/
|
|
889
|
-
async initialize() {
|
|
890
|
-
await this.scheduler.initialize();
|
|
891
|
-
}
|
|
892
|
-
/**
|
|
893
|
-
* Start a new playback session
|
|
894
|
-
*
|
|
895
|
-
* Resets all state and prepares for incoming audio chunks.
|
|
896
|
-
* Audio will be scheduled immediately as chunks arrive (no buffering).
|
|
897
|
-
*/
|
|
898
|
-
start() {
|
|
899
|
-
this.stopMonitoring();
|
|
900
|
-
this.scheduler.reset();
|
|
901
|
-
this.coalescer.reset();
|
|
902
|
-
this.lamPipeline.reset();
|
|
903
|
-
this.playbackStarted = false;
|
|
904
|
-
this.scheduler.warmup();
|
|
905
|
-
this.startFrameLoop();
|
|
906
|
-
this.startMonitoring();
|
|
907
|
-
}
|
|
908
|
-
/**
|
|
909
|
-
* Receive audio chunk from network
|
|
910
|
-
*
|
|
911
|
-
* Audio-first design: schedules audio immediately, LAM runs in background.
|
|
912
|
-
* This prevents LAM inference (50-300ms) from blocking audio scheduling,
|
|
913
|
-
* which caused audible stuttering with continuous audio streams.
|
|
914
|
-
*
|
|
915
|
-
* @param chunk - Uint8Array containing Int16 PCM audio
|
|
916
|
-
*/
|
|
917
|
-
async onAudioChunk(chunk) {
|
|
918
|
-
const combined = this.coalescer.add(chunk);
|
|
919
|
-
if (!combined) {
|
|
920
|
-
return;
|
|
690
|
+
seen.add(value);
|
|
921
691
|
}
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
692
|
+
if (value instanceof Error) {
|
|
693
|
+
return {
|
|
694
|
+
name: value.name,
|
|
695
|
+
message: value.message,
|
|
696
|
+
stack: value.stack
|
|
697
|
+
};
|
|
927
698
|
}
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
});
|
|
931
|
-
}
|
|
932
|
-
/**
|
|
933
|
-
* End of audio stream
|
|
934
|
-
*
|
|
935
|
-
* Flushes any remaining buffered data.
|
|
936
|
-
*/
|
|
937
|
-
async end() {
|
|
938
|
-
const remaining = this.coalescer.flush();
|
|
939
|
-
if (remaining) {
|
|
940
|
-
const chunk = new Uint8Array(remaining);
|
|
941
|
-
await this.onAudioChunk(chunk);
|
|
699
|
+
if (value instanceof Float32Array || value instanceof Int16Array) {
|
|
700
|
+
return `${value.constructor.name}(${value.length})`;
|
|
942
701
|
}
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
*/
|
|
959
|
-
async stop(fadeOutMs = 50) {
|
|
960
|
-
this.stopMonitoring();
|
|
961
|
-
await this.scheduler.cancelAll(fadeOutMs);
|
|
962
|
-
this.coalescer.reset();
|
|
963
|
-
this.lamPipeline.reset();
|
|
964
|
-
this.playbackStarted = false;
|
|
965
|
-
this.emit("playback_complete", void 0);
|
|
702
|
+
if (ArrayBuffer.isView(value)) {
|
|
703
|
+
return `${value.constructor.name}(${value.byteLength})`;
|
|
704
|
+
}
|
|
705
|
+
return value;
|
|
706
|
+
});
|
|
707
|
+
}
|
|
708
|
+
var jsonFormatter = (entry) => {
|
|
709
|
+
const output = {
|
|
710
|
+
timestamp: entry.timestamp,
|
|
711
|
+
level: entry.level,
|
|
712
|
+
module: entry.module,
|
|
713
|
+
message: entry.message
|
|
714
|
+
};
|
|
715
|
+
if (entry.data && Object.keys(entry.data).length > 0) {
|
|
716
|
+
output.data = entry.data;
|
|
966
717
|
}
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
*
|
|
973
|
-
* Frame Emission Strategy:
|
|
974
|
-
* - LAMPipeline uses last-frame-hold to prevent null returns
|
|
975
|
-
* - Always emit frames (even repeated frames) to maintain smooth animation
|
|
976
|
-
* - Renderer is responsible for detecting duplicate frames if needed
|
|
977
|
-
*/
|
|
978
|
-
startFrameLoop() {
|
|
979
|
-
const updateFrame = () => {
|
|
980
|
-
const currentTime = this.scheduler.getCurrentTime();
|
|
981
|
-
const frame = this.lamPipeline.getFrameForTime(currentTime, this.options.lam);
|
|
982
|
-
if (frame) {
|
|
983
|
-
this.emit("frame_ready", frame);
|
|
984
|
-
}
|
|
985
|
-
this.frameAnimationId = requestAnimationFrame(updateFrame);
|
|
718
|
+
if (entry.error) {
|
|
719
|
+
output.error = {
|
|
720
|
+
name: entry.error.name,
|
|
721
|
+
message: entry.error.message,
|
|
722
|
+
stack: entry.error.stack
|
|
986
723
|
};
|
|
987
|
-
this.frameAnimationId = requestAnimationFrame(updateFrame);
|
|
988
724
|
}
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
}
|
|
725
|
+
return safeStringify(output);
|
|
726
|
+
};
|
|
727
|
+
var prettyFormatter = (entry) => {
|
|
728
|
+
const time = formatTimestamp(entry.timestamp);
|
|
729
|
+
const level = LEVEL_NAMES[entry.level];
|
|
730
|
+
const module2 = entry.module;
|
|
731
|
+
const message = entry.message;
|
|
732
|
+
let output;
|
|
733
|
+
if (isBrowser) {
|
|
734
|
+
output = `${time} ${level} [${module2}] ${message}`;
|
|
735
|
+
} else {
|
|
736
|
+
const color = LEVEL_COLORS[entry.level];
|
|
737
|
+
output = `${COLORS.gray}${time}${COLORS.reset} ${color}${level}${COLORS.reset} ${COLORS.cyan}[${module2}]${COLORS.reset} ${message}`;
|
|
1002
738
|
}
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
this.monitorInterval = null;
|
|
1010
|
-
}
|
|
1011
|
-
if (this.frameAnimationId) {
|
|
1012
|
-
cancelAnimationFrame(this.frameAnimationId);
|
|
1013
|
-
this.frameAnimationId = null;
|
|
739
|
+
if (entry.data && Object.keys(entry.data).length > 0) {
|
|
740
|
+
const dataStr = safeStringify(entry.data);
|
|
741
|
+
if (dataStr.length > 80) {
|
|
742
|
+
output += "\n " + JSON.stringify(entry.data, null, 2).replace(/\n/g, "\n ");
|
|
743
|
+
} else {
|
|
744
|
+
output += " " + dataStr;
|
|
1014
745
|
}
|
|
1015
746
|
}
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
lamFill: this.lamPipeline.fillLevel,
|
|
1024
|
-
queuedFrames: this.lamPipeline.queuedFrameCount,
|
|
1025
|
-
currentTime: this.scheduler.getCurrentTime(),
|
|
1026
|
-
playbackEndTime: this.scheduler.getPlaybackEndTime()
|
|
1027
|
-
};
|
|
1028
|
-
}
|
|
1029
|
-
/**
|
|
1030
|
-
* Cleanup resources
|
|
1031
|
-
*/
|
|
1032
|
-
dispose() {
|
|
1033
|
-
this.stopMonitoring();
|
|
1034
|
-
this.scheduler.dispose();
|
|
1035
|
-
this.coalescer.reset();
|
|
1036
|
-
this.lamPipeline.reset();
|
|
747
|
+
if (entry.error) {
|
|
748
|
+
output += `
|
|
749
|
+
${entry.error.name}: ${entry.error.message}`;
|
|
750
|
+
if (entry.error.stack) {
|
|
751
|
+
const stackLines = entry.error.stack.split("\n").slice(1, 4);
|
|
752
|
+
output += "\n " + stackLines.join("\n ");
|
|
753
|
+
}
|
|
1037
754
|
}
|
|
755
|
+
return output;
|
|
1038
756
|
};
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
eyeWideRight: 0.4,
|
|
1073
|
-
// AU7 - Lid tightener (tense stare, combines with AU5 for angry glare)
|
|
1074
|
-
eyeSquintLeft: 0.3,
|
|
1075
|
-
eyeSquintRight: 0.3
|
|
1076
|
-
},
|
|
1077
|
-
sad: {
|
|
1078
|
-
// AU1 - Inner brow raiser (primary sadness marker)
|
|
1079
|
-
browInnerUp: 0.6,
|
|
1080
|
-
// AU4 - Brow lowerer (brows drawn together)
|
|
1081
|
-
browDownLeft: 0.3,
|
|
1082
|
-
browDownRight: 0.3
|
|
1083
|
-
},
|
|
1084
|
-
neutral: {}
|
|
1085
|
-
// All zeros - no expression overlay
|
|
1086
|
-
};
|
|
1087
|
-
var DEFAULT_CONFIG = {
|
|
1088
|
-
smoothingFactor: 0.15,
|
|
1089
|
-
confidenceThreshold: 0.3,
|
|
1090
|
-
intensity: 1,
|
|
1091
|
-
blendMode: "dominant",
|
|
1092
|
-
minBlendProbability: 0.1,
|
|
1093
|
-
energyModulation: false,
|
|
1094
|
-
minEnergyScale: 0.3,
|
|
1095
|
-
maxEnergyScale: 1
|
|
1096
|
-
};
|
|
1097
|
-
function createZeroBlendshapes() {
|
|
1098
|
-
const result = {};
|
|
1099
|
-
for (const name of UPPER_FACE_BLENDSHAPES) {
|
|
1100
|
-
result[name] = 0;
|
|
757
|
+
function getFormatter(format) {
|
|
758
|
+
return format === "json" ? jsonFormatter : prettyFormatter;
|
|
759
|
+
}
|
|
760
|
+
function createBrowserConsoleArgs(entry) {
|
|
761
|
+
const time = formatTimestamp(entry.timestamp);
|
|
762
|
+
const level = entry.level.toUpperCase().padEnd(7);
|
|
763
|
+
const module2 = entry.module;
|
|
764
|
+
const message = entry.message;
|
|
765
|
+
const styles = {
|
|
766
|
+
time: "color: gray;",
|
|
767
|
+
error: "color: red; font-weight: bold;",
|
|
768
|
+
warn: "color: orange; font-weight: bold;",
|
|
769
|
+
info: "color: blue;",
|
|
770
|
+
debug: "color: cyan;",
|
|
771
|
+
trace: "color: magenta;",
|
|
772
|
+
verbose: "color: gray;",
|
|
773
|
+
module: "color: teal; font-weight: bold;",
|
|
774
|
+
message: "color: inherit;"
|
|
775
|
+
};
|
|
776
|
+
let formatStr = "%c%s %c%s %c[%s]%c %s";
|
|
777
|
+
const args = [
|
|
778
|
+
styles.time,
|
|
779
|
+
time,
|
|
780
|
+
styles[entry.level],
|
|
781
|
+
level,
|
|
782
|
+
styles.module,
|
|
783
|
+
module2,
|
|
784
|
+
styles.message,
|
|
785
|
+
message
|
|
786
|
+
];
|
|
787
|
+
if (entry.data && Object.keys(entry.data).length > 0) {
|
|
788
|
+
formatStr += " %o";
|
|
789
|
+
args.push(entry.data);
|
|
1101
790
|
}
|
|
1102
|
-
return
|
|
791
|
+
return [formatStr, ...args];
|
|
1103
792
|
}
|
|
1104
|
-
|
|
1105
|
-
|
|
793
|
+
|
|
794
|
+
// src/logging/Logger.ts
|
|
795
|
+
var isBrowser2 = typeof window !== "undefined";
|
|
796
|
+
var globalConfig = { ...DEFAULT_LOGGING_CONFIG };
|
|
797
|
+
function configureLogging(config) {
|
|
798
|
+
globalConfig = { ...globalConfig, ...config };
|
|
1106
799
|
}
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
800
|
+
function getLoggingConfig() {
|
|
801
|
+
return { ...globalConfig };
|
|
802
|
+
}
|
|
803
|
+
function resetLoggingConfig() {
|
|
804
|
+
globalConfig = { ...DEFAULT_LOGGING_CONFIG };
|
|
805
|
+
}
|
|
806
|
+
function setLogLevel(level) {
|
|
807
|
+
globalConfig.level = level;
|
|
808
|
+
}
|
|
809
|
+
function setLoggingEnabled(enabled) {
|
|
810
|
+
globalConfig.enabled = enabled;
|
|
811
|
+
}
|
|
812
|
+
var consoleSink = (entry) => {
|
|
813
|
+
const consoleMethod = entry.level === "error" ? "error" : entry.level === "warn" ? "warn" : "log";
|
|
814
|
+
if (globalConfig.format === "pretty" && isBrowser2) {
|
|
815
|
+
const args = createBrowserConsoleArgs(entry);
|
|
816
|
+
console[consoleMethod](...args);
|
|
817
|
+
} else {
|
|
818
|
+
const formatter = getFormatter(globalConfig.format);
|
|
819
|
+
const formatted = formatter(entry);
|
|
820
|
+
console[consoleMethod](formatted);
|
|
1121
821
|
}
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
this.targetBlendshapes = createZeroBlendshapes();
|
|
1134
|
-
if (audioEnergy !== void 0) {
|
|
1135
|
-
this.currentEnergy = clamp01(audioEnergy);
|
|
1136
|
-
}
|
|
1137
|
-
if (!frame) {
|
|
1138
|
-
return { ...this.targetBlendshapes };
|
|
1139
|
-
}
|
|
1140
|
-
if (this.config.blendMode === "weighted") {
|
|
1141
|
-
this.mapFrameWeighted(frame);
|
|
1142
|
-
} else {
|
|
1143
|
-
this.mapFrameDominant(frame);
|
|
1144
|
-
}
|
|
1145
|
-
if (this.config.energyModulation) {
|
|
1146
|
-
this.applyEnergyModulation();
|
|
1147
|
-
}
|
|
1148
|
-
return { ...this.targetBlendshapes };
|
|
822
|
+
};
|
|
823
|
+
function getActiveSink() {
|
|
824
|
+
return globalConfig.sink || consoleSink;
|
|
825
|
+
}
|
|
826
|
+
function shouldLog(level) {
|
|
827
|
+
if (!globalConfig.enabled) return false;
|
|
828
|
+
return LOG_LEVEL_PRIORITY[level] <= LOG_LEVEL_PRIORITY[globalConfig.level];
|
|
829
|
+
}
|
|
830
|
+
var Logger = class _Logger {
|
|
831
|
+
constructor(module2) {
|
|
832
|
+
this.module = module2;
|
|
1149
833
|
}
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
if (
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
for (const [name, value] of Object.entries(mapping)) {
|
|
1164
|
-
const blendshapeName = name;
|
|
1165
|
-
if (value !== void 0) {
|
|
1166
|
-
this.targetBlendshapes[blendshapeName] = clamp01(value * scale);
|
|
1167
|
-
}
|
|
834
|
+
log(level, message, data) {
|
|
835
|
+
if (!shouldLog(level)) return;
|
|
836
|
+
const entry = {
|
|
837
|
+
timestamp: Date.now(),
|
|
838
|
+
level,
|
|
839
|
+
module: this.module,
|
|
840
|
+
message,
|
|
841
|
+
data
|
|
842
|
+
};
|
|
843
|
+
if (data?.error instanceof Error) {
|
|
844
|
+
entry.error = data.error;
|
|
845
|
+
const { error, ...rest } = data;
|
|
846
|
+
entry.data = Object.keys(rest).length > 0 ? rest : void 0;
|
|
1168
847
|
}
|
|
848
|
+
getActiveSink()(entry);
|
|
1169
849
|
}
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
* Creates more nuanced expressions (e.g., bittersweet = happy + sad)
|
|
1173
|
-
*/
|
|
1174
|
-
mapFrameWeighted(frame) {
|
|
1175
|
-
if (!frame.probabilities) {
|
|
1176
|
-
this.mapFrameDominant(frame);
|
|
1177
|
-
return;
|
|
1178
|
-
}
|
|
1179
|
-
for (const [emotion, probability] of Object.entries(frame.probabilities)) {
|
|
1180
|
-
if (probability < this.config.minBlendProbability) {
|
|
1181
|
-
continue;
|
|
1182
|
-
}
|
|
1183
|
-
const mapping = EMOTION_ARKIT_MAP[emotion];
|
|
1184
|
-
if (!mapping) {
|
|
1185
|
-
continue;
|
|
1186
|
-
}
|
|
1187
|
-
const scale = this.config.intensity * probability;
|
|
1188
|
-
for (const [name, value] of Object.entries(mapping)) {
|
|
1189
|
-
const blendshapeName = name;
|
|
1190
|
-
if (value !== void 0) {
|
|
1191
|
-
this.targetBlendshapes[blendshapeName] += value * scale;
|
|
1192
|
-
}
|
|
1193
|
-
}
|
|
1194
|
-
}
|
|
1195
|
-
for (const name of UPPER_FACE_BLENDSHAPES) {
|
|
1196
|
-
this.targetBlendshapes[name] = clamp01(this.targetBlendshapes[name]);
|
|
1197
|
-
}
|
|
850
|
+
error(message, data) {
|
|
851
|
+
this.log("error", message, data);
|
|
1198
852
|
}
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
* Louder speech = stronger expressions
|
|
1202
|
-
*/
|
|
1203
|
-
applyEnergyModulation() {
|
|
1204
|
-
const { minEnergyScale, maxEnergyScale } = this.config;
|
|
1205
|
-
const energyScale = minEnergyScale + this.currentEnergy * (maxEnergyScale - minEnergyScale);
|
|
1206
|
-
for (const name of UPPER_FACE_BLENDSHAPES) {
|
|
1207
|
-
this.targetBlendshapes[name] = clamp01(this.targetBlendshapes[name] * energyScale);
|
|
1208
|
-
}
|
|
853
|
+
warn(message, data) {
|
|
854
|
+
this.log("warn", message, data);
|
|
1209
855
|
}
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
*
|
|
1213
|
-
* Uses exponential moving average:
|
|
1214
|
-
* current = current + smoothingFactor * (target - current)
|
|
1215
|
-
*
|
|
1216
|
-
* @param _deltaMs - Delta time in milliseconds (reserved for future time-based smoothing)
|
|
1217
|
-
*/
|
|
1218
|
-
update(_deltaMs) {
|
|
1219
|
-
const factor = this.config.smoothingFactor;
|
|
1220
|
-
for (const name of UPPER_FACE_BLENDSHAPES) {
|
|
1221
|
-
const target = this.targetBlendshapes[name];
|
|
1222
|
-
const current = this.currentBlendshapes[name];
|
|
1223
|
-
this.currentBlendshapes[name] = clamp01(current + factor * (target - current));
|
|
1224
|
-
}
|
|
856
|
+
info(message, data) {
|
|
857
|
+
this.log("info", message, data);
|
|
1225
858
|
}
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
*
|
|
1229
|
-
* @returns Current upper face blendshapes (after smoothing)
|
|
1230
|
-
*/
|
|
1231
|
-
getCurrentBlendshapes() {
|
|
1232
|
-
return { ...this.currentBlendshapes };
|
|
859
|
+
debug(message, data) {
|
|
860
|
+
this.log("debug", message, data);
|
|
1233
861
|
}
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
*
|
|
1237
|
-
* Sets both target and current blendshapes to zero.
|
|
1238
|
-
*/
|
|
1239
|
-
reset() {
|
|
1240
|
-
this.targetBlendshapes = createZeroBlendshapes();
|
|
1241
|
-
this.currentBlendshapes = createZeroBlendshapes();
|
|
1242
|
-
this.currentEnergy = 1;
|
|
862
|
+
trace(message, data) {
|
|
863
|
+
this.log("trace", message, data);
|
|
1243
864
|
}
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
*/
|
|
1247
|
-
getConfig() {
|
|
1248
|
-
return { ...this.config };
|
|
865
|
+
verbose(message, data) {
|
|
866
|
+
this.log("verbose", message, data);
|
|
1249
867
|
}
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
*
|
|
1253
|
-
* @param config - Partial configuration to update
|
|
1254
|
-
*/
|
|
1255
|
-
setConfig(config) {
|
|
1256
|
-
this.config = {
|
|
1257
|
-
...this.config,
|
|
1258
|
-
...config
|
|
1259
|
-
};
|
|
868
|
+
child(subModule) {
|
|
869
|
+
return new _Logger(`${this.module}.${subModule}`);
|
|
1260
870
|
}
|
|
1261
871
|
};
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
if (
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
sumSquares += samples[i] * samples[i];
|
|
1269
|
-
}
|
|
1270
|
-
return Math.sqrt(sumSquares / samples.length);
|
|
1271
|
-
}
|
|
1272
|
-
function calculatePeak(samples) {
|
|
1273
|
-
let peak = 0;
|
|
1274
|
-
for (let i = 0; i < samples.length; i++) {
|
|
1275
|
-
const abs = Math.abs(samples[i]);
|
|
1276
|
-
if (abs > peak) peak = abs;
|
|
872
|
+
var loggerCache = /* @__PURE__ */ new Map();
|
|
873
|
+
function createLogger(module2) {
|
|
874
|
+
let logger17 = loggerCache.get(module2);
|
|
875
|
+
if (!logger17) {
|
|
876
|
+
logger17 = new Logger(module2);
|
|
877
|
+
loggerCache.set(module2, logger17);
|
|
1277
878
|
}
|
|
1278
|
-
return
|
|
879
|
+
return logger17;
|
|
1279
880
|
}
|
|
1280
|
-
var
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
881
|
+
var noopLogger = {
|
|
882
|
+
module: "noop",
|
|
883
|
+
error: () => {
|
|
884
|
+
},
|
|
885
|
+
warn: () => {
|
|
886
|
+
},
|
|
887
|
+
info: () => {
|
|
888
|
+
},
|
|
889
|
+
debug: () => {
|
|
890
|
+
},
|
|
891
|
+
trace: () => {
|
|
892
|
+
},
|
|
893
|
+
verbose: () => {
|
|
894
|
+
},
|
|
895
|
+
child: () => noopLogger
|
|
896
|
+
};
|
|
897
|
+
|
|
898
|
+
// src/inference/A2EProcessor.ts
|
|
899
|
+
var logger = createLogger("A2EProcessor");
|
|
900
|
+
var FRAME_RATE = 30;
|
|
901
|
+
var DRIP_INTERVAL_MS = 33;
|
|
902
|
+
var A2EProcessor = class {
|
|
903
|
+
constructor(config) {
|
|
904
|
+
this.writeOffset = 0;
|
|
905
|
+
this.bufferStartTime = 0;
|
|
906
|
+
// Frame queues (timestamped for pull mode, plain for drip mode)
|
|
907
|
+
this.timestampedQueue = [];
|
|
908
|
+
this.plainQueue = [];
|
|
909
|
+
// Push mode state
|
|
910
|
+
this._latestFrame = null;
|
|
911
|
+
this.dripInterval = null;
|
|
912
|
+
// Last-frame-hold for pull mode (prevents avatar freezing between frames)
|
|
913
|
+
this.lastPulledFrame = null;
|
|
914
|
+
// Inference serialization
|
|
915
|
+
this.inferenceRunning = false;
|
|
916
|
+
this.pendingChunks = [];
|
|
917
|
+
// Diagnostic: track getFrameForTime calls
|
|
918
|
+
this.getFrameCallCount = 0;
|
|
919
|
+
this.disposed = false;
|
|
920
|
+
this.backend = config.backend;
|
|
921
|
+
this.sampleRate = config.sampleRate ?? 16e3;
|
|
922
|
+
this.chunkSize = config.chunkSize ?? config.backend.chunkSize ?? 16e3;
|
|
923
|
+
this.onFrame = config.onFrame;
|
|
924
|
+
this.onError = config.onError;
|
|
925
|
+
this.bufferCapacity = this.chunkSize * 2;
|
|
926
|
+
this.buffer = new Float32Array(this.bufferCapacity);
|
|
927
|
+
}
|
|
928
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
929
|
+
// Audio Input
|
|
930
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
931
|
+
/**
|
|
932
|
+
* Push audio samples for inference (any source: mic, TTS, file).
|
|
933
|
+
*
|
|
934
|
+
* - With `timestamp`: frames stored with timestamps (pull mode)
|
|
935
|
+
* - Without `timestamp`: frames stored in plain queue (drip/push mode)
|
|
936
|
+
*
|
|
937
|
+
* Fire-and-forget: returns immediately, inference runs async.
|
|
1295
938
|
*/
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
const gatedPeak = instantPeak > this.noiseFloor ? instantPeak : 0;
|
|
1301
|
-
if (gatedRMS > this.smoothedRMS) {
|
|
1302
|
-
this.smoothedRMS = this.smoothedRMS * 0.5 + gatedRMS * 0.5;
|
|
1303
|
-
} else {
|
|
1304
|
-
this.smoothedRMS = this.smoothedRMS * this.smoothingFactor + gatedRMS * (1 - this.smoothingFactor);
|
|
939
|
+
pushAudio(samples, timestamp) {
|
|
940
|
+
if (this.disposed) return;
|
|
941
|
+
if (this.writeOffset === 0 && timestamp !== void 0) {
|
|
942
|
+
this.bufferStartTime = timestamp;
|
|
1305
943
|
}
|
|
1306
|
-
if (
|
|
1307
|
-
this.
|
|
1308
|
-
|
|
1309
|
-
|
|
944
|
+
if (this.writeOffset + samples.length > this.bufferCapacity) {
|
|
945
|
+
this.bufferCapacity = (this.writeOffset + samples.length) * 2;
|
|
946
|
+
const grown = new Float32Array(this.bufferCapacity);
|
|
947
|
+
grown.set(this.buffer.subarray(0, this.writeOffset));
|
|
948
|
+
this.buffer = grown;
|
|
949
|
+
}
|
|
950
|
+
this.buffer.set(samples, this.writeOffset);
|
|
951
|
+
this.writeOffset += samples.length;
|
|
952
|
+
logger.debug("pushAudio", {
|
|
953
|
+
samplesIn: samples.length,
|
|
954
|
+
writeOffset: this.writeOffset,
|
|
955
|
+
chunkSize: this.chunkSize,
|
|
956
|
+
willExtract: this.writeOffset >= this.chunkSize,
|
|
957
|
+
inferenceRunning: this.inferenceRunning,
|
|
958
|
+
pendingChunks: this.pendingChunks.length,
|
|
959
|
+
queuedFrames: this.timestampedQueue.length + this.plainQueue.length
|
|
960
|
+
});
|
|
961
|
+
while (this.writeOffset >= this.chunkSize) {
|
|
962
|
+
const chunk = this.buffer.slice(0, this.chunkSize);
|
|
963
|
+
this.buffer.copyWithin(0, this.chunkSize, this.writeOffset);
|
|
964
|
+
this.writeOffset -= this.chunkSize;
|
|
965
|
+
const chunkTimestamp = timestamp !== void 0 ? this.bufferStartTime : void 0;
|
|
966
|
+
this.pendingChunks.push({ chunk, timestamp: chunkTimestamp });
|
|
967
|
+
logger.info("Chunk queued for inference", {
|
|
968
|
+
chunkSize: chunk.length,
|
|
969
|
+
chunkTimestamp,
|
|
970
|
+
pendingChunks: this.pendingChunks.length,
|
|
971
|
+
remainderOffset: this.writeOffset
|
|
972
|
+
});
|
|
973
|
+
if (timestamp !== void 0) {
|
|
974
|
+
this.bufferStartTime += this.chunkSize / this.sampleRate;
|
|
975
|
+
}
|
|
1310
976
|
}
|
|
1311
|
-
|
|
1312
|
-
return {
|
|
1313
|
-
rms: this.smoothedRMS,
|
|
1314
|
-
peak: this.smoothedPeak,
|
|
1315
|
-
energy: Math.min(1, energy * 2)
|
|
1316
|
-
// Scale up and clamp
|
|
1317
|
-
};
|
|
977
|
+
this.drainPendingChunks();
|
|
1318
978
|
}
|
|
1319
979
|
/**
|
|
1320
|
-
*
|
|
980
|
+
* Flush remaining buffered audio (pads to chunkSize).
|
|
981
|
+
* Call at end of stream to process final partial chunk.
|
|
982
|
+
*
|
|
983
|
+
* Routes through the serialized pendingChunks pipeline to maintain
|
|
984
|
+
* correct frame ordering. Without this, flush() could push frames
|
|
985
|
+
* with the latest timestamp to the queue before drainPendingChunks()
|
|
986
|
+
* finishes pushing frames with earlier timestamps — causing
|
|
987
|
+
* getFrameForTime() to see out-of-order timestamps and stall.
|
|
1321
988
|
*/
|
|
1322
|
-
|
|
1323
|
-
this.
|
|
1324
|
-
|
|
989
|
+
async flush() {
|
|
990
|
+
if (this.disposed || this.writeOffset === 0) return;
|
|
991
|
+
const padded = new Float32Array(this.chunkSize);
|
|
992
|
+
padded.set(this.buffer.subarray(0, this.writeOffset), 0);
|
|
993
|
+
const chunkTimestamp = this.bufferStartTime > 0 ? this.bufferStartTime : void 0;
|
|
994
|
+
logger.info("flush: routing through drain pipeline", {
|
|
995
|
+
actualSamples: this.writeOffset,
|
|
996
|
+
chunkTimestamp: chunkTimestamp?.toFixed(3),
|
|
997
|
+
pendingChunks: this.pendingChunks.length,
|
|
998
|
+
inferenceRunning: this.inferenceRunning
|
|
999
|
+
});
|
|
1000
|
+
this.writeOffset = 0;
|
|
1001
|
+
this.bufferStartTime = 0;
|
|
1002
|
+
this.pendingChunks.push({ chunk: padded, timestamp: chunkTimestamp });
|
|
1003
|
+
this.drainPendingChunks();
|
|
1325
1004
|
}
|
|
1326
1005
|
/**
|
|
1327
|
-
*
|
|
1006
|
+
* Reset buffer and frame queues
|
|
1328
1007
|
*/
|
|
1329
|
-
|
|
1330
|
-
|
|
1008
|
+
reset() {
|
|
1009
|
+
this.writeOffset = 0;
|
|
1010
|
+
this.bufferStartTime = 0;
|
|
1011
|
+
this.timestampedQueue = [];
|
|
1012
|
+
this.plainQueue = [];
|
|
1013
|
+
this._latestFrame = null;
|
|
1014
|
+
this.lastPulledFrame = null;
|
|
1015
|
+
this.pendingChunks = [];
|
|
1016
|
+
this.inferenceRunning = false;
|
|
1017
|
+
this.getFrameCallCount = 0;
|
|
1018
|
+
}
|
|
1019
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
1020
|
+
// Frame Output — Pull Mode (TTS playback)
|
|
1021
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
1022
|
+
/**
|
|
1023
|
+
* Get frame synced to external clock (e.g. AudioContext.currentTime).
|
|
1024
|
+
*
|
|
1025
|
+
* Discards frames that are too old, returns the current frame,
|
|
1026
|
+
* or holds last frame as fallback to prevent avatar freezing.
|
|
1027
|
+
*
|
|
1028
|
+
* @param currentTime - Current playback time (seconds)
|
|
1029
|
+
* @returns Blendshape frame, or null if no frames yet
|
|
1030
|
+
*/
|
|
1031
|
+
getFrameForTime(currentTime) {
|
|
1032
|
+
this.getFrameCallCount++;
|
|
1033
|
+
const discardWindow = this.backend.backend === "wasm" ? 1 : 0.5;
|
|
1034
|
+
let discardCount = 0;
|
|
1035
|
+
while (this.timestampedQueue.length > 0 && this.timestampedQueue[0].timestamp < currentTime - discardWindow) {
|
|
1036
|
+
this.timestampedQueue.shift();
|
|
1037
|
+
discardCount++;
|
|
1038
|
+
}
|
|
1039
|
+
if (discardCount > 0) {
|
|
1040
|
+
logger.warn("getFrameForTime DISCARDED stale frames", {
|
|
1041
|
+
discardCount,
|
|
1042
|
+
currentTime: currentTime.toFixed(3),
|
|
1043
|
+
discardWindow,
|
|
1044
|
+
remainingFrames: this.timestampedQueue.length,
|
|
1045
|
+
nextFrameTs: this.timestampedQueue.length > 0 ? this.timestampedQueue[0].timestamp.toFixed(3) : "none"
|
|
1046
|
+
});
|
|
1047
|
+
}
|
|
1048
|
+
if (this.timestampedQueue.length > 0 && this.timestampedQueue[0].timestamp <= currentTime) {
|
|
1049
|
+
const { frame } = this.timestampedQueue.shift();
|
|
1050
|
+
this.lastPulledFrame = frame;
|
|
1051
|
+
return frame;
|
|
1052
|
+
}
|
|
1053
|
+
if (this.timestampedQueue.length > 0 && this.getFrameCallCount % 60 === 0) {
|
|
1054
|
+
logger.warn("getFrameForTime: frames in queue but NOT consumable", {
|
|
1055
|
+
queueLen: this.timestampedQueue.length,
|
|
1056
|
+
frontTimestamp: this.timestampedQueue[0].timestamp.toFixed(4),
|
|
1057
|
+
currentTime: currentTime.toFixed(4),
|
|
1058
|
+
delta: (this.timestampedQueue[0].timestamp - currentTime).toFixed(4),
|
|
1059
|
+
callCount: this.getFrameCallCount
|
|
1060
|
+
});
|
|
1061
|
+
}
|
|
1062
|
+
return this.lastPulledFrame;
|
|
1063
|
+
}
|
|
1064
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
1065
|
+
// Frame Output — Push Mode (live mic, game loop)
|
|
1066
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
1067
|
+
/** Latest frame from drip-feed (live mic, game loop) */
|
|
1068
|
+
get latestFrame() {
|
|
1069
|
+
return this._latestFrame;
|
|
1070
|
+
}
|
|
1071
|
+
/** Start 30fps drip-feed timer (push mode) */
|
|
1072
|
+
startDrip() {
|
|
1073
|
+
if (this.dripInterval) return;
|
|
1074
|
+
this.dripInterval = setInterval(() => {
|
|
1075
|
+
const frame = this.plainQueue.shift();
|
|
1076
|
+
if (frame) {
|
|
1077
|
+
this._latestFrame = frame;
|
|
1078
|
+
this.onFrame?.(frame);
|
|
1079
|
+
}
|
|
1080
|
+
}, DRIP_INTERVAL_MS);
|
|
1081
|
+
}
|
|
1082
|
+
/** Stop drip-feed timer */
|
|
1083
|
+
stopDrip() {
|
|
1084
|
+
if (this.dripInterval) {
|
|
1085
|
+
clearInterval(this.dripInterval);
|
|
1086
|
+
this.dripInterval = null;
|
|
1087
|
+
}
|
|
1088
|
+
}
|
|
1089
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
1090
|
+
// State
|
|
1091
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
1092
|
+
/** Number of frames waiting in queue (both modes combined) */
|
|
1093
|
+
get queuedFrameCount() {
|
|
1094
|
+
return this.timestampedQueue.length + this.plainQueue.length;
|
|
1095
|
+
}
|
|
1096
|
+
/** Buffer fill level as fraction of chunkSize (0-1) */
|
|
1097
|
+
get fillLevel() {
|
|
1098
|
+
return Math.min(1, this.writeOffset / this.chunkSize);
|
|
1099
|
+
}
|
|
1100
|
+
/** Dispose resources */
|
|
1101
|
+
dispose() {
|
|
1102
|
+
if (this.disposed) return;
|
|
1103
|
+
this.disposed = true;
|
|
1104
|
+
this.stopDrip();
|
|
1105
|
+
this.reset();
|
|
1331
1106
|
}
|
|
1107
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
1108
|
+
// Private
|
|
1109
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
1332
1110
|
/**
|
|
1333
|
-
*
|
|
1111
|
+
* Process pending chunks sequentially.
|
|
1112
|
+
* Fire-and-forget — called from pushAudio() without awaiting.
|
|
1334
1113
|
*/
|
|
1335
|
-
|
|
1336
|
-
|
|
1114
|
+
drainPendingChunks() {
|
|
1115
|
+
if (this.inferenceRunning || this.pendingChunks.length === 0) {
|
|
1116
|
+
if (this.inferenceRunning && this.pendingChunks.length > 0) {
|
|
1117
|
+
logger.debug("drainPendingChunks skipped (inference running)", {
|
|
1118
|
+
pendingChunks: this.pendingChunks.length
|
|
1119
|
+
});
|
|
1120
|
+
}
|
|
1121
|
+
return;
|
|
1122
|
+
}
|
|
1123
|
+
this.inferenceRunning = true;
|
|
1124
|
+
logger.info("drainPendingChunks starting", { pendingChunks: this.pendingChunks.length });
|
|
1125
|
+
const processNext = async () => {
|
|
1126
|
+
while (this.pendingChunks.length > 0 && !this.disposed) {
|
|
1127
|
+
const { chunk, timestamp } = this.pendingChunks.shift();
|
|
1128
|
+
try {
|
|
1129
|
+
const t0 = performance.now();
|
|
1130
|
+
const result = await this.backend.infer(chunk);
|
|
1131
|
+
const inferMs = Math.round(performance.now() - t0);
|
|
1132
|
+
const actualDuration = chunk.length / this.sampleRate;
|
|
1133
|
+
const actualFrameCount = Math.ceil(actualDuration * FRAME_RATE);
|
|
1134
|
+
const framesToQueue = Math.min(actualFrameCount, result.blendshapes.length);
|
|
1135
|
+
logger.info("Inference complete", {
|
|
1136
|
+
inferMs,
|
|
1137
|
+
modelFrames: result.blendshapes.length,
|
|
1138
|
+
framesToQueue,
|
|
1139
|
+
timestamp,
|
|
1140
|
+
totalQueued: this.timestampedQueue.length + framesToQueue,
|
|
1141
|
+
remainingPending: this.pendingChunks.length
|
|
1142
|
+
});
|
|
1143
|
+
for (let i = 0; i < framesToQueue; i++) {
|
|
1144
|
+
if (timestamp !== void 0) {
|
|
1145
|
+
this.timestampedQueue.push({
|
|
1146
|
+
frame: result.blendshapes[i],
|
|
1147
|
+
timestamp: timestamp + i / FRAME_RATE
|
|
1148
|
+
});
|
|
1149
|
+
} else {
|
|
1150
|
+
this.plainQueue.push(result.blendshapes[i]);
|
|
1151
|
+
}
|
|
1152
|
+
}
|
|
1153
|
+
} catch (err) {
|
|
1154
|
+
this.handleError(err);
|
|
1155
|
+
}
|
|
1156
|
+
if (this.pendingChunks.length > 0) {
|
|
1157
|
+
await new Promise((r) => setTimeout(r, 0));
|
|
1158
|
+
}
|
|
1159
|
+
}
|
|
1160
|
+
this.inferenceRunning = false;
|
|
1161
|
+
if (this.pendingChunks.length > 0) {
|
|
1162
|
+
this.drainPendingChunks();
|
|
1163
|
+
}
|
|
1164
|
+
};
|
|
1165
|
+
processNext().catch((err) => this.handleError(err));
|
|
1166
|
+
}
|
|
1167
|
+
handleError(err) {
|
|
1168
|
+
const error = err instanceof Error ? err : new Error(String(err));
|
|
1169
|
+
logger.warn("A2EProcessor inference error", { error: error.message });
|
|
1170
|
+
this.onError?.(error);
|
|
1337
1171
|
}
|
|
1338
1172
|
};
|
|
1339
|
-
|
|
1173
|
+
|
|
1174
|
+
// src/inference/BlendshapeSmoother.ts
|
|
1175
|
+
var NUM_BLENDSHAPES = 52;
|
|
1176
|
+
var BlendshapeSmoother = class {
|
|
1177
|
+
constructor(config) {
|
|
1178
|
+
/** Whether any target has been set */
|
|
1179
|
+
this._hasTarget = false;
|
|
1180
|
+
this.halflife = config?.halflife ?? 0.06;
|
|
1181
|
+
this.values = new Float32Array(NUM_BLENDSHAPES);
|
|
1182
|
+
this.velocities = new Float32Array(NUM_BLENDSHAPES);
|
|
1183
|
+
this.targets = new Float32Array(NUM_BLENDSHAPES);
|
|
1184
|
+
}
|
|
1185
|
+
/** Whether a target frame has been set (false until first setTarget call) */
|
|
1186
|
+
get hasTarget() {
|
|
1187
|
+
return this._hasTarget;
|
|
1188
|
+
}
|
|
1340
1189
|
/**
|
|
1341
|
-
*
|
|
1342
|
-
*
|
|
1190
|
+
* Set new target frame from inference output.
|
|
1191
|
+
* Springs will converge toward these values on subsequent update() calls.
|
|
1343
1192
|
*/
|
|
1344
|
-
|
|
1345
|
-
this.
|
|
1346
|
-
this.
|
|
1347
|
-
this.emphasisThreshold = emphasisThreshold;
|
|
1193
|
+
setTarget(frame) {
|
|
1194
|
+
this.targets.set(frame);
|
|
1195
|
+
this._hasTarget = true;
|
|
1348
1196
|
}
|
|
1349
1197
|
/**
|
|
1350
|
-
*
|
|
1351
|
-
*
|
|
1352
|
-
*
|
|
1198
|
+
* Advance all 52 springs by `dt` seconds and return the smoothed frame.
|
|
1199
|
+
*
|
|
1200
|
+
* Call this every render frame (e.g., inside requestAnimationFrame).
|
|
1201
|
+
* Returns the internal values buffer — do NOT mutate the returned array.
|
|
1202
|
+
*
|
|
1203
|
+
* @param dt - Time step in seconds (e.g., 1/60 for 60fps)
|
|
1204
|
+
* @returns Smoothed blendshape values (Float32Array of 52)
|
|
1353
1205
|
*/
|
|
1354
|
-
|
|
1355
|
-
this.
|
|
1356
|
-
|
|
1357
|
-
this.energyHistory.shift();
|
|
1206
|
+
update(dt) {
|
|
1207
|
+
if (!this._hasTarget) {
|
|
1208
|
+
return this.values;
|
|
1358
1209
|
}
|
|
1359
|
-
if (this.
|
|
1360
|
-
|
|
1210
|
+
if (this.halflife <= 0) {
|
|
1211
|
+
this.values.set(this.targets);
|
|
1212
|
+
this.velocities.fill(0);
|
|
1213
|
+
return this.values;
|
|
1361
1214
|
}
|
|
1362
|
-
const
|
|
1363
|
-
const
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1215
|
+
const damping = Math.LN2 / this.halflife;
|
|
1216
|
+
const eydt = Math.exp(-damping * dt);
|
|
1217
|
+
for (let i = 0; i < NUM_BLENDSHAPES; i++) {
|
|
1218
|
+
const j0 = this.values[i] - this.targets[i];
|
|
1219
|
+
const j1 = this.velocities[i] + j0 * damping;
|
|
1220
|
+
this.values[i] = eydt * (j0 + j1 * dt) + this.targets[i];
|
|
1221
|
+
this.velocities[i] = eydt * (this.velocities[i] - j1 * damping * dt);
|
|
1222
|
+
this.values[i] = Math.max(0, Math.min(1, this.values[i]));
|
|
1223
|
+
}
|
|
1224
|
+
return this.values;
|
|
1370
1225
|
}
|
|
1371
1226
|
/**
|
|
1372
|
-
*
|
|
1227
|
+
* Decay all spring targets to neutral (0).
|
|
1228
|
+
*
|
|
1229
|
+
* Call when inference stalls (no new frames for threshold duration).
|
|
1230
|
+
* The springs will smoothly close the mouth / relax the face over
|
|
1231
|
+
* the halflife period rather than freezing.
|
|
1232
|
+
*/
|
|
1233
|
+
decayToNeutral() {
|
|
1234
|
+
this.targets.fill(0);
|
|
1235
|
+
}
|
|
1236
|
+
/**
|
|
1237
|
+
* Reset all state (values, velocities, targets).
|
|
1238
|
+
* Call when starting a new playback session.
|
|
1373
1239
|
*/
|
|
1374
1240
|
reset() {
|
|
1375
|
-
this.
|
|
1241
|
+
this.values.fill(0);
|
|
1242
|
+
this.velocities.fill(0);
|
|
1243
|
+
this.targets.fill(0);
|
|
1244
|
+
this._hasTarget = false;
|
|
1376
1245
|
}
|
|
1377
1246
|
};
|
|
1378
1247
|
|
|
@@ -2485,340 +2354,76 @@ async function fetchWithCache(url, optionsOrProgress) {
|
|
|
2485
2354
|
if (!response.ok) {
|
|
2486
2355
|
throw new Error(`Failed to fetch ${url}: ${response.status}`);
|
|
2487
2356
|
}
|
|
2488
|
-
const contentLength = response.headers.get("content-length");
|
|
2489
|
-
const total = contentLength ? parseInt(contentLength, 10) : 0;
|
|
2490
|
-
const etag = response.headers.get("etag") ?? void 0;
|
|
2491
|
-
const tooLargeForCache = total > MAX_CACHE_SIZE_BYTES;
|
|
2492
|
-
if (tooLargeForCache) {
|
|
2493
|
-
console.log(`[ModelCache] File too large for IndexedDB (${(total / 1024 / 1024).toFixed(0)}MB > 500MB), using HTTP cache only`);
|
|
2494
|
-
}
|
|
2495
|
-
if (!response.body) {
|
|
2496
|
-
const data2 = await response.arrayBuffer();
|
|
2497
|
-
if (!tooLargeForCache) {
|
|
2498
|
-
await cache.set(cacheKey, data2, etag, version);
|
|
2499
|
-
}
|
|
2500
|
-
span?.setAttributes({
|
|
2501
|
-
"fetch.size_bytes": data2.byteLength,
|
|
2502
|
-
"fetch.cached_to_indexeddb": !tooLargeForCache
|
|
2503
|
-
});
|
|
2504
|
-
span?.end();
|
|
2505
|
-
return data2;
|
|
2506
|
-
}
|
|
2507
|
-
const reader = response.body.getReader();
|
|
2508
|
-
const chunks = [];
|
|
2509
|
-
let loaded = 0;
|
|
2510
|
-
while (true) {
|
|
2511
|
-
const { done, value } = await reader.read();
|
|
2512
|
-
if (done) break;
|
|
2513
|
-
chunks.push(value);
|
|
2514
|
-
loaded += value.length;
|
|
2515
|
-
onProgress?.(loaded, total || loaded);
|
|
2516
|
-
}
|
|
2517
|
-
const data = new Uint8Array(loaded);
|
|
2518
|
-
let offset = 0;
|
|
2519
|
-
for (const chunk of chunks) {
|
|
2520
|
-
data.set(chunk, offset);
|
|
2521
|
-
offset += chunk.length;
|
|
2522
|
-
}
|
|
2523
|
-
const buffer = data.buffer;
|
|
2524
|
-
if (!tooLargeForCache) {
|
|
2525
|
-
await cache.set(cacheKey, buffer, etag, version);
|
|
2526
|
-
console.log(`[ModelCache] Cached: ${url} (${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB)`);
|
|
2527
|
-
}
|
|
2528
|
-
span?.setAttributes({
|
|
2529
|
-
"fetch.size_bytes": buffer.byteLength,
|
|
2530
|
-
"fetch.cached_to_indexeddb": !tooLargeForCache
|
|
2531
|
-
});
|
|
2532
|
-
span?.end();
|
|
2533
|
-
return buffer;
|
|
2534
|
-
} catch (error) {
|
|
2535
|
-
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
2536
|
-
throw error;
|
|
2537
|
-
}
|
|
2538
|
-
}
|
|
2539
|
-
async function preloadModels(urls, onProgress) {
|
|
2540
|
-
const cache = getModelCache();
|
|
2541
|
-
for (let i = 0; i < urls.length; i++) {
|
|
2542
|
-
const url = urls[i];
|
|
2543
|
-
onProgress?.(i, urls.length, url);
|
|
2544
|
-
if (await cache.has(url)) {
|
|
2545
|
-
console.log(`[ModelCache] Already cached: ${url}`);
|
|
2546
|
-
continue;
|
|
2547
|
-
}
|
|
2548
|
-
await fetchWithCache(url);
|
|
2549
|
-
}
|
|
2550
|
-
onProgress?.(urls.length, urls.length, "done");
|
|
2551
|
-
}
|
|
2552
|
-
function formatBytes(bytes) {
|
|
2553
|
-
if (bytes < 1024) return `${bytes} B`;
|
|
2554
|
-
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
|
|
2555
|
-
if (bytes < 1024 * 1024 * 1024) return `${(bytes / 1024 / 1024).toFixed(1)} MB`;
|
|
2556
|
-
return `${(bytes / 1024 / 1024 / 1024).toFixed(1)} GB`;
|
|
2557
|
-
}
|
|
2558
|
-
|
|
2559
|
-
// src/logging/types.ts
|
|
2560
|
-
var LOG_LEVEL_PRIORITY = {
|
|
2561
|
-
error: 0,
|
|
2562
|
-
warn: 1,
|
|
2563
|
-
info: 2,
|
|
2564
|
-
debug: 3,
|
|
2565
|
-
trace: 4,
|
|
2566
|
-
verbose: 5
|
|
2567
|
-
};
|
|
2568
|
-
var DEFAULT_LOGGING_CONFIG = {
|
|
2569
|
-
level: "info",
|
|
2570
|
-
enabled: true,
|
|
2571
|
-
format: "pretty",
|
|
2572
|
-
timestamps: true,
|
|
2573
|
-
includeModule: true
|
|
2574
|
-
};
|
|
2575
|
-
|
|
2576
|
-
// src/logging/formatters.ts
|
|
2577
|
-
var COLORS = {
|
|
2578
|
-
reset: "\x1B[0m",
|
|
2579
|
-
red: "\x1B[31m",
|
|
2580
|
-
yellow: "\x1B[33m",
|
|
2581
|
-
blue: "\x1B[34m",
|
|
2582
|
-
cyan: "\x1B[36m",
|
|
2583
|
-
gray: "\x1B[90m",
|
|
2584
|
-
white: "\x1B[37m",
|
|
2585
|
-
magenta: "\x1B[35m"
|
|
2586
|
-
};
|
|
2587
|
-
var LEVEL_COLORS = {
|
|
2588
|
-
error: COLORS.red,
|
|
2589
|
-
warn: COLORS.yellow,
|
|
2590
|
-
info: COLORS.blue,
|
|
2591
|
-
debug: COLORS.cyan,
|
|
2592
|
-
trace: COLORS.magenta,
|
|
2593
|
-
verbose: COLORS.gray
|
|
2594
|
-
};
|
|
2595
|
-
var LEVEL_NAMES = {
|
|
2596
|
-
error: "ERROR ",
|
|
2597
|
-
warn: "WARN ",
|
|
2598
|
-
info: "INFO ",
|
|
2599
|
-
debug: "DEBUG ",
|
|
2600
|
-
trace: "TRACE ",
|
|
2601
|
-
verbose: "VERBOSE"
|
|
2602
|
-
};
|
|
2603
|
-
var isBrowser = typeof window !== "undefined";
|
|
2604
|
-
function formatTimestamp(timestamp) {
|
|
2605
|
-
const date = new Date(timestamp);
|
|
2606
|
-
return date.toISOString().substring(11, 23);
|
|
2607
|
-
}
|
|
2608
|
-
function safeStringify(data) {
|
|
2609
|
-
const seen = /* @__PURE__ */ new WeakSet();
|
|
2610
|
-
return JSON.stringify(data, (key, value) => {
|
|
2611
|
-
if (typeof value === "object" && value !== null) {
|
|
2612
|
-
if (seen.has(value)) {
|
|
2613
|
-
return "[Circular]";
|
|
2614
|
-
}
|
|
2615
|
-
seen.add(value);
|
|
2616
|
-
}
|
|
2617
|
-
if (value instanceof Error) {
|
|
2618
|
-
return {
|
|
2619
|
-
name: value.name,
|
|
2620
|
-
message: value.message,
|
|
2621
|
-
stack: value.stack
|
|
2622
|
-
};
|
|
2357
|
+
const contentLength = response.headers.get("content-length");
|
|
2358
|
+
const total = contentLength ? parseInt(contentLength, 10) : 0;
|
|
2359
|
+
const etag = response.headers.get("etag") ?? void 0;
|
|
2360
|
+
const tooLargeForCache = total > MAX_CACHE_SIZE_BYTES;
|
|
2361
|
+
if (tooLargeForCache) {
|
|
2362
|
+
console.log(`[ModelCache] File too large for IndexedDB (${(total / 1024 / 1024).toFixed(0)}MB > 500MB), using HTTP cache only`);
|
|
2623
2363
|
}
|
|
2624
|
-
if (
|
|
2625
|
-
|
|
2364
|
+
if (!response.body) {
|
|
2365
|
+
const data2 = await response.arrayBuffer();
|
|
2366
|
+
if (!tooLargeForCache) {
|
|
2367
|
+
await cache.set(cacheKey, data2, etag, version);
|
|
2368
|
+
}
|
|
2369
|
+
span?.setAttributes({
|
|
2370
|
+
"fetch.size_bytes": data2.byteLength,
|
|
2371
|
+
"fetch.cached_to_indexeddb": !tooLargeForCache
|
|
2372
|
+
});
|
|
2373
|
+
span?.end();
|
|
2374
|
+
return data2;
|
|
2626
2375
|
}
|
|
2627
|
-
|
|
2628
|
-
|
|
2376
|
+
const reader = response.body.getReader();
|
|
2377
|
+
const chunks = [];
|
|
2378
|
+
let loaded = 0;
|
|
2379
|
+
while (true) {
|
|
2380
|
+
const { done, value } = await reader.read();
|
|
2381
|
+
if (done) break;
|
|
2382
|
+
chunks.push(value);
|
|
2383
|
+
loaded += value.length;
|
|
2384
|
+
onProgress?.(loaded, total || loaded);
|
|
2629
2385
|
}
|
|
2630
|
-
|
|
2631
|
-
|
|
2632
|
-
|
|
2633
|
-
|
|
2634
|
-
|
|
2635
|
-
timestamp: entry.timestamp,
|
|
2636
|
-
level: entry.level,
|
|
2637
|
-
module: entry.module,
|
|
2638
|
-
message: entry.message
|
|
2639
|
-
};
|
|
2640
|
-
if (entry.data && Object.keys(entry.data).length > 0) {
|
|
2641
|
-
output.data = entry.data;
|
|
2642
|
-
}
|
|
2643
|
-
if (entry.error) {
|
|
2644
|
-
output.error = {
|
|
2645
|
-
name: entry.error.name,
|
|
2646
|
-
message: entry.error.message,
|
|
2647
|
-
stack: entry.error.stack
|
|
2648
|
-
};
|
|
2649
|
-
}
|
|
2650
|
-
return safeStringify(output);
|
|
2651
|
-
};
|
|
2652
|
-
var prettyFormatter = (entry) => {
|
|
2653
|
-
const time = formatTimestamp(entry.timestamp);
|
|
2654
|
-
const level = LEVEL_NAMES[entry.level];
|
|
2655
|
-
const module2 = entry.module;
|
|
2656
|
-
const message = entry.message;
|
|
2657
|
-
let output;
|
|
2658
|
-
if (isBrowser) {
|
|
2659
|
-
output = `${time} ${level} [${module2}] ${message}`;
|
|
2660
|
-
} else {
|
|
2661
|
-
const color = LEVEL_COLORS[entry.level];
|
|
2662
|
-
output = `${COLORS.gray}${time}${COLORS.reset} ${color}${level}${COLORS.reset} ${COLORS.cyan}[${module2}]${COLORS.reset} ${message}`;
|
|
2663
|
-
}
|
|
2664
|
-
if (entry.data && Object.keys(entry.data).length > 0) {
|
|
2665
|
-
const dataStr = safeStringify(entry.data);
|
|
2666
|
-
if (dataStr.length > 80) {
|
|
2667
|
-
output += "\n " + JSON.stringify(entry.data, null, 2).replace(/\n/g, "\n ");
|
|
2668
|
-
} else {
|
|
2669
|
-
output += " " + dataStr;
|
|
2386
|
+
const data = new Uint8Array(loaded);
|
|
2387
|
+
let offset = 0;
|
|
2388
|
+
for (const chunk of chunks) {
|
|
2389
|
+
data.set(chunk, offset);
|
|
2390
|
+
offset += chunk.length;
|
|
2670
2391
|
}
|
|
2671
|
-
|
|
2672
|
-
|
|
2673
|
-
|
|
2674
|
-
|
|
2675
|
-
if (entry.error.stack) {
|
|
2676
|
-
const stackLines = entry.error.stack.split("\n").slice(1, 4);
|
|
2677
|
-
output += "\n " + stackLines.join("\n ");
|
|
2392
|
+
const buffer = data.buffer;
|
|
2393
|
+
if (!tooLargeForCache) {
|
|
2394
|
+
await cache.set(cacheKey, buffer, etag, version);
|
|
2395
|
+
console.log(`[ModelCache] Cached: ${url} (${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB)`);
|
|
2678
2396
|
}
|
|
2397
|
+
span?.setAttributes({
|
|
2398
|
+
"fetch.size_bytes": buffer.byteLength,
|
|
2399
|
+
"fetch.cached_to_indexeddb": !tooLargeForCache
|
|
2400
|
+
});
|
|
2401
|
+
span?.end();
|
|
2402
|
+
return buffer;
|
|
2403
|
+
} catch (error) {
|
|
2404
|
+
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
2405
|
+
throw error;
|
|
2679
2406
|
}
|
|
2680
|
-
return output;
|
|
2681
|
-
};
|
|
2682
|
-
function getFormatter(format) {
|
|
2683
|
-
return format === "json" ? jsonFormatter : prettyFormatter;
|
|
2684
|
-
}
|
|
2685
|
-
function createBrowserConsoleArgs(entry) {
|
|
2686
|
-
const time = formatTimestamp(entry.timestamp);
|
|
2687
|
-
const level = entry.level.toUpperCase().padEnd(7);
|
|
2688
|
-
const module2 = entry.module;
|
|
2689
|
-
const message = entry.message;
|
|
2690
|
-
const styles = {
|
|
2691
|
-
time: "color: gray;",
|
|
2692
|
-
error: "color: red; font-weight: bold;",
|
|
2693
|
-
warn: "color: orange; font-weight: bold;",
|
|
2694
|
-
info: "color: blue;",
|
|
2695
|
-
debug: "color: cyan;",
|
|
2696
|
-
trace: "color: magenta;",
|
|
2697
|
-
verbose: "color: gray;",
|
|
2698
|
-
module: "color: teal; font-weight: bold;",
|
|
2699
|
-
message: "color: inherit;"
|
|
2700
|
-
};
|
|
2701
|
-
let formatStr = "%c%s %c%s %c[%s]%c %s";
|
|
2702
|
-
const args = [
|
|
2703
|
-
styles.time,
|
|
2704
|
-
time,
|
|
2705
|
-
styles[entry.level],
|
|
2706
|
-
level,
|
|
2707
|
-
styles.module,
|
|
2708
|
-
module2,
|
|
2709
|
-
styles.message,
|
|
2710
|
-
message
|
|
2711
|
-
];
|
|
2712
|
-
if (entry.data && Object.keys(entry.data).length > 0) {
|
|
2713
|
-
formatStr += " %o";
|
|
2714
|
-
args.push(entry.data);
|
|
2715
|
-
}
|
|
2716
|
-
return [formatStr, ...args];
|
|
2717
|
-
}
|
|
2718
|
-
|
|
2719
|
-
// src/logging/Logger.ts
|
|
2720
|
-
var isBrowser2 = typeof window !== "undefined";
|
|
2721
|
-
var globalConfig = { ...DEFAULT_LOGGING_CONFIG };
|
|
2722
|
-
function configureLogging(config) {
|
|
2723
|
-
globalConfig = { ...globalConfig, ...config };
|
|
2724
|
-
}
|
|
2725
|
-
function getLoggingConfig() {
|
|
2726
|
-
return { ...globalConfig };
|
|
2727
|
-
}
|
|
2728
|
-
function resetLoggingConfig() {
|
|
2729
|
-
globalConfig = { ...DEFAULT_LOGGING_CONFIG };
|
|
2730
|
-
}
|
|
2731
|
-
function setLogLevel(level) {
|
|
2732
|
-
globalConfig.level = level;
|
|
2733
|
-
}
|
|
2734
|
-
function setLoggingEnabled(enabled) {
|
|
2735
|
-
globalConfig.enabled = enabled;
|
|
2736
|
-
}
|
|
2737
|
-
var consoleSink = (entry) => {
|
|
2738
|
-
const consoleMethod = entry.level === "error" ? "error" : entry.level === "warn" ? "warn" : "log";
|
|
2739
|
-
if (globalConfig.format === "pretty" && isBrowser2) {
|
|
2740
|
-
const args = createBrowserConsoleArgs(entry);
|
|
2741
|
-
console[consoleMethod](...args);
|
|
2742
|
-
} else {
|
|
2743
|
-
const formatter = getFormatter(globalConfig.format);
|
|
2744
|
-
const formatted = formatter(entry);
|
|
2745
|
-
console[consoleMethod](formatted);
|
|
2746
|
-
}
|
|
2747
|
-
};
|
|
2748
|
-
function getActiveSink() {
|
|
2749
|
-
return globalConfig.sink || consoleSink;
|
|
2750
|
-
}
|
|
2751
|
-
function shouldLog(level) {
|
|
2752
|
-
if (!globalConfig.enabled) return false;
|
|
2753
|
-
return LOG_LEVEL_PRIORITY[level] <= LOG_LEVEL_PRIORITY[globalConfig.level];
|
|
2754
2407
|
}
|
|
2755
|
-
|
|
2756
|
-
|
|
2757
|
-
|
|
2758
|
-
|
|
2759
|
-
|
|
2760
|
-
if (
|
|
2761
|
-
|
|
2762
|
-
|
|
2763
|
-
level,
|
|
2764
|
-
module: this.module,
|
|
2765
|
-
message,
|
|
2766
|
-
data
|
|
2767
|
-
};
|
|
2768
|
-
if (data?.error instanceof Error) {
|
|
2769
|
-
entry.error = data.error;
|
|
2770
|
-
const { error, ...rest } = data;
|
|
2771
|
-
entry.data = Object.keys(rest).length > 0 ? rest : void 0;
|
|
2408
|
+
async function preloadModels(urls, onProgress) {
|
|
2409
|
+
const cache = getModelCache();
|
|
2410
|
+
for (let i = 0; i < urls.length; i++) {
|
|
2411
|
+
const url = urls[i];
|
|
2412
|
+
onProgress?.(i, urls.length, url);
|
|
2413
|
+
if (await cache.has(url)) {
|
|
2414
|
+
console.log(`[ModelCache] Already cached: ${url}`);
|
|
2415
|
+
continue;
|
|
2772
2416
|
}
|
|
2773
|
-
|
|
2774
|
-
}
|
|
2775
|
-
error(message, data) {
|
|
2776
|
-
this.log("error", message, data);
|
|
2777
|
-
}
|
|
2778
|
-
warn(message, data) {
|
|
2779
|
-
this.log("warn", message, data);
|
|
2780
|
-
}
|
|
2781
|
-
info(message, data) {
|
|
2782
|
-
this.log("info", message, data);
|
|
2783
|
-
}
|
|
2784
|
-
debug(message, data) {
|
|
2785
|
-
this.log("debug", message, data);
|
|
2786
|
-
}
|
|
2787
|
-
trace(message, data) {
|
|
2788
|
-
this.log("trace", message, data);
|
|
2789
|
-
}
|
|
2790
|
-
verbose(message, data) {
|
|
2791
|
-
this.log("verbose", message, data);
|
|
2792
|
-
}
|
|
2793
|
-
child(subModule) {
|
|
2794
|
-
return new _Logger(`${this.module}.${subModule}`);
|
|
2795
|
-
}
|
|
2796
|
-
};
|
|
2797
|
-
var loggerCache = /* @__PURE__ */ new Map();
|
|
2798
|
-
function createLogger(module2) {
|
|
2799
|
-
let logger15 = loggerCache.get(module2);
|
|
2800
|
-
if (!logger15) {
|
|
2801
|
-
logger15 = new Logger(module2);
|
|
2802
|
-
loggerCache.set(module2, logger15);
|
|
2417
|
+
await fetchWithCache(url);
|
|
2803
2418
|
}
|
|
2804
|
-
|
|
2419
|
+
onProgress?.(urls.length, urls.length, "done");
|
|
2420
|
+
}
|
|
2421
|
+
function formatBytes(bytes) {
|
|
2422
|
+
if (bytes < 1024) return `${bytes} B`;
|
|
2423
|
+
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
|
|
2424
|
+
if (bytes < 1024 * 1024 * 1024) return `${(bytes / 1024 / 1024).toFixed(1)} MB`;
|
|
2425
|
+
return `${(bytes / 1024 / 1024 / 1024).toFixed(1)} GB`;
|
|
2805
2426
|
}
|
|
2806
|
-
var noopLogger = {
|
|
2807
|
-
module: "noop",
|
|
2808
|
-
error: () => {
|
|
2809
|
-
},
|
|
2810
|
-
warn: () => {
|
|
2811
|
-
},
|
|
2812
|
-
info: () => {
|
|
2813
|
-
},
|
|
2814
|
-
debug: () => {
|
|
2815
|
-
},
|
|
2816
|
-
trace: () => {
|
|
2817
|
-
},
|
|
2818
|
-
verbose: () => {
|
|
2819
|
-
},
|
|
2820
|
-
child: () => noopLogger
|
|
2821
|
-
};
|
|
2822
2427
|
|
|
2823
2428
|
// src/utils/runtime.ts
|
|
2824
2429
|
function isIOSSafari() {
|
|
@@ -2889,7 +2494,7 @@ function isSafari() {
|
|
|
2889
2494
|
const ua = navigator.userAgent.toLowerCase();
|
|
2890
2495
|
return /safari/.test(ua) && !/chrome|crios|fxios|chromium|edg/.test(ua);
|
|
2891
2496
|
}
|
|
2892
|
-
function
|
|
2497
|
+
function shouldUseCpuA2E() {
|
|
2893
2498
|
return isSafari() || isIOS();
|
|
2894
2499
|
}
|
|
2895
2500
|
function isSpeechRecognitionAvailable() {
|
|
@@ -2899,22 +2504,22 @@ function isSpeechRecognitionAvailable() {
|
|
|
2899
2504
|
function shouldUseNativeASR() {
|
|
2900
2505
|
return (isIOS() || isSafari()) && isSpeechRecognitionAvailable();
|
|
2901
2506
|
}
|
|
2902
|
-
function
|
|
2507
|
+
function shouldUseServerA2E() {
|
|
2903
2508
|
return isIOS();
|
|
2904
2509
|
}
|
|
2905
2510
|
|
|
2906
2511
|
// src/inference/onnxLoader.ts
|
|
2907
|
-
var
|
|
2512
|
+
var logger2 = createLogger("OnnxLoader");
|
|
2908
2513
|
var ortInstance = null;
|
|
2909
2514
|
var loadedBackend = null;
|
|
2910
2515
|
var WASM_CDN_PATH = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
2911
2516
|
async function isWebGPUAvailable() {
|
|
2912
2517
|
if (isIOS()) {
|
|
2913
|
-
|
|
2518
|
+
logger2.debug("WebGPU check: disabled on iOS (asyncify bundle crashes WebKit)");
|
|
2914
2519
|
return false;
|
|
2915
2520
|
}
|
|
2916
2521
|
if (!hasWebGPUApi()) {
|
|
2917
|
-
|
|
2522
|
+
logger2.debug("WebGPU check: navigator.gpu not available", {
|
|
2918
2523
|
isSecureContext: typeof window !== "undefined" ? window.isSecureContext : "N/A"
|
|
2919
2524
|
});
|
|
2920
2525
|
return false;
|
|
@@ -2922,19 +2527,19 @@ async function isWebGPUAvailable() {
|
|
|
2922
2527
|
try {
|
|
2923
2528
|
const adapter = await navigator.gpu.requestAdapter();
|
|
2924
2529
|
if (!adapter) {
|
|
2925
|
-
|
|
2530
|
+
logger2.debug("WebGPU check: No adapter available");
|
|
2926
2531
|
return false;
|
|
2927
2532
|
}
|
|
2928
2533
|
const device = await adapter.requestDevice();
|
|
2929
2534
|
if (!device) {
|
|
2930
|
-
|
|
2535
|
+
logger2.debug("WebGPU check: Could not create device");
|
|
2931
2536
|
return false;
|
|
2932
2537
|
}
|
|
2933
2538
|
device.destroy();
|
|
2934
|
-
|
|
2539
|
+
logger2.debug("WebGPU check: Available and working");
|
|
2935
2540
|
return true;
|
|
2936
2541
|
} catch (err) {
|
|
2937
|
-
|
|
2542
|
+
logger2.debug("WebGPU check: Error during availability check", { error: err });
|
|
2938
2543
|
return false;
|
|
2939
2544
|
}
|
|
2940
2545
|
}
|
|
@@ -2944,11 +2549,11 @@ function applyIOSWasmMemoryPatch() {
|
|
|
2944
2549
|
iosWasmPatched = true;
|
|
2945
2550
|
const OrigMemory = WebAssembly.Memory;
|
|
2946
2551
|
const MAX_IOS_PAGES = 32768;
|
|
2947
|
-
|
|
2552
|
+
logger2.info("Applying iOS WASM memory patch (max\u21922GB, shared preserved)");
|
|
2948
2553
|
WebAssembly.Memory = function IOSPatchedMemory(descriptor) {
|
|
2949
2554
|
const patched = { ...descriptor };
|
|
2950
2555
|
if (patched.maximum !== void 0 && patched.maximum > MAX_IOS_PAGES) {
|
|
2951
|
-
|
|
2556
|
+
logger2.info("iOS memory patch: capping maximum", {
|
|
2952
2557
|
original: patched.maximum,
|
|
2953
2558
|
capped: MAX_IOS_PAGES,
|
|
2954
2559
|
shared: patched.shared,
|
|
@@ -2967,7 +2572,7 @@ function configureWasm(ort) {
|
|
|
2967
2572
|
ort.env.wasm.numThreads = numThreads;
|
|
2968
2573
|
ort.env.wasm.simd = true;
|
|
2969
2574
|
ort.env.wasm.proxy = enableProxy;
|
|
2970
|
-
|
|
2575
|
+
logger2.info("WASM configured", {
|
|
2971
2576
|
numThreads,
|
|
2972
2577
|
simd: true,
|
|
2973
2578
|
proxy: enableProxy,
|
|
@@ -2979,12 +2584,12 @@ async function getOnnxRuntime(backend) {
|
|
|
2979
2584
|
return ortInstance;
|
|
2980
2585
|
}
|
|
2981
2586
|
if (ortInstance && loadedBackend !== backend) {
|
|
2982
|
-
|
|
2587
|
+
logger2.warn(
|
|
2983
2588
|
`ONNX Runtime already loaded with ${loadedBackend} backend. Cannot switch to ${backend}. Returning existing instance.`
|
|
2984
2589
|
);
|
|
2985
2590
|
return ortInstance;
|
|
2986
2591
|
}
|
|
2987
|
-
|
|
2592
|
+
logger2.info(`Loading ONNX Runtime with ${backend} backend...`);
|
|
2988
2593
|
applyIOSWasmMemoryPatch();
|
|
2989
2594
|
try {
|
|
2990
2595
|
if (backend === "wasm" && (isIOS() || isSafari())) {
|
|
@@ -2999,10 +2604,10 @@ async function getOnnxRuntime(backend) {
|
|
|
2999
2604
|
}
|
|
3000
2605
|
loadedBackend = backend;
|
|
3001
2606
|
configureWasm(ortInstance);
|
|
3002
|
-
|
|
2607
|
+
logger2.info(`ONNX Runtime loaded successfully`, { backend });
|
|
3003
2608
|
return ortInstance;
|
|
3004
2609
|
} catch (err) {
|
|
3005
|
-
|
|
2610
|
+
logger2.error(`Failed to load ONNX Runtime with ${backend} backend`, {
|
|
3006
2611
|
error: err
|
|
3007
2612
|
});
|
|
3008
2613
|
throw new Error(
|
|
@@ -3013,7 +2618,7 @@ async function getOnnxRuntime(backend) {
|
|
|
3013
2618
|
async function getOnnxRuntimeForPreference(preference = "auto") {
|
|
3014
2619
|
const webgpuAvailable = await isWebGPUAvailable();
|
|
3015
2620
|
const backend = resolveBackend(preference, webgpuAvailable);
|
|
3016
|
-
|
|
2621
|
+
logger2.info("Resolved backend preference", {
|
|
3017
2622
|
preference,
|
|
3018
2623
|
webgpuAvailable,
|
|
3019
2624
|
resolvedBackend: backend
|
|
@@ -3047,42 +2652,6 @@ function getSessionOptions(backend) {
|
|
|
3047
2652
|
graphOptimizationLevel: "all"
|
|
3048
2653
|
};
|
|
3049
2654
|
}
|
|
3050
|
-
async function createSessionWithFallback(modelBuffer, preferredBackend) {
|
|
3051
|
-
const ort = await getOnnxRuntime(preferredBackend);
|
|
3052
|
-
const modelData = new Uint8Array(modelBuffer);
|
|
3053
|
-
if (preferredBackend === "webgpu") {
|
|
3054
|
-
try {
|
|
3055
|
-
const options2 = getSessionOptions("webgpu");
|
|
3056
|
-
const session2 = await ort.InferenceSession.create(modelData, options2);
|
|
3057
|
-
logger.info("Session created with WebGPU backend");
|
|
3058
|
-
return { session: session2, backend: "webgpu" };
|
|
3059
|
-
} catch (err) {
|
|
3060
|
-
logger.warn("WebGPU session creation failed, falling back to WASM", {
|
|
3061
|
-
error: err instanceof Error ? err.message : String(err)
|
|
3062
|
-
});
|
|
3063
|
-
}
|
|
3064
|
-
}
|
|
3065
|
-
const options = getSessionOptions("wasm");
|
|
3066
|
-
const session = await ort.InferenceSession.create(modelData, options);
|
|
3067
|
-
logger.info("Session created with WASM backend");
|
|
3068
|
-
return { session, backend: "wasm" };
|
|
3069
|
-
}
|
|
3070
|
-
function getLoadedBackend() {
|
|
3071
|
-
return loadedBackend;
|
|
3072
|
-
}
|
|
3073
|
-
function isOnnxRuntimeLoaded() {
|
|
3074
|
-
return ortInstance !== null;
|
|
3075
|
-
}
|
|
3076
|
-
async function preloadOnnxRuntime(preference = "auto") {
|
|
3077
|
-
if (ortInstance) {
|
|
3078
|
-
logger.info("ONNX Runtime already preloaded", { backend: loadedBackend });
|
|
3079
|
-
return loadedBackend;
|
|
3080
|
-
}
|
|
3081
|
-
logger.info("Preloading ONNX Runtime...", { preference });
|
|
3082
|
-
const { backend } = await getOnnxRuntimeForPreference(preference);
|
|
3083
|
-
logger.info("ONNX Runtime preloaded", { backend });
|
|
3084
|
-
return backend;
|
|
3085
|
-
}
|
|
3086
2655
|
|
|
3087
2656
|
// src/inference/blendshapeUtils.ts
|
|
3088
2657
|
var LAM_BLENDSHAPES = [
|
|
@@ -3232,16 +2801,19 @@ var WAV2ARKIT_BLENDSHAPES = [
|
|
|
3232
2801
|
var REMAP_WAV2ARKIT_TO_LAM = WAV2ARKIT_BLENDSHAPES.map(
|
|
3233
2802
|
(name) => LAM_BLENDSHAPES.indexOf(name)
|
|
3234
2803
|
);
|
|
3235
|
-
function
|
|
3236
|
-
const
|
|
3237
|
-
|
|
3238
|
-
|
|
2804
|
+
function lerpBlendshapes(current, target, factor = 0.3) {
|
|
2805
|
+
const len = Math.max(current.length, target.length);
|
|
2806
|
+
const result = new Array(len);
|
|
2807
|
+
for (let i = 0; i < len; i++) {
|
|
2808
|
+
const c = current[i] ?? 0;
|
|
2809
|
+
const t = target[i] ?? 0;
|
|
2810
|
+
result[i] = c + (t - c) * factor;
|
|
3239
2811
|
}
|
|
3240
2812
|
return result;
|
|
3241
2813
|
}
|
|
3242
2814
|
|
|
3243
2815
|
// src/inference/Wav2Vec2Inference.ts
|
|
3244
|
-
var
|
|
2816
|
+
var logger3 = createLogger("Wav2Vec2");
|
|
3245
2817
|
var CTC_VOCAB = [
|
|
3246
2818
|
"<pad>",
|
|
3247
2819
|
"<s>",
|
|
@@ -3291,6 +2863,7 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3291
2863
|
this.poisoned = false;
|
|
3292
2864
|
this.config = config;
|
|
3293
2865
|
this.numIdentityClasses = config.numIdentityClasses ?? 12;
|
|
2866
|
+
this.chunkSize = config.chunkSize ?? 16e3;
|
|
3294
2867
|
}
|
|
3295
2868
|
get backend() {
|
|
3296
2869
|
return this.session ? this._backend : null;
|
|
@@ -3320,30 +2893,30 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3320
2893
|
"model.backend_requested": this.config.backend || "auto"
|
|
3321
2894
|
});
|
|
3322
2895
|
try {
|
|
3323
|
-
|
|
2896
|
+
logger3.info("Loading ONNX Runtime...", { preference: this.config.backend || "auto" });
|
|
3324
2897
|
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend || "auto");
|
|
3325
2898
|
this.ort = ort;
|
|
3326
2899
|
this._backend = backend;
|
|
3327
|
-
|
|
2900
|
+
logger3.info("ONNX Runtime loaded", { backend: this._backend });
|
|
3328
2901
|
const modelUrl = this.config.modelUrl;
|
|
3329
2902
|
const dataUrl = this.config.externalDataUrl !== false ? typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data` : null;
|
|
3330
2903
|
const sessionOptions = getSessionOptions(this._backend);
|
|
3331
2904
|
let isCached = false;
|
|
3332
2905
|
if (isIOS()) {
|
|
3333
|
-
|
|
2906
|
+
logger3.info("iOS: passing model URLs directly to ORT (low-memory path)", {
|
|
3334
2907
|
modelUrl,
|
|
3335
2908
|
dataUrl
|
|
3336
2909
|
});
|
|
3337
2910
|
if (dataUrl) {
|
|
3338
2911
|
const dataFilename = dataUrl.split("/").pop();
|
|
3339
|
-
|
|
2912
|
+
logger3.info("iOS: setting externalData", { dataFilename, dataUrl });
|
|
3340
2913
|
sessionOptions.externalData = [{
|
|
3341
2914
|
path: dataFilename,
|
|
3342
2915
|
data: dataUrl
|
|
3343
2916
|
// URL string — ORT fetches directly into WASM
|
|
3344
2917
|
}];
|
|
3345
2918
|
}
|
|
3346
|
-
|
|
2919
|
+
logger3.info("iOS: calling InferenceSession.create() with URL string", {
|
|
3347
2920
|
modelUrl,
|
|
3348
2921
|
sessionOptions: JSON.stringify(
|
|
3349
2922
|
sessionOptions,
|
|
@@ -3353,14 +2926,14 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3353
2926
|
try {
|
|
3354
2927
|
this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
|
|
3355
2928
|
} catch (sessionErr) {
|
|
3356
|
-
|
|
2929
|
+
logger3.error("iOS: InferenceSession.create() failed", {
|
|
3357
2930
|
error: sessionErr instanceof Error ? sessionErr.message : String(sessionErr),
|
|
3358
2931
|
errorType: sessionErr?.constructor?.name,
|
|
3359
2932
|
stack: sessionErr instanceof Error ? sessionErr.stack : void 0
|
|
3360
2933
|
});
|
|
3361
2934
|
throw sessionErr;
|
|
3362
2935
|
}
|
|
3363
|
-
|
|
2936
|
+
logger3.info("iOS: session created successfully", {
|
|
3364
2937
|
inputNames: this.session.inputNames,
|
|
3365
2938
|
outputNames: this.session.outputNames
|
|
3366
2939
|
});
|
|
@@ -3369,15 +2942,15 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3369
2942
|
isCached = await cache.has(modelUrl);
|
|
3370
2943
|
let modelBuffer;
|
|
3371
2944
|
if (isCached) {
|
|
3372
|
-
|
|
2945
|
+
logger3.debug("Loading model from cache", { modelUrl });
|
|
3373
2946
|
modelBuffer = await cache.get(modelUrl);
|
|
3374
2947
|
if (!modelBuffer) {
|
|
3375
|
-
|
|
2948
|
+
logger3.warn("Cache corruption detected, clearing and retrying", { modelUrl });
|
|
3376
2949
|
await cache.delete(modelUrl);
|
|
3377
2950
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
3378
2951
|
}
|
|
3379
2952
|
} else {
|
|
3380
|
-
|
|
2953
|
+
logger3.debug("Fetching and caching model", { modelUrl });
|
|
3381
2954
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
3382
2955
|
}
|
|
3383
2956
|
if (!modelBuffer) {
|
|
@@ -3388,31 +2961,31 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3388
2961
|
try {
|
|
3389
2962
|
const isDataCached = await cache.has(dataUrl);
|
|
3390
2963
|
if (isDataCached) {
|
|
3391
|
-
|
|
2964
|
+
logger3.debug("Loading external data from cache", { dataUrl });
|
|
3392
2965
|
externalDataBuffer = await cache.get(dataUrl);
|
|
3393
2966
|
if (!externalDataBuffer) {
|
|
3394
|
-
|
|
2967
|
+
logger3.warn("Cache corruption for external data, retrying", { dataUrl });
|
|
3395
2968
|
await cache.delete(dataUrl);
|
|
3396
2969
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
3397
2970
|
}
|
|
3398
2971
|
} else {
|
|
3399
|
-
|
|
2972
|
+
logger3.info("Fetching external model data", {
|
|
3400
2973
|
dataUrl,
|
|
3401
2974
|
note: "This may be a large download (383MB+)"
|
|
3402
2975
|
});
|
|
3403
2976
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
3404
2977
|
}
|
|
3405
|
-
|
|
2978
|
+
logger3.info("External data loaded", {
|
|
3406
2979
|
size: formatBytes(externalDataBuffer.byteLength)
|
|
3407
2980
|
});
|
|
3408
2981
|
} catch (err) {
|
|
3409
|
-
|
|
2982
|
+
logger3.debug("No external data file found (single-file model)", {
|
|
3410
2983
|
dataUrl,
|
|
3411
2984
|
error: err.message
|
|
3412
2985
|
});
|
|
3413
2986
|
}
|
|
3414
2987
|
}
|
|
3415
|
-
|
|
2988
|
+
logger3.debug("Creating ONNX session", {
|
|
3416
2989
|
graphSize: formatBytes(modelBuffer.byteLength),
|
|
3417
2990
|
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
|
|
3418
2991
|
backend: this._backend
|
|
@@ -3427,12 +3000,12 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3427
3000
|
const modelData = new Uint8Array(modelBuffer);
|
|
3428
3001
|
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
3429
3002
|
}
|
|
3430
|
-
|
|
3003
|
+
logger3.info("ONNX session created successfully", {
|
|
3431
3004
|
executionProvider: this._backend,
|
|
3432
3005
|
backend: this._backend
|
|
3433
3006
|
});
|
|
3434
3007
|
const loadTimeMs = performance.now() - startTime;
|
|
3435
|
-
|
|
3008
|
+
logger3.info("Model loaded successfully", {
|
|
3436
3009
|
backend: this._backend,
|
|
3437
3010
|
loadTimeMs: Math.round(loadTimeMs),
|
|
3438
3011
|
inputs: this.session.inputNames,
|
|
@@ -3448,13 +3021,13 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3448
3021
|
model: "wav2vec2",
|
|
3449
3022
|
backend: this._backend
|
|
3450
3023
|
});
|
|
3451
|
-
|
|
3024
|
+
logger3.debug("Running warmup inference to initialize GPU context");
|
|
3452
3025
|
const warmupStart = performance.now();
|
|
3453
|
-
const warmupAudio = new Float32Array(
|
|
3026
|
+
const warmupAudio = new Float32Array(this.chunkSize);
|
|
3454
3027
|
const warmupIdentity = new Float32Array(this.numIdentityClasses);
|
|
3455
3028
|
warmupIdentity[0] = 1;
|
|
3456
3029
|
const warmupFeeds = {
|
|
3457
|
-
"audio": new this.ort.Tensor("float32", warmupAudio, [1,
|
|
3030
|
+
"audio": new this.ort.Tensor("float32", warmupAudio, [1, this.chunkSize]),
|
|
3458
3031
|
"identity": new this.ort.Tensor("float32", warmupIdentity, [1, this.numIdentityClasses])
|
|
3459
3032
|
};
|
|
3460
3033
|
const WARMUP_TIMEOUT_MS = 15e3;
|
|
@@ -3464,12 +3037,12 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3464
3037
|
]);
|
|
3465
3038
|
const warmupTimeMs = performance.now() - warmupStart;
|
|
3466
3039
|
if (warmupResult === "timeout") {
|
|
3467
|
-
|
|
3040
|
+
logger3.warn("Warmup inference timed out \u2014 GPU may be unresponsive. Continuing without warmup.", {
|
|
3468
3041
|
timeoutMs: WARMUP_TIMEOUT_MS,
|
|
3469
3042
|
backend: this._backend
|
|
3470
3043
|
});
|
|
3471
3044
|
} else {
|
|
3472
|
-
|
|
3045
|
+
logger3.info("Warmup inference complete", {
|
|
3473
3046
|
warmupTimeMs: Math.round(warmupTimeMs),
|
|
3474
3047
|
backend: this._backend
|
|
3475
3048
|
});
|
|
@@ -3497,11 +3070,10 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3497
3070
|
}
|
|
3498
3071
|
/**
|
|
3499
3072
|
* Run inference on raw audio
|
|
3500
|
-
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
3073
|
+
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
3501
3074
|
* @param identityIndex - Optional identity index (0-11, default 0 = neutral)
|
|
3502
3075
|
*
|
|
3503
|
-
*
|
|
3504
|
-
* Audio will be zero-padded or truncated to 16000 samples.
|
|
3076
|
+
* Audio will be zero-padded or truncated to chunkSize samples.
|
|
3505
3077
|
*/
|
|
3506
3078
|
async infer(audioSamples, identityIndex = 0) {
|
|
3507
3079
|
if (!this.session) {
|
|
@@ -3512,20 +3084,20 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3512
3084
|
}
|
|
3513
3085
|
const audioSamplesCopy = new Float32Array(audioSamples);
|
|
3514
3086
|
let audio;
|
|
3515
|
-
if (audioSamplesCopy.length ===
|
|
3087
|
+
if (audioSamplesCopy.length === this.chunkSize) {
|
|
3516
3088
|
audio = audioSamplesCopy;
|
|
3517
|
-
} else if (audioSamplesCopy.length <
|
|
3518
|
-
audio = new Float32Array(
|
|
3089
|
+
} else if (audioSamplesCopy.length < this.chunkSize) {
|
|
3090
|
+
audio = new Float32Array(this.chunkSize);
|
|
3519
3091
|
audio.set(audioSamplesCopy, 0);
|
|
3520
3092
|
} else {
|
|
3521
|
-
audio = audioSamplesCopy.slice(0,
|
|
3093
|
+
audio = audioSamplesCopy.slice(0, this.chunkSize);
|
|
3522
3094
|
}
|
|
3523
3095
|
const identity = new Float32Array(this.numIdentityClasses);
|
|
3524
3096
|
identity[Math.max(0, Math.min(identityIndex, this.numIdentityClasses - 1))] = 1;
|
|
3525
3097
|
const audioCopy = new Float32Array(audio);
|
|
3526
3098
|
const identityCopy = new Float32Array(identity);
|
|
3527
3099
|
const feeds = {
|
|
3528
|
-
"audio": new this.ort.Tensor("float32", audioCopy, [1,
|
|
3100
|
+
"audio": new this.ort.Tensor("float32", audioCopy, [1, this.chunkSize]),
|
|
3529
3101
|
"identity": new this.ort.Tensor("float32", identityCopy, [1, this.numIdentityClasses])
|
|
3530
3102
|
};
|
|
3531
3103
|
return this.queueInference(feeds);
|
|
@@ -3561,7 +3133,7 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3561
3133
|
const telemetry = getTelemetry();
|
|
3562
3134
|
const span = telemetry?.startSpan("Wav2Vec2.infer", {
|
|
3563
3135
|
"inference.backend": this._backend,
|
|
3564
|
-
"inference.input_samples":
|
|
3136
|
+
"inference.input_samples": this.chunkSize
|
|
3565
3137
|
});
|
|
3566
3138
|
try {
|
|
3567
3139
|
const startTime = performance.now();
|
|
@@ -3600,7 +3172,7 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3600
3172
|
blendshapes.push(symmetrizeBlendshapes(rawFrame));
|
|
3601
3173
|
}
|
|
3602
3174
|
const text = this.decodeCTC(asrLogits);
|
|
3603
|
-
|
|
3175
|
+
logger3.trace("Inference completed", {
|
|
3604
3176
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
3605
3177
|
numA2EFrames,
|
|
3606
3178
|
numASRFrames,
|
|
@@ -3634,12 +3206,12 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3634
3206
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
3635
3207
|
if (errMsg.includes("timed out")) {
|
|
3636
3208
|
this.poisoned = true;
|
|
3637
|
-
|
|
3209
|
+
logger3.error("CRITICAL: Inference session timed out \u2014 LAM is dead. Page reload required.", {
|
|
3638
3210
|
backend: this._backend,
|
|
3639
3211
|
timeoutMs: _Wav2Vec2Inference.INFERENCE_TIMEOUT_MS
|
|
3640
3212
|
});
|
|
3641
3213
|
} else {
|
|
3642
|
-
|
|
3214
|
+
logger3.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
3643
3215
|
}
|
|
3644
3216
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
3645
3217
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -3680,56 +3252,79 @@ _Wav2Vec2Inference.INFERENCE_TIMEOUT_MS = 5e3;
|
|
|
3680
3252
|
_Wav2Vec2Inference.isWebGPUAvailable = isWebGPUAvailable;
|
|
3681
3253
|
var Wav2Vec2Inference = _Wav2Vec2Inference;
|
|
3682
3254
|
|
|
3255
|
+
// src/audio/audioUtils.ts
|
|
3256
|
+
function pcm16ToFloat32(buffer) {
|
|
3257
|
+
const byteLen = buffer.byteLength & ~1;
|
|
3258
|
+
const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
|
|
3259
|
+
const float32 = new Float32Array(int16.length);
|
|
3260
|
+
for (let i = 0; i < int16.length; i++) {
|
|
3261
|
+
float32[i] = int16[i] / 32768;
|
|
3262
|
+
}
|
|
3263
|
+
return float32;
|
|
3264
|
+
}
|
|
3265
|
+
function int16ToFloat32(int16) {
|
|
3266
|
+
const float32 = new Float32Array(int16.length);
|
|
3267
|
+
for (let i = 0; i < int16.length; i++) {
|
|
3268
|
+
float32[i] = int16[i] / 32768;
|
|
3269
|
+
}
|
|
3270
|
+
return float32;
|
|
3271
|
+
}
|
|
3272
|
+
|
|
3683
3273
|
// src/audio/FullFacePipeline.ts
|
|
3684
|
-
var
|
|
3685
|
-
var
|
|
3686
|
-
|
|
3687
|
-
|
|
3688
|
-
|
|
3689
|
-
|
|
3690
|
-
|
|
3691
|
-
|
|
3692
|
-
|
|
3693
|
-
|
|
3694
|
-
|
|
3695
|
-
|
|
3696
|
-
|
|
3697
|
-
|
|
3698
|
-
|
|
3699
|
-
|
|
3700
|
-
|
|
3701
|
-
|
|
3702
|
-
|
|
3703
|
-
|
|
3704
|
-
disappointed: "sad",
|
|
3705
|
-
frustrated: "angry",
|
|
3706
|
-
irritated: "angry",
|
|
3707
|
-
furious: "angry",
|
|
3708
|
-
annoyed: "angry",
|
|
3709
|
-
// SenseVoice labels
|
|
3710
|
-
fearful: "sad",
|
|
3711
|
-
disgusted: "angry",
|
|
3712
|
-
surprised: "happy"
|
|
3713
|
-
};
|
|
3714
|
-
var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
3274
|
+
var logger4 = createLogger("FullFacePipeline");
|
|
3275
|
+
var BLENDSHAPE_TO_GROUP = /* @__PURE__ */ new Map();
|
|
3276
|
+
for (const name of LAM_BLENDSHAPES) {
|
|
3277
|
+
if (name.startsWith("eye")) {
|
|
3278
|
+
BLENDSHAPE_TO_GROUP.set(name, "eyes");
|
|
3279
|
+
} else if (name.startsWith("brow")) {
|
|
3280
|
+
BLENDSHAPE_TO_GROUP.set(name, "brows");
|
|
3281
|
+
} else if (name.startsWith("jaw")) {
|
|
3282
|
+
BLENDSHAPE_TO_GROUP.set(name, "jaw");
|
|
3283
|
+
} else if (name.startsWith("mouth")) {
|
|
3284
|
+
BLENDSHAPE_TO_GROUP.set(name, "mouth");
|
|
3285
|
+
} else if (name.startsWith("cheek")) {
|
|
3286
|
+
BLENDSHAPE_TO_GROUP.set(name, "cheeks");
|
|
3287
|
+
} else if (name.startsWith("nose")) {
|
|
3288
|
+
BLENDSHAPE_TO_GROUP.set(name, "nose");
|
|
3289
|
+
} else if (name.startsWith("tongue")) {
|
|
3290
|
+
BLENDSHAPE_TO_GROUP.set(name, "tongue");
|
|
3291
|
+
}
|
|
3292
|
+
}
|
|
3293
|
+
var FullFacePipeline = class extends EventEmitter {
|
|
3715
3294
|
constructor(options) {
|
|
3716
3295
|
super();
|
|
3717
3296
|
this.options = options;
|
|
3718
3297
|
this.playbackStarted = false;
|
|
3719
3298
|
this.monitorInterval = null;
|
|
3720
3299
|
this.frameAnimationId = null;
|
|
3721
|
-
// Emotion state
|
|
3722
|
-
this.lastEmotionFrame = null;
|
|
3723
|
-
this.currentAudioEnergy = 0;
|
|
3724
3300
|
// Stale frame detection
|
|
3725
3301
|
this.lastNewFrameTime = 0;
|
|
3726
3302
|
this.lastKnownLamFrame = null;
|
|
3727
3303
|
this.staleWarningEmitted = false;
|
|
3304
|
+
// Frame loop timing (for dt calculation)
|
|
3305
|
+
this.lastFrameLoopTime = 0;
|
|
3306
|
+
// Diagnostic logging counter
|
|
3307
|
+
this.frameLoopCount = 0;
|
|
3728
3308
|
const sampleRate = options.sampleRate ?? 16e3;
|
|
3729
|
-
this.
|
|
3730
|
-
this.
|
|
3731
|
-
|
|
3309
|
+
this.profile = options.profile ?? {};
|
|
3310
|
+
this.staleThresholdMs = options.staleThresholdMs ?? 2e3;
|
|
3311
|
+
this.smoother = new BlendshapeSmoother({
|
|
3312
|
+
halflife: options.smoothingHalflife ?? 0.06
|
|
3313
|
+
});
|
|
3314
|
+
const isCpuModel = options.lam.modelId === "wav2arkit_cpu";
|
|
3315
|
+
const chunkSize = options.chunkSize ?? options.lam.chunkSize ?? 16e3;
|
|
3316
|
+
const chunkAccumulationMs = chunkSize / sampleRate * 1e3;
|
|
3317
|
+
const inferenceEstimateMs = isCpuModel ? 300 : options.lam.backend === "wasm" ? 250 : 80;
|
|
3318
|
+
const marginMs = 100;
|
|
3319
|
+
const autoDelay = Math.ceil(chunkAccumulationMs + inferenceEstimateMs + marginMs);
|
|
3732
3320
|
const audioDelayMs = options.audioDelayMs ?? autoDelay;
|
|
3321
|
+
logger4.info("FullFacePipeline config", {
|
|
3322
|
+
chunkSize,
|
|
3323
|
+
audioDelayMs,
|
|
3324
|
+
autoDelay,
|
|
3325
|
+
backend: options.lam.backend,
|
|
3326
|
+
modelId: options.lam.modelId
|
|
3327
|
+
});
|
|
3733
3328
|
this.scheduler = new AudioScheduler({
|
|
3734
3329
|
sampleRate,
|
|
3735
3330
|
initialLookaheadSec: audioDelayMs / 1e3
|
|
@@ -3738,20 +3333,15 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3738
3333
|
sampleRate,
|
|
3739
3334
|
targetDurationMs: options.chunkTargetMs ?? 200
|
|
3740
3335
|
});
|
|
3741
|
-
this.
|
|
3336
|
+
this.processor = new A2EProcessor({
|
|
3337
|
+
backend: options.lam,
|
|
3742
3338
|
sampleRate,
|
|
3339
|
+
chunkSize,
|
|
3743
3340
|
onError: (error) => {
|
|
3744
|
-
|
|
3341
|
+
logger4.error("A2E inference error", { message: error.message, stack: error.stack });
|
|
3745
3342
|
this.emit("error", error);
|
|
3746
3343
|
}
|
|
3747
3344
|
});
|
|
3748
|
-
this.emotionMapper = new EmotionToBlendshapeMapper({
|
|
3749
|
-
smoothingFactor: 0.15,
|
|
3750
|
-
confidenceThreshold: 0.3,
|
|
3751
|
-
intensity: 1,
|
|
3752
|
-
energyModulation: true
|
|
3753
|
-
});
|
|
3754
|
-
this.energyAnalyzer = new AudioEnergyAnalyzer();
|
|
3755
3345
|
}
|
|
3756
3346
|
/**
|
|
3757
3347
|
* Initialize the pipeline
|
|
@@ -3760,40 +3350,33 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3760
3350
|
await this.scheduler.initialize();
|
|
3761
3351
|
}
|
|
3762
3352
|
/**
|
|
3763
|
-
*
|
|
3764
|
-
|
|
3765
|
-
|
|
3766
|
-
|
|
3767
|
-
*
|
|
3768
|
-
* Supported labels: happy, excited, joyful, sad, melancholic, angry,
|
|
3769
|
-
* frustrated, neutral, etc.
|
|
3770
|
-
*
|
|
3771
|
-
* @param label - Emotion label string (case-insensitive)
|
|
3772
|
-
*/
|
|
3773
|
-
setEmotionLabel(label) {
|
|
3774
|
-
const normalized = label.toLowerCase();
|
|
3775
|
-
const mapped = EMOTION_LABEL_MAP[normalized] ?? "neutral";
|
|
3776
|
-
const probabilities = {
|
|
3777
|
-
neutral: 0.1,
|
|
3778
|
-
happy: 0.1,
|
|
3779
|
-
angry: 0.1,
|
|
3780
|
-
sad: 0.1
|
|
3781
|
-
};
|
|
3782
|
-
probabilities[mapped] = 0.7;
|
|
3783
|
-
const frame = {
|
|
3784
|
-
emotion: mapped,
|
|
3785
|
-
confidence: 0.7,
|
|
3786
|
-
probabilities
|
|
3787
|
-
};
|
|
3788
|
-
this.lastEmotionFrame = frame;
|
|
3789
|
-
logger3.info("Emotion label set", { label, mapped });
|
|
3353
|
+
* Update the ExpressionProfile at runtime (e.g., character switch).
|
|
3354
|
+
*/
|
|
3355
|
+
setProfile(profile) {
|
|
3356
|
+
this.profile = profile;
|
|
3790
3357
|
}
|
|
3791
3358
|
/**
|
|
3792
|
-
*
|
|
3793
|
-
*
|
|
3359
|
+
* Apply ExpressionProfile scaling to raw A2E blendshapes.
|
|
3360
|
+
*
|
|
3361
|
+
* For each blendshape:
|
|
3362
|
+
* 1. If an override exists for the blendshape name, use override as scaler
|
|
3363
|
+
* 2. Otherwise, use the group scaler (default 1.0)
|
|
3364
|
+
* 3. Clamp result to [0, 1]
|
|
3794
3365
|
*/
|
|
3795
|
-
|
|
3796
|
-
|
|
3366
|
+
applyProfile(raw) {
|
|
3367
|
+
const scaled = new Float32Array(52);
|
|
3368
|
+
for (let i = 0; i < 52; i++) {
|
|
3369
|
+
const name = LAM_BLENDSHAPES[i];
|
|
3370
|
+
let scaler;
|
|
3371
|
+
if (this.profile.overrides && this.profile.overrides[name] !== void 0) {
|
|
3372
|
+
scaler = this.profile.overrides[name];
|
|
3373
|
+
} else {
|
|
3374
|
+
const group = BLENDSHAPE_TO_GROUP.get(name);
|
|
3375
|
+
scaler = group ? this.profile[group] ?? 1 : 1;
|
|
3376
|
+
}
|
|
3377
|
+
scaled[i] = Math.min(1, Math.max(0, raw[i] * scaler));
|
|
3378
|
+
}
|
|
3379
|
+
return scaled;
|
|
3797
3380
|
}
|
|
3798
3381
|
/**
|
|
3799
3382
|
* Start a new playback session
|
|
@@ -3805,15 +3388,14 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3805
3388
|
this.stopMonitoring();
|
|
3806
3389
|
this.scheduler.reset();
|
|
3807
3390
|
this.coalescer.reset();
|
|
3808
|
-
this.
|
|
3391
|
+
this.processor.reset();
|
|
3809
3392
|
this.playbackStarted = false;
|
|
3810
|
-
this.lastEmotionFrame = null;
|
|
3811
|
-
this.currentAudioEnergy = 0;
|
|
3812
|
-
this.emotionMapper.reset();
|
|
3813
|
-
this.energyAnalyzer.reset();
|
|
3814
3393
|
this.lastNewFrameTime = 0;
|
|
3815
3394
|
this.lastKnownLamFrame = null;
|
|
3816
3395
|
this.staleWarningEmitted = false;
|
|
3396
|
+
this.lastFrameLoopTime = 0;
|
|
3397
|
+
this.frameLoopCount = 0;
|
|
3398
|
+
this.smoother.reset();
|
|
3817
3399
|
this.scheduler.warmup();
|
|
3818
3400
|
this.startFrameLoop();
|
|
3819
3401
|
this.startMonitoring();
|
|
@@ -3821,8 +3403,8 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3821
3403
|
/**
|
|
3822
3404
|
* Receive audio chunk from network
|
|
3823
3405
|
*
|
|
3824
|
-
* Audio-first design: schedules audio immediately,
|
|
3825
|
-
* This prevents
|
|
3406
|
+
* Audio-first design: schedules audio immediately, A2E runs in background.
|
|
3407
|
+
* This prevents A2E inference (50-300ms) from blocking audio scheduling.
|
|
3826
3408
|
*
|
|
3827
3409
|
* @param chunk - Uint8Array containing Int16 PCM audio
|
|
3828
3410
|
*/
|
|
@@ -3837,100 +3419,77 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3837
3419
|
this.playbackStarted = true;
|
|
3838
3420
|
this.emit("playback_start", scheduleTime);
|
|
3839
3421
|
}
|
|
3840
|
-
|
|
3841
|
-
|
|
3842
|
-
|
|
3843
|
-
this.
|
|
3422
|
+
logger4.info("onAudioChunk \u2192 pushAudio", {
|
|
3423
|
+
float32Samples: float32.length,
|
|
3424
|
+
scheduleTime: scheduleTime.toFixed(3),
|
|
3425
|
+
currentTime: this.scheduler.getCurrentTime().toFixed(3),
|
|
3426
|
+
deltaToPlayback: (scheduleTime - this.scheduler.getCurrentTime()).toFixed(3)
|
|
3844
3427
|
});
|
|
3845
|
-
|
|
3846
|
-
/**
|
|
3847
|
-
* Get emotion frame for current animation.
|
|
3848
|
-
*
|
|
3849
|
-
* Priority:
|
|
3850
|
-
* 1. Explicit emotion label from setEmotionLabel()
|
|
3851
|
-
* 2. Prosody fallback: subtle brow movement from audio energy
|
|
3852
|
-
*/
|
|
3853
|
-
getEmotionFrame() {
|
|
3854
|
-
if (this.lastEmotionFrame) {
|
|
3855
|
-
return { frame: this.lastEmotionFrame, energy: this.currentAudioEnergy };
|
|
3856
|
-
}
|
|
3857
|
-
return { frame: null, energy: this.currentAudioEnergy };
|
|
3858
|
-
}
|
|
3859
|
-
/**
|
|
3860
|
-
* Merge LAM blendshapes with emotion upper face blendshapes
|
|
3861
|
-
*/
|
|
3862
|
-
mergeBlendshapes(lamFrame, emotionFrame, audioEnergy) {
|
|
3863
|
-
const merged = new Float32Array(52);
|
|
3864
|
-
let emotionBlendshapes;
|
|
3865
|
-
if (emotionFrame) {
|
|
3866
|
-
this.emotionMapper.mapFrame(emotionFrame, audioEnergy);
|
|
3867
|
-
this.emotionMapper.update(33);
|
|
3868
|
-
emotionBlendshapes = this.emotionMapper.getCurrentBlendshapes();
|
|
3869
|
-
} else {
|
|
3870
|
-
emotionBlendshapes = {};
|
|
3871
|
-
for (const name of UPPER_FACE_BLENDSHAPES) {
|
|
3872
|
-
emotionBlendshapes[name] = 0;
|
|
3873
|
-
}
|
|
3874
|
-
}
|
|
3875
|
-
for (let i = 0; i < 52; i++) {
|
|
3876
|
-
const name = LAM_BLENDSHAPES[i];
|
|
3877
|
-
if (UPPER_FACE_SET.has(name)) {
|
|
3878
|
-
const emotionValue = emotionBlendshapes[name] ?? 0;
|
|
3879
|
-
const lamValue = lamFrame[i];
|
|
3880
|
-
merged[i] = emotionValue * this.emotionBlendFactor + lamValue * this.lamBlendFactor;
|
|
3881
|
-
} else {
|
|
3882
|
-
merged[i] = lamFrame[i];
|
|
3883
|
-
}
|
|
3884
|
-
}
|
|
3885
|
-
return { merged, emotionBlendshapes };
|
|
3428
|
+
this.processor.pushAudio(float32, scheduleTime);
|
|
3886
3429
|
}
|
|
3887
3430
|
/**
|
|
3888
3431
|
* Start frame animation loop
|
|
3432
|
+
*
|
|
3433
|
+
* Uses critically damped spring smoother to produce continuous output
|
|
3434
|
+
* at render rate (60fps), even between inference batches (~30fps bursts).
|
|
3435
|
+
* Springs interpolate toward the latest inference target, and decay
|
|
3436
|
+
* to neutral when inference stalls.
|
|
3889
3437
|
*/
|
|
3890
3438
|
startFrameLoop() {
|
|
3439
|
+
this.lastFrameLoopTime = 0;
|
|
3891
3440
|
const updateFrame = () => {
|
|
3441
|
+
const now = performance.now() / 1e3;
|
|
3442
|
+
const dt = this.lastFrameLoopTime > 0 ? now - this.lastFrameLoopTime : 1 / 60;
|
|
3443
|
+
this.lastFrameLoopTime = now;
|
|
3444
|
+
this.frameLoopCount++;
|
|
3892
3445
|
const currentTime = this.scheduler.getCurrentTime();
|
|
3893
|
-
const lamFrame = this.
|
|
3894
|
-
if (lamFrame) {
|
|
3895
|
-
|
|
3896
|
-
|
|
3897
|
-
|
|
3898
|
-
|
|
3446
|
+
const lamFrame = this.processor.getFrameForTime(currentTime);
|
|
3447
|
+
if (lamFrame && lamFrame !== this.lastKnownLamFrame) {
|
|
3448
|
+
this.smoother.setTarget(lamFrame);
|
|
3449
|
+
this.lastNewFrameTime = performance.now();
|
|
3450
|
+
this.lastKnownLamFrame = lamFrame;
|
|
3451
|
+
this.staleWarningEmitted = false;
|
|
3452
|
+
logger4.info("New A2E frame", {
|
|
3453
|
+
jawOpen: lamFrame[24]?.toFixed(3),
|
|
3454
|
+
mouthClose: lamFrame[26]?.toFixed(3),
|
|
3455
|
+
browInnerUp: lamFrame[2]?.toFixed(3),
|
|
3456
|
+
browDownL: lamFrame[0]?.toFixed(3),
|
|
3457
|
+
browOuterUpL: lamFrame[3]?.toFixed(3),
|
|
3458
|
+
currentTime: currentTime.toFixed(3),
|
|
3459
|
+
queuedFrames: this.processor.queuedFrameCount
|
|
3460
|
+
});
|
|
3461
|
+
}
|
|
3462
|
+
if (this.frameLoopCount % 60 === 0) {
|
|
3463
|
+
logger4.info("Frame loop heartbeat", {
|
|
3464
|
+
frameLoopCount: this.frameLoopCount,
|
|
3465
|
+
currentTime: currentTime.toFixed(3),
|
|
3466
|
+
playbackEndTime: this.scheduler.getPlaybackEndTime().toFixed(3),
|
|
3467
|
+
queuedFrames: this.processor.queuedFrameCount,
|
|
3468
|
+
hasTarget: this.smoother.hasTarget,
|
|
3469
|
+
playbackStarted: this.playbackStarted,
|
|
3470
|
+
msSinceNewFrame: this.lastNewFrameTime > 0 ? Math.round(performance.now() - this.lastNewFrameTime) : -1,
|
|
3471
|
+
processorFill: this.processor.fillLevel.toFixed(2)
|
|
3472
|
+
});
|
|
3473
|
+
}
|
|
3474
|
+
if (this.playbackStarted && this.lastNewFrameTime > 0 && performance.now() - this.lastNewFrameTime > this.staleThresholdMs) {
|
|
3475
|
+
this.smoother.decayToNeutral();
|
|
3476
|
+
if (!this.staleWarningEmitted) {
|
|
3477
|
+
this.staleWarningEmitted = true;
|
|
3478
|
+
logger4.warn("A2E stalled \u2014 decaying to neutral", {
|
|
3479
|
+
staleDurationMs: Math.round(performance.now() - this.lastNewFrameTime),
|
|
3480
|
+
queuedFrames: this.processor.queuedFrameCount
|
|
3481
|
+
});
|
|
3899
3482
|
}
|
|
3900
|
-
|
|
3901
|
-
|
|
3483
|
+
}
|
|
3484
|
+
if (lamFrame) {
|
|
3485
|
+
const scaled = this.applyProfile(lamFrame);
|
|
3902
3486
|
const fullFrame = {
|
|
3903
|
-
blendshapes:
|
|
3904
|
-
|
|
3905
|
-
emotionBlendshapes,
|
|
3906
|
-
emotion: emotionFrame,
|
|
3487
|
+
blendshapes: scaled,
|
|
3488
|
+
rawBlendshapes: lamFrame,
|
|
3907
3489
|
timestamp: currentTime
|
|
3908
3490
|
};
|
|
3909
3491
|
this.emit("full_frame_ready", fullFrame);
|
|
3910
3492
|
this.emit("lam_frame_ready", lamFrame);
|
|
3911
|
-
if (emotionFrame) {
|
|
3912
|
-
this.emit("emotion_frame_ready", emotionFrame);
|
|
3913
|
-
}
|
|
3914
|
-
} else if (this.playbackStarted && !this.lastKnownLamFrame) {
|
|
3915
|
-
const { frame: emotionFrame, energy } = this.getEmotionFrame();
|
|
3916
|
-
if (emotionFrame && energy > 0.05) {
|
|
3917
|
-
const startupFrame = new Float32Array(52);
|
|
3918
|
-
const { merged, emotionBlendshapes } = this.mergeBlendshapes(startupFrame, emotionFrame, energy);
|
|
3919
|
-
this.emit("full_frame_ready", {
|
|
3920
|
-
blendshapes: merged,
|
|
3921
|
-
lamBlendshapes: startupFrame,
|
|
3922
|
-
emotionBlendshapes,
|
|
3923
|
-
emotion: emotionFrame,
|
|
3924
|
-
timestamp: currentTime
|
|
3925
|
-
});
|
|
3926
|
-
}
|
|
3927
|
-
}
|
|
3928
|
-
if (this.playbackStarted && this.lastNewFrameTime > 0 && !this.staleWarningEmitted && performance.now() - this.lastNewFrameTime > _FullFacePipeline.STALE_FRAME_THRESHOLD_MS) {
|
|
3929
|
-
this.staleWarningEmitted = true;
|
|
3930
|
-
logger3.warn("LAM appears stalled \u2014 no new frames for 3+ seconds during playback", {
|
|
3931
|
-
staleDurationMs: Math.round(performance.now() - this.lastNewFrameTime),
|
|
3932
|
-
queuedFrames: this.lamPipeline.queuedFrameCount
|
|
3933
|
-
});
|
|
3934
3493
|
}
|
|
3935
3494
|
this.frameAnimationId = requestAnimationFrame(updateFrame);
|
|
3936
3495
|
};
|
|
@@ -3945,7 +3504,7 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3945
3504
|
const chunk = new Uint8Array(remaining);
|
|
3946
3505
|
await this.onAudioChunk(chunk);
|
|
3947
3506
|
}
|
|
3948
|
-
await this.
|
|
3507
|
+
await this.processor.flush();
|
|
3949
3508
|
}
|
|
3950
3509
|
/**
|
|
3951
3510
|
* Stop playback immediately with smooth fade-out
|
|
@@ -3954,15 +3513,13 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3954
3513
|
this.stopMonitoring();
|
|
3955
3514
|
await this.scheduler.cancelAll(fadeOutMs);
|
|
3956
3515
|
this.coalescer.reset();
|
|
3957
|
-
this.
|
|
3516
|
+
this.processor.reset();
|
|
3517
|
+
this.smoother.reset();
|
|
3958
3518
|
this.playbackStarted = false;
|
|
3959
|
-
this.lastEmotionFrame = null;
|
|
3960
|
-
this.currentAudioEnergy = 0;
|
|
3961
|
-
this.emotionMapper.reset();
|
|
3962
|
-
this.energyAnalyzer.reset();
|
|
3963
3519
|
this.lastNewFrameTime = 0;
|
|
3964
3520
|
this.lastKnownLamFrame = null;
|
|
3965
3521
|
this.staleWarningEmitted = false;
|
|
3522
|
+
this.lastFrameLoopTime = 0;
|
|
3966
3523
|
this.emit("playback_complete", void 0);
|
|
3967
3524
|
}
|
|
3968
3525
|
/**
|
|
@@ -3973,7 +3530,7 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3973
3530
|
clearInterval(this.monitorInterval);
|
|
3974
3531
|
}
|
|
3975
3532
|
this.monitorInterval = setInterval(() => {
|
|
3976
|
-
if (this.scheduler.isComplete() && this.
|
|
3533
|
+
if (this.scheduler.isComplete() && this.processor.queuedFrameCount === 0) {
|
|
3977
3534
|
this.emit("playback_complete", void 0);
|
|
3978
3535
|
this.stopMonitoring();
|
|
3979
3536
|
}
|
|
@@ -3999,20 +3556,12 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3999
3556
|
return {
|
|
4000
3557
|
playbackStarted: this.playbackStarted,
|
|
4001
3558
|
coalescerFill: this.coalescer.fillLevel,
|
|
4002
|
-
|
|
4003
|
-
|
|
4004
|
-
emotionLabel: this.lastEmotionFrame?.emotion ?? null,
|
|
4005
|
-
currentAudioEnergy: this.currentAudioEnergy,
|
|
3559
|
+
processorFill: this.processor.fillLevel,
|
|
3560
|
+
queuedFrames: this.processor.queuedFrameCount,
|
|
4006
3561
|
currentTime: this.scheduler.getCurrentTime(),
|
|
4007
3562
|
playbackEndTime: this.scheduler.getPlaybackEndTime()
|
|
4008
3563
|
};
|
|
4009
3564
|
}
|
|
4010
|
-
/**
|
|
4011
|
-
* Check if an explicit emotion label is currently set
|
|
4012
|
-
*/
|
|
4013
|
-
get hasEmotionLabel() {
|
|
4014
|
-
return this.lastEmotionFrame !== null;
|
|
4015
|
-
}
|
|
4016
3565
|
/**
|
|
4017
3566
|
* Cleanup resources
|
|
4018
3567
|
*/
|
|
@@ -4020,13 +3569,9 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
4020
3569
|
this.stopMonitoring();
|
|
4021
3570
|
this.scheduler.dispose();
|
|
4022
3571
|
this.coalescer.reset();
|
|
4023
|
-
this.
|
|
4024
|
-
this.lastEmotionFrame = null;
|
|
4025
|
-
this.currentAudioEnergy = 0;
|
|
3572
|
+
this.processor.dispose();
|
|
4026
3573
|
}
|
|
4027
3574
|
};
|
|
4028
|
-
_FullFacePipeline.STALE_FRAME_THRESHOLD_MS = 3e3;
|
|
4029
|
-
var FullFacePipeline = _FullFacePipeline;
|
|
4030
3575
|
|
|
4031
3576
|
// src/inference/kaldiFbank.ts
|
|
4032
3577
|
function fft(re, im) {
|
|
@@ -4313,7 +3858,7 @@ function ctcGreedyDecode(logits, seqLen, vocabSize, tokenMap) {
|
|
|
4313
3858
|
}
|
|
4314
3859
|
|
|
4315
3860
|
// src/inference/SenseVoiceInference.ts
|
|
4316
|
-
var
|
|
3861
|
+
var logger5 = createLogger("SenseVoice");
|
|
4317
3862
|
var _SenseVoiceInference = class _SenseVoiceInference {
|
|
4318
3863
|
constructor(config) {
|
|
4319
3864
|
this.session = null;
|
|
@@ -4366,26 +3911,26 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4366
3911
|
"model.backend_requested": this.config.backend
|
|
4367
3912
|
});
|
|
4368
3913
|
try {
|
|
4369
|
-
|
|
3914
|
+
logger5.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
4370
3915
|
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
4371
3916
|
this.ort = ort;
|
|
4372
3917
|
this._backend = backend;
|
|
4373
|
-
|
|
4374
|
-
|
|
3918
|
+
logger5.info("ONNX Runtime loaded", { backend: this._backend });
|
|
3919
|
+
logger5.debug("Fetching tokens vocabulary", { tokensUrl: this.config.tokensUrl });
|
|
4375
3920
|
const tokensResponse = await fetch(this.config.tokensUrl);
|
|
4376
3921
|
if (!tokensResponse.ok) {
|
|
4377
3922
|
throw new Error(`Failed to fetch tokens.txt: ${tokensResponse.status} ${tokensResponse.statusText}`);
|
|
4378
3923
|
}
|
|
4379
3924
|
const tokensText = await tokensResponse.text();
|
|
4380
3925
|
this.tokenMap = parseTokensFile(tokensText);
|
|
4381
|
-
|
|
3926
|
+
logger5.debug("Tokens loaded", { vocabSize: this.tokenMap.size });
|
|
4382
3927
|
const sessionOptions = getSessionOptions(this._backend);
|
|
4383
3928
|
if (this._backend === "webgpu") {
|
|
4384
3929
|
sessionOptions.graphOptimizationLevel = "basic";
|
|
4385
3930
|
}
|
|
4386
3931
|
let isCached = false;
|
|
4387
3932
|
if (isIOS()) {
|
|
4388
|
-
|
|
3933
|
+
logger5.info("iOS: passing model URL directly to ORT (low-memory path)", {
|
|
4389
3934
|
modelUrl: this.config.modelUrl
|
|
4390
3935
|
});
|
|
4391
3936
|
this.session = await this.ort.InferenceSession.create(
|
|
@@ -4397,14 +3942,14 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4397
3942
|
isCached = await cache.has(this.config.modelUrl);
|
|
4398
3943
|
let modelBuffer;
|
|
4399
3944
|
if (isCached) {
|
|
4400
|
-
|
|
3945
|
+
logger5.debug("Loading model from cache", { modelUrl: this.config.modelUrl });
|
|
4401
3946
|
modelBuffer = await cache.get(this.config.modelUrl);
|
|
4402
3947
|
onProgress?.(modelBuffer.byteLength, modelBuffer.byteLength);
|
|
4403
3948
|
} else {
|
|
4404
|
-
|
|
3949
|
+
logger5.debug("Fetching and caching model", { modelUrl: this.config.modelUrl });
|
|
4405
3950
|
modelBuffer = await fetchWithCache(this.config.modelUrl, onProgress);
|
|
4406
3951
|
}
|
|
4407
|
-
|
|
3952
|
+
logger5.debug("Creating ONNX session", {
|
|
4408
3953
|
size: formatBytes(modelBuffer.byteLength),
|
|
4409
3954
|
backend: this._backend
|
|
4410
3955
|
});
|
|
@@ -4417,15 +3962,15 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4417
3962
|
const cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
|
|
4418
3963
|
this.negMean = cmvn.negMean;
|
|
4419
3964
|
this.invStddev = cmvn.invStddev;
|
|
4420
|
-
|
|
3965
|
+
logger5.debug("CMVN loaded from model metadata", { dim: this.negMean.length });
|
|
4421
3966
|
} else {
|
|
4422
|
-
|
|
3967
|
+
logger5.warn("CMVN not found in model metadata \u2014 features will not be normalized");
|
|
4423
3968
|
}
|
|
4424
3969
|
} catch (cmvnErr) {
|
|
4425
|
-
|
|
3970
|
+
logger5.warn("Failed to read CMVN from model metadata", { error: cmvnErr });
|
|
4426
3971
|
}
|
|
4427
3972
|
const loadTimeMs = performance.now() - startTime;
|
|
4428
|
-
|
|
3973
|
+
logger5.info("SenseVoice model loaded", {
|
|
4429
3974
|
backend: this._backend,
|
|
4430
3975
|
loadTimeMs: Math.round(loadTimeMs),
|
|
4431
3976
|
vocabSize: this.tokenMap.size,
|
|
@@ -4536,7 +4081,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4536
4081
|
const vocabSize = logitsDims[2];
|
|
4537
4082
|
const decoded = ctcGreedyDecode(logitsData, seqLen, vocabSize, this.tokenMap);
|
|
4538
4083
|
const inferenceTimeMs = performance.now() - startTime;
|
|
4539
|
-
|
|
4084
|
+
logger5.trace("Transcription complete", {
|
|
4540
4085
|
text: decoded.text.substring(0, 50),
|
|
4541
4086
|
language: decoded.language,
|
|
4542
4087
|
emotion: decoded.emotion,
|
|
@@ -4574,7 +4119,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4574
4119
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
4575
4120
|
if (errMsg.includes("timed out")) {
|
|
4576
4121
|
this.poisoned = true;
|
|
4577
|
-
|
|
4122
|
+
logger5.error("CRITICAL: Inference session timed out \u2014 SenseVoice is dead. Page reload required.", {
|
|
4578
4123
|
backend: this._backend,
|
|
4579
4124
|
timeoutMs: _SenseVoiceInference.INFERENCE_TIMEOUT_MS
|
|
4580
4125
|
});
|
|
@@ -4582,7 +4127,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4582
4127
|
const oomError = new Error(
|
|
4583
4128
|
`SenseVoice inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
|
|
4584
4129
|
);
|
|
4585
|
-
|
|
4130
|
+
logger5.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
4586
4131
|
pointer: `0x${err.toString(16)}`,
|
|
4587
4132
|
backend: this._backend
|
|
4588
4133
|
});
|
|
@@ -4595,7 +4140,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4595
4140
|
reject(oomError);
|
|
4596
4141
|
return;
|
|
4597
4142
|
} else {
|
|
4598
|
-
|
|
4143
|
+
logger5.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
4599
4144
|
}
|
|
4600
4145
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
4601
4146
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -4624,7 +4169,7 @@ _SenseVoiceInference.INFERENCE_TIMEOUT_MS = 1e4;
|
|
|
4624
4169
|
var SenseVoiceInference = _SenseVoiceInference;
|
|
4625
4170
|
|
|
4626
4171
|
// src/inference/SenseVoiceWorker.ts
|
|
4627
|
-
var
|
|
4172
|
+
var logger6 = createLogger("SenseVoiceWorker");
|
|
4628
4173
|
var WASM_CDN_PATH2 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
4629
4174
|
var LOAD_TIMEOUT_MS = 3e4;
|
|
4630
4175
|
var INFERENCE_TIMEOUT_MS = 1e4;
|
|
@@ -5357,7 +4902,7 @@ var SenseVoiceWorker = class {
|
|
|
5357
4902
|
this.handleWorkerMessage(event.data);
|
|
5358
4903
|
};
|
|
5359
4904
|
worker.onerror = (error) => {
|
|
5360
|
-
|
|
4905
|
+
logger6.error("Worker error", { error: error.message });
|
|
5361
4906
|
for (const [, resolver] of this.pendingResolvers) {
|
|
5362
4907
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
5363
4908
|
}
|
|
@@ -5437,9 +4982,9 @@ var SenseVoiceWorker = class {
|
|
|
5437
4982
|
"model.language": this.config.language
|
|
5438
4983
|
});
|
|
5439
4984
|
try {
|
|
5440
|
-
|
|
4985
|
+
logger6.info("Creating SenseVoice worker...");
|
|
5441
4986
|
this.worker = this.createWorker();
|
|
5442
|
-
|
|
4987
|
+
logger6.info("Loading model in worker...", {
|
|
5443
4988
|
modelUrl: this.config.modelUrl,
|
|
5444
4989
|
tokensUrl: this.config.tokensUrl,
|
|
5445
4990
|
language: this.config.language,
|
|
@@ -5461,7 +5006,7 @@ var SenseVoiceWorker = class {
|
|
|
5461
5006
|
this._isLoaded = true;
|
|
5462
5007
|
const loadTimeMs = performance.now() - startTime;
|
|
5463
5008
|
onProgress?.(1, 1);
|
|
5464
|
-
|
|
5009
|
+
logger6.info("SenseVoice worker loaded successfully", {
|
|
5465
5010
|
backend: "wasm",
|
|
5466
5011
|
loadTimeMs: Math.round(loadTimeMs),
|
|
5467
5012
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -5540,7 +5085,7 @@ var SenseVoiceWorker = class {
|
|
|
5540
5085
|
INFERENCE_TIMEOUT_MS
|
|
5541
5086
|
);
|
|
5542
5087
|
const totalTimeMs = performance.now() - startTime;
|
|
5543
|
-
|
|
5088
|
+
logger6.trace("Worker transcription complete", {
|
|
5544
5089
|
text: result.text.substring(0, 50),
|
|
5545
5090
|
language: result.language,
|
|
5546
5091
|
emotion: result.emotion,
|
|
@@ -5576,11 +5121,11 @@ var SenseVoiceWorker = class {
|
|
|
5576
5121
|
} catch (err) {
|
|
5577
5122
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
5578
5123
|
if (errMsg.includes("timed out")) {
|
|
5579
|
-
|
|
5124
|
+
logger6.error("CRITICAL: Worker inference timed out \u2014 SenseVoice worker is dead. Page reload required.", {
|
|
5580
5125
|
timeoutMs: INFERENCE_TIMEOUT_MS
|
|
5581
5126
|
});
|
|
5582
5127
|
} else {
|
|
5583
|
-
|
|
5128
|
+
logger6.error("Worker inference failed", { error: errMsg });
|
|
5584
5129
|
}
|
|
5585
5130
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
5586
5131
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -5618,7 +5163,7 @@ var SenseVoiceWorker = class {
|
|
|
5618
5163
|
};
|
|
5619
5164
|
|
|
5620
5165
|
// src/inference/UnifiedInferenceWorker.ts
|
|
5621
|
-
var
|
|
5166
|
+
var logger7 = createLogger("UnifiedInferenceWorker");
|
|
5622
5167
|
var WASM_CDN_PATH3 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
5623
5168
|
var INIT_TIMEOUT_MS = 15e3;
|
|
5624
5169
|
var SV_LOAD_TIMEOUT_MS = 3e4;
|
|
@@ -6314,7 +5859,7 @@ var UnifiedInferenceWorker = class {
|
|
|
6314
5859
|
const telemetry = getTelemetry();
|
|
6315
5860
|
const span = telemetry?.startSpan("UnifiedInferenceWorker.init");
|
|
6316
5861
|
try {
|
|
6317
|
-
|
|
5862
|
+
logger7.info("Creating unified inference worker...");
|
|
6318
5863
|
this.worker = this.createWorker();
|
|
6319
5864
|
await this.sendMessage(
|
|
6320
5865
|
{ type: "init", wasmPaths: WASM_CDN_PATH3, isIOS: isIOS() },
|
|
@@ -6323,7 +5868,7 @@ var UnifiedInferenceWorker = class {
|
|
|
6323
5868
|
);
|
|
6324
5869
|
this.initialized = true;
|
|
6325
5870
|
const loadTimeMs = performance.now() - startTime;
|
|
6326
|
-
|
|
5871
|
+
logger7.info("Unified worker initialized", { loadTimeMs: Math.round(loadTimeMs) });
|
|
6327
5872
|
span?.setAttributes({ "worker.init_time_ms": loadTimeMs });
|
|
6328
5873
|
span?.end();
|
|
6329
5874
|
} catch (error) {
|
|
@@ -6377,8 +5922,8 @@ var UnifiedInferenceWorker = class {
|
|
|
6377
5922
|
if (!this.worker) return;
|
|
6378
5923
|
await this.sendMessage({ type: "sv:dispose" }, "sv:disposed", DISPOSE_TIMEOUT_MS);
|
|
6379
5924
|
}
|
|
6380
|
-
// ── Wav2ArkitCpu (
|
|
6381
|
-
async
|
|
5925
|
+
// ── Wav2ArkitCpu (A2E) ──────────────────────────────────────────────
|
|
5926
|
+
async loadA2E(config) {
|
|
6382
5927
|
this.assertReady();
|
|
6383
5928
|
const startTime = performance.now();
|
|
6384
5929
|
const result = await this.sendMessage(
|
|
@@ -6399,7 +5944,7 @@ var UnifiedInferenceWorker = class {
|
|
|
6399
5944
|
outputNames: result.outputNames
|
|
6400
5945
|
};
|
|
6401
5946
|
}
|
|
6402
|
-
async
|
|
5947
|
+
async inferA2E(audio) {
|
|
6403
5948
|
this.assertReady();
|
|
6404
5949
|
return this.sendMessage(
|
|
6405
5950
|
{ type: "cpu:infer", audio },
|
|
@@ -6407,7 +5952,7 @@ var UnifiedInferenceWorker = class {
|
|
|
6407
5952
|
CPU_INFER_TIMEOUT_MS
|
|
6408
5953
|
);
|
|
6409
5954
|
}
|
|
6410
|
-
async
|
|
5955
|
+
async disposeA2E() {
|
|
6411
5956
|
if (!this.worker) return;
|
|
6412
5957
|
await this.sendMessage({ type: "cpu:dispose" }, "cpu:disposed", DISPOSE_TIMEOUT_MS);
|
|
6413
5958
|
}
|
|
@@ -6497,7 +6042,7 @@ var UnifiedInferenceWorker = class {
|
|
|
6497
6042
|
this.handleWorkerMessage(event.data);
|
|
6498
6043
|
};
|
|
6499
6044
|
worker.onerror = (error) => {
|
|
6500
|
-
|
|
6045
|
+
logger7.error("Unified worker error", { error: error.message });
|
|
6501
6046
|
this.rejectAllPending(`Worker error: ${error.message}`);
|
|
6502
6047
|
};
|
|
6503
6048
|
return worker;
|
|
@@ -6511,7 +6056,7 @@ var UnifiedInferenceWorker = class {
|
|
|
6511
6056
|
this.pendingRequests.delete(requestId);
|
|
6512
6057
|
pending.reject(new Error(data.error));
|
|
6513
6058
|
} else {
|
|
6514
|
-
|
|
6059
|
+
logger7.error("Worker broadcast error", { error: data.error });
|
|
6515
6060
|
this.rejectAllPending(data.error);
|
|
6516
6061
|
}
|
|
6517
6062
|
return;
|
|
@@ -6533,7 +6078,7 @@ var UnifiedInferenceWorker = class {
|
|
|
6533
6078
|
const timeout = setTimeout(() => {
|
|
6534
6079
|
this.pendingRequests.delete(requestId);
|
|
6535
6080
|
this.poisoned = true;
|
|
6536
|
-
|
|
6081
|
+
logger7.error("CRITICAL: Worker operation timed out \u2014 worker is dead", {
|
|
6537
6082
|
type: message.type,
|
|
6538
6083
|
timeoutMs
|
|
6539
6084
|
});
|
|
@@ -6599,7 +6144,7 @@ var SenseVoiceUnifiedAdapter = class {
|
|
|
6599
6144
|
});
|
|
6600
6145
|
this._isLoaded = true;
|
|
6601
6146
|
onProgress?.(1, 1);
|
|
6602
|
-
|
|
6147
|
+
logger7.info("SenseVoice loaded via unified worker", {
|
|
6603
6148
|
backend: "wasm",
|
|
6604
6149
|
loadTimeMs: Math.round(result.loadTimeMs),
|
|
6605
6150
|
vocabSize: result.vocabSize
|
|
@@ -6640,6 +6185,7 @@ var SenseVoiceUnifiedAdapter = class {
|
|
|
6640
6185
|
var Wav2ArkitCpuUnifiedAdapter = class {
|
|
6641
6186
|
constructor(worker, config) {
|
|
6642
6187
|
this.modelId = "wav2arkit_cpu";
|
|
6188
|
+
this.chunkSize = 16e3;
|
|
6643
6189
|
this._isLoaded = false;
|
|
6644
6190
|
this.inferenceQueue = Promise.resolve();
|
|
6645
6191
|
this.worker = worker;
|
|
@@ -6658,12 +6204,12 @@ var Wav2ArkitCpuUnifiedAdapter = class {
|
|
|
6658
6204
|
});
|
|
6659
6205
|
try {
|
|
6660
6206
|
const externalDataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${this.config.modelUrl}.data` : null;
|
|
6661
|
-
const result = await this.worker.
|
|
6207
|
+
const result = await this.worker.loadA2E({
|
|
6662
6208
|
modelUrl: this.config.modelUrl,
|
|
6663
6209
|
externalDataUrl: externalDataUrl || null
|
|
6664
6210
|
});
|
|
6665
6211
|
this._isLoaded = true;
|
|
6666
|
-
|
|
6212
|
+
logger7.info("Wav2ArkitCpu loaded via unified worker", {
|
|
6667
6213
|
backend: "wasm",
|
|
6668
6214
|
loadTimeMs: Math.round(result.loadTimeMs)
|
|
6669
6215
|
});
|
|
@@ -6690,7 +6236,7 @@ var Wav2ArkitCpuUnifiedAdapter = class {
|
|
|
6690
6236
|
});
|
|
6691
6237
|
try {
|
|
6692
6238
|
const startTime = performance.now();
|
|
6693
|
-
const result = await this.worker.
|
|
6239
|
+
const result = await this.worker.inferA2E(audioCopy);
|
|
6694
6240
|
const inferenceTimeMs = performance.now() - startTime;
|
|
6695
6241
|
const flatBuffer = result.blendshapes;
|
|
6696
6242
|
const { numFrames, numBlendshapes } = result;
|
|
@@ -6713,7 +6259,7 @@ var Wav2ArkitCpuUnifiedAdapter = class {
|
|
|
6713
6259
|
}
|
|
6714
6260
|
async dispose() {
|
|
6715
6261
|
if (this._isLoaded) {
|
|
6716
|
-
await this.worker.
|
|
6262
|
+
await this.worker.disposeA2E();
|
|
6717
6263
|
this._isLoaded = false;
|
|
6718
6264
|
}
|
|
6719
6265
|
}
|
|
@@ -6769,7 +6315,7 @@ var SileroVADUnifiedAdapter = class {
|
|
|
6769
6315
|
sampleRate: this.config.sampleRate
|
|
6770
6316
|
});
|
|
6771
6317
|
this._isLoaded = true;
|
|
6772
|
-
|
|
6318
|
+
logger7.info("SileroVAD loaded via unified worker", {
|
|
6773
6319
|
backend: "wasm",
|
|
6774
6320
|
loadTimeMs: Math.round(result.loadTimeMs),
|
|
6775
6321
|
sampleRate: this.config.sampleRate,
|
|
@@ -6850,10 +6396,10 @@ var SileroVADUnifiedAdapter = class {
|
|
|
6850
6396
|
};
|
|
6851
6397
|
|
|
6852
6398
|
// src/inference/createSenseVoice.ts
|
|
6853
|
-
var
|
|
6399
|
+
var logger8 = createLogger("createSenseVoice");
|
|
6854
6400
|
function createSenseVoice(config) {
|
|
6855
6401
|
if (config.unifiedWorker) {
|
|
6856
|
-
|
|
6402
|
+
logger8.info("Creating SenseVoiceUnifiedAdapter (shared unified worker)");
|
|
6857
6403
|
return new SenseVoiceUnifiedAdapter(config.unifiedWorker, {
|
|
6858
6404
|
modelUrl: config.modelUrl,
|
|
6859
6405
|
tokensUrl: config.tokensUrl,
|
|
@@ -6866,7 +6412,7 @@ function createSenseVoice(config) {
|
|
|
6866
6412
|
if (!SenseVoiceWorker.isSupported()) {
|
|
6867
6413
|
throw new Error("Web Workers are not supported in this environment");
|
|
6868
6414
|
}
|
|
6869
|
-
|
|
6415
|
+
logger8.info("Creating SenseVoiceWorker (off-main-thread)");
|
|
6870
6416
|
return new SenseVoiceWorker({
|
|
6871
6417
|
modelUrl: config.modelUrl,
|
|
6872
6418
|
tokensUrl: config.tokensUrl,
|
|
@@ -6875,7 +6421,7 @@ function createSenseVoice(config) {
|
|
|
6875
6421
|
});
|
|
6876
6422
|
}
|
|
6877
6423
|
if (useWorker === false) {
|
|
6878
|
-
|
|
6424
|
+
logger8.info("Creating SenseVoiceInference (main thread)");
|
|
6879
6425
|
return new SenseVoiceInference({
|
|
6880
6426
|
modelUrl: config.modelUrl,
|
|
6881
6427
|
tokensUrl: config.tokensUrl,
|
|
@@ -6884,7 +6430,7 @@ function createSenseVoice(config) {
|
|
|
6884
6430
|
});
|
|
6885
6431
|
}
|
|
6886
6432
|
if (SenseVoiceWorker.isSupported() && !isIOS()) {
|
|
6887
|
-
|
|
6433
|
+
logger8.info("Auto-detected: creating SenseVoiceWorker (off-main-thread)");
|
|
6888
6434
|
return new SenseVoiceWorker({
|
|
6889
6435
|
modelUrl: config.modelUrl,
|
|
6890
6436
|
tokensUrl: config.tokensUrl,
|
|
@@ -6892,7 +6438,7 @@ function createSenseVoice(config) {
|
|
|
6892
6438
|
textNorm: config.textNorm
|
|
6893
6439
|
});
|
|
6894
6440
|
}
|
|
6895
|
-
|
|
6441
|
+
logger8.info("Auto-detected: creating SenseVoiceInference (main thread)", {
|
|
6896
6442
|
reason: isIOS() ? "iOS (shared ORT instance)" : "Worker unsupported"
|
|
6897
6443
|
});
|
|
6898
6444
|
return new SenseVoiceInference({
|
|
@@ -6904,10 +6450,11 @@ function createSenseVoice(config) {
|
|
|
6904
6450
|
}
|
|
6905
6451
|
|
|
6906
6452
|
// src/inference/Wav2ArkitCpuInference.ts
|
|
6907
|
-
var
|
|
6453
|
+
var logger9 = createLogger("Wav2ArkitCpu");
|
|
6908
6454
|
var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
6909
6455
|
constructor(config) {
|
|
6910
6456
|
this.modelId = "wav2arkit_cpu";
|
|
6457
|
+
this.chunkSize = 16e3;
|
|
6911
6458
|
this.session = null;
|
|
6912
6459
|
this.ort = null;
|
|
6913
6460
|
this._backend = "wasm";
|
|
@@ -6945,16 +6492,16 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6945
6492
|
});
|
|
6946
6493
|
try {
|
|
6947
6494
|
const preference = this.config.backend || "wasm";
|
|
6948
|
-
|
|
6495
|
+
logger9.info("Loading ONNX Runtime...", { preference });
|
|
6949
6496
|
const { ort, backend } = await getOnnxRuntimeForPreference(preference);
|
|
6950
6497
|
this.ort = ort;
|
|
6951
6498
|
this._backend = backend;
|
|
6952
|
-
|
|
6499
|
+
logger9.info("ONNX Runtime loaded", { backend: this._backend });
|
|
6953
6500
|
const modelUrl = this.config.modelUrl;
|
|
6954
6501
|
const dataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${modelUrl}.data` : null;
|
|
6955
6502
|
const sessionOptions = getSessionOptions(this._backend);
|
|
6956
6503
|
if (isIOS()) {
|
|
6957
|
-
|
|
6504
|
+
logger9.info("iOS: passing model URLs directly to ORT (low-memory path)", {
|
|
6958
6505
|
modelUrl,
|
|
6959
6506
|
dataUrl
|
|
6960
6507
|
});
|
|
@@ -6972,15 +6519,15 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6972
6519
|
const isCached = await cache.has(modelUrl);
|
|
6973
6520
|
let modelBuffer;
|
|
6974
6521
|
if (isCached) {
|
|
6975
|
-
|
|
6522
|
+
logger9.debug("Loading model from cache", { modelUrl });
|
|
6976
6523
|
modelBuffer = await cache.get(modelUrl);
|
|
6977
6524
|
if (!modelBuffer) {
|
|
6978
|
-
|
|
6525
|
+
logger9.warn("Cache corruption detected, clearing and retrying", { modelUrl });
|
|
6979
6526
|
await cache.delete(modelUrl);
|
|
6980
6527
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
6981
6528
|
}
|
|
6982
6529
|
} else {
|
|
6983
|
-
|
|
6530
|
+
logger9.debug("Fetching and caching model graph", { modelUrl });
|
|
6984
6531
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
6985
6532
|
}
|
|
6986
6533
|
if (!modelBuffer) {
|
|
@@ -6991,31 +6538,31 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6991
6538
|
try {
|
|
6992
6539
|
const isDataCached = await cache.has(dataUrl);
|
|
6993
6540
|
if (isDataCached) {
|
|
6994
|
-
|
|
6541
|
+
logger9.debug("Loading external data from cache", { dataUrl });
|
|
6995
6542
|
externalDataBuffer = await cache.get(dataUrl);
|
|
6996
6543
|
if (!externalDataBuffer) {
|
|
6997
|
-
|
|
6544
|
+
logger9.warn("Cache corruption for external data, retrying", { dataUrl });
|
|
6998
6545
|
await cache.delete(dataUrl);
|
|
6999
6546
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
7000
6547
|
}
|
|
7001
6548
|
} else {
|
|
7002
|
-
|
|
6549
|
+
logger9.info("Fetching external model data", {
|
|
7003
6550
|
dataUrl,
|
|
7004
6551
|
note: "This may be a large download (400MB+)"
|
|
7005
6552
|
});
|
|
7006
6553
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
7007
6554
|
}
|
|
7008
|
-
|
|
6555
|
+
logger9.info("External data loaded", {
|
|
7009
6556
|
size: formatBytes(externalDataBuffer.byteLength)
|
|
7010
6557
|
});
|
|
7011
6558
|
} catch (err) {
|
|
7012
|
-
|
|
6559
|
+
logger9.debug("No external data file found (single-file model)", {
|
|
7013
6560
|
dataUrl,
|
|
7014
6561
|
error: err.message
|
|
7015
6562
|
});
|
|
7016
6563
|
}
|
|
7017
6564
|
}
|
|
7018
|
-
|
|
6565
|
+
logger9.debug("Creating ONNX session", {
|
|
7019
6566
|
graphSize: formatBytes(modelBuffer.byteLength),
|
|
7020
6567
|
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
|
|
7021
6568
|
backend: this._backend
|
|
@@ -7031,7 +6578,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
7031
6578
|
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
7032
6579
|
}
|
|
7033
6580
|
const loadTimeMs = performance.now() - startTime;
|
|
7034
|
-
|
|
6581
|
+
logger9.info("Model loaded successfully", {
|
|
7035
6582
|
backend: this._backend,
|
|
7036
6583
|
loadTimeMs: Math.round(loadTimeMs),
|
|
7037
6584
|
inputs: this.session.inputNames,
|
|
@@ -7047,12 +6594,12 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
7047
6594
|
model: "wav2arkit_cpu",
|
|
7048
6595
|
backend: this._backend
|
|
7049
6596
|
});
|
|
7050
|
-
|
|
6597
|
+
logger9.debug("Running warmup inference");
|
|
7051
6598
|
const warmupStart = performance.now();
|
|
7052
6599
|
const silentAudio = new Float32Array(16e3);
|
|
7053
6600
|
await this.infer(silentAudio);
|
|
7054
6601
|
const warmupTimeMs = performance.now() - warmupStart;
|
|
7055
|
-
|
|
6602
|
+
logger9.info("Warmup inference complete", {
|
|
7056
6603
|
warmupTimeMs: Math.round(warmupTimeMs),
|
|
7057
6604
|
backend: this._backend
|
|
7058
6605
|
});
|
|
@@ -7139,7 +6686,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
7139
6686
|
const symmetrized = symmetrizeBlendshapes(rawFrame);
|
|
7140
6687
|
blendshapes.push(symmetrized);
|
|
7141
6688
|
}
|
|
7142
|
-
|
|
6689
|
+
logger9.trace("Inference completed", {
|
|
7143
6690
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
7144
6691
|
numFrames,
|
|
7145
6692
|
inputSamples
|
|
@@ -7167,7 +6714,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
7167
6714
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
7168
6715
|
if (errMsg.includes("timed out")) {
|
|
7169
6716
|
this.poisoned = true;
|
|
7170
|
-
|
|
6717
|
+
logger9.error("CRITICAL: Inference session timed out \u2014 Wav2ArkitCpu is dead. Page reload required.", {
|
|
7171
6718
|
backend: this._backend,
|
|
7172
6719
|
timeoutMs: _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
|
|
7173
6720
|
});
|
|
@@ -7175,7 +6722,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
7175
6722
|
const oomError = new Error(
|
|
7176
6723
|
`Wav2ArkitCpu inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
|
|
7177
6724
|
);
|
|
7178
|
-
|
|
6725
|
+
logger9.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
7179
6726
|
pointer: `0x${err.toString(16)}`,
|
|
7180
6727
|
backend: this._backend
|
|
7181
6728
|
});
|
|
@@ -7188,7 +6735,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
7188
6735
|
reject(oomError);
|
|
7189
6736
|
return;
|
|
7190
6737
|
} else {
|
|
7191
|
-
|
|
6738
|
+
logger9.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
7192
6739
|
}
|
|
7193
6740
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
7194
6741
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -7215,7 +6762,7 @@ _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS = 5e3;
|
|
|
7215
6762
|
var Wav2ArkitCpuInference = _Wav2ArkitCpuInference;
|
|
7216
6763
|
|
|
7217
6764
|
// src/inference/Wav2ArkitCpuWorker.ts
|
|
7218
|
-
var
|
|
6765
|
+
var logger10 = createLogger("Wav2ArkitCpuWorker");
|
|
7219
6766
|
var WASM_CDN_PATH4 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
7220
6767
|
var LOAD_TIMEOUT_MS2 = 6e4;
|
|
7221
6768
|
var INFERENCE_TIMEOUT_MS2 = 5e3;
|
|
@@ -7461,6 +7008,7 @@ self.onerror = function(err) {
|
|
|
7461
7008
|
var Wav2ArkitCpuWorker = class {
|
|
7462
7009
|
constructor(config) {
|
|
7463
7010
|
this.modelId = "wav2arkit_cpu";
|
|
7011
|
+
this.chunkSize = 16e3;
|
|
7464
7012
|
this.worker = null;
|
|
7465
7013
|
this.isLoading = false;
|
|
7466
7014
|
this._isLoaded = false;
|
|
@@ -7495,7 +7043,7 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7495
7043
|
this.handleWorkerMessage(event.data);
|
|
7496
7044
|
};
|
|
7497
7045
|
worker.onerror = (error) => {
|
|
7498
|
-
|
|
7046
|
+
logger10.error("Worker error", { error: error.message });
|
|
7499
7047
|
for (const [, resolver] of this.pendingResolvers) {
|
|
7500
7048
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
7501
7049
|
}
|
|
@@ -7571,10 +7119,10 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7571
7119
|
"model.backend_requested": "wasm"
|
|
7572
7120
|
});
|
|
7573
7121
|
try {
|
|
7574
|
-
|
|
7122
|
+
logger10.info("Creating wav2arkit_cpu worker...");
|
|
7575
7123
|
this.worker = this.createWorker();
|
|
7576
7124
|
const externalDataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${this.config.modelUrl}.data` : null;
|
|
7577
|
-
|
|
7125
|
+
logger10.info("Loading model in worker...", {
|
|
7578
7126
|
modelUrl: this.config.modelUrl,
|
|
7579
7127
|
externalDataUrl,
|
|
7580
7128
|
isIOS: isIOS()
|
|
@@ -7592,7 +7140,7 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7592
7140
|
);
|
|
7593
7141
|
this._isLoaded = true;
|
|
7594
7142
|
const loadTimeMs = performance.now() - startTime;
|
|
7595
|
-
|
|
7143
|
+
logger10.info("Wav2ArkitCpu worker loaded successfully", {
|
|
7596
7144
|
backend: "wasm",
|
|
7597
7145
|
loadTimeMs: Math.round(loadTimeMs),
|
|
7598
7146
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -7677,7 +7225,7 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7677
7225
|
for (let f = 0; f < numFrames; f++) {
|
|
7678
7226
|
blendshapes.push(flatBuffer.slice(f * numBlendshapes, (f + 1) * numBlendshapes));
|
|
7679
7227
|
}
|
|
7680
|
-
|
|
7228
|
+
logger10.trace("Worker inference completed", {
|
|
7681
7229
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
7682
7230
|
workerTimeMs: Math.round(result.inferenceTimeMs * 100) / 100,
|
|
7683
7231
|
numFrames,
|
|
@@ -7707,12 +7255,12 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7707
7255
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
7708
7256
|
if (errMsg.includes("timed out")) {
|
|
7709
7257
|
this.poisoned = true;
|
|
7710
|
-
|
|
7258
|
+
logger10.error("CRITICAL: Worker inference timed out \u2014 Wav2ArkitCpu worker is dead. Page reload required.", {
|
|
7711
7259
|
backend: "wasm",
|
|
7712
7260
|
timeoutMs: INFERENCE_TIMEOUT_MS2
|
|
7713
7261
|
});
|
|
7714
7262
|
} else {
|
|
7715
|
-
|
|
7263
|
+
logger10.error("Worker inference failed", { error: errMsg, backend: "wasm" });
|
|
7716
7264
|
}
|
|
7717
7265
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
7718
7266
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -7749,39 +7297,39 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7749
7297
|
}
|
|
7750
7298
|
};
|
|
7751
7299
|
|
|
7752
|
-
// src/inference/
|
|
7753
|
-
var
|
|
7754
|
-
function
|
|
7300
|
+
// src/inference/createA2E.ts
|
|
7301
|
+
var logger11 = createLogger("createA2E");
|
|
7302
|
+
function createA2E(config) {
|
|
7755
7303
|
const mode = config.mode ?? "auto";
|
|
7756
7304
|
const fallbackOnError = config.fallbackOnError ?? true;
|
|
7757
7305
|
let useCpu;
|
|
7758
7306
|
if (mode === "cpu") {
|
|
7759
7307
|
useCpu = true;
|
|
7760
|
-
|
|
7308
|
+
logger11.info("Forcing CPU A2E model (wav2arkit_cpu)");
|
|
7761
7309
|
} else if (mode === "gpu") {
|
|
7762
7310
|
useCpu = false;
|
|
7763
|
-
|
|
7311
|
+
logger11.info("Forcing GPU A2E model (Wav2Vec2)");
|
|
7764
7312
|
} else {
|
|
7765
|
-
useCpu =
|
|
7766
|
-
|
|
7313
|
+
useCpu = shouldUseCpuA2E();
|
|
7314
|
+
logger11.info("Auto-detected A2E model", {
|
|
7767
7315
|
useCpu,
|
|
7768
7316
|
isSafari: isSafari()
|
|
7769
7317
|
});
|
|
7770
7318
|
}
|
|
7771
7319
|
if (useCpu) {
|
|
7772
7320
|
if (config.unifiedWorker) {
|
|
7773
|
-
|
|
7321
|
+
logger11.info("Creating Wav2ArkitCpuUnifiedAdapter (404MB, WASM, shared unified worker)");
|
|
7774
7322
|
return new Wav2ArkitCpuUnifiedAdapter(config.unifiedWorker, {
|
|
7775
7323
|
modelUrl: config.cpuModelUrl
|
|
7776
7324
|
});
|
|
7777
7325
|
}
|
|
7778
7326
|
if (config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
|
|
7779
|
-
|
|
7327
|
+
logger11.info("Creating Wav2ArkitCpuWorker (404MB, WASM, off-main-thread)");
|
|
7780
7328
|
return new Wav2ArkitCpuWorker({
|
|
7781
7329
|
modelUrl: config.cpuModelUrl
|
|
7782
7330
|
});
|
|
7783
7331
|
}
|
|
7784
|
-
|
|
7332
|
+
logger11.info("Creating Wav2ArkitCpuInference (404MB, WASM)");
|
|
7785
7333
|
return new Wav2ArkitCpuInference({
|
|
7786
7334
|
modelUrl: config.cpuModelUrl
|
|
7787
7335
|
});
|
|
@@ -7793,13 +7341,13 @@ function createLipSync(config) {
|
|
|
7793
7341
|
numIdentityClasses: config.numIdentityClasses
|
|
7794
7342
|
});
|
|
7795
7343
|
if (fallbackOnError) {
|
|
7796
|
-
|
|
7797
|
-
return new
|
|
7344
|
+
logger11.info("Creating Wav2Vec2Inference with CPU fallback");
|
|
7345
|
+
return new A2EWithFallback(gpuInstance, config);
|
|
7798
7346
|
}
|
|
7799
|
-
|
|
7347
|
+
logger11.info("Creating Wav2Vec2Inference (no fallback)");
|
|
7800
7348
|
return gpuInstance;
|
|
7801
7349
|
}
|
|
7802
|
-
var
|
|
7350
|
+
var A2EWithFallback = class {
|
|
7803
7351
|
constructor(gpuInstance, config) {
|
|
7804
7352
|
this.hasFallenBack = false;
|
|
7805
7353
|
this.implementation = gpuInstance;
|
|
@@ -7808,6 +7356,9 @@ var LipSyncWithFallback = class {
|
|
|
7808
7356
|
get modelId() {
|
|
7809
7357
|
return this.implementation.modelId;
|
|
7810
7358
|
}
|
|
7359
|
+
get chunkSize() {
|
|
7360
|
+
return this.implementation.chunkSize;
|
|
7361
|
+
}
|
|
7811
7362
|
get backend() {
|
|
7812
7363
|
return this.implementation.backend;
|
|
7813
7364
|
}
|
|
@@ -7822,7 +7373,7 @@ var LipSyncWithFallback = class {
|
|
|
7822
7373
|
}
|
|
7823
7374
|
}
|
|
7824
7375
|
async fallbackToCpu(reason) {
|
|
7825
|
-
|
|
7376
|
+
logger11.warn("GPU model load failed, falling back to CPU model", { reason });
|
|
7826
7377
|
try {
|
|
7827
7378
|
await this.implementation.dispose();
|
|
7828
7379
|
} catch {
|
|
@@ -7831,17 +7382,17 @@ var LipSyncWithFallback = class {
|
|
|
7831
7382
|
this.implementation = new Wav2ArkitCpuUnifiedAdapter(this.config.unifiedWorker, {
|
|
7832
7383
|
modelUrl: this.config.cpuModelUrl
|
|
7833
7384
|
});
|
|
7834
|
-
|
|
7385
|
+
logger11.info("Fallback to Wav2ArkitCpuUnifiedAdapter successful");
|
|
7835
7386
|
} else if (this.config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
|
|
7836
7387
|
this.implementation = new Wav2ArkitCpuWorker({
|
|
7837
7388
|
modelUrl: this.config.cpuModelUrl
|
|
7838
7389
|
});
|
|
7839
|
-
|
|
7390
|
+
logger11.info("Fallback to Wav2ArkitCpuWorker successful");
|
|
7840
7391
|
} else {
|
|
7841
7392
|
this.implementation = new Wav2ArkitCpuInference({
|
|
7842
7393
|
modelUrl: this.config.cpuModelUrl
|
|
7843
7394
|
});
|
|
7844
|
-
|
|
7395
|
+
logger11.info("Fallback to Wav2ArkitCpuInference successful");
|
|
7845
7396
|
}
|
|
7846
7397
|
this.hasFallenBack = true;
|
|
7847
7398
|
return await this.implementation.load();
|
|
@@ -7854,8 +7405,124 @@ var LipSyncWithFallback = class {
|
|
|
7854
7405
|
}
|
|
7855
7406
|
};
|
|
7856
7407
|
|
|
7408
|
+
// src/animation/audioEnergy.ts
|
|
7409
|
+
function calculateRMS(samples) {
|
|
7410
|
+
if (samples.length === 0) return 0;
|
|
7411
|
+
let sumSquares = 0;
|
|
7412
|
+
for (let i = 0; i < samples.length; i++) {
|
|
7413
|
+
sumSquares += samples[i] * samples[i];
|
|
7414
|
+
}
|
|
7415
|
+
return Math.sqrt(sumSquares / samples.length);
|
|
7416
|
+
}
|
|
7417
|
+
function calculatePeak(samples) {
|
|
7418
|
+
let peak = 0;
|
|
7419
|
+
for (let i = 0; i < samples.length; i++) {
|
|
7420
|
+
const abs = Math.abs(samples[i]);
|
|
7421
|
+
if (abs > peak) peak = abs;
|
|
7422
|
+
}
|
|
7423
|
+
return peak;
|
|
7424
|
+
}
|
|
7425
|
+
var AudioEnergyAnalyzer = class {
|
|
7426
|
+
/**
|
|
7427
|
+
* @param smoothingFactor How much to smooth (0 = no smoothing, 1 = infinite smoothing). Default 0.85
|
|
7428
|
+
* @param noiseFloor Minimum energy threshold to consider as signal. Default 0.01
|
|
7429
|
+
*/
|
|
7430
|
+
constructor(smoothingFactor = 0.85, noiseFloor = 0.01) {
|
|
7431
|
+
this.smoothedRMS = 0;
|
|
7432
|
+
this.smoothedPeak = 0;
|
|
7433
|
+
this.smoothingFactor = Math.max(0, Math.min(0.99, smoothingFactor));
|
|
7434
|
+
this.noiseFloor = noiseFloor;
|
|
7435
|
+
}
|
|
7436
|
+
/**
|
|
7437
|
+
* Process audio samples and return smoothed energy values
|
|
7438
|
+
* @param samples Audio samples (Float32Array)
|
|
7439
|
+
* @returns Object with rms and peak values
|
|
7440
|
+
*/
|
|
7441
|
+
process(samples) {
|
|
7442
|
+
const instantRMS = calculateRMS(samples);
|
|
7443
|
+
const instantPeak = calculatePeak(samples);
|
|
7444
|
+
const gatedRMS = instantRMS > this.noiseFloor ? instantRMS : 0;
|
|
7445
|
+
const gatedPeak = instantPeak > this.noiseFloor ? instantPeak : 0;
|
|
7446
|
+
if (gatedRMS > this.smoothedRMS) {
|
|
7447
|
+
this.smoothedRMS = this.smoothedRMS * 0.5 + gatedRMS * 0.5;
|
|
7448
|
+
} else {
|
|
7449
|
+
this.smoothedRMS = this.smoothedRMS * this.smoothingFactor + gatedRMS * (1 - this.smoothingFactor);
|
|
7450
|
+
}
|
|
7451
|
+
if (gatedPeak > this.smoothedPeak) {
|
|
7452
|
+
this.smoothedPeak = this.smoothedPeak * 0.3 + gatedPeak * 0.7;
|
|
7453
|
+
} else {
|
|
7454
|
+
this.smoothedPeak = this.smoothedPeak * this.smoothingFactor + gatedPeak * (1 - this.smoothingFactor);
|
|
7455
|
+
}
|
|
7456
|
+
const energy = this.smoothedRMS * 0.7 + this.smoothedPeak * 0.3;
|
|
7457
|
+
return {
|
|
7458
|
+
rms: this.smoothedRMS,
|
|
7459
|
+
peak: this.smoothedPeak,
|
|
7460
|
+
energy: Math.min(1, energy * 2)
|
|
7461
|
+
// Scale up and clamp
|
|
7462
|
+
};
|
|
7463
|
+
}
|
|
7464
|
+
/**
|
|
7465
|
+
* Reset analyzer state
|
|
7466
|
+
*/
|
|
7467
|
+
reset() {
|
|
7468
|
+
this.smoothedRMS = 0;
|
|
7469
|
+
this.smoothedPeak = 0;
|
|
7470
|
+
}
|
|
7471
|
+
/**
|
|
7472
|
+
* Get current smoothed RMS value
|
|
7473
|
+
*/
|
|
7474
|
+
get rms() {
|
|
7475
|
+
return this.smoothedRMS;
|
|
7476
|
+
}
|
|
7477
|
+
/**
|
|
7478
|
+
* Get current smoothed peak value
|
|
7479
|
+
*/
|
|
7480
|
+
get peak() {
|
|
7481
|
+
return this.smoothedPeak;
|
|
7482
|
+
}
|
|
7483
|
+
};
|
|
7484
|
+
var EmphasisDetector = class {
|
|
7485
|
+
/**
|
|
7486
|
+
* @param historySize Number of frames to track. Default 10
|
|
7487
|
+
* @param emphasisThreshold Minimum energy increase to count as emphasis. Default 0.15
|
|
7488
|
+
*/
|
|
7489
|
+
constructor(historySize = 10, emphasisThreshold = 0.15) {
|
|
7490
|
+
this.energyHistory = [];
|
|
7491
|
+
this.historySize = historySize;
|
|
7492
|
+
this.emphasisThreshold = emphasisThreshold;
|
|
7493
|
+
}
|
|
7494
|
+
/**
|
|
7495
|
+
* Process energy value and detect emphasis
|
|
7496
|
+
* @param energy Current energy value (0-1)
|
|
7497
|
+
* @returns Object with isEmphasis flag and emphasisStrength
|
|
7498
|
+
*/
|
|
7499
|
+
process(energy) {
|
|
7500
|
+
this.energyHistory.push(energy);
|
|
7501
|
+
if (this.energyHistory.length > this.historySize) {
|
|
7502
|
+
this.energyHistory.shift();
|
|
7503
|
+
}
|
|
7504
|
+
if (this.energyHistory.length < 3) {
|
|
7505
|
+
return { isEmphasis: false, emphasisStrength: 0 };
|
|
7506
|
+
}
|
|
7507
|
+
const prevFrames = this.energyHistory.slice(0, -1);
|
|
7508
|
+
const avgPrev = prevFrames.reduce((a, b) => a + b, 0) / prevFrames.length;
|
|
7509
|
+
const increase = energy - avgPrev;
|
|
7510
|
+
const isEmphasis = increase > this.emphasisThreshold;
|
|
7511
|
+
return {
|
|
7512
|
+
isEmphasis,
|
|
7513
|
+
emphasisStrength: isEmphasis ? Math.min(1, increase / 0.3) : 0
|
|
7514
|
+
};
|
|
7515
|
+
}
|
|
7516
|
+
/**
|
|
7517
|
+
* Reset detector state
|
|
7518
|
+
*/
|
|
7519
|
+
reset() {
|
|
7520
|
+
this.energyHistory = [];
|
|
7521
|
+
}
|
|
7522
|
+
};
|
|
7523
|
+
|
|
7857
7524
|
// src/inference/SileroVADInference.ts
|
|
7858
|
-
var
|
|
7525
|
+
var logger12 = createLogger("SileroVAD");
|
|
7859
7526
|
var SileroVADInference = class {
|
|
7860
7527
|
constructor(config) {
|
|
7861
7528
|
this.session = null;
|
|
@@ -7929,23 +7596,23 @@ var SileroVADInference = class {
|
|
|
7929
7596
|
"model.sample_rate": this.config.sampleRate
|
|
7930
7597
|
});
|
|
7931
7598
|
try {
|
|
7932
|
-
|
|
7599
|
+
logger12.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
7933
7600
|
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
7934
7601
|
this.ort = ort;
|
|
7935
7602
|
this._backend = backend;
|
|
7936
|
-
|
|
7603
|
+
logger12.info("ONNX Runtime loaded", { backend: this._backend });
|
|
7937
7604
|
const cache = getModelCache();
|
|
7938
7605
|
const modelUrl = this.config.modelUrl;
|
|
7939
7606
|
const isCached = await cache.has(modelUrl);
|
|
7940
7607
|
let modelBuffer;
|
|
7941
7608
|
if (isCached) {
|
|
7942
|
-
|
|
7609
|
+
logger12.debug("Loading model from cache", { modelUrl });
|
|
7943
7610
|
modelBuffer = await cache.get(modelUrl);
|
|
7944
7611
|
} else {
|
|
7945
|
-
|
|
7612
|
+
logger12.debug("Fetching and caching model", { modelUrl });
|
|
7946
7613
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
7947
7614
|
}
|
|
7948
|
-
|
|
7615
|
+
logger12.debug("Creating ONNX session", {
|
|
7949
7616
|
size: formatBytes(modelBuffer.byteLength),
|
|
7950
7617
|
backend: this._backend
|
|
7951
7618
|
});
|
|
@@ -7954,7 +7621,7 @@ var SileroVADInference = class {
|
|
|
7954
7621
|
this.session = await ort.InferenceSession.create(modelData, sessionOptions);
|
|
7955
7622
|
this.reset();
|
|
7956
7623
|
const loadTimeMs = performance.now() - startTime;
|
|
7957
|
-
|
|
7624
|
+
logger12.info("Model loaded successfully", {
|
|
7958
7625
|
backend: this._backend,
|
|
7959
7626
|
loadTimeMs: Math.round(loadTimeMs),
|
|
7960
7627
|
sampleRate: this.config.sampleRate,
|
|
@@ -8009,7 +7676,7 @@ var SileroVADInference = class {
|
|
|
8009
7676
|
[]
|
|
8010
7677
|
);
|
|
8011
7678
|
} catch (e) {
|
|
8012
|
-
|
|
7679
|
+
logger12.warn("BigInt64Array not available, using bigint array fallback", {
|
|
8013
7680
|
error: e instanceof Error ? e.message : String(e)
|
|
8014
7681
|
});
|
|
8015
7682
|
this.srTensor = new this.ort.Tensor(
|
|
@@ -8115,7 +7782,7 @@ var SileroVADInference = class {
|
|
|
8115
7782
|
this.preSpeechBuffer.shift();
|
|
8116
7783
|
}
|
|
8117
7784
|
}
|
|
8118
|
-
|
|
7785
|
+
logger12.trace("Skipping VAD inference - audio too quiet", {
|
|
8119
7786
|
rms: Math.round(rms * 1e4) / 1e4,
|
|
8120
7787
|
threshold: MIN_ENERGY_THRESHOLD
|
|
8121
7788
|
});
|
|
@@ -8169,7 +7836,7 @@ var SileroVADInference = class {
|
|
|
8169
7836
|
if (isSpeech && !this.wasSpeaking) {
|
|
8170
7837
|
preSpeechChunks = [...this.preSpeechBuffer];
|
|
8171
7838
|
this.preSpeechBuffer = [];
|
|
8172
|
-
|
|
7839
|
+
logger12.debug("Speech started with pre-speech buffer", {
|
|
8173
7840
|
preSpeechChunks: preSpeechChunks.length,
|
|
8174
7841
|
durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
|
|
8175
7842
|
});
|
|
@@ -8182,7 +7849,7 @@ var SileroVADInference = class {
|
|
|
8182
7849
|
this.preSpeechBuffer = [];
|
|
8183
7850
|
}
|
|
8184
7851
|
this.wasSpeaking = isSpeech;
|
|
8185
|
-
|
|
7852
|
+
logger12.trace("VAD inference completed", {
|
|
8186
7853
|
probability: Math.round(probability * 1e3) / 1e3,
|
|
8187
7854
|
isSpeech,
|
|
8188
7855
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100
|
|
@@ -8213,7 +7880,7 @@ var SileroVADInference = class {
|
|
|
8213
7880
|
const oomError = new Error(
|
|
8214
7881
|
`SileroVAD inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reducing concurrent model sessions or reloading the page.`
|
|
8215
7882
|
);
|
|
8216
|
-
|
|
7883
|
+
logger12.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
8217
7884
|
pointer: `0x${err.toString(16)}`,
|
|
8218
7885
|
backend: this._backend
|
|
8219
7886
|
});
|
|
@@ -8256,7 +7923,7 @@ var SileroVADInference = class {
|
|
|
8256
7923
|
SileroVADInference.isWebGPUAvailable = isWebGPUAvailable;
|
|
8257
7924
|
|
|
8258
7925
|
// src/inference/SileroVADWorker.ts
|
|
8259
|
-
var
|
|
7926
|
+
var logger13 = createLogger("SileroVADWorker");
|
|
8260
7927
|
var WASM_CDN_PATH5 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
8261
7928
|
var LOAD_TIMEOUT_MS3 = 1e4;
|
|
8262
7929
|
var INFERENCE_TIMEOUT_MS3 = 1e3;
|
|
@@ -8534,7 +8201,7 @@ var SileroVADWorker = class {
|
|
|
8534
8201
|
this.handleWorkerMessage(event.data);
|
|
8535
8202
|
};
|
|
8536
8203
|
worker.onerror = (error) => {
|
|
8537
|
-
|
|
8204
|
+
logger13.error("Worker error", { error: error.message });
|
|
8538
8205
|
for (const [, resolver] of this.pendingResolvers) {
|
|
8539
8206
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
8540
8207
|
}
|
|
@@ -8610,9 +8277,9 @@ var SileroVADWorker = class {
|
|
|
8610
8277
|
"model.sample_rate": this.config.sampleRate
|
|
8611
8278
|
});
|
|
8612
8279
|
try {
|
|
8613
|
-
|
|
8280
|
+
logger13.info("Creating VAD worker...");
|
|
8614
8281
|
this.worker = this.createWorker();
|
|
8615
|
-
|
|
8282
|
+
logger13.info("Loading model in worker...", {
|
|
8616
8283
|
modelUrl: this.config.modelUrl,
|
|
8617
8284
|
sampleRate: this.config.sampleRate
|
|
8618
8285
|
});
|
|
@@ -8628,7 +8295,7 @@ var SileroVADWorker = class {
|
|
|
8628
8295
|
);
|
|
8629
8296
|
this._isLoaded = true;
|
|
8630
8297
|
const loadTimeMs = performance.now() - startTime;
|
|
8631
|
-
|
|
8298
|
+
logger13.info("VAD worker loaded successfully", {
|
|
8632
8299
|
backend: "wasm",
|
|
8633
8300
|
loadTimeMs: Math.round(loadTimeMs),
|
|
8634
8301
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -8735,7 +8402,7 @@ var SileroVADWorker = class {
|
|
|
8735
8402
|
if (isSpeech && !this.wasSpeaking) {
|
|
8736
8403
|
preSpeechChunks = [...this.preSpeechBuffer];
|
|
8737
8404
|
this.preSpeechBuffer = [];
|
|
8738
|
-
|
|
8405
|
+
logger13.debug("Speech started with pre-speech buffer", {
|
|
8739
8406
|
preSpeechChunks: preSpeechChunks.length,
|
|
8740
8407
|
durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
|
|
8741
8408
|
});
|
|
@@ -8748,7 +8415,7 @@ var SileroVADWorker = class {
|
|
|
8748
8415
|
this.preSpeechBuffer = [];
|
|
8749
8416
|
}
|
|
8750
8417
|
this.wasSpeaking = isSpeech;
|
|
8751
|
-
|
|
8418
|
+
logger13.trace("VAD worker inference completed", {
|
|
8752
8419
|
probability: Math.round(result.probability * 1e3) / 1e3,
|
|
8753
8420
|
isSpeech,
|
|
8754
8421
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
@@ -8816,44 +8483,44 @@ var SileroVADWorker = class {
|
|
|
8816
8483
|
};
|
|
8817
8484
|
|
|
8818
8485
|
// src/inference/createSileroVAD.ts
|
|
8819
|
-
var
|
|
8486
|
+
var logger14 = createLogger("createSileroVAD");
|
|
8820
8487
|
function supportsVADWorker() {
|
|
8821
8488
|
if (typeof Worker === "undefined") {
|
|
8822
|
-
|
|
8489
|
+
logger14.debug("Worker not supported: Worker constructor undefined");
|
|
8823
8490
|
return false;
|
|
8824
8491
|
}
|
|
8825
8492
|
if (typeof URL === "undefined" || typeof URL.createObjectURL === "undefined") {
|
|
8826
|
-
|
|
8493
|
+
logger14.debug("Worker not supported: URL.createObjectURL unavailable");
|
|
8827
8494
|
return false;
|
|
8828
8495
|
}
|
|
8829
8496
|
if (typeof Blob === "undefined") {
|
|
8830
|
-
|
|
8497
|
+
logger14.debug("Worker not supported: Blob constructor unavailable");
|
|
8831
8498
|
return false;
|
|
8832
8499
|
}
|
|
8833
8500
|
return true;
|
|
8834
8501
|
}
|
|
8835
8502
|
function createSileroVAD(config) {
|
|
8836
8503
|
if (config.unifiedWorker) {
|
|
8837
|
-
|
|
8504
|
+
logger14.info("Creating SileroVADUnifiedAdapter (shared unified worker)");
|
|
8838
8505
|
return new SileroVADUnifiedAdapter(config.unifiedWorker, config);
|
|
8839
8506
|
}
|
|
8840
8507
|
const fallbackOnError = config.fallbackOnError ?? true;
|
|
8841
8508
|
let useWorker;
|
|
8842
8509
|
if (config.useWorker !== void 0) {
|
|
8843
8510
|
useWorker = config.useWorker;
|
|
8844
|
-
|
|
8511
|
+
logger14.debug("Worker preference explicitly set", { useWorker });
|
|
8845
8512
|
} else {
|
|
8846
8513
|
const workerSupported = supportsVADWorker();
|
|
8847
8514
|
const onMobile = isMobile();
|
|
8848
8515
|
useWorker = workerSupported && !onMobile;
|
|
8849
|
-
|
|
8516
|
+
logger14.debug("Auto-detected Worker preference", {
|
|
8850
8517
|
useWorker,
|
|
8851
8518
|
workerSupported,
|
|
8852
8519
|
onMobile
|
|
8853
8520
|
});
|
|
8854
8521
|
}
|
|
8855
8522
|
if (useWorker) {
|
|
8856
|
-
|
|
8523
|
+
logger14.info("Creating SileroVADWorker (off-main-thread)");
|
|
8857
8524
|
const worker = new SileroVADWorker({
|
|
8858
8525
|
modelUrl: config.modelUrl,
|
|
8859
8526
|
sampleRate: config.sampleRate,
|
|
@@ -8865,7 +8532,7 @@ function createSileroVAD(config) {
|
|
|
8865
8532
|
}
|
|
8866
8533
|
return worker;
|
|
8867
8534
|
}
|
|
8868
|
-
|
|
8535
|
+
logger14.info("Creating SileroVADInference (main thread)");
|
|
8869
8536
|
return new SileroVADInference(config);
|
|
8870
8537
|
}
|
|
8871
8538
|
var VADWorkerWithFallback = class {
|
|
@@ -8891,7 +8558,7 @@ var VADWorkerWithFallback = class {
|
|
|
8891
8558
|
try {
|
|
8892
8559
|
return await this.implementation.load();
|
|
8893
8560
|
} catch (error) {
|
|
8894
|
-
|
|
8561
|
+
logger14.warn("Worker load failed, falling back to main thread", {
|
|
8895
8562
|
error: error instanceof Error ? error.message : String(error)
|
|
8896
8563
|
});
|
|
8897
8564
|
try {
|
|
@@ -8900,7 +8567,7 @@ var VADWorkerWithFallback = class {
|
|
|
8900
8567
|
}
|
|
8901
8568
|
this.implementation = new SileroVADInference(this.config);
|
|
8902
8569
|
this.hasFallenBack = true;
|
|
8903
|
-
|
|
8570
|
+
logger14.info("Fallback to SileroVADInference successful");
|
|
8904
8571
|
return await this.implementation.load();
|
|
8905
8572
|
}
|
|
8906
8573
|
}
|
|
@@ -8921,8 +8588,175 @@ var VADWorkerWithFallback = class {
|
|
|
8921
8588
|
}
|
|
8922
8589
|
};
|
|
8923
8590
|
|
|
8591
|
+
// src/inference/A2EOrchestrator.ts
|
|
8592
|
+
var logger15 = createLogger("A2EOrchestrator");
|
|
8593
|
+
var A2EOrchestrator = class {
|
|
8594
|
+
constructor(config) {
|
|
8595
|
+
this.a2e = null;
|
|
8596
|
+
this.processor = null;
|
|
8597
|
+
// Mic capture state (lightweight — no dependency on MicrophoneCapture class
|
|
8598
|
+
// which requires an external EventEmitter. We do raw Web Audio here.)
|
|
8599
|
+
this.stream = null;
|
|
8600
|
+
this.audioContext = null;
|
|
8601
|
+
this.scriptProcessor = null;
|
|
8602
|
+
this.nativeSampleRate = 0;
|
|
8603
|
+
this._isReady = false;
|
|
8604
|
+
this._isStreaming = false;
|
|
8605
|
+
this._backend = null;
|
|
8606
|
+
this.disposed = false;
|
|
8607
|
+
this.config = {
|
|
8608
|
+
sampleRate: 16e3,
|
|
8609
|
+
...config
|
|
8610
|
+
};
|
|
8611
|
+
}
|
|
8612
|
+
/** Latest blendshape weights from inference (null if none yet) */
|
|
8613
|
+
get latestWeights() {
|
|
8614
|
+
return this.processor?.latestFrame ?? null;
|
|
8615
|
+
}
|
|
8616
|
+
/** Whether the model is loaded and ready for inference */
|
|
8617
|
+
get isReady() {
|
|
8618
|
+
return this._isReady;
|
|
8619
|
+
}
|
|
8620
|
+
/** Whether mic is active and inference loop is running */
|
|
8621
|
+
get isStreaming() {
|
|
8622
|
+
return this._isStreaming;
|
|
8623
|
+
}
|
|
8624
|
+
/** Current backend type (webgpu, wasm, or null) */
|
|
8625
|
+
get backend() {
|
|
8626
|
+
return this._backend;
|
|
8627
|
+
}
|
|
8628
|
+
/**
|
|
8629
|
+
* Load the A2E model and create the processor
|
|
8630
|
+
*/
|
|
8631
|
+
async load() {
|
|
8632
|
+
if (this.disposed) throw new Error("A2EOrchestrator has been disposed");
|
|
8633
|
+
logger15.info("Loading A2E model...");
|
|
8634
|
+
this.a2e = createA2E({
|
|
8635
|
+
gpuModelUrl: this.config.gpuModelUrl,
|
|
8636
|
+
gpuExternalDataUrl: this.config.gpuExternalDataUrl,
|
|
8637
|
+
cpuModelUrl: this.config.cpuModelUrl ?? this.config.gpuModelUrl,
|
|
8638
|
+
...this.config.a2eConfig
|
|
8639
|
+
});
|
|
8640
|
+
const info = await this.a2e.load();
|
|
8641
|
+
this._backend = info.backend;
|
|
8642
|
+
this.processor = new A2EProcessor({
|
|
8643
|
+
backend: this.a2e,
|
|
8644
|
+
sampleRate: this.config.sampleRate,
|
|
8645
|
+
chunkSize: this.config.chunkSize,
|
|
8646
|
+
onFrame: this.config.onFrame,
|
|
8647
|
+
onError: this.config.onError
|
|
8648
|
+
});
|
|
8649
|
+
this._isReady = true;
|
|
8650
|
+
logger15.info("A2E model loaded", {
|
|
8651
|
+
backend: info.backend,
|
|
8652
|
+
loadTimeMs: info.loadTimeMs,
|
|
8653
|
+
modelId: this.a2e.modelId
|
|
8654
|
+
});
|
|
8655
|
+
this.config.onReady?.();
|
|
8656
|
+
}
|
|
8657
|
+
/**
|
|
8658
|
+
* Start mic capture and inference loop
|
|
8659
|
+
*/
|
|
8660
|
+
async start() {
|
|
8661
|
+
if (this.disposed) throw new Error("A2EOrchestrator has been disposed");
|
|
8662
|
+
if (!this._isReady || !this.processor) throw new Error("Model not loaded. Call load() first.");
|
|
8663
|
+
if (this._isStreaming) return;
|
|
8664
|
+
try {
|
|
8665
|
+
this.stream = await navigator.mediaDevices.getUserMedia({
|
|
8666
|
+
audio: {
|
|
8667
|
+
sampleRate: { ideal: this.config.sampleRate },
|
|
8668
|
+
channelCount: 1,
|
|
8669
|
+
echoCancellation: true,
|
|
8670
|
+
noiseSuppression: true,
|
|
8671
|
+
autoGainControl: true
|
|
8672
|
+
}
|
|
8673
|
+
});
|
|
8674
|
+
this.audioContext = new AudioContext({ sampleRate: this.config.sampleRate });
|
|
8675
|
+
if (this.audioContext.state === "suspended") {
|
|
8676
|
+
await this.audioContext.resume();
|
|
8677
|
+
}
|
|
8678
|
+
this.nativeSampleRate = this.audioContext.sampleRate;
|
|
8679
|
+
const source = this.audioContext.createMediaStreamSource(this.stream);
|
|
8680
|
+
this.scriptProcessor = this.audioContext.createScriptProcessor(4096, 1, 1);
|
|
8681
|
+
this.scriptProcessor.onaudioprocess = (e) => {
|
|
8682
|
+
if (!this._isStreaming || !this.processor) return;
|
|
8683
|
+
const input = e.inputBuffer.getChannelData(0);
|
|
8684
|
+
let samples;
|
|
8685
|
+
if (this.nativeSampleRate !== this.config.sampleRate) {
|
|
8686
|
+
const ratio = this.config.sampleRate / this.nativeSampleRate;
|
|
8687
|
+
const newLen = Math.round(input.length * ratio);
|
|
8688
|
+
samples = new Float32Array(newLen);
|
|
8689
|
+
for (let i = 0; i < newLen; i++) {
|
|
8690
|
+
const srcIdx = i / ratio;
|
|
8691
|
+
const lo = Math.floor(srcIdx);
|
|
8692
|
+
const hi = Math.min(lo + 1, input.length - 1);
|
|
8693
|
+
const frac = srcIdx - lo;
|
|
8694
|
+
samples[i] = input[lo] * (1 - frac) + input[hi] * frac;
|
|
8695
|
+
}
|
|
8696
|
+
} else {
|
|
8697
|
+
samples = new Float32Array(input);
|
|
8698
|
+
}
|
|
8699
|
+
this.processor.pushAudio(samples);
|
|
8700
|
+
};
|
|
8701
|
+
source.connect(this.scriptProcessor);
|
|
8702
|
+
this.scriptProcessor.connect(this.audioContext.destination);
|
|
8703
|
+
this._isStreaming = true;
|
|
8704
|
+
this.processor.startDrip();
|
|
8705
|
+
logger15.info("Mic capture started", { sampleRate: this.nativeSampleRate });
|
|
8706
|
+
} catch (err) {
|
|
8707
|
+
const error = err instanceof Error ? err : new Error(String(err));
|
|
8708
|
+
logger15.error("Failed to start mic capture", { error: error.message });
|
|
8709
|
+
this.config.onError?.(error);
|
|
8710
|
+
throw error;
|
|
8711
|
+
}
|
|
8712
|
+
}
|
|
8713
|
+
/**
|
|
8714
|
+
* Stop mic capture and inference loop
|
|
8715
|
+
*/
|
|
8716
|
+
stop() {
|
|
8717
|
+
this._isStreaming = false;
|
|
8718
|
+
if (this.processor) {
|
|
8719
|
+
this.processor.stopDrip();
|
|
8720
|
+
this.processor.reset();
|
|
8721
|
+
}
|
|
8722
|
+
if (this.scriptProcessor) {
|
|
8723
|
+
this.scriptProcessor.disconnect();
|
|
8724
|
+
this.scriptProcessor.onaudioprocess = null;
|
|
8725
|
+
this.scriptProcessor = null;
|
|
8726
|
+
}
|
|
8727
|
+
if (this.stream) {
|
|
8728
|
+
this.stream.getTracks().forEach((t) => t.stop());
|
|
8729
|
+
this.stream = null;
|
|
8730
|
+
}
|
|
8731
|
+
if (this.audioContext) {
|
|
8732
|
+
this.audioContext.close().catch(() => {
|
|
8733
|
+
});
|
|
8734
|
+
this.audioContext = null;
|
|
8735
|
+
}
|
|
8736
|
+
logger15.info("Mic capture stopped");
|
|
8737
|
+
}
|
|
8738
|
+
/**
|
|
8739
|
+
* Dispose of all resources
|
|
8740
|
+
*/
|
|
8741
|
+
async dispose() {
|
|
8742
|
+
if (this.disposed) return;
|
|
8743
|
+
this.disposed = true;
|
|
8744
|
+
this.stop();
|
|
8745
|
+
if (this.processor) {
|
|
8746
|
+
this.processor.dispose();
|
|
8747
|
+
this.processor = null;
|
|
8748
|
+
}
|
|
8749
|
+
if (this.a2e) {
|
|
8750
|
+
await this.a2e.dispose();
|
|
8751
|
+
this.a2e = null;
|
|
8752
|
+
}
|
|
8753
|
+
this._isReady = false;
|
|
8754
|
+
this._backend = null;
|
|
8755
|
+
}
|
|
8756
|
+
};
|
|
8757
|
+
|
|
8924
8758
|
// src/inference/SafariSpeechRecognition.ts
|
|
8925
|
-
var
|
|
8759
|
+
var logger16 = createLogger("SafariSpeech");
|
|
8926
8760
|
var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
8927
8761
|
constructor(config = {}) {
|
|
8928
8762
|
this.recognition = null;
|
|
@@ -8941,7 +8775,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8941
8775
|
interimResults: config.interimResults ?? true,
|
|
8942
8776
|
maxAlternatives: config.maxAlternatives ?? 1
|
|
8943
8777
|
};
|
|
8944
|
-
|
|
8778
|
+
logger16.debug("SafariSpeechRecognition created", {
|
|
8945
8779
|
language: this.config.language,
|
|
8946
8780
|
continuous: this.config.continuous
|
|
8947
8781
|
});
|
|
@@ -9002,7 +8836,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9002
8836
|
*/
|
|
9003
8837
|
async start() {
|
|
9004
8838
|
if (this.isListening) {
|
|
9005
|
-
|
|
8839
|
+
logger16.warn("Already listening");
|
|
9006
8840
|
return;
|
|
9007
8841
|
}
|
|
9008
8842
|
if (!_SafariSpeechRecognition.isAvailable()) {
|
|
@@ -9032,7 +8866,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9032
8866
|
this.isListening = true;
|
|
9033
8867
|
this.startTime = performance.now();
|
|
9034
8868
|
this.accumulatedText = "";
|
|
9035
|
-
|
|
8869
|
+
logger16.info("Speech recognition started", {
|
|
9036
8870
|
language: this.config.language
|
|
9037
8871
|
});
|
|
9038
8872
|
span?.end();
|
|
@@ -9047,7 +8881,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9047
8881
|
*/
|
|
9048
8882
|
async stop() {
|
|
9049
8883
|
if (!this.isListening || !this.recognition) {
|
|
9050
|
-
|
|
8884
|
+
logger16.warn("Not currently listening");
|
|
9051
8885
|
return {
|
|
9052
8886
|
text: this.accumulatedText,
|
|
9053
8887
|
language: this.config.language,
|
|
@@ -9076,7 +8910,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9076
8910
|
if (this.recognition && this.isListening) {
|
|
9077
8911
|
this.recognition.abort();
|
|
9078
8912
|
this.isListening = false;
|
|
9079
|
-
|
|
8913
|
+
logger16.info("Speech recognition aborted");
|
|
9080
8914
|
}
|
|
9081
8915
|
}
|
|
9082
8916
|
/**
|
|
@@ -9107,7 +8941,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9107
8941
|
this.isListening = false;
|
|
9108
8942
|
this.resultCallbacks = [];
|
|
9109
8943
|
this.errorCallbacks = [];
|
|
9110
|
-
|
|
8944
|
+
logger16.debug("SafariSpeechRecognition disposed");
|
|
9111
8945
|
}
|
|
9112
8946
|
/**
|
|
9113
8947
|
* Set up event handlers for the recognition instance
|
|
@@ -9135,7 +8969,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9135
8969
|
confidence: alternative.confidence
|
|
9136
8970
|
};
|
|
9137
8971
|
this.emitResult(speechResult);
|
|
9138
|
-
|
|
8972
|
+
logger16.trace("Speech result", {
|
|
9139
8973
|
text: text.substring(0, 50),
|
|
9140
8974
|
isFinal,
|
|
9141
8975
|
confidence: alternative.confidence
|
|
@@ -9145,12 +8979,12 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9145
8979
|
span?.end();
|
|
9146
8980
|
} catch (error) {
|
|
9147
8981
|
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
9148
|
-
|
|
8982
|
+
logger16.error("Error processing speech result", { error });
|
|
9149
8983
|
}
|
|
9150
8984
|
};
|
|
9151
8985
|
this.recognition.onerror = (event) => {
|
|
9152
8986
|
const error = new Error(`Speech recognition error: ${event.error} - ${event.message}`);
|
|
9153
|
-
|
|
8987
|
+
logger16.error("Speech recognition error", { error: event.error, message: event.message });
|
|
9154
8988
|
this.emitError(error);
|
|
9155
8989
|
if (this.stopRejecter) {
|
|
9156
8990
|
this.stopRejecter(error);
|
|
@@ -9160,7 +8994,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9160
8994
|
};
|
|
9161
8995
|
this.recognition.onend = () => {
|
|
9162
8996
|
this.isListening = false;
|
|
9163
|
-
|
|
8997
|
+
logger16.info("Speech recognition ended", {
|
|
9164
8998
|
totalText: this.accumulatedText.length,
|
|
9165
8999
|
durationMs: performance.now() - this.startTime
|
|
9166
9000
|
});
|
|
@@ -9177,13 +9011,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9177
9011
|
}
|
|
9178
9012
|
};
|
|
9179
9013
|
this.recognition.onstart = () => {
|
|
9180
|
-
|
|
9014
|
+
logger16.debug("Speech recognition started by browser");
|
|
9181
9015
|
};
|
|
9182
9016
|
this.recognition.onspeechstart = () => {
|
|
9183
|
-
|
|
9017
|
+
logger16.debug("Speech detected");
|
|
9184
9018
|
};
|
|
9185
9019
|
this.recognition.onspeechend = () => {
|
|
9186
|
-
|
|
9020
|
+
logger16.debug("Speech ended");
|
|
9187
9021
|
};
|
|
9188
9022
|
}
|
|
9189
9023
|
/**
|
|
@@ -9194,7 +9028,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9194
9028
|
try {
|
|
9195
9029
|
callback(result);
|
|
9196
9030
|
} catch (error) {
|
|
9197
|
-
|
|
9031
|
+
logger16.error("Error in result callback", { error });
|
|
9198
9032
|
}
|
|
9199
9033
|
}
|
|
9200
9034
|
}
|
|
@@ -9206,7 +9040,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9206
9040
|
try {
|
|
9207
9041
|
callback(error);
|
|
9208
9042
|
} catch (callbackError) {
|
|
9209
|
-
|
|
9043
|
+
logger16.error("Error in error callback", { error: callbackError });
|
|
9210
9044
|
}
|
|
9211
9045
|
}
|
|
9212
9046
|
}
|
|
@@ -9619,13 +9453,14 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
9619
9453
|
if (!this.lam) {
|
|
9620
9454
|
throw new Error("LAM must be initialized before pipeline");
|
|
9621
9455
|
}
|
|
9622
|
-
this.pipeline = new
|
|
9456
|
+
this.pipeline = new FullFacePipeline({
|
|
9623
9457
|
lam: this.lam,
|
|
9624
9458
|
sampleRate: 16e3,
|
|
9625
9459
|
chunkTargetMs: 200
|
|
9626
9460
|
});
|
|
9627
9461
|
await this.pipeline.initialize();
|
|
9628
|
-
this.pipeline.on("
|
|
9462
|
+
this.pipeline.on("full_frame_ready", (fullFrame) => {
|
|
9463
|
+
const frame = fullFrame.blendshapes;
|
|
9629
9464
|
this.emit("animation", {
|
|
9630
9465
|
blendshapes: frame,
|
|
9631
9466
|
get: (name) => {
|
|
@@ -9804,9 +9639,9 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
9804
9639
|
});
|
|
9805
9640
|
}
|
|
9806
9641
|
}
|
|
9807
|
-
// REMOVED: processAudioForAnimation() - now handled by
|
|
9642
|
+
// REMOVED: processAudioForAnimation() - now handled by FullFacePipeline
|
|
9808
9643
|
// The pipeline manages audio scheduling, LAM inference, and frame synchronization
|
|
9809
|
-
// Frames are emitted via pipeline.on('
|
|
9644
|
+
// Frames are emitted via pipeline.on('full_frame_ready') event (see initPipeline())
|
|
9810
9645
|
/**
|
|
9811
9646
|
* Detect voice activity using Silero VAD
|
|
9812
9647
|
* Falls back to simple RMS if VAD not available
|