@omote/core 0.9.1 → 0.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ErrorCodes-AX3ADZri.d.mts +266 -0
- package/dist/ErrorCodes-AX3ADZri.d.ts +266 -0
- package/dist/chunk-CYBTTLG7.mjs +927 -0
- package/dist/chunk-CYBTTLG7.mjs.map +1 -0
- package/dist/chunk-X5OTUOE6.mjs +927 -0
- package/dist/chunk-X5OTUOE6.mjs.map +1 -0
- package/dist/chunk-Y3DTP5P3.mjs +927 -0
- package/dist/chunk-Y3DTP5P3.mjs.map +1 -0
- package/dist/index.d.mts +214 -3
- package/dist/index.d.ts +214 -3
- package/dist/index.js +713 -233
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +638 -225
- package/dist/index.mjs.map +1 -1
- package/dist/logging/index.d.mts +2 -2
- package/dist/logging/index.d.ts +2 -2
- package/dist/logging/index.js +75 -1
- package/dist/logging/index.js.map +1 -1
- package/dist/logging/index.mjs +9 -1
- package/package.json +3 -1
package/dist/index.js
CHANGED
|
@@ -52,10 +52,12 @@ __export(index_exports, {
|
|
|
52
52
|
EMOTION_TO_AU: () => EMOTION_TO_AU,
|
|
53
53
|
EMOTION_VECTOR_SIZE: () => EMOTION_VECTOR_SIZE,
|
|
54
54
|
EXPLICIT_EMOTION_COUNT: () => EXPLICIT_EMOTION_COUNT,
|
|
55
|
+
ElevenLabsTTSBackend: () => ElevenLabsTTSBackend,
|
|
55
56
|
EmotionController: () => EmotionController,
|
|
56
57
|
EmotionPresets: () => EmotionPresets,
|
|
57
58
|
EmotionResolver: () => EmotionResolver,
|
|
58
59
|
EmphasisDetector: () => EmphasisDetector,
|
|
60
|
+
ErrorCodes: () => ErrorCodes,
|
|
59
61
|
ErrorTypes: () => ErrorTypes,
|
|
60
62
|
EventEmitter: () => EventEmitter,
|
|
61
63
|
FaceCompositor: () => FaceCompositor,
|
|
@@ -79,6 +81,7 @@ __export(index_exports, {
|
|
|
79
81
|
PRESERVE_POSITION_BONES: () => PRESERVE_POSITION_BONES,
|
|
80
82
|
PROTOCOL_VERSION: () => PROTOCOL_VERSION,
|
|
81
83
|
PlaybackPipeline: () => PlaybackPipeline,
|
|
84
|
+
PollyTTSBackend: () => PollyTTSBackend,
|
|
82
85
|
ProceduralLifeLayer: () => ProceduralLifeLayer,
|
|
83
86
|
RingBuffer: () => RingBuffer,
|
|
84
87
|
SafariSpeechRecognition: () => SafariSpeechRecognition,
|
|
@@ -102,6 +105,7 @@ __export(index_exports, {
|
|
|
102
105
|
calculatePeak: () => calculatePeak,
|
|
103
106
|
calculateRMS: () => calculateRMS,
|
|
104
107
|
configureCacheLimit: () => configureCacheLimit,
|
|
108
|
+
configureClock: () => configureClock,
|
|
105
109
|
configureLogging: () => configureLogging,
|
|
106
110
|
configureModelUrls: () => configureModelUrls,
|
|
107
111
|
configureOrtCdn: () => configureOrtCdn,
|
|
@@ -118,6 +122,7 @@ __export(index_exports, {
|
|
|
118
122
|
formatBytes: () => formatBytes,
|
|
119
123
|
getCacheConfig: () => getCacheConfig,
|
|
120
124
|
getCacheKey: () => getCacheKey,
|
|
125
|
+
getClock: () => getClock,
|
|
121
126
|
getEmotionPreset: () => getEmotionPreset,
|
|
122
127
|
getLoggingConfig: () => getLoggingConfig,
|
|
123
128
|
getModelCache: () => getModelCache,
|
|
@@ -654,6 +659,19 @@ var OTLPExporter = class {
|
|
|
654
659
|
}
|
|
655
660
|
};
|
|
656
661
|
|
|
662
|
+
// src/logging/Clock.ts
|
|
663
|
+
var defaultClock = {
|
|
664
|
+
now: () => performance.now(),
|
|
665
|
+
timestamp: () => Date.now()
|
|
666
|
+
};
|
|
667
|
+
var activeClock = defaultClock;
|
|
668
|
+
function configureClock(clock) {
|
|
669
|
+
activeClock = clock;
|
|
670
|
+
}
|
|
671
|
+
function getClock() {
|
|
672
|
+
return activeClock;
|
|
673
|
+
}
|
|
674
|
+
|
|
657
675
|
// src/telemetry/OmoteTelemetry.ts
|
|
658
676
|
function generateId(length = 16) {
|
|
659
677
|
const bytes = new Uint8Array(length);
|
|
@@ -762,7 +780,7 @@ var OmoteTelemetry = class {
|
|
|
762
780
|
const traceId = parentContext?.traceId ?? this.activeTraceId ?? generateId(16);
|
|
763
781
|
const spanId = generateId(8);
|
|
764
782
|
const parentSpanId = parentContext?.spanId;
|
|
765
|
-
const startTime =
|
|
783
|
+
const startTime = getClock().now();
|
|
766
784
|
if (!parentContext && !this.activeTraceId) {
|
|
767
785
|
this.activeTraceId = traceId;
|
|
768
786
|
}
|
|
@@ -776,7 +794,7 @@ var OmoteTelemetry = class {
|
|
|
776
794
|
ended = true;
|
|
777
795
|
const idx = this.spanStack.findIndex((s) => s.spanId === spanId);
|
|
778
796
|
if (idx !== -1) this.spanStack.splice(idx, 1);
|
|
779
|
-
const endTime =
|
|
797
|
+
const endTime = getClock().now();
|
|
780
798
|
const durationMs = endTime - startTime;
|
|
781
799
|
if (status === "error" && !sampled) {
|
|
782
800
|
sampled = this.shouldSample(true);
|
|
@@ -891,7 +909,7 @@ var OmoteTelemetry = class {
|
|
|
891
909
|
*/
|
|
892
910
|
flushMetrics() {
|
|
893
911
|
if (!this.exporter) return;
|
|
894
|
-
const timestamp =
|
|
912
|
+
const timestamp = getClock().now();
|
|
895
913
|
for (const [key, data] of this.counters) {
|
|
896
914
|
if (data.value === 0) continue;
|
|
897
915
|
const name = key.split("|")[0];
|
|
@@ -1012,7 +1030,7 @@ var Logger = class _Logger {
|
|
|
1012
1030
|
log(level, message, data) {
|
|
1013
1031
|
if (!shouldLog(level)) return;
|
|
1014
1032
|
const entry = {
|
|
1015
|
-
timestamp:
|
|
1033
|
+
timestamp: getClock().timestamp(),
|
|
1016
1034
|
level,
|
|
1017
1035
|
module: this.module,
|
|
1018
1036
|
message,
|
|
@@ -1054,12 +1072,12 @@ var Logger = class _Logger {
|
|
|
1054
1072
|
};
|
|
1055
1073
|
var loggerCache = /* @__PURE__ */ new Map();
|
|
1056
1074
|
function createLogger(module2) {
|
|
1057
|
-
let
|
|
1058
|
-
if (!
|
|
1059
|
-
|
|
1060
|
-
loggerCache.set(module2,
|
|
1075
|
+
let logger45 = loggerCache.get(module2);
|
|
1076
|
+
if (!logger45) {
|
|
1077
|
+
logger45 = new Logger(module2);
|
|
1078
|
+
loggerCache.set(module2, logger45);
|
|
1061
1079
|
}
|
|
1062
|
-
return
|
|
1080
|
+
return logger45;
|
|
1063
1081
|
}
|
|
1064
1082
|
var noopLogger = {
|
|
1065
1083
|
module: "noop",
|
|
@@ -1078,6 +1096,63 @@ var noopLogger = {
|
|
|
1078
1096
|
child: () => noopLogger
|
|
1079
1097
|
};
|
|
1080
1098
|
|
|
1099
|
+
// src/logging/ErrorCodes.ts
|
|
1100
|
+
var ErrorCodes = {
|
|
1101
|
+
// ── Inference ──────────────────────────────────────────────────────────
|
|
1102
|
+
/** Model failed to load (file not found, corrupted, unsupported format) */
|
|
1103
|
+
INF_LOAD_FAILED: "OMOTE_INF_001",
|
|
1104
|
+
/** ORT session poisoned after WebGPU device loss — must reload tab */
|
|
1105
|
+
INF_SESSION_POISON: "OMOTE_INF_002",
|
|
1106
|
+
/** Inference exceeded timeout threshold */
|
|
1107
|
+
INF_TIMEOUT: "OMOTE_INF_003",
|
|
1108
|
+
/** Out-of-memory during inference or model loading */
|
|
1109
|
+
INF_OOM: "OMOTE_INF_004",
|
|
1110
|
+
/** WebGPU unavailable, fell back to WASM */
|
|
1111
|
+
INF_WEBGPU_FALLBACK: "OMOTE_INF_005",
|
|
1112
|
+
/** Input tensor shape does not match model expectations */
|
|
1113
|
+
INF_SHAPE_MISMATCH: "OMOTE_INF_006",
|
|
1114
|
+
// ── Audio ──────────────────────────────────────────────────────────────
|
|
1115
|
+
/** AudioContext creation or resume failed */
|
|
1116
|
+
AUD_CONTEXT_FAILED: "OMOTE_AUD_001",
|
|
1117
|
+
/** Gap detected in audio scheduling (buffer underrun) */
|
|
1118
|
+
AUD_SCHEDULE_GAP: "OMOTE_AUD_002",
|
|
1119
|
+
/** Audio buffer decoding failed */
|
|
1120
|
+
AUD_DECODE_FAILED: "OMOTE_AUD_003",
|
|
1121
|
+
// ── Speech ─────────────────────────────────────────────────────────────
|
|
1122
|
+
/** Voice activity detection error */
|
|
1123
|
+
SPH_VAD_ERROR: "OMOTE_SPH_001",
|
|
1124
|
+
/** Automatic speech recognition error */
|
|
1125
|
+
SPH_ASR_ERROR: "OMOTE_SPH_002",
|
|
1126
|
+
/** Microphone access denied or unavailable */
|
|
1127
|
+
SPH_MIC_DENIED: "OMOTE_SPH_003",
|
|
1128
|
+
// ── TTS ────────────────────────────────────────────────────────────────
|
|
1129
|
+
/** TTS synthesis failed */
|
|
1130
|
+
TTS_SYNTH_FAILED: "OMOTE_TTS_001",
|
|
1131
|
+
/** TTS streaming error (chunk delivery failure) */
|
|
1132
|
+
TTS_STREAM_ERROR: "OMOTE_TTS_002",
|
|
1133
|
+
/** Phonemizer (eSpeak-NG WASM) ran out of memory */
|
|
1134
|
+
TTS_PHONEMIZER_OOM: "OMOTE_TTS_003",
|
|
1135
|
+
// ── Pipeline ───────────────────────────────────────────────────────────
|
|
1136
|
+
/** Invalid state transition in pipeline state machine */
|
|
1137
|
+
PIP_STATE_ERROR: "OMOTE_PIP_001",
|
|
1138
|
+
/** Pipeline operation aborted (user interrupt or signal) */
|
|
1139
|
+
PIP_ABORT: "OMOTE_PIP_002",
|
|
1140
|
+
// ── Cache ──────────────────────────────────────────────────────────────
|
|
1141
|
+
/** IndexedDB storage quota exceeded */
|
|
1142
|
+
CAC_QUOTA_EXCEEDED: "OMOTE_CAC_001",
|
|
1143
|
+
/** Cache entry evicted (LRU or manual) */
|
|
1144
|
+
CAC_EVICTION: "OMOTE_CAC_002",
|
|
1145
|
+
/** Cached model is stale (version mismatch) */
|
|
1146
|
+
CAC_STALE: "OMOTE_CAC_003",
|
|
1147
|
+
// ── Network ────────────────────────────────────────────────────────────
|
|
1148
|
+
/** HTTP fetch failed (model download, CDN) */
|
|
1149
|
+
NET_FETCH_FAILED: "OMOTE_NET_001",
|
|
1150
|
+
/** Network request timed out */
|
|
1151
|
+
NET_TIMEOUT: "OMOTE_NET_002",
|
|
1152
|
+
/** WebSocket connection error */
|
|
1153
|
+
NET_WEBSOCKET_ERROR: "OMOTE_NET_003"
|
|
1154
|
+
};
|
|
1155
|
+
|
|
1081
1156
|
// src/audio/MicrophoneCapture.ts
|
|
1082
1157
|
var logger = createLogger("MicrophoneCapture");
|
|
1083
1158
|
var MicrophoneCapture = class {
|
|
@@ -1175,7 +1250,7 @@ var MicrophoneCapture = class {
|
|
|
1175
1250
|
const pcm = this.floatToPCM16(chunk);
|
|
1176
1251
|
this.events.emit("audio.chunk", {
|
|
1177
1252
|
pcm,
|
|
1178
|
-
timestamp:
|
|
1253
|
+
timestamp: getClock().now()
|
|
1179
1254
|
});
|
|
1180
1255
|
chunkCount++;
|
|
1181
1256
|
}
|
|
@@ -1406,11 +1481,23 @@ var AudioScheduler = class {
|
|
|
1406
1481
|
source.connect(gainNode);
|
|
1407
1482
|
const scheduleTime = this.nextPlayTime;
|
|
1408
1483
|
if (scheduleTime < ctx.currentTime) {
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1484
|
+
const gap = ctx.currentTime - scheduleTime;
|
|
1485
|
+
const gapMs = gap * 1e3;
|
|
1486
|
+
if (gap > 0.5) {
|
|
1487
|
+
logger2.error("Critical audio scheduling gap", {
|
|
1488
|
+
code: ErrorCodes.AUD_SCHEDULE_GAP,
|
|
1489
|
+
scheduleTime,
|
|
1490
|
+
currentTime: ctx.currentTime,
|
|
1491
|
+
gapMs: Math.round(gapMs)
|
|
1492
|
+
});
|
|
1493
|
+
this.options.onError?.(new Error(`Audio scheduling gap: ${gap.toFixed(3)}s`));
|
|
1494
|
+
} else {
|
|
1495
|
+
logger2.warn("Audio gap detected", {
|
|
1496
|
+
scheduleTime,
|
|
1497
|
+
currentTime: ctx.currentTime,
|
|
1498
|
+
gapMs: Math.round(gapMs)
|
|
1499
|
+
});
|
|
1500
|
+
}
|
|
1414
1501
|
}
|
|
1415
1502
|
source.start(scheduleTime);
|
|
1416
1503
|
const entry = { source, gainNode };
|
|
@@ -1604,8 +1691,8 @@ var AudioChunkCoalescer = class {
|
|
|
1604
1691
|
var logger4 = createLogger("A2EProcessor");
|
|
1605
1692
|
var FRAME_RATE = 30;
|
|
1606
1693
|
var DRIP_INTERVAL_MS = 33;
|
|
1607
|
-
var HOLD_DURATION_MS =
|
|
1608
|
-
var DECAY_DURATION_MS =
|
|
1694
|
+
var HOLD_DURATION_MS = 400;
|
|
1695
|
+
var DECAY_DURATION_MS = 300;
|
|
1609
1696
|
var _A2EProcessor = class _A2EProcessor {
|
|
1610
1697
|
constructor(config) {
|
|
1611
1698
|
this.writeOffset = 0;
|
|
@@ -1769,7 +1856,7 @@ var _A2EProcessor = class _A2EProcessor {
|
|
|
1769
1856
|
if (this.timestampedQueue.length > 0 && this.timestampedQueue[0].timestamp <= currentTime) {
|
|
1770
1857
|
const { frame } = this.timestampedQueue.shift();
|
|
1771
1858
|
this.lastPulledFrame = frame;
|
|
1772
|
-
this.lastDequeuedTime =
|
|
1859
|
+
this.lastDequeuedTime = getClock().now();
|
|
1773
1860
|
return frame;
|
|
1774
1861
|
}
|
|
1775
1862
|
if (this.timestampedQueue.length > 0 && this.getFrameCallCount % 60 === 0) {
|
|
@@ -1781,7 +1868,7 @@ var _A2EProcessor = class _A2EProcessor {
|
|
|
1781
1868
|
});
|
|
1782
1869
|
}
|
|
1783
1870
|
if (this.lastPulledFrame) {
|
|
1784
|
-
const elapsed =
|
|
1871
|
+
const elapsed = getClock().now() - this.lastDequeuedTime;
|
|
1785
1872
|
if (elapsed < HOLD_DURATION_MS) {
|
|
1786
1873
|
return this.lastPulledFrame;
|
|
1787
1874
|
}
|
|
@@ -1866,9 +1953,9 @@ var _A2EProcessor = class _A2EProcessor {
|
|
|
1866
1953
|
while (this.pendingChunks.length > 0 && !this.disposed) {
|
|
1867
1954
|
const { chunk, timestamp } = this.pendingChunks.shift();
|
|
1868
1955
|
try {
|
|
1869
|
-
const t0 =
|
|
1956
|
+
const t0 = getClock().now();
|
|
1870
1957
|
const result = await this.backend.infer(chunk, this.identityIndex);
|
|
1871
|
-
const inferMs = Math.round(
|
|
1958
|
+
const inferMs = Math.round(getClock().now() - t0);
|
|
1872
1959
|
const actualDuration = chunk.length / this.sampleRate;
|
|
1873
1960
|
const actualFrameCount = Math.ceil(actualDuration * FRAME_RATE);
|
|
1874
1961
|
const framesToQueue = Math.min(actualFrameCount, result.blendshapes.length);
|
|
@@ -1907,7 +1994,11 @@ var _A2EProcessor = class _A2EProcessor {
|
|
|
1907
1994
|
}
|
|
1908
1995
|
handleError(err) {
|
|
1909
1996
|
const error = err instanceof Error ? err : new Error(String(err));
|
|
1910
|
-
|
|
1997
|
+
const isOOM = typeof err === "number" || error.message && /out of memory|oom|alloc/i.test(error.message);
|
|
1998
|
+
logger4.warn("A2EProcessor inference error", {
|
|
1999
|
+
error: error.message,
|
|
2000
|
+
code: isOOM ? ErrorCodes.INF_OOM : ErrorCodes.INF_SESSION_POISON
|
|
2001
|
+
});
|
|
1911
2002
|
this.onError?.(error);
|
|
1912
2003
|
}
|
|
1913
2004
|
};
|
|
@@ -1929,6 +2020,12 @@ var MetricNames = {
|
|
|
1929
2020
|
CACHE_HITS: "omote.cache.hits",
|
|
1930
2021
|
/** Counter: Cache misses */
|
|
1931
2022
|
CACHE_MISSES: "omote.cache.misses",
|
|
2023
|
+
/** Counter: Cache stale (version/etag mismatch) */
|
|
2024
|
+
CACHE_STALE: "omote.cache.stale",
|
|
2025
|
+
/** Counter: Cache quota warning (>90% used) */
|
|
2026
|
+
CACHE_QUOTA_WARNING: "omote.cache.quota_warning",
|
|
2027
|
+
/** Counter: Cache eviction (LRU) */
|
|
2028
|
+
CACHE_EVICTION: "omote.cache.eviction",
|
|
1932
2029
|
// --- Pipeline ---
|
|
1933
2030
|
/** Histogram: VoicePipeline turn latency (speech end → transcript ready, excludes playback) */
|
|
1934
2031
|
VOICE_TURN_LATENCY: "omote.voice.turn.latency",
|
|
@@ -2228,14 +2325,14 @@ var PlaybackPipeline = class extends EventEmitter {
|
|
|
2228
2325
|
this._currentRawFrame = null;
|
|
2229
2326
|
this.cancelNeutralTransition();
|
|
2230
2327
|
this.scheduler.warmup();
|
|
2231
|
-
this.sessionStartTime =
|
|
2328
|
+
this.sessionStartTime = getClock().now();
|
|
2232
2329
|
this.startFrameLoop();
|
|
2233
2330
|
this.startMonitoring();
|
|
2234
2331
|
this.setState("playing");
|
|
2235
2332
|
}
|
|
2236
2333
|
/** Feed a streaming audio chunk (PCM16 Uint8Array) */
|
|
2237
2334
|
async onAudioChunk(chunk) {
|
|
2238
|
-
const chunkStart =
|
|
2335
|
+
const chunkStart = getClock().now();
|
|
2239
2336
|
const combined = this.coalescer.add(chunk);
|
|
2240
2337
|
if (!combined) return;
|
|
2241
2338
|
const float32 = pcm16ToFloat32(combined);
|
|
@@ -2245,7 +2342,7 @@ var PlaybackPipeline = class extends EventEmitter {
|
|
|
2245
2342
|
this.emit("playback:start", { time: scheduleTime });
|
|
2246
2343
|
}
|
|
2247
2344
|
this.processor.pushAudio(float32, scheduleTime);
|
|
2248
|
-
getTelemetry()?.recordHistogram(MetricNames.PLAYBACK_CHUNK_LATENCY,
|
|
2345
|
+
getTelemetry()?.recordHistogram(MetricNames.PLAYBACK_CHUNK_LATENCY, getClock().now() - chunkStart);
|
|
2249
2346
|
}
|
|
2250
2347
|
/** Signal end of audio stream (flushes remaining audio) */
|
|
2251
2348
|
async end() {
|
|
@@ -2348,15 +2445,15 @@ var PlaybackPipeline = class extends EventEmitter {
|
|
|
2348
2445
|
const currentTime = this.scheduler.getCurrentTime();
|
|
2349
2446
|
const lamFrame = this.processor.getFrameForTime(currentTime);
|
|
2350
2447
|
if (lamFrame && lamFrame !== this.lastKnownLamFrame) {
|
|
2351
|
-
this.lastNewFrameTime =
|
|
2448
|
+
this.lastNewFrameTime = getClock().now();
|
|
2352
2449
|
this.lastKnownLamFrame = lamFrame;
|
|
2353
2450
|
this.staleWarningEmitted = false;
|
|
2354
2451
|
}
|
|
2355
|
-
if (this.playbackStarted && this.lastNewFrameTime > 0 &&
|
|
2452
|
+
if (this.playbackStarted && this.lastNewFrameTime > 0 && getClock().now() - this.lastNewFrameTime > this.staleThresholdMs) {
|
|
2356
2453
|
if (!this.staleWarningEmitted) {
|
|
2357
2454
|
this.staleWarningEmitted = true;
|
|
2358
2455
|
logger5.warn("A2E stalled \u2014 no new inference frames", {
|
|
2359
|
-
staleDurationMs: Math.round(
|
|
2456
|
+
staleDurationMs: Math.round(getClock().now() - this.lastNewFrameTime),
|
|
2360
2457
|
queuedFrames: this.processor.queuedFrameCount
|
|
2361
2458
|
});
|
|
2362
2459
|
}
|
|
@@ -2396,7 +2493,7 @@ var PlaybackPipeline = class extends EventEmitter {
|
|
|
2396
2493
|
if (this.sessionStartTime > 0) {
|
|
2397
2494
|
getTelemetry()?.recordHistogram(
|
|
2398
2495
|
MetricNames.PLAYBACK_SESSION_DURATION,
|
|
2399
|
-
|
|
2496
|
+
getClock().now() - this.sessionStartTime
|
|
2400
2497
|
);
|
|
2401
2498
|
}
|
|
2402
2499
|
this.stopInternal();
|
|
@@ -2414,9 +2511,9 @@ var PlaybackPipeline = class extends EventEmitter {
|
|
|
2414
2511
|
// ---------------------------------------------------------------------------
|
|
2415
2512
|
startNeutralTransition(fromFrame) {
|
|
2416
2513
|
this.neutralTransitionFrame = new Float32Array(fromFrame);
|
|
2417
|
-
this.neutralTransitionStart =
|
|
2514
|
+
this.neutralTransitionStart = getClock().now();
|
|
2418
2515
|
const animate = () => {
|
|
2419
|
-
const elapsed =
|
|
2516
|
+
const elapsed = getClock().now() - this.neutralTransitionStart;
|
|
2420
2517
|
const t = Math.min(1, elapsed / this.neutralTransitionMs);
|
|
2421
2518
|
const eased = 1 - Math.pow(1 - t, 3);
|
|
2422
2519
|
logger5.trace("neutral transition", { t: Math.round(t * 1e3) / 1e3, eased: Math.round(eased * 1e3) / 1e3 });
|
|
@@ -2429,7 +2526,7 @@ var PlaybackPipeline = class extends EventEmitter {
|
|
|
2429
2526
|
blendshapes,
|
|
2430
2527
|
rawBlendshapes: blendshapes,
|
|
2431
2528
|
// raw = scaled during transition
|
|
2432
|
-
timestamp:
|
|
2529
|
+
timestamp: getClock().now() / 1e3,
|
|
2433
2530
|
emotion: this._emotion ?? void 0
|
|
2434
2531
|
};
|
|
2435
2532
|
this.emit("frame", frame);
|
|
@@ -2660,7 +2757,7 @@ var ModelCache = class {
|
|
|
2660
2757
|
logger7.warn("Failed to request persistent storage", { error: String(err) });
|
|
2661
2758
|
}
|
|
2662
2759
|
}
|
|
2663
|
-
const dbOpenStart =
|
|
2760
|
+
const dbOpenStart = getClock().now();
|
|
2664
2761
|
this.dbPromise = new Promise((resolve, reject) => {
|
|
2665
2762
|
const request = indexedDB.open(DB_NAME, DB_VERSION);
|
|
2666
2763
|
request.onerror = () => {
|
|
@@ -2669,7 +2766,7 @@ var ModelCache = class {
|
|
|
2669
2766
|
};
|
|
2670
2767
|
request.onsuccess = () => {
|
|
2671
2768
|
this.db = request.result;
|
|
2672
|
-
logger7.debug("IndexedDB opened", { durationMs: Math.round(
|
|
2769
|
+
logger7.debug("IndexedDB opened", { durationMs: Math.round(getClock().now() - dbOpenStart) });
|
|
2673
2770
|
resolve(this.db);
|
|
2674
2771
|
};
|
|
2675
2772
|
request.onupgradeneeded = (event) => {
|
|
@@ -2743,16 +2840,16 @@ var ModelCache = class {
|
|
|
2743
2840
|
}
|
|
2744
2841
|
span?.end();
|
|
2745
2842
|
if (hit) {
|
|
2746
|
-
telemetry?.incrementCounter(
|
|
2843
|
+
telemetry?.incrementCounter(MetricNames.CACHE_HITS, 1, {});
|
|
2747
2844
|
} else {
|
|
2748
|
-
telemetry?.incrementCounter(
|
|
2845
|
+
telemetry?.incrementCounter(MetricNames.CACHE_MISSES, 1, {});
|
|
2749
2846
|
}
|
|
2750
2847
|
resolve(cached?.data ?? null);
|
|
2751
2848
|
};
|
|
2752
2849
|
request.onerror = () => {
|
|
2753
2850
|
span?.setAttributes({ "cache.hit": false });
|
|
2754
2851
|
span?.end();
|
|
2755
|
-
telemetry?.incrementCounter(
|
|
2852
|
+
telemetry?.incrementCounter(MetricNames.CACHE_MISSES, 1, {});
|
|
2756
2853
|
resolve(null);
|
|
2757
2854
|
};
|
|
2758
2855
|
});
|
|
@@ -2796,14 +2893,14 @@ var ModelCache = class {
|
|
|
2796
2893
|
if (!cached?.data) {
|
|
2797
2894
|
span?.setAttributes({ "cache.hit": false });
|
|
2798
2895
|
span?.end();
|
|
2799
|
-
telemetry?.incrementCounter(
|
|
2896
|
+
telemetry?.incrementCounter(MetricNames.CACHE_MISSES, 1, {});
|
|
2800
2897
|
return { data: null, stale: false };
|
|
2801
2898
|
}
|
|
2802
2899
|
span?.setAttributes({ "cache.hit": true, "cache.size_bytes": cached.size });
|
|
2803
2900
|
if (!cached.etag) {
|
|
2804
2901
|
span?.setAttributes({ "cache.validated": false, "cache.stale": false });
|
|
2805
2902
|
span?.end();
|
|
2806
|
-
telemetry?.incrementCounter(
|
|
2903
|
+
telemetry?.incrementCounter(MetricNames.CACHE_HITS, 1, {});
|
|
2807
2904
|
return { data: cached.data, stale: false };
|
|
2808
2905
|
}
|
|
2809
2906
|
const fetchUrl = originalUrl || url;
|
|
@@ -2812,7 +2909,7 @@ var ModelCache = class {
|
|
|
2812
2909
|
if (!response.ok) {
|
|
2813
2910
|
span?.setAttributes({ "cache.validated": false, "cache.stale": false });
|
|
2814
2911
|
span?.end();
|
|
2815
|
-
telemetry?.incrementCounter(
|
|
2912
|
+
telemetry?.incrementCounter(MetricNames.CACHE_HITS, 1, {});
|
|
2816
2913
|
return { data: cached.data, stale: false };
|
|
2817
2914
|
}
|
|
2818
2915
|
const serverEtag = response.headers.get("etag");
|
|
@@ -2825,17 +2922,17 @@ var ModelCache = class {
|
|
|
2825
2922
|
});
|
|
2826
2923
|
span?.end();
|
|
2827
2924
|
if (isStale) {
|
|
2828
|
-
telemetry?.incrementCounter(
|
|
2925
|
+
telemetry?.incrementCounter(MetricNames.CACHE_STALE, 1, {});
|
|
2829
2926
|
logger7.debug("Stale cache detected", { url });
|
|
2830
2927
|
} else {
|
|
2831
|
-
telemetry?.incrementCounter(
|
|
2928
|
+
telemetry?.incrementCounter(MetricNames.CACHE_HITS, 1, {});
|
|
2832
2929
|
}
|
|
2833
2930
|
return { data: cached.data, stale: isStale };
|
|
2834
2931
|
} catch (fetchError) {
|
|
2835
2932
|
logger7.warn("HEAD validation failed, using cached data", { error: String(fetchError) });
|
|
2836
2933
|
span?.setAttributes({ "cache.validated": false, "cache.stale": false });
|
|
2837
2934
|
span?.end();
|
|
2838
|
-
telemetry?.incrementCounter(
|
|
2935
|
+
telemetry?.incrementCounter(MetricNames.CACHE_HITS, 1, {});
|
|
2839
2936
|
return { data: cached.data, stale: false };
|
|
2840
2937
|
}
|
|
2841
2938
|
} catch {
|
|
@@ -2916,7 +3013,7 @@ var ModelCache = class {
|
|
|
2916
3013
|
const telemetry = getTelemetry();
|
|
2917
3014
|
if (quota.percentUsed > 90) {
|
|
2918
3015
|
logger7.warn("Storage quota warning", { percentUsed: quota.percentUsed.toFixed(1), used: formatBytes(quota.usedBytes), quota: formatBytes(quota.quotaBytes) });
|
|
2919
|
-
telemetry?.incrementCounter(
|
|
3016
|
+
telemetry?.incrementCounter(MetricNames.CACHE_QUOTA_WARNING, 1, {
|
|
2920
3017
|
percent_used: String(Math.round(quota.percentUsed))
|
|
2921
3018
|
});
|
|
2922
3019
|
if (config.onQuotaWarning) {
|
|
@@ -3058,7 +3155,7 @@ var ModelCache = class {
|
|
|
3058
3155
|
});
|
|
3059
3156
|
span?.end();
|
|
3060
3157
|
if (freedBytes > 0) {
|
|
3061
|
-
telemetry?.incrementCounter(
|
|
3158
|
+
telemetry?.incrementCounter(MetricNames.CACHE_EVICTION, evictedUrls.length, {
|
|
3062
3159
|
bytes_freed: String(freedBytes)
|
|
3063
3160
|
});
|
|
3064
3161
|
}
|
|
@@ -3578,7 +3675,7 @@ var _A2EInference = class _A2EInference {
|
|
|
3578
3675
|
throw new Error("Model already loaded. Call dispose() first.");
|
|
3579
3676
|
}
|
|
3580
3677
|
this.isLoading = true;
|
|
3581
|
-
const startTime =
|
|
3678
|
+
const startTime = getClock().now();
|
|
3582
3679
|
const telemetry = getTelemetry();
|
|
3583
3680
|
const span = telemetry?.startSpan("A2EInference.load", {
|
|
3584
3681
|
"model.url": this.config.modelUrl,
|
|
@@ -3674,7 +3771,7 @@ var _A2EInference = class _A2EInference {
|
|
|
3674
3771
|
executionProvider: this._backend,
|
|
3675
3772
|
backend: this._backend
|
|
3676
3773
|
});
|
|
3677
|
-
const loadTimeMs =
|
|
3774
|
+
const loadTimeMs = getClock().now() - startTime;
|
|
3678
3775
|
logger10.info("Model loaded successfully", {
|
|
3679
3776
|
backend: this._backend,
|
|
3680
3777
|
loadTimeMs: Math.round(loadTimeMs),
|
|
@@ -3693,7 +3790,7 @@ var _A2EInference = class _A2EInference {
|
|
|
3693
3790
|
});
|
|
3694
3791
|
await new Promise((r) => setTimeout(r, 0));
|
|
3695
3792
|
logger10.debug("Running warmup inference to initialize GPU context");
|
|
3696
|
-
const warmupStart =
|
|
3793
|
+
const warmupStart = getClock().now();
|
|
3697
3794
|
const warmupAudio = new Float32Array(this.chunkSize);
|
|
3698
3795
|
const warmupIdentity = new Float32Array(this.numIdentityClasses);
|
|
3699
3796
|
warmupIdentity[0] = 1;
|
|
@@ -3706,7 +3803,7 @@ var _A2EInference = class _A2EInference {
|
|
|
3706
3803
|
this.session.run(warmupFeeds).then(() => "ok"),
|
|
3707
3804
|
new Promise((r) => setTimeout(() => r("timeout"), WARMUP_TIMEOUT_MS))
|
|
3708
3805
|
]);
|
|
3709
|
-
const warmupTimeMs =
|
|
3806
|
+
const warmupTimeMs = getClock().now() - warmupStart;
|
|
3710
3807
|
if (warmupResult === "timeout") {
|
|
3711
3808
|
logger10.warn("Warmup inference timed out \u2014 GPU may be unresponsive. Continuing without warmup.", {
|
|
3712
3809
|
timeoutMs: WARMUP_TIMEOUT_MS,
|
|
@@ -3786,7 +3883,7 @@ var _A2EInference = class _A2EInference {
|
|
|
3786
3883
|
"inference.input_samples": this.chunkSize
|
|
3787
3884
|
});
|
|
3788
3885
|
try {
|
|
3789
|
-
const startTime =
|
|
3886
|
+
const startTime = getClock().now();
|
|
3790
3887
|
let timeoutId;
|
|
3791
3888
|
const results = await Promise.race([
|
|
3792
3889
|
this.session.run(feeds).then((r) => {
|
|
@@ -3800,7 +3897,7 @@ var _A2EInference = class _A2EInference {
|
|
|
3800
3897
|
);
|
|
3801
3898
|
})
|
|
3802
3899
|
]);
|
|
3803
|
-
const inferenceTimeMs =
|
|
3900
|
+
const inferenceTimeMs = getClock().now() - startTime;
|
|
3804
3901
|
const blendshapeOutput = results["blendshapes"];
|
|
3805
3902
|
if (!blendshapeOutput) {
|
|
3806
3903
|
throw new Error("Missing blendshapes output from model");
|
|
@@ -4207,9 +4304,9 @@ var A2EUnifiedAdapter = class {
|
|
|
4207
4304
|
"inference.input_samples": audio.length
|
|
4208
4305
|
});
|
|
4209
4306
|
try {
|
|
4210
|
-
const startTime =
|
|
4307
|
+
const startTime = getClock().now();
|
|
4211
4308
|
const result = await this.worker.inferLAM(audio, identityIndex);
|
|
4212
|
-
const inferenceTimeMs =
|
|
4309
|
+
const inferenceTimeMs = getClock().now() - startTime;
|
|
4213
4310
|
const flatBuffer = result.blendshapes;
|
|
4214
4311
|
const { numFrames, numBlendshapes } = result;
|
|
4215
4312
|
const blendshapes = [];
|
|
@@ -4860,7 +4957,7 @@ var KokoroTTSInference = class {
|
|
|
4860
4957
|
throw new Error("KokoroTTS is already loading");
|
|
4861
4958
|
}
|
|
4862
4959
|
this.isLoading = true;
|
|
4863
|
-
const startTime =
|
|
4960
|
+
const startTime = getClock().now();
|
|
4864
4961
|
try {
|
|
4865
4962
|
const backendPref = this.config.backend ?? "wasm";
|
|
4866
4963
|
const ortResult = await getOnnxRuntimeForPreference(backendPref);
|
|
@@ -4884,7 +4981,7 @@ var KokoroTTSInference = class {
|
|
|
4884
4981
|
"KokoroTTS InferenceSession.create"
|
|
4885
4982
|
);
|
|
4886
4983
|
}
|
|
4887
|
-
const loadTimeMs =
|
|
4984
|
+
const loadTimeMs = getClock().now() - startTime;
|
|
4888
4985
|
logger17.info("Kokoro TTS loaded", {
|
|
4889
4986
|
backend: this._backend,
|
|
4890
4987
|
loadTimeMs: Math.round(loadTimeMs),
|
|
@@ -4971,7 +5068,18 @@ var KokoroTTSInference = class {
|
|
|
4971
5068
|
logger17.debug("stream aborted");
|
|
4972
5069
|
return;
|
|
4973
5070
|
}
|
|
4974
|
-
|
|
5071
|
+
let phonemes;
|
|
5072
|
+
try {
|
|
5073
|
+
phonemes = await phonemize(sentence, language);
|
|
5074
|
+
} catch (phonErr) {
|
|
5075
|
+
logger17.error("Phonemizer failed (possible OOM)", {
|
|
5076
|
+
code: ErrorCodes.TTS_PHONEMIZER_OOM,
|
|
5077
|
+
error: String(phonErr),
|
|
5078
|
+
textLength: sentence.length
|
|
5079
|
+
});
|
|
5080
|
+
yield { audio: new Float32Array(0), text: sentence, phonemes: "", duration: 0 };
|
|
5081
|
+
continue;
|
|
5082
|
+
}
|
|
4975
5083
|
const tokens = tokenize(phonemes);
|
|
4976
5084
|
const voiceData = await this.ensureVoice(voiceName);
|
|
4977
5085
|
const style = getStyleForTokenCount(voiceData, tokens.length);
|
|
@@ -5031,16 +5139,27 @@ var KokoroTTSInference = class {
|
|
|
5031
5139
|
"tts.speed": speed
|
|
5032
5140
|
});
|
|
5033
5141
|
try {
|
|
5034
|
-
const startTime =
|
|
5142
|
+
const startTime = getClock().now();
|
|
5035
5143
|
const language = getVoiceLanguage(voiceName);
|
|
5036
|
-
|
|
5144
|
+
let phonemes;
|
|
5145
|
+
try {
|
|
5146
|
+
phonemes = await phonemize(text, language);
|
|
5147
|
+
} catch (phonErr) {
|
|
5148
|
+
logger17.error("Phonemizer failed (possible OOM)", {
|
|
5149
|
+
code: ErrorCodes.TTS_PHONEMIZER_OOM,
|
|
5150
|
+
error: String(phonErr),
|
|
5151
|
+
textLength: text.length
|
|
5152
|
+
});
|
|
5153
|
+
resolve({ audio: new Float32Array(0), duration: 0, inferenceTimeMs: 0 });
|
|
5154
|
+
return;
|
|
5155
|
+
}
|
|
5037
5156
|
logger17.trace("Phonemized", { text: text.substring(0, 50), phonemes: phonemes.substring(0, 50) });
|
|
5038
5157
|
const tokens = tokenize(phonemes);
|
|
5039
5158
|
logger17.trace("Tokenized", { numTokens: tokens.length });
|
|
5040
5159
|
const voiceData = await this.ensureVoice(voiceName);
|
|
5041
5160
|
const style = getStyleForTokenCount(voiceData, tokens.length);
|
|
5042
5161
|
const audio = await this.runInference(tokens, style, speed);
|
|
5043
|
-
const inferenceTimeMs =
|
|
5162
|
+
const inferenceTimeMs = getClock().now() - startTime;
|
|
5044
5163
|
const duration = audio.length / SAMPLE_RATE;
|
|
5045
5164
|
logger17.trace("Synthesis complete", {
|
|
5046
5165
|
duration: `${duration.toFixed(2)}s`,
|
|
@@ -5159,11 +5278,11 @@ var KokoroTTSUnifiedAdapter = class {
|
|
|
5159
5278
|
"model.url": this.modelUrl
|
|
5160
5279
|
});
|
|
5161
5280
|
try {
|
|
5162
|
-
const startTime =
|
|
5281
|
+
const startTime = getClock().now();
|
|
5163
5282
|
await this.worker.loadKokoro({ modelUrl: this.modelUrl });
|
|
5164
5283
|
this._isLoaded = true;
|
|
5165
5284
|
this.loadedGeneration = this.worker.workerGeneration;
|
|
5166
|
-
const loadTimeMs =
|
|
5285
|
+
const loadTimeMs = getClock().now() - startTime;
|
|
5167
5286
|
logger18.info("Kokoro TTS loaded via unified worker", {
|
|
5168
5287
|
backend: "wasm",
|
|
5169
5288
|
loadTimeMs: Math.round(loadTimeMs),
|
|
@@ -5238,11 +5357,11 @@ var KokoroTTSUnifiedAdapter = class {
|
|
|
5238
5357
|
runWorkerInference(tokens, style, speed) {
|
|
5239
5358
|
return new Promise((resolve, reject) => {
|
|
5240
5359
|
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
5241
|
-
const startTime =
|
|
5360
|
+
const startTime = getClock().now();
|
|
5242
5361
|
const telemetry = getTelemetry();
|
|
5243
5362
|
try {
|
|
5244
5363
|
const result = await this.worker.inferKokoro(tokens, style, speed);
|
|
5245
|
-
const latencyMs =
|
|
5364
|
+
const latencyMs = getClock().now() - startTime;
|
|
5246
5365
|
telemetry?.recordHistogram("omote.inference.latency", latencyMs, {
|
|
5247
5366
|
model: "kokoro-tts-unified",
|
|
5248
5367
|
backend: "wasm"
|
|
@@ -5357,11 +5476,11 @@ var SileroVADUnifiedAdapter = class {
|
|
|
5357
5476
|
return new Promise((resolve, reject) => {
|
|
5358
5477
|
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
5359
5478
|
try {
|
|
5360
|
-
const startTime =
|
|
5479
|
+
const startTime = getClock().now();
|
|
5361
5480
|
const result = await this.worker.processVAD(audioChunkCopy, this.state, this.context);
|
|
5362
5481
|
this.state = result.state;
|
|
5363
5482
|
this.context = audioChunkCopy.slice(-this.contextSize);
|
|
5364
|
-
const inferenceTimeMs =
|
|
5483
|
+
const inferenceTimeMs = getClock().now() - startTime;
|
|
5365
5484
|
const isSpeech = result.probability > this.config.threshold;
|
|
5366
5485
|
let preSpeechChunks;
|
|
5367
5486
|
if (isSpeech && !this.wasSpeaking) {
|
|
@@ -5419,17 +5538,20 @@ var SileroVADUnifiedAdapter = class {
|
|
|
5419
5538
|
var logger20 = createLogger("createA2E");
|
|
5420
5539
|
function createA2E(config = {}) {
|
|
5421
5540
|
const modelUrl = config.modelUrl ?? DEFAULT_MODEL_URLS.lam;
|
|
5541
|
+
const platformInfo = {
|
|
5542
|
+
modelUrl,
|
|
5543
|
+
isIOS: isIOS(),
|
|
5544
|
+
webgpu: typeof navigator !== "undefined" && "gpu" in navigator
|
|
5545
|
+
};
|
|
5422
5546
|
if (config.unifiedWorker) {
|
|
5423
|
-
logger20.info("Creating A2EUnifiedAdapter (via unified worker)",
|
|
5424
|
-
modelUrl
|
|
5425
|
-
});
|
|
5547
|
+
logger20.info("Creating A2EUnifiedAdapter (via unified worker)", platformInfo);
|
|
5426
5548
|
return new A2EUnifiedAdapter(config.unifiedWorker, {
|
|
5427
5549
|
modelUrl,
|
|
5428
5550
|
externalDataUrl: config.externalDataUrl,
|
|
5429
5551
|
numIdentityClasses: config.numIdentityClasses
|
|
5430
5552
|
});
|
|
5431
5553
|
}
|
|
5432
|
-
logger20.info("Creating A2EInference",
|
|
5554
|
+
logger20.info("Creating A2EInference", platformInfo);
|
|
5433
5555
|
return new A2EInference({
|
|
5434
5556
|
modelUrl,
|
|
5435
5557
|
externalDataUrl: config.externalDataUrl,
|
|
@@ -5805,16 +5927,28 @@ async function loadOrt(wasmPaths, isIOSDevice) {
|
|
|
5805
5927
|
// ort.webgpu.min.js crashes WebKit's JIT compiler.
|
|
5806
5928
|
var isSafariWorker = typeof navigator !== 'undefined' && /safari/i.test(navigator.userAgent) && !/chrome|crios|fxios|chromium|edg/i.test(navigator.userAgent);
|
|
5807
5929
|
var hasWebGPU = false;
|
|
5808
|
-
|
|
5930
|
+
var webgpuReason = '';
|
|
5931
|
+
if (isIOSDevice) {
|
|
5932
|
+
webgpuReason = 'iOS device';
|
|
5933
|
+
} else if (isSafariWorker) {
|
|
5934
|
+
webgpuReason = 'Safari (JSEP/ASYNCIFY crash)';
|
|
5935
|
+
} else if (typeof navigator === 'undefined' || !navigator.gpu) {
|
|
5936
|
+
webgpuReason = 'navigator.gpu unavailable';
|
|
5937
|
+
} else {
|
|
5809
5938
|
try {
|
|
5810
5939
|
var adapter = await navigator.gpu.requestAdapter();
|
|
5811
5940
|
if (adapter) {
|
|
5812
5941
|
hasWebGPU = true;
|
|
5942
|
+
} else {
|
|
5943
|
+
webgpuReason = 'requestAdapter returned null';
|
|
5813
5944
|
}
|
|
5814
5945
|
} catch (e) {
|
|
5815
|
-
|
|
5946
|
+
webgpuReason = 'requestAdapter failed: ' + String(e);
|
|
5816
5947
|
}
|
|
5817
5948
|
}
|
|
5949
|
+
if (!hasWebGPU && webgpuReason) {
|
|
5950
|
+
console.warn('[UnifiedWorker] WebGPU unavailable: ' + webgpuReason + ', falling back to WASM');
|
|
5951
|
+
}
|
|
5818
5952
|
|
|
5819
5953
|
var ortUrl;
|
|
5820
5954
|
if (hasWebGPU) {
|
|
@@ -6299,7 +6433,12 @@ var UnifiedInferenceWorker = class {
|
|
|
6299
6433
|
span?.setAttributes({ "worker.init_time_ms": loadTimeMs, "worker.backend": this._workerBackend });
|
|
6300
6434
|
span?.end();
|
|
6301
6435
|
} catch (error) {
|
|
6302
|
-
|
|
6436
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
6437
|
+
const isTimeout = err.message.includes("timed out");
|
|
6438
|
+
if (isTimeout) {
|
|
6439
|
+
logger21.error("Worker init timed out", { code: "OMOTE_INF_003", timeoutMs: INIT_TIMEOUT_MS });
|
|
6440
|
+
}
|
|
6441
|
+
span?.endWithError(err);
|
|
6303
6442
|
this.cleanup();
|
|
6304
6443
|
throw error;
|
|
6305
6444
|
}
|
|
@@ -6683,7 +6822,7 @@ var TTSSpeaker = class {
|
|
|
6683
6822
|
async connect(tts, config) {
|
|
6684
6823
|
logger22.info("Connecting TTS...");
|
|
6685
6824
|
const span = getTelemetry()?.startSpan("TTSSpeaker.connect");
|
|
6686
|
-
const connectStart =
|
|
6825
|
+
const connectStart = getClock().now();
|
|
6687
6826
|
this.tts = tts;
|
|
6688
6827
|
if (!tts.isLoaded) {
|
|
6689
6828
|
await tts.load();
|
|
@@ -6692,7 +6831,7 @@ var TTSSpeaker = class {
|
|
|
6692
6831
|
if (!hasLam) {
|
|
6693
6832
|
this._audioOnly = true;
|
|
6694
6833
|
this.scheduler = new AudioScheduler({ sampleRate: tts.sampleRate });
|
|
6695
|
-
getTelemetry()?.recordHistogram(MetricNames.TTS_CONNECT_LATENCY,
|
|
6834
|
+
getTelemetry()?.recordHistogram(MetricNames.TTS_CONNECT_LATENCY, getClock().now() - connectStart);
|
|
6696
6835
|
span?.end();
|
|
6697
6836
|
logger22.info("TTS connected (audio-only mode)");
|
|
6698
6837
|
return;
|
|
@@ -6726,7 +6865,7 @@ var TTSSpeaker = class {
|
|
|
6726
6865
|
neutralTransitionMs: config?.neutralTransitionMs
|
|
6727
6866
|
});
|
|
6728
6867
|
await this.ttsPlayback.initialize();
|
|
6729
|
-
getTelemetry()?.recordHistogram(MetricNames.TTS_CONNECT_LATENCY,
|
|
6868
|
+
getTelemetry()?.recordHistogram(MetricNames.TTS_CONNECT_LATENCY, getClock().now() - connectStart);
|
|
6730
6869
|
span?.end();
|
|
6731
6870
|
logger22.info("TTS connected (lip sync mode)");
|
|
6732
6871
|
}
|
|
@@ -6761,7 +6900,7 @@ var TTSSpeaker = class {
|
|
|
6761
6900
|
const span = getTelemetry()?.startSpan("TTSSpeaker.speak", {
|
|
6762
6901
|
"text.length": text.length
|
|
6763
6902
|
});
|
|
6764
|
-
const speakStart =
|
|
6903
|
+
const speakStart = getClock().now();
|
|
6765
6904
|
try {
|
|
6766
6905
|
if (this._audioOnly) {
|
|
6767
6906
|
await this.speakAudioOnly(text, abort, options?.voice);
|
|
@@ -6771,7 +6910,7 @@ var TTSSpeaker = class {
|
|
|
6771
6910
|
voice: options?.voice
|
|
6772
6911
|
});
|
|
6773
6912
|
}
|
|
6774
|
-
getTelemetry()?.recordHistogram(MetricNames.TTS_SPEAK_LATENCY,
|
|
6913
|
+
getTelemetry()?.recordHistogram(MetricNames.TTS_SPEAK_LATENCY, getClock().now() - speakStart);
|
|
6775
6914
|
span?.end();
|
|
6776
6915
|
} catch (err) {
|
|
6777
6916
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
@@ -6901,42 +7040,42 @@ var TTSSpeaker = class {
|
|
|
6901
7040
|
end: async () => {
|
|
6902
7041
|
if (ended) return;
|
|
6903
7042
|
ended = true;
|
|
6904
|
-
|
|
6905
|
-
|
|
6906
|
-
if (this.currentAbort === abort) this.currentAbort = null;
|
|
6907
|
-
return;
|
|
6908
|
-
}
|
|
6909
|
-
if (buffer.trim()) {
|
|
6910
|
-
enqueueSentence(buffer.trim());
|
|
6911
|
-
buffer = "";
|
|
6912
|
-
}
|
|
6913
|
-
await processChain;
|
|
6914
|
-
if (abort.signal.aborted) {
|
|
6915
|
-
this._isSpeaking = false;
|
|
6916
|
-
if (this.currentAbort === abort) this.currentAbort = null;
|
|
6917
|
-
return;
|
|
6918
|
-
}
|
|
6919
|
-
await pipeline.end();
|
|
6920
|
-
await new Promise((resolve) => {
|
|
6921
|
-
let resolved = false;
|
|
6922
|
-
const done = () => {
|
|
6923
|
-
if (resolved) return;
|
|
6924
|
-
resolved = true;
|
|
6925
|
-
unsubC();
|
|
6926
|
-
unsubS();
|
|
6927
|
-
abort.signal.removeEventListener("abort", done);
|
|
6928
|
-
resolve();
|
|
6929
|
-
};
|
|
7043
|
+
const unsubs = [];
|
|
7044
|
+
try {
|
|
6930
7045
|
if (abort.signal.aborted) {
|
|
6931
|
-
resolve();
|
|
6932
7046
|
return;
|
|
6933
7047
|
}
|
|
6934
|
-
|
|
6935
|
-
|
|
6936
|
-
|
|
6937
|
-
|
|
6938
|
-
|
|
6939
|
-
|
|
7048
|
+
if (buffer.trim()) {
|
|
7049
|
+
enqueueSentence(buffer.trim());
|
|
7050
|
+
buffer = "";
|
|
7051
|
+
}
|
|
7052
|
+
await processChain;
|
|
7053
|
+
if (abort.signal.aborted) {
|
|
7054
|
+
return;
|
|
7055
|
+
}
|
|
7056
|
+
await pipeline.end();
|
|
7057
|
+
await new Promise((resolve) => {
|
|
7058
|
+
let resolved = false;
|
|
7059
|
+
const done = () => {
|
|
7060
|
+
if (resolved) return;
|
|
7061
|
+
resolved = true;
|
|
7062
|
+
resolve();
|
|
7063
|
+
};
|
|
7064
|
+
if (abort.signal.aborted) {
|
|
7065
|
+
resolve();
|
|
7066
|
+
return;
|
|
7067
|
+
}
|
|
7068
|
+
unsubs.push(pipeline.on("playback:complete", done));
|
|
7069
|
+
unsubs.push(pipeline.on("playback:stop", done));
|
|
7070
|
+
const onAbort = () => done();
|
|
7071
|
+
abort.signal.addEventListener("abort", onAbort);
|
|
7072
|
+
unsubs.push(() => abort.signal.removeEventListener("abort", onAbort));
|
|
7073
|
+
});
|
|
7074
|
+
} finally {
|
|
7075
|
+
unsubs.forEach((fn) => fn());
|
|
7076
|
+
this._isSpeaking = false;
|
|
7077
|
+
if (this.currentAbort === abort) this.currentAbort = null;
|
|
7078
|
+
}
|
|
6940
7079
|
}
|
|
6941
7080
|
};
|
|
6942
7081
|
}
|
|
@@ -7627,14 +7766,14 @@ function createKokoroTTS(config = {}) {
|
|
|
7627
7766
|
logger24.info("iOS + unified worker: creating KokoroTTSUnifiedAdapter (off-main-thread ONNX)");
|
|
7628
7767
|
return new KokoroTTSUnifiedAdapter(config.unifiedWorker, config);
|
|
7629
7768
|
}
|
|
7630
|
-
logger24.info("iOS
|
|
7769
|
+
logger24.info("iOS: creating KokoroTTSInference (main thread, shared ORT)");
|
|
7631
7770
|
return new KokoroTTSInference(config);
|
|
7632
7771
|
}
|
|
7633
7772
|
if (!KokoroTTSWorker.isSupported()) {
|
|
7634
7773
|
logger24.info("Worker not supported: creating KokoroTTSInference (main thread)");
|
|
7635
7774
|
return new KokoroTTSInference(config);
|
|
7636
7775
|
}
|
|
7637
|
-
logger24.info("Auto
|
|
7776
|
+
logger24.info("Auto: creating KokoroTTSWorker (off-main-thread)");
|
|
7638
7777
|
return new KokoroTTSWorker(config);
|
|
7639
7778
|
}
|
|
7640
7779
|
|
|
@@ -7868,6 +8007,9 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
7868
8007
|
// so all future transcribe() calls reject immediately to prevent concurrent access.
|
|
7869
8008
|
this.poisoned = false;
|
|
7870
8009
|
// 10s for SenseVoice (heavier preprocessing)
|
|
8010
|
+
// WebGPU shape change tracking (for dynamic shape warning)
|
|
8011
|
+
this.lastLfrFrames = 0;
|
|
8012
|
+
this.webgpuShapeWarned = false;
|
|
7871
8013
|
// Preprocessing state (loaded once)
|
|
7872
8014
|
this.tokenMap = null;
|
|
7873
8015
|
this.negMean = null;
|
|
@@ -7902,7 +8044,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
7902
8044
|
throw new Error("Model already loaded. Call dispose() first.");
|
|
7903
8045
|
}
|
|
7904
8046
|
this.isLoading = true;
|
|
7905
|
-
const startTime =
|
|
8047
|
+
const startTime = getClock().now();
|
|
7906
8048
|
const telemetry = getTelemetry();
|
|
7907
8049
|
const span = telemetry?.startSpan("SenseVoice.load", {
|
|
7908
8050
|
"model.url": this.config.modelUrl,
|
|
@@ -7969,7 +8111,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
7969
8111
|
} catch (cmvnErr) {
|
|
7970
8112
|
logger25.warn("Failed to read CMVN from model metadata", { error: cmvnErr });
|
|
7971
8113
|
}
|
|
7972
|
-
const loadTimeMs =
|
|
8114
|
+
const loadTimeMs = getClock().now() - startTime;
|
|
7973
8115
|
logger25.info("SenseVoice model loaded", {
|
|
7974
8116
|
backend: this._backend,
|
|
7975
8117
|
loadTimeMs: Math.round(loadTimeMs),
|
|
@@ -8034,24 +8176,35 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
8034
8176
|
"inference.input_samples": audio.length
|
|
8035
8177
|
});
|
|
8036
8178
|
try {
|
|
8037
|
-
const startTime =
|
|
8038
|
-
const preprocessStart =
|
|
8179
|
+
const startTime = getClock().now();
|
|
8180
|
+
const preprocessStart = getClock().now();
|
|
8039
8181
|
const fbank = computeKaldiFbank(audio, 16e3, 80);
|
|
8040
8182
|
const numFrames = fbank.length / 80;
|
|
8041
8183
|
if (numFrames === 0) {
|
|
8042
8184
|
resolve({
|
|
8043
8185
|
text: "",
|
|
8044
|
-
inferenceTimeMs:
|
|
8045
|
-
preprocessTimeMs:
|
|
8186
|
+
inferenceTimeMs: getClock().now() - startTime,
|
|
8187
|
+
preprocessTimeMs: getClock().now() - preprocessStart
|
|
8046
8188
|
});
|
|
8047
8189
|
return;
|
|
8048
8190
|
}
|
|
8049
8191
|
const lfrFeatures = applyLFR(fbank, 80, 7, 6);
|
|
8050
8192
|
const numLfrFrames = lfrFeatures.length / 560;
|
|
8193
|
+
if (this._backend === "webgpu" && this.lastLfrFrames !== 0 && numLfrFrames !== this.lastLfrFrames) {
|
|
8194
|
+
if (!this.webgpuShapeWarned) {
|
|
8195
|
+
this.webgpuShapeWarned = true;
|
|
8196
|
+
logger25.warn("SenseVoice running on WebGPU with variable audio shapes \u2014 risk of kernel crash", {
|
|
8197
|
+
code: ErrorCodes.INF_SHAPE_MISMATCH,
|
|
8198
|
+
previousFrames: this.lastLfrFrames,
|
|
8199
|
+
currentFrames: numLfrFrames
|
|
8200
|
+
});
|
|
8201
|
+
}
|
|
8202
|
+
}
|
|
8203
|
+
this.lastLfrFrames = numLfrFrames;
|
|
8051
8204
|
if (this.negMean && this.invStddev) {
|
|
8052
8205
|
applyCMVN(lfrFeatures, 560, this.negMean, this.invStddev);
|
|
8053
8206
|
}
|
|
8054
|
-
const preprocessTimeMs =
|
|
8207
|
+
const preprocessTimeMs = getClock().now() - preprocessStart;
|
|
8055
8208
|
const ort = this.ort;
|
|
8056
8209
|
const feeds = {
|
|
8057
8210
|
x: new ort.Tensor("float32", lfrFeatures, [1, numLfrFrames, 560]),
|
|
@@ -8081,7 +8234,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
8081
8234
|
const seqLen = logitsDims[1];
|
|
8082
8235
|
const vocabSize = logitsDims[2];
|
|
8083
8236
|
const decoded = ctcGreedyDecode(logitsData, seqLen, vocabSize, this.tokenMap);
|
|
8084
|
-
const inferenceTimeMs =
|
|
8237
|
+
const inferenceTimeMs = getClock().now() - startTime;
|
|
8085
8238
|
logger25.trace("Transcription complete", {
|
|
8086
8239
|
text: decoded.text.substring(0, 50),
|
|
8087
8240
|
language: decoded.language,
|
|
@@ -9420,7 +9573,7 @@ var SileroVADInference = class {
|
|
|
9420
9573
|
throw new Error("Model already loaded. Call dispose() first.");
|
|
9421
9574
|
}
|
|
9422
9575
|
this.isLoading = true;
|
|
9423
|
-
const startTime =
|
|
9576
|
+
const startTime = getClock().now();
|
|
9424
9577
|
const telemetry = getTelemetry();
|
|
9425
9578
|
const span = telemetry?.startSpan("SileroVAD.load", {
|
|
9426
9579
|
"model.url": this.config.modelUrl,
|
|
@@ -9452,7 +9605,7 @@ var SileroVADInference = class {
|
|
|
9452
9605
|
const modelData = new Uint8Array(modelBuffer);
|
|
9453
9606
|
this.session = await ort.InferenceSession.create(modelData, sessionOptions);
|
|
9454
9607
|
this.reset();
|
|
9455
|
-
const loadTimeMs =
|
|
9608
|
+
const loadTimeMs = getClock().now() - startTime;
|
|
9456
9609
|
logger28.info("Model loaded successfully", {
|
|
9457
9610
|
backend: this._backend,
|
|
9458
9611
|
loadTimeMs: Math.round(loadTimeMs),
|
|
@@ -9632,7 +9785,7 @@ var SileroVADInference = class {
|
|
|
9632
9785
|
"inference.chunk_size": this.chunkSize
|
|
9633
9786
|
});
|
|
9634
9787
|
try {
|
|
9635
|
-
const startTime =
|
|
9788
|
+
const startTime = getClock().now();
|
|
9636
9789
|
const inputSize = this.contextSize + this.chunkSize;
|
|
9637
9790
|
const inputBuffer = new Float32Array(inputSize);
|
|
9638
9791
|
inputBuffer.set(this.context, 0);
|
|
@@ -9662,7 +9815,7 @@ var SileroVADInference = class {
|
|
|
9662
9815
|
);
|
|
9663
9816
|
}
|
|
9664
9817
|
this.context = audioChunkCopy.slice(-this.contextSize);
|
|
9665
|
-
const inferenceTimeMs =
|
|
9818
|
+
const inferenceTimeMs = getClock().now() - startTime;
|
|
9666
9819
|
const isSpeech = probability > this.config.threshold;
|
|
9667
9820
|
let preSpeechChunks;
|
|
9668
9821
|
if (isSpeech && !this.wasSpeaking) {
|
|
@@ -10477,6 +10630,7 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
10477
10630
|
this.lastProgressiveSamples = 0;
|
|
10478
10631
|
// ASR error recovery
|
|
10479
10632
|
this.asrErrorCount = 0;
|
|
10633
|
+
this.progressiveErrorCount = 0;
|
|
10480
10634
|
this.config = config ?? {};
|
|
10481
10635
|
}
|
|
10482
10636
|
/** Current listener state */
|
|
@@ -10669,7 +10823,7 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
10669
10823
|
if (result.isSpeech) {
|
|
10670
10824
|
if (!wasSpeaking) {
|
|
10671
10825
|
this.isSpeechActive = true;
|
|
10672
|
-
this.speechStartTime =
|
|
10826
|
+
this.speechStartTime = getClock().now();
|
|
10673
10827
|
this.audioBuffer = [];
|
|
10674
10828
|
this.audioBufferSamples = 0;
|
|
10675
10829
|
this.lastProgressiveResult = null;
|
|
@@ -10708,13 +10862,13 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
10708
10862
|
const extended = this.config.silenceTimeoutExtendedMs ?? 700;
|
|
10709
10863
|
const adaptive = this.config.adaptiveTimeout ?? true;
|
|
10710
10864
|
if (!adaptive) return base;
|
|
10711
|
-
const speechDurationMs =
|
|
10865
|
+
const speechDurationMs = getClock().now() - this.speechStartTime;
|
|
10712
10866
|
return speechDurationMs > 3e3 ? extended : base;
|
|
10713
10867
|
}
|
|
10714
10868
|
onSilenceDetected() {
|
|
10715
10869
|
const capturedEpoch = this.epoch;
|
|
10716
10870
|
this.isSpeechActive = false;
|
|
10717
|
-
const durationMs =
|
|
10871
|
+
const durationMs = getClock().now() - this.speechStartTime;
|
|
10718
10872
|
logger31.debug("Speech end", { durationMs: Math.round(durationMs) });
|
|
10719
10873
|
this.emit("speech:end", { durationMs });
|
|
10720
10874
|
this.clearSilenceTimer();
|
|
@@ -10811,7 +10965,15 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
10811
10965
|
this.lastProgressiveSamples = snapshotSamples;
|
|
10812
10966
|
this.emit("transcript", { ...result, isFinal: false });
|
|
10813
10967
|
}
|
|
10814
|
-
} catch {
|
|
10968
|
+
} catch (err) {
|
|
10969
|
+
this.progressiveErrorCount = (this.progressiveErrorCount ?? 0) + 1;
|
|
10970
|
+
if (this.progressiveErrorCount % 10 === 1) {
|
|
10971
|
+
logger31.warn("Progressive transcription error", {
|
|
10972
|
+
code: ErrorCodes.SPH_ASR_ERROR,
|
|
10973
|
+
error: String(err),
|
|
10974
|
+
count: this.progressiveErrorCount
|
|
10975
|
+
});
|
|
10976
|
+
}
|
|
10815
10977
|
}
|
|
10816
10978
|
})();
|
|
10817
10979
|
}, intervalMs);
|
|
@@ -10828,7 +10990,7 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
10828
10990
|
async transcribeWithTimeout(audio) {
|
|
10829
10991
|
if (!this.asr) return null;
|
|
10830
10992
|
const timeoutMs = this.config.transcriptionTimeoutMs ?? 1e4;
|
|
10831
|
-
const startTime =
|
|
10993
|
+
const startTime = getClock().now();
|
|
10832
10994
|
const span = getTelemetry()?.startSpan("SpeechListener.transcribe", {
|
|
10833
10995
|
"inference.input_samples": audio.length,
|
|
10834
10996
|
"inference.input_duration_ms": audio.length / 16e3 * 1e3
|
|
@@ -10842,7 +11004,7 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
10842
11004
|
})
|
|
10843
11005
|
]);
|
|
10844
11006
|
clearTimeout(timeoutId);
|
|
10845
|
-
const latency =
|
|
11007
|
+
const latency = getClock().now() - startTime;
|
|
10846
11008
|
this.asrErrorCount = 0;
|
|
10847
11009
|
getTelemetry()?.recordHistogram(MetricNames.VOICE_TRANSCRIPTION_LATENCY, latency);
|
|
10848
11010
|
getTelemetry()?.incrementCounter(MetricNames.VOICE_TRANSCRIPTIONS);
|
|
@@ -11016,11 +11178,11 @@ var InterruptionHandler = class extends EventEmitter {
|
|
|
11016
11178
|
getState() {
|
|
11017
11179
|
return {
|
|
11018
11180
|
isSpeaking: this.isSpeaking,
|
|
11019
|
-
speechDurationMs: this.isSpeaking ?
|
|
11181
|
+
speechDurationMs: this.isSpeaking ? getClock().now() - this.speechStartTime : 0
|
|
11020
11182
|
};
|
|
11021
11183
|
}
|
|
11022
11184
|
onSpeechDetected(rms) {
|
|
11023
|
-
const now =
|
|
11185
|
+
const now = getClock().now();
|
|
11024
11186
|
this.lastSpeechTime = now;
|
|
11025
11187
|
if (this.silenceTimer) {
|
|
11026
11188
|
clearTimeout(this.silenceTimer);
|
|
@@ -11237,7 +11399,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
11237
11399
|
this.setupEventHandlers();
|
|
11238
11400
|
this.recognition.start();
|
|
11239
11401
|
this.isListening = true;
|
|
11240
|
-
this.startTime =
|
|
11402
|
+
this.startTime = getClock().now();
|
|
11241
11403
|
this.accumulatedText = "";
|
|
11242
11404
|
logger33.info("Speech recognition started", {
|
|
11243
11405
|
language: this.config.language
|
|
@@ -11338,7 +11500,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
11338
11500
|
const speechResult = {
|
|
11339
11501
|
text: isFinal ? this.accumulatedText.trim() : text,
|
|
11340
11502
|
language: this.config.language,
|
|
11341
|
-
inferenceTimeMs:
|
|
11503
|
+
inferenceTimeMs: getClock().now() - this.startTime,
|
|
11342
11504
|
isFinal,
|
|
11343
11505
|
confidence: alternative.confidence
|
|
11344
11506
|
};
|
|
@@ -11370,13 +11532,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
11370
11532
|
this.isListening = false;
|
|
11371
11533
|
logger33.info("Speech recognition ended", {
|
|
11372
11534
|
totalText: this.accumulatedText.length,
|
|
11373
|
-
durationMs:
|
|
11535
|
+
durationMs: getClock().now() - this.startTime
|
|
11374
11536
|
});
|
|
11375
11537
|
if (this.stopResolver) {
|
|
11376
11538
|
const result = {
|
|
11377
11539
|
text: this.accumulatedText.trim(),
|
|
11378
11540
|
language: this.config.language,
|
|
11379
|
-
inferenceTimeMs:
|
|
11541
|
+
inferenceTimeMs: getClock().now() - this.startTime,
|
|
11380
11542
|
isFinal: true
|
|
11381
11543
|
};
|
|
11382
11544
|
this.stopResolver(result);
|
|
@@ -11420,6 +11582,303 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
11420
11582
|
}
|
|
11421
11583
|
};
|
|
11422
11584
|
|
|
11585
|
+
// src/inference/ElevenLabsTTSBackend.ts
|
|
11586
|
+
var logger34 = createLogger("ElevenLabsTTS");
|
|
11587
|
+
var DEFAULT_MODEL = "eleven_multilingual_v2";
|
|
11588
|
+
var DEFAULT_OUTPUT_FORMAT = "pcm_16000";
|
|
11589
|
+
var DEFAULT_STABILITY = 0.5;
|
|
11590
|
+
var DEFAULT_SIMILARITY_BOOST = 0.75;
|
|
11591
|
+
var DEFAULT_BASE_URL = "https://api.elevenlabs.io";
|
|
11592
|
+
var FORMAT_TO_SAMPLE_RATE = {
|
|
11593
|
+
pcm_16000: 16e3,
|
|
11594
|
+
pcm_22050: 22050,
|
|
11595
|
+
pcm_24000: 24e3,
|
|
11596
|
+
pcm_44100: 44100
|
|
11597
|
+
};
|
|
11598
|
+
var ElevenLabsTTSBackend = class {
|
|
11599
|
+
constructor(config) {
|
|
11600
|
+
this._isLoaded = false;
|
|
11601
|
+
if (!config.apiKey) throw new Error("ElevenLabsTTS: apiKey is required");
|
|
11602
|
+
if (!config.voiceId) throw new Error("ElevenLabsTTS: voiceId is required");
|
|
11603
|
+
this.apiKey = config.apiKey;
|
|
11604
|
+
this.voiceId = config.voiceId;
|
|
11605
|
+
this.model = config.model ?? DEFAULT_MODEL;
|
|
11606
|
+
this.outputFormat = config.outputFormat ?? DEFAULT_OUTPUT_FORMAT;
|
|
11607
|
+
this.stability = config.stability ?? DEFAULT_STABILITY;
|
|
11608
|
+
this.similarityBoost = config.similarityBoost ?? DEFAULT_SIMILARITY_BOOST;
|
|
11609
|
+
this.baseUrl = config.baseUrl ?? DEFAULT_BASE_URL;
|
|
11610
|
+
const rate = FORMAT_TO_SAMPLE_RATE[this.outputFormat];
|
|
11611
|
+
if (!rate) {
|
|
11612
|
+
throw new Error(
|
|
11613
|
+
`ElevenLabsTTS: unsupported outputFormat "${this.outputFormat}". Supported: ${Object.keys(FORMAT_TO_SAMPLE_RATE).join(", ")}`
|
|
11614
|
+
);
|
|
11615
|
+
}
|
|
11616
|
+
this._sampleRate = rate;
|
|
11617
|
+
}
|
|
11618
|
+
get sampleRate() {
|
|
11619
|
+
return this._sampleRate;
|
|
11620
|
+
}
|
|
11621
|
+
get isLoaded() {
|
|
11622
|
+
return this._isLoaded;
|
|
11623
|
+
}
|
|
11624
|
+
// ─── Load ───────────────────────────────────────────────────────────────
|
|
11625
|
+
/**
|
|
11626
|
+
* No-op for cloud TTS (no model to load).
|
|
11627
|
+
* Marks backend as ready.
|
|
11628
|
+
*/
|
|
11629
|
+
async load() {
|
|
11630
|
+
this._isLoaded = true;
|
|
11631
|
+
logger34.info("ElevenLabs TTS ready", { voiceId: this.voiceId, model: this.model });
|
|
11632
|
+
}
|
|
11633
|
+
// ─── Stream ─────────────────────────────────────────────────────────────
|
|
11634
|
+
/**
|
|
11635
|
+
* Stream audio from ElevenLabs for the given text.
|
|
11636
|
+
*
|
|
11637
|
+
* Uses the streaming endpoint. Yields a single chunk for non-streaming
|
|
11638
|
+
* or multiple chunks as response data arrives.
|
|
11639
|
+
*/
|
|
11640
|
+
async *stream(text, options) {
|
|
11641
|
+
if (!this._isLoaded) {
|
|
11642
|
+
throw new Error("ElevenLabsTTS: not loaded. Call load() first.");
|
|
11643
|
+
}
|
|
11644
|
+
const trimmed = text.trim();
|
|
11645
|
+
if (trimmed.length === 0) {
|
|
11646
|
+
throw new Error("ElevenLabsTTS: text must not be empty");
|
|
11647
|
+
}
|
|
11648
|
+
const startTime = getClock().now();
|
|
11649
|
+
const telemetry = getTelemetry();
|
|
11650
|
+
const span = telemetry?.startSpan("ElevenLabsTTS.stream", {
|
|
11651
|
+
"tts.text_length": trimmed.length,
|
|
11652
|
+
"tts.voice_id": this.voiceId,
|
|
11653
|
+
"tts.model": this.model
|
|
11654
|
+
});
|
|
11655
|
+
const url = `${this.baseUrl}/v1/text-to-speech/${this.voiceId}?output_format=${this.outputFormat}`;
|
|
11656
|
+
try {
|
|
11657
|
+
const response = await fetch(url, {
|
|
11658
|
+
method: "POST",
|
|
11659
|
+
headers: {
|
|
11660
|
+
"xi-api-key": this.apiKey,
|
|
11661
|
+
"Content-Type": "application/json",
|
|
11662
|
+
Accept: "audio/pcm"
|
|
11663
|
+
},
|
|
11664
|
+
body: JSON.stringify({
|
|
11665
|
+
text: trimmed,
|
|
11666
|
+
model_id: this.model,
|
|
11667
|
+
voice_settings: {
|
|
11668
|
+
stability: this.stability,
|
|
11669
|
+
similarity_boost: this.similarityBoost
|
|
11670
|
+
}
|
|
11671
|
+
}),
|
|
11672
|
+
signal: options?.signal
|
|
11673
|
+
});
|
|
11674
|
+
if (!response.ok) {
|
|
11675
|
+
const errorText = await response.text().catch(() => "unknown");
|
|
11676
|
+
const msg = `ElevenLabsTTS: HTTP ${response.status} \u2014 ${this.getHttpErrorMessage(response.status, errorText)}`;
|
|
11677
|
+
logger34.error(msg);
|
|
11678
|
+
throw new Error(msg);
|
|
11679
|
+
}
|
|
11680
|
+
if (!response.body) {
|
|
11681
|
+
const buffer = await response.arrayBuffer();
|
|
11682
|
+
const audio = pcm16ToFloat32(buffer);
|
|
11683
|
+
const duration = audio.length / this._sampleRate;
|
|
11684
|
+
const latency2 = getClock().now() - startTime;
|
|
11685
|
+
span?.setAttributes({ "tts.duration_s": duration, "tts.latency_ms": latency2 });
|
|
11686
|
+
span?.end();
|
|
11687
|
+
telemetry?.recordHistogram("omote.inference.latency", latency2, {
|
|
11688
|
+
model: "elevenlabs-tts",
|
|
11689
|
+
backend: "cloud"
|
|
11690
|
+
});
|
|
11691
|
+
yield { audio, duration, text: trimmed };
|
|
11692
|
+
return;
|
|
11693
|
+
}
|
|
11694
|
+
const reader = response.body.getReader();
|
|
11695
|
+
let totalSamples = 0;
|
|
11696
|
+
try {
|
|
11697
|
+
while (true) {
|
|
11698
|
+
if (options?.signal?.aborted) {
|
|
11699
|
+
reader.cancel();
|
|
11700
|
+
logger34.debug("Stream aborted by signal");
|
|
11701
|
+
return;
|
|
11702
|
+
}
|
|
11703
|
+
const { done, value } = await reader.read();
|
|
11704
|
+
if (done) break;
|
|
11705
|
+
if (value && value.byteLength > 0) {
|
|
11706
|
+
const usableBytes = value.byteLength & ~1;
|
|
11707
|
+
if (usableBytes === 0) continue;
|
|
11708
|
+
const audio = pcm16ToFloat32(value.buffer.slice(value.byteOffset, value.byteOffset + usableBytes));
|
|
11709
|
+
const duration = audio.length / this._sampleRate;
|
|
11710
|
+
totalSamples += audio.length;
|
|
11711
|
+
yield { audio, duration, text: trimmed };
|
|
11712
|
+
}
|
|
11713
|
+
}
|
|
11714
|
+
} finally {
|
|
11715
|
+
reader.releaseLock();
|
|
11716
|
+
}
|
|
11717
|
+
const latency = getClock().now() - startTime;
|
|
11718
|
+
const totalDuration = totalSamples / this._sampleRate;
|
|
11719
|
+
logger34.debug("Stream complete", {
|
|
11720
|
+
totalDuration: `${totalDuration.toFixed(2)}s`,
|
|
11721
|
+
latencyMs: Math.round(latency),
|
|
11722
|
+
totalSamples
|
|
11723
|
+
});
|
|
11724
|
+
span?.setAttributes({ "tts.duration_s": totalDuration, "tts.latency_ms": latency });
|
|
11725
|
+
span?.end();
|
|
11726
|
+
telemetry?.recordHistogram("omote.inference.latency", latency, {
|
|
11727
|
+
model: "elevenlabs-tts",
|
|
11728
|
+
backend: "cloud"
|
|
11729
|
+
});
|
|
11730
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
11731
|
+
model: "elevenlabs-tts",
|
|
11732
|
+
backend: "cloud",
|
|
11733
|
+
status: "success"
|
|
11734
|
+
});
|
|
11735
|
+
} catch (err) {
|
|
11736
|
+
if (err instanceof DOMException && err.name === "AbortError") {
|
|
11737
|
+
logger34.debug("Stream aborted");
|
|
11738
|
+
span?.end();
|
|
11739
|
+
return;
|
|
11740
|
+
}
|
|
11741
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
11742
|
+
logger34.error("Stream failed", { error: errMsg });
|
|
11743
|
+
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
11744
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
11745
|
+
model: "elevenlabs-tts",
|
|
11746
|
+
backend: "cloud",
|
|
11747
|
+
status: "error"
|
|
11748
|
+
});
|
|
11749
|
+
throw err;
|
|
11750
|
+
}
|
|
11751
|
+
}
|
|
11752
|
+
// ─── Dispose ────────────────────────────────────────────────────────────
|
|
11753
|
+
async dispose() {
|
|
11754
|
+
this._isLoaded = false;
|
|
11755
|
+
logger34.info("ElevenLabs TTS disposed");
|
|
11756
|
+
}
|
|
11757
|
+
// ─── Private ────────────────────────────────────────────────────────────
|
|
11758
|
+
getHttpErrorMessage(status, body) {
|
|
11759
|
+
switch (status) {
|
|
11760
|
+
case 401:
|
|
11761
|
+
return "Unauthorized \u2014 check your API key";
|
|
11762
|
+
case 403:
|
|
11763
|
+
return "Forbidden \u2014 API key lacks required permissions";
|
|
11764
|
+
case 429:
|
|
11765
|
+
return "Rate limited \u2014 too many requests";
|
|
11766
|
+
case 400:
|
|
11767
|
+
return `Bad request \u2014 ${body}`;
|
|
11768
|
+
default:
|
|
11769
|
+
return body || `HTTP error ${status}`;
|
|
11770
|
+
}
|
|
11771
|
+
}
|
|
11772
|
+
};
|
|
11773
|
+
|
|
11774
|
+
// src/inference/PollyTTSBackend.ts
|
|
11775
|
+
var logger35 = createLogger("PollyTTS");
|
|
11776
|
+
var DEFAULT_VOICE = "Joanna";
|
|
11777
|
+
var DEFAULT_SAMPLE_RATE = 16e3;
|
|
11778
|
+
var PollyTTSBackend = class {
|
|
11779
|
+
constructor(config) {
|
|
11780
|
+
this._isLoaded = false;
|
|
11781
|
+
if (!config.synthesizeFn) {
|
|
11782
|
+
throw new Error("PollyTTS: synthesizeFn is required");
|
|
11783
|
+
}
|
|
11784
|
+
this.synthesizeFn = config.synthesizeFn;
|
|
11785
|
+
this.voice = config.voice ?? DEFAULT_VOICE;
|
|
11786
|
+
this._sampleRate = config.sampleRate ?? DEFAULT_SAMPLE_RATE;
|
|
11787
|
+
this.engine = config.engine ?? "neural";
|
|
11788
|
+
}
|
|
11789
|
+
get sampleRate() {
|
|
11790
|
+
return this._sampleRate;
|
|
11791
|
+
}
|
|
11792
|
+
get isLoaded() {
|
|
11793
|
+
return this._isLoaded;
|
|
11794
|
+
}
|
|
11795
|
+
// ─── Load ───────────────────────────────────────────────────────────────
|
|
11796
|
+
/**
|
|
11797
|
+
* No-op for cloud TTS (no model to load).
|
|
11798
|
+
* Marks backend as ready.
|
|
11799
|
+
*/
|
|
11800
|
+
async load() {
|
|
11801
|
+
this._isLoaded = true;
|
|
11802
|
+
logger35.info("Polly TTS ready", { voice: this.voice, engine: this.engine, sampleRate: this._sampleRate });
|
|
11803
|
+
}
|
|
11804
|
+
// ─── Stream ─────────────────────────────────────────────────────────────
|
|
11805
|
+
/**
|
|
11806
|
+
* Synthesize audio via consumer's Polly function.
|
|
11807
|
+
*
|
|
11808
|
+
* Polly's SynthesizeSpeech is request/response (not streaming for PCM),
|
|
11809
|
+
* so this yields a single chunk per call. For long text, consider splitting
|
|
11810
|
+
* into sentences on the consumer side.
|
|
11811
|
+
*/
|
|
11812
|
+
async *stream(text, options) {
|
|
11813
|
+
if (!this._isLoaded) {
|
|
11814
|
+
throw new Error("PollyTTS: not loaded. Call load() first.");
|
|
11815
|
+
}
|
|
11816
|
+
const trimmed = text.trim();
|
|
11817
|
+
if (trimmed.length === 0) {
|
|
11818
|
+
throw new Error("PollyTTS: text must not be empty");
|
|
11819
|
+
}
|
|
11820
|
+
if (options?.signal?.aborted) {
|
|
11821
|
+
return;
|
|
11822
|
+
}
|
|
11823
|
+
const voiceName = options?.voice ?? this.voice;
|
|
11824
|
+
const startTime = getClock().now();
|
|
11825
|
+
const telemetry = getTelemetry();
|
|
11826
|
+
const span = telemetry?.startSpan("PollyTTS.stream", {
|
|
11827
|
+
"tts.text_length": trimmed.length,
|
|
11828
|
+
"tts.voice": voiceName,
|
|
11829
|
+
"tts.engine": this.engine
|
|
11830
|
+
});
|
|
11831
|
+
try {
|
|
11832
|
+
const result = await this.synthesizeFn(trimmed, voiceName, this._sampleRate);
|
|
11833
|
+
if (options?.signal?.aborted) {
|
|
11834
|
+
span?.end();
|
|
11835
|
+
return;
|
|
11836
|
+
}
|
|
11837
|
+
const audio = pcm16ToFloat32(result.audio);
|
|
11838
|
+
const duration = audio.length / this._sampleRate;
|
|
11839
|
+
const latency = getClock().now() - startTime;
|
|
11840
|
+
logger35.debug("Synthesis complete", {
|
|
11841
|
+
voice: voiceName,
|
|
11842
|
+
duration: `${duration.toFixed(2)}s`,
|
|
11843
|
+
latencyMs: Math.round(latency),
|
|
11844
|
+
numSamples: audio.length
|
|
11845
|
+
});
|
|
11846
|
+
span?.setAttributes({ "tts.duration_s": duration, "tts.latency_ms": latency });
|
|
11847
|
+
span?.end();
|
|
11848
|
+
telemetry?.recordHistogram("omote.inference.latency", latency, {
|
|
11849
|
+
model: "polly-tts",
|
|
11850
|
+
backend: "cloud"
|
|
11851
|
+
});
|
|
11852
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
11853
|
+
model: "polly-tts",
|
|
11854
|
+
backend: "cloud",
|
|
11855
|
+
status: "success"
|
|
11856
|
+
});
|
|
11857
|
+
yield { audio, duration, text: trimmed };
|
|
11858
|
+
} catch (err) {
|
|
11859
|
+
if (err instanceof DOMException && err.name === "AbortError") {
|
|
11860
|
+
logger35.debug("Synthesis aborted");
|
|
11861
|
+
span?.end();
|
|
11862
|
+
return;
|
|
11863
|
+
}
|
|
11864
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
11865
|
+
logger35.error("Synthesis failed", { error: errMsg });
|
|
11866
|
+
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
11867
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
11868
|
+
model: "polly-tts",
|
|
11869
|
+
backend: "cloud",
|
|
11870
|
+
status: "error"
|
|
11871
|
+
});
|
|
11872
|
+
throw err;
|
|
11873
|
+
}
|
|
11874
|
+
}
|
|
11875
|
+
// ─── Dispose ────────────────────────────────────────────────────────────
|
|
11876
|
+
async dispose() {
|
|
11877
|
+
this._isLoaded = false;
|
|
11878
|
+
logger35.info("Polly TTS disposed");
|
|
11879
|
+
}
|
|
11880
|
+
};
|
|
11881
|
+
|
|
11423
11882
|
// src/inference/ortConfig.ts
|
|
11424
11883
|
var ortCdnBase = null;
|
|
11425
11884
|
function configureOrtCdn(cdnPath) {
|
|
@@ -11433,7 +11892,7 @@ function getOrtCdnBase() {
|
|
|
11433
11892
|
}
|
|
11434
11893
|
|
|
11435
11894
|
// src/emotion/Emotion.ts
|
|
11436
|
-
var
|
|
11895
|
+
var logger36 = createLogger("EmotionController");
|
|
11437
11896
|
var EMOTION_NAMES = [
|
|
11438
11897
|
"amazement",
|
|
11439
11898
|
"anger",
|
|
@@ -11455,7 +11914,7 @@ function createEmotionVector(weights = {}) {
|
|
|
11455
11914
|
if (idx >= 0) {
|
|
11456
11915
|
vector[idx] = Math.max(0, Math.min(1, value));
|
|
11457
11916
|
} else {
|
|
11458
|
-
|
|
11917
|
+
logger36.warn(`Invalid emotion name in createEmotionVector: "${name}"`);
|
|
11459
11918
|
}
|
|
11460
11919
|
}
|
|
11461
11920
|
return vector;
|
|
@@ -11538,7 +11997,7 @@ var EmotionController = class {
|
|
|
11538
11997
|
this.targetEmotion.set(newEmotion);
|
|
11539
11998
|
this.currentEmotion.set(newEmotion);
|
|
11540
11999
|
this.transitionProgress = 1;
|
|
11541
|
-
|
|
12000
|
+
logger36.debug("set", { weights });
|
|
11542
12001
|
}
|
|
11543
12002
|
/**
|
|
11544
12003
|
* Set emotion from preset immediately
|
|
@@ -11548,7 +12007,7 @@ var EmotionController = class {
|
|
|
11548
12007
|
this.targetEmotion.set(newEmotion);
|
|
11549
12008
|
this.currentEmotion.set(newEmotion);
|
|
11550
12009
|
this.transitionProgress = 1;
|
|
11551
|
-
|
|
12010
|
+
logger36.debug("setPreset", { preset });
|
|
11552
12011
|
}
|
|
11553
12012
|
/**
|
|
11554
12013
|
* Transition to new emotion over time
|
|
@@ -11560,9 +12019,9 @@ var EmotionController = class {
|
|
|
11560
12019
|
this.currentEmotion.set(this.emotion);
|
|
11561
12020
|
this.targetEmotion.set(createEmotionVector(weights));
|
|
11562
12021
|
this.transitionDuration = durationMs;
|
|
11563
|
-
this.transitionStartTime =
|
|
12022
|
+
this.transitionStartTime = getClock().now();
|
|
11564
12023
|
this.transitionProgress = 0;
|
|
11565
|
-
|
|
12024
|
+
logger36.debug("transitionTo", { weights, durationMs });
|
|
11566
12025
|
}
|
|
11567
12026
|
/**
|
|
11568
12027
|
* Transition to preset over time
|
|
@@ -11571,7 +12030,7 @@ var EmotionController = class {
|
|
|
11571
12030
|
this.currentEmotion.set(this.emotion);
|
|
11572
12031
|
this.targetEmotion.set(getEmotionPreset(preset));
|
|
11573
12032
|
this.transitionDuration = durationMs;
|
|
11574
|
-
this.transitionStartTime =
|
|
12033
|
+
this.transitionStartTime = getClock().now();
|
|
11575
12034
|
this.transitionProgress = 0;
|
|
11576
12035
|
}
|
|
11577
12036
|
/**
|
|
@@ -11579,7 +12038,7 @@ var EmotionController = class {
|
|
|
11579
12038
|
*/
|
|
11580
12039
|
update() {
|
|
11581
12040
|
if (this.transitionProgress >= 1) return;
|
|
11582
|
-
const elapsed =
|
|
12041
|
+
const elapsed = getClock().now() - this.transitionStartTime;
|
|
11583
12042
|
this.transitionProgress = Math.min(1, elapsed / this.transitionDuration);
|
|
11584
12043
|
}
|
|
11585
12044
|
/**
|
|
@@ -11595,7 +12054,7 @@ var EmotionController = class {
|
|
|
11595
12054
|
this.currentEmotion.fill(0);
|
|
11596
12055
|
this.targetEmotion.fill(0);
|
|
11597
12056
|
this.transitionProgress = 1;
|
|
11598
|
-
|
|
12057
|
+
logger36.debug("reset");
|
|
11599
12058
|
}
|
|
11600
12059
|
};
|
|
11601
12060
|
|
|
@@ -11676,7 +12135,7 @@ var DEFAULT_ANIMATION_CONFIG = {
|
|
|
11676
12135
|
};
|
|
11677
12136
|
|
|
11678
12137
|
// src/animation/AnimationGraph.ts
|
|
11679
|
-
var
|
|
12138
|
+
var logger37 = createLogger("AnimationGraph");
|
|
11680
12139
|
var AnimationGraph = class extends EventEmitter {
|
|
11681
12140
|
constructor(config = {}) {
|
|
11682
12141
|
super();
|
|
@@ -11709,7 +12168,7 @@ var AnimationGraph = class extends EventEmitter {
|
|
|
11709
12168
|
this.stateEnterTime = Date.now();
|
|
11710
12169
|
this.lastUpdateTime = Date.now();
|
|
11711
12170
|
this.cachedOutput = this.computeOutput();
|
|
11712
|
-
|
|
12171
|
+
logger37.info("constructor", {
|
|
11713
12172
|
initialState: this.config.initialState,
|
|
11714
12173
|
stateCount: this.config.states.length,
|
|
11715
12174
|
transitionCount: this.config.transitions.length
|
|
@@ -11780,7 +12239,7 @@ var AnimationGraph = class extends EventEmitter {
|
|
|
11780
12239
|
setState(stateName, blendDuration = 300) {
|
|
11781
12240
|
const targetState = this.config.states.find((s) => s.name === stateName);
|
|
11782
12241
|
if (!targetState) {
|
|
11783
|
-
|
|
12242
|
+
logger37.warn(`State '${stateName}' not found`);
|
|
11784
12243
|
return;
|
|
11785
12244
|
}
|
|
11786
12245
|
if (targetState.name === this.currentState.name && !this.isTransitioning) {
|
|
@@ -11858,7 +12317,7 @@ var AnimationGraph = class extends EventEmitter {
|
|
|
11858
12317
|
(s) => s.name === transition.to
|
|
11859
12318
|
);
|
|
11860
12319
|
if (!targetState) {
|
|
11861
|
-
|
|
12320
|
+
logger37.warn(`Target state '${transition.to}' not found`);
|
|
11862
12321
|
return;
|
|
11863
12322
|
}
|
|
11864
12323
|
const fromState = this.currentState.name;
|
|
@@ -11872,7 +12331,7 @@ var AnimationGraph = class extends EventEmitter {
|
|
|
11872
12331
|
if (!this.currentState.emotionBlendEnabled) {
|
|
11873
12332
|
this.targetEmotionWeight = 0;
|
|
11874
12333
|
}
|
|
11875
|
-
|
|
12334
|
+
logger37.debug("state transition", {
|
|
11876
12335
|
from: fromState,
|
|
11877
12336
|
to: targetState.name,
|
|
11878
12337
|
trigger: event,
|
|
@@ -11909,7 +12368,7 @@ var AnimationGraph = class extends EventEmitter {
|
|
|
11909
12368
|
if (this.currentState.timeout <= 0) return;
|
|
11910
12369
|
const elapsed = now - this.stateEnterTime;
|
|
11911
12370
|
if (elapsed >= this.currentState.timeout) {
|
|
11912
|
-
|
|
12371
|
+
logger37.debug("timeout transition", {
|
|
11913
12372
|
state: this.currentState.name,
|
|
11914
12373
|
elapsed,
|
|
11915
12374
|
timeout: this.currentState.timeout
|
|
@@ -12023,7 +12482,7 @@ var AnimationGraph = class extends EventEmitter {
|
|
|
12023
12482
|
|
|
12024
12483
|
// src/animation/ProceduralLifeLayer.ts
|
|
12025
12484
|
var import_simplex_noise = require("simplex-noise");
|
|
12026
|
-
var
|
|
12485
|
+
var logger38 = createLogger("ProceduralLifeLayer");
|
|
12027
12486
|
var simplex2d = (0, import_simplex_noise.createNoise2D)();
|
|
12028
12487
|
var LIFE_BS_INDEX = /* @__PURE__ */ new Map();
|
|
12029
12488
|
for (let i = 0; i < LAM_BLENDSHAPES.length; i++) {
|
|
@@ -12129,7 +12588,7 @@ var ProceduralLifeLayer = class {
|
|
|
12129
12588
|
}
|
|
12130
12589
|
this.blinkInterval = this.nextBlinkInterval();
|
|
12131
12590
|
this.gazeBreakInterval = randomRange(...this.gazeBreakIntervalRange);
|
|
12132
|
-
|
|
12591
|
+
logger38.debug("constructor", {
|
|
12133
12592
|
blinkIntervalRange: this.blinkIntervalRange,
|
|
12134
12593
|
useLogNormalBlinks: this.useLogNormalBlinks,
|
|
12135
12594
|
gazeBreakIntervalRange: this.gazeBreakIntervalRange,
|
|
@@ -12233,7 +12692,7 @@ var ProceduralLifeLayer = class {
|
|
|
12233
12692
|
* Reset all internal state to initial values.
|
|
12234
12693
|
*/
|
|
12235
12694
|
reset() {
|
|
12236
|
-
|
|
12695
|
+
logger38.debug("reset");
|
|
12237
12696
|
this.blinkTimer = 0;
|
|
12238
12697
|
this.blinkInterval = this.nextBlinkInterval();
|
|
12239
12698
|
this.blinkPhase = PHASE_OPEN;
|
|
@@ -12285,7 +12744,7 @@ var ProceduralLifeLayer = class {
|
|
|
12285
12744
|
this.blinkTimer = 0;
|
|
12286
12745
|
this.blinkInterval = this.nextBlinkInterval();
|
|
12287
12746
|
this.asymmetryRight = 0.95 + Math.random() * 0.08;
|
|
12288
|
-
|
|
12747
|
+
logger38.trace("blink", { nextInterval: this.blinkInterval });
|
|
12289
12748
|
}
|
|
12290
12749
|
if (this.blinkPhase > PHASE_OPEN) {
|
|
12291
12750
|
this.blinkProgress += delta;
|
|
@@ -12366,7 +12825,7 @@ var ProceduralLifeLayer = class {
|
|
|
12366
12825
|
this.gazeBreakTargetX = (Math.random() - 0.5) * 2 * amp;
|
|
12367
12826
|
this.gazeBreakTargetY = (Math.random() - 0.5) * amp * 0.4;
|
|
12368
12827
|
this.gazeBreakInterval = randomRange(...params.interval);
|
|
12369
|
-
|
|
12828
|
+
logger38.trace("gaze break", {
|
|
12370
12829
|
targetX: this.gazeBreakTargetX.toFixed(3),
|
|
12371
12830
|
targetY: this.gazeBreakTargetY.toFixed(3),
|
|
12372
12831
|
nextInterval: this.gazeBreakInterval.toFixed(2),
|
|
@@ -12609,7 +13068,7 @@ var ALL_AUS = [...new Set(
|
|
|
12609
13068
|
)];
|
|
12610
13069
|
|
|
12611
13070
|
// src/face/EmotionResolver.ts
|
|
12612
|
-
var
|
|
13071
|
+
var logger39 = createLogger("EmotionResolver");
|
|
12613
13072
|
var BS_INDEX = /* @__PURE__ */ new Map();
|
|
12614
13073
|
for (let i = 0; i < LAM_BLENDSHAPES.length; i++) {
|
|
12615
13074
|
BS_INDEX.set(LAM_BLENDSHAPES[i], i);
|
|
@@ -12636,7 +13095,7 @@ var EmotionResolver = class {
|
|
|
12636
13095
|
if (!emotionWeight || emotionWeight < 0.01) continue;
|
|
12637
13096
|
const auActivations = EMOTION_TO_AU[emotionName];
|
|
12638
13097
|
if (!auActivations) {
|
|
12639
|
-
|
|
13098
|
+
logger39.warn(`Unknown emotion name with no AU mapping: "${emotionName}"`);
|
|
12640
13099
|
continue;
|
|
12641
13100
|
}
|
|
12642
13101
|
for (const activation of auActivations) {
|
|
@@ -12661,7 +13120,7 @@ var EmotionResolver = class {
|
|
|
12661
13120
|
};
|
|
12662
13121
|
|
|
12663
13122
|
// src/face/FaceCompositor.ts
|
|
12664
|
-
var
|
|
13123
|
+
var logger40 = createLogger("FaceCompositor");
|
|
12665
13124
|
function smoothstep(t) {
|
|
12666
13125
|
return t * t * (3 - 2 * t);
|
|
12667
13126
|
}
|
|
@@ -12692,7 +13151,7 @@ var FaceCompositor = class {
|
|
|
12692
13151
|
if (config?.profile) {
|
|
12693
13152
|
this.applyProfileArrays(config.profile);
|
|
12694
13153
|
}
|
|
12695
|
-
|
|
13154
|
+
logger40.debug("constructor", {
|
|
12696
13155
|
emotionSmoothing: this.emotionSmoothing,
|
|
12697
13156
|
hasProfile: !!config?.profile,
|
|
12698
13157
|
hasLifeLayer: !!config?.lifeLayer
|
|
@@ -12708,7 +13167,7 @@ var FaceCompositor = class {
|
|
|
12708
13167
|
* @returns Blendshapes (Float32Array[52] clamped [0,1]) and head rotation deltas
|
|
12709
13168
|
*/
|
|
12710
13169
|
compose(base, input, target) {
|
|
12711
|
-
const composeStart =
|
|
13170
|
+
const composeStart = getClock().now();
|
|
12712
13171
|
const out = target ?? this.outputBuffer;
|
|
12713
13172
|
out.set(base);
|
|
12714
13173
|
const emotion = input.emotion ?? this.stickyEmotion;
|
|
@@ -12755,7 +13214,7 @@ var FaceCompositor = class {
|
|
|
12755
13214
|
}
|
|
12756
13215
|
getTelemetry()?.recordHistogram(
|
|
12757
13216
|
MetricNames.COMPOSITOR_COMPOSE_LATENCY,
|
|
12758
|
-
(
|
|
13217
|
+
(getClock().now() - composeStart) * 1e3
|
|
12759
13218
|
// µs
|
|
12760
13219
|
);
|
|
12761
13220
|
return { blendshapes: out, headDelta: lifeResult.headDelta };
|
|
@@ -12765,7 +13224,7 @@ var FaceCompositor = class {
|
|
|
12765
13224
|
*/
|
|
12766
13225
|
setEmotion(weights) {
|
|
12767
13226
|
this.stickyEmotion = weights;
|
|
12768
|
-
|
|
13227
|
+
logger40.debug("setEmotion", { weights });
|
|
12769
13228
|
}
|
|
12770
13229
|
/**
|
|
12771
13230
|
* Update character profile at runtime.
|
|
@@ -12774,7 +13233,7 @@ var FaceCompositor = class {
|
|
|
12774
13233
|
this.multiplier.fill(1);
|
|
12775
13234
|
this.offset.fill(0);
|
|
12776
13235
|
this.applyProfileArrays(profile);
|
|
12777
|
-
|
|
13236
|
+
logger40.debug("setProfile", {
|
|
12778
13237
|
multiplierKeys: profile.multiplier ? Object.keys(profile.multiplier).length : 0,
|
|
12779
13238
|
offsetKeys: profile.offset ? Object.keys(profile.offset).length : 0
|
|
12780
13239
|
});
|
|
@@ -12788,7 +13247,7 @@ var FaceCompositor = class {
|
|
|
12788
13247
|
this.lifeBuffer.fill(0);
|
|
12789
13248
|
this.stickyEmotion = void 0;
|
|
12790
13249
|
this.lifeLayer.reset();
|
|
12791
|
-
|
|
13250
|
+
logger40.debug("reset");
|
|
12792
13251
|
}
|
|
12793
13252
|
/** Expand partial profile maps into dense Float32Arrays */
|
|
12794
13253
|
applyProfileArrays(profile) {
|
|
@@ -12873,7 +13332,7 @@ function parseEmotionTags(text) {
|
|
|
12873
13332
|
}
|
|
12874
13333
|
|
|
12875
13334
|
// src/character/CharacterController.ts
|
|
12876
|
-
var
|
|
13335
|
+
var logger41 = createLogger("CharacterController");
|
|
12877
13336
|
var FRAME_BUDGET_US = 33e3;
|
|
12878
13337
|
var EMOTION_MAP = {
|
|
12879
13338
|
// Synced with EmotionPresets (packages/core/src/emotion/Emotion.ts)
|
|
@@ -12943,7 +13402,7 @@ var CharacterController = class {
|
|
|
12943
13402
|
this.gazeYawInfluence = config?.gaze?.yawInfluence ?? 0.4;
|
|
12944
13403
|
this.gazePitchInfluence = config?.gaze?.pitchInfluence ?? 0.3;
|
|
12945
13404
|
this.gazeSmoothing = config?.gaze?.smoothing ?? 5;
|
|
12946
|
-
|
|
13405
|
+
logger41.debug("constructor", {
|
|
12947
13406
|
gazeEnabled: this.gazeEnabled,
|
|
12948
13407
|
gazeYawInfluence: this.gazeYawInfluence,
|
|
12949
13408
|
gazePitchInfluence: this.gazePitchInfluence,
|
|
@@ -12958,7 +13417,7 @@ var CharacterController = class {
|
|
|
12958
13417
|
* into a single output frame.
|
|
12959
13418
|
*/
|
|
12960
13419
|
update(input) {
|
|
12961
|
-
const frameStart =
|
|
13420
|
+
const frameStart = getClock().now();
|
|
12962
13421
|
const base = input.baseBlendshapes ?? this.zeroBase;
|
|
12963
13422
|
const eyeTargets = this.computeEyeTargets(
|
|
12964
13423
|
input.cameraWorldPos,
|
|
@@ -12985,7 +13444,7 @@ var CharacterController = class {
|
|
|
12985
13444
|
lifeHeadDelta,
|
|
12986
13445
|
input.avatarRotationY ?? 0
|
|
12987
13446
|
);
|
|
12988
|
-
const frameUs = (
|
|
13447
|
+
const frameUs = (getClock().now() - frameStart) * 1e3;
|
|
12989
13448
|
this.frameTimes[this.frameTimeIdx] = frameUs;
|
|
12990
13449
|
this.frameTimeIdx = (this.frameTimeIdx + 1) % this.frameTimes.length;
|
|
12991
13450
|
if (this.frameTimeFill < this.frameTimes.length) this.frameTimeFill++;
|
|
@@ -13007,13 +13466,13 @@ var CharacterController = class {
|
|
|
13007
13466
|
const resolved = resolveEmotion(emotion);
|
|
13008
13467
|
if (resolved) {
|
|
13009
13468
|
this._compositor.setEmotion(resolved);
|
|
13010
|
-
|
|
13469
|
+
logger41.debug("setEmotion", { emotion, resolved });
|
|
13011
13470
|
}
|
|
13012
13471
|
}
|
|
13013
13472
|
/** Update character profile at runtime. */
|
|
13014
13473
|
setProfile(profile) {
|
|
13015
13474
|
this._compositor.setProfile(profile);
|
|
13016
|
-
|
|
13475
|
+
logger41.debug("setProfile", {
|
|
13017
13476
|
multiplierKeys: profile.multiplier ? Object.keys(profile.multiplier).length : 0,
|
|
13018
13477
|
offsetKeys: profile.offset ? Object.keys(profile.offset).length : 0
|
|
13019
13478
|
});
|
|
@@ -13048,11 +13507,11 @@ var CharacterController = class {
|
|
|
13048
13507
|
this._compositor.reset();
|
|
13049
13508
|
this.gazeHeadYaw = 0;
|
|
13050
13509
|
this.gazeHeadPitch = -0.1;
|
|
13051
|
-
|
|
13510
|
+
logger41.debug("reset");
|
|
13052
13511
|
}
|
|
13053
13512
|
dispose() {
|
|
13054
13513
|
this.reset();
|
|
13055
|
-
|
|
13514
|
+
logger41.debug("dispose");
|
|
13056
13515
|
}
|
|
13057
13516
|
// ---------------------------------------------------------------------------
|
|
13058
13517
|
// Eye angle math (extracted from r3f useGazeTracking.computeEyeTargets)
|
|
@@ -13134,7 +13593,7 @@ var CharacterController = class {
|
|
|
13134
13593
|
};
|
|
13135
13594
|
|
|
13136
13595
|
// src/orchestration/MicLipSync.ts
|
|
13137
|
-
var
|
|
13596
|
+
var logger42 = createLogger("MicLipSync");
|
|
13138
13597
|
var MicLipSync = class extends EventEmitter {
|
|
13139
13598
|
constructor(config) {
|
|
13140
13599
|
super();
|
|
@@ -13153,7 +13612,7 @@ var MicLipSync = class extends EventEmitter {
|
|
|
13153
13612
|
this.vadChunkSize = 0;
|
|
13154
13613
|
this.vadBuffer = null;
|
|
13155
13614
|
this.vadBufferOffset = 0;
|
|
13156
|
-
|
|
13615
|
+
logger42.info("MicLipSync created", {
|
|
13157
13616
|
sampleRate: config.sampleRate ?? 16e3,
|
|
13158
13617
|
micChunkSize: config.micChunkSize ?? 512,
|
|
13159
13618
|
hasVAD: !!config.vad,
|
|
@@ -13175,12 +13634,12 @@ var MicLipSync = class extends EventEmitter {
|
|
|
13175
13634
|
this._currentFrame = scaled;
|
|
13176
13635
|
if (!this._firstFrameEmitted) {
|
|
13177
13636
|
this._firstFrameEmitted = true;
|
|
13178
|
-
|
|
13637
|
+
logger42.trace("First blendshape frame emitted");
|
|
13179
13638
|
}
|
|
13180
13639
|
this.emit("frame", { blendshapes: scaled, rawBlendshapes: raw });
|
|
13181
13640
|
},
|
|
13182
13641
|
onError: (error) => {
|
|
13183
|
-
|
|
13642
|
+
logger42.error("A2E inference error", { message: error.message });
|
|
13184
13643
|
this.emit("error", error);
|
|
13185
13644
|
}
|
|
13186
13645
|
});
|
|
@@ -13188,7 +13647,9 @@ var MicLipSync = class extends EventEmitter {
|
|
|
13188
13647
|
const float32 = int16ToFloat32(pcm);
|
|
13189
13648
|
this.processor.pushAudio(float32);
|
|
13190
13649
|
if (this.vad) {
|
|
13191
|
-
this.vadQueue = this.vadQueue.then(() => this.processVAD(float32)).catch(() => {
|
|
13650
|
+
this.vadQueue = this.vadQueue.then(() => this.processVAD(float32)).catch((err) => {
|
|
13651
|
+
logger42.warn("VAD processing error", { error: String(err), code: ErrorCodes.SPH_VAD_ERROR });
|
|
13652
|
+
this.emit("error", err instanceof Error ? err : new Error(String(err)));
|
|
13192
13653
|
});
|
|
13193
13654
|
}
|
|
13194
13655
|
});
|
|
@@ -13223,7 +13684,7 @@ var MicLipSync = class extends EventEmitter {
|
|
|
13223
13684
|
/** Start microphone capture and inference loop */
|
|
13224
13685
|
async start() {
|
|
13225
13686
|
if (this._state === "active") return;
|
|
13226
|
-
|
|
13687
|
+
logger42.info("Starting MicLipSync");
|
|
13227
13688
|
getTelemetry()?.incrementCounter(MetricNames.MIC_SESSIONS);
|
|
13228
13689
|
await this.mic.start();
|
|
13229
13690
|
this.processor.startDrip();
|
|
@@ -13233,7 +13694,7 @@ var MicLipSync = class extends EventEmitter {
|
|
|
13233
13694
|
/** Stop microphone and inference */
|
|
13234
13695
|
stop() {
|
|
13235
13696
|
if (this._state === "idle") return;
|
|
13236
|
-
|
|
13697
|
+
logger42.info("Stopping MicLipSync");
|
|
13237
13698
|
this.processor.stopDrip();
|
|
13238
13699
|
this.mic.stop();
|
|
13239
13700
|
this._isSpeaking = false;
|
|
@@ -13275,14 +13736,15 @@ var MicLipSync = class extends EventEmitter {
|
|
|
13275
13736
|
const wasSpeaking = this._isSpeaking;
|
|
13276
13737
|
this._isSpeaking = result.isSpeech;
|
|
13277
13738
|
if (!wasSpeaking && result.isSpeech) {
|
|
13278
|
-
this.speechStartTime =
|
|
13739
|
+
this.speechStartTime = getClock().now();
|
|
13279
13740
|
this.emit("speech:start");
|
|
13280
13741
|
} else if (wasSpeaking && !result.isSpeech) {
|
|
13281
|
-
const durationMs =
|
|
13742
|
+
const durationMs = getClock().now() - this.speechStartTime;
|
|
13282
13743
|
this.emit("speech:end", { durationMs });
|
|
13283
13744
|
}
|
|
13284
13745
|
} catch (err) {
|
|
13285
|
-
|
|
13746
|
+
logger42.warn("VAD process error", { error: String(err), code: ErrorCodes.SPH_VAD_ERROR });
|
|
13747
|
+
this.emit("error", err instanceof Error ? err : new Error(String(err)));
|
|
13286
13748
|
}
|
|
13287
13749
|
this.vadBufferOffset = 0;
|
|
13288
13750
|
}
|
|
@@ -13299,7 +13761,7 @@ var MicLipSync = class extends EventEmitter {
|
|
|
13299
13761
|
};
|
|
13300
13762
|
|
|
13301
13763
|
// src/orchestration/VoicePipeline.ts
|
|
13302
|
-
var
|
|
13764
|
+
var logger43 = createLogger("VoicePipeline");
|
|
13303
13765
|
var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
13304
13766
|
constructor(config) {
|
|
13305
13767
|
super();
|
|
@@ -13331,6 +13793,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13331
13793
|
this.lastProgressiveSamples = 0;
|
|
13332
13794
|
// ASR error recovery
|
|
13333
13795
|
this.asrErrorCount = 0;
|
|
13796
|
+
this.progressiveErrorCount = 0;
|
|
13334
13797
|
// Response abort
|
|
13335
13798
|
this.responseAbortController = null;
|
|
13336
13799
|
// Listener cleanup
|
|
@@ -13374,7 +13837,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13374
13837
|
if (typeof requestAnimationFrame !== "undefined") {
|
|
13375
13838
|
await new Promise((r) => requestAnimationFrame(() => r()));
|
|
13376
13839
|
}
|
|
13377
|
-
|
|
13840
|
+
logger43.debug("Creating PlaybackPipeline", {
|
|
13378
13841
|
neutralTransitionEnabled: this.config.neutralTransitionEnabled ?? true,
|
|
13379
13842
|
audioDelayMs: this.config.audioDelayMs,
|
|
13380
13843
|
chunkTargetMs: this.config.chunkTargetMs
|
|
@@ -13414,8 +13877,9 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13414
13877
|
this.setState("ready");
|
|
13415
13878
|
} catch (error) {
|
|
13416
13879
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
13880
|
+
span?.setAttributes({ "error.type": ErrorTypes.MODEL });
|
|
13417
13881
|
span?.endWithError(err);
|
|
13418
|
-
|
|
13882
|
+
logger43.error("Model loading failed", { message: err.message });
|
|
13419
13883
|
this.emit("error", err);
|
|
13420
13884
|
this.setState("error");
|
|
13421
13885
|
throw err;
|
|
@@ -13429,7 +13893,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13429
13893
|
const { backends } = this.config;
|
|
13430
13894
|
if (!backends) throw new Error("No backends config");
|
|
13431
13895
|
this.emitProgress("Loading models", 0, 1, 0);
|
|
13432
|
-
|
|
13896
|
+
logger43.info("Loading from pre-built backends");
|
|
13433
13897
|
const toLoad = [];
|
|
13434
13898
|
if (!backends.asr.isLoaded) toLoad.push(backends.asr.load());
|
|
13435
13899
|
if (!backends.lam.isLoaded) toLoad.push(backends.lam.load());
|
|
@@ -13463,7 +13927,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13463
13927
|
} else if (UnifiedInferenceWorker.isSupported()) {
|
|
13464
13928
|
this.unifiedWorker = new UnifiedInferenceWorker();
|
|
13465
13929
|
await this.unifiedWorker.init();
|
|
13466
|
-
|
|
13930
|
+
logger43.info("Created internal unified worker", { backend: this.unifiedWorker.backend });
|
|
13467
13931
|
}
|
|
13468
13932
|
}
|
|
13469
13933
|
this.emitProgress("Loading models", 0, 3, 0);
|
|
@@ -13500,17 +13964,17 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13500
13964
|
throw asrResult.reason;
|
|
13501
13965
|
}
|
|
13502
13966
|
this.asr = asr;
|
|
13503
|
-
|
|
13967
|
+
logger43.info("SenseVoice loaded");
|
|
13504
13968
|
if (vadResult.status === "rejected") {
|
|
13505
13969
|
throw vadResult.reason;
|
|
13506
13970
|
}
|
|
13507
13971
|
this.vad = vad;
|
|
13508
|
-
|
|
13972
|
+
logger43.info("Silero VAD loaded");
|
|
13509
13973
|
if (lamResult.status === "rejected") {
|
|
13510
13974
|
throw lamResult.reason;
|
|
13511
13975
|
}
|
|
13512
13976
|
this.lam = lam;
|
|
13513
|
-
|
|
13977
|
+
logger43.info("LAM loaded");
|
|
13514
13978
|
} finally {
|
|
13515
13979
|
clearInterval(progressInterval);
|
|
13516
13980
|
}
|
|
@@ -13518,7 +13982,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13518
13982
|
if (this.isLocalMode) {
|
|
13519
13983
|
const localConfig = this.config;
|
|
13520
13984
|
if (localConfig.ttsConfig && !localConfig.tts) {
|
|
13521
|
-
|
|
13985
|
+
logger43.info("Creating Kokoro TTS from config", {
|
|
13522
13986
|
hasUnifiedWorker: !!this.unifiedWorker,
|
|
13523
13987
|
voice: localConfig.ttsConfig.defaultVoice
|
|
13524
13988
|
});
|
|
@@ -13528,7 +13992,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13528
13992
|
});
|
|
13529
13993
|
}
|
|
13530
13994
|
if (localConfig.tts && !localConfig.ttsConfig && isIOS()) {
|
|
13531
|
-
|
|
13995
|
+
logger43.warn(
|
|
13532
13996
|
"External TTS on iOS creates a separate ORT WASM instance, risking OOM. Prefer ttsConfig for automatic unified worker integration."
|
|
13533
13997
|
);
|
|
13534
13998
|
}
|
|
@@ -13536,9 +14000,9 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13536
14000
|
throw new Error("VoicePipeline local mode requires either tts or ttsConfig");
|
|
13537
14001
|
}
|
|
13538
14002
|
if (!localConfig.tts.isLoaded) {
|
|
13539
|
-
|
|
14003
|
+
logger43.info("Loading local TTS model...");
|
|
13540
14004
|
await localConfig.tts.load();
|
|
13541
|
-
|
|
14005
|
+
logger43.info("Local TTS model loaded");
|
|
13542
14006
|
}
|
|
13543
14007
|
}
|
|
13544
14008
|
this.emitProgress("Loading models", 100, 3, 3);
|
|
@@ -13554,8 +14018,8 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13554
14018
|
this.epoch++;
|
|
13555
14019
|
this._sessionId = crypto.randomUUID();
|
|
13556
14020
|
this.asrErrorCount = 0;
|
|
13557
|
-
|
|
13558
|
-
|
|
14021
|
+
logger43.info("Starting voice pipeline", { sessionId: this._sessionId });
|
|
14022
|
+
logger43.debug("Pipeline mode", { mode: this.isLocalMode ? "local" : "cloud" });
|
|
13559
14023
|
this.mic = new MicrophoneCapture(this.omoteEvents, {
|
|
13560
14024
|
sampleRate: 16e3,
|
|
13561
14025
|
chunkSize: 512
|
|
@@ -13568,11 +14032,11 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13568
14032
|
this.emit("audio:level", level);
|
|
13569
14033
|
});
|
|
13570
14034
|
await this.mic.start();
|
|
13571
|
-
|
|
14035
|
+
logger43.debug("Microphone started");
|
|
13572
14036
|
this.setState("listening");
|
|
13573
14037
|
}
|
|
13574
14038
|
stop() {
|
|
13575
|
-
|
|
14039
|
+
logger43.info("Stopping voice pipeline", { sessionId: this._sessionId });
|
|
13576
14040
|
this.stopped = true;
|
|
13577
14041
|
this.epoch++;
|
|
13578
14042
|
this.clearSilenceTimer();
|
|
@@ -13601,7 +14065,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13601
14065
|
this.playback?.setProfile(profile);
|
|
13602
14066
|
}
|
|
13603
14067
|
async dispose() {
|
|
13604
|
-
|
|
14068
|
+
logger43.debug("Disposing VoicePipeline");
|
|
13605
14069
|
this.stop();
|
|
13606
14070
|
this.epoch++;
|
|
13607
14071
|
await Promise.allSettled([
|
|
@@ -13635,19 +14099,20 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13635
14099
|
if (result.isSpeech) {
|
|
13636
14100
|
if (!wasSpeaking) {
|
|
13637
14101
|
this.isSpeaking = true;
|
|
13638
|
-
this.speechStartTime =
|
|
14102
|
+
this.speechStartTime = getClock().now();
|
|
13639
14103
|
this.audioBuffer = [];
|
|
13640
14104
|
this.audioBufferSamples = 0;
|
|
13641
14105
|
this.lastProgressiveResult = null;
|
|
13642
14106
|
this.lastProgressiveSamples = 0;
|
|
13643
|
-
|
|
14107
|
+
this.progressiveErrorCount = 0;
|
|
14108
|
+
logger43.debug("VAD speech start");
|
|
13644
14109
|
this.emit("speech:start");
|
|
13645
14110
|
this.startProgressiveTranscription();
|
|
13646
14111
|
}
|
|
13647
14112
|
this.audioBuffer.push(new Float32Array(samples));
|
|
13648
14113
|
this.audioBufferSamples += samples.length;
|
|
13649
14114
|
if (this.audioBufferSamples >= _VoicePipeline.MAX_AUDIO_BUFFER_SAMPLES) {
|
|
13650
|
-
|
|
14115
|
+
logger43.warn("Audio buffer exceeded max, forcing transcription flush");
|
|
13651
14116
|
this.onSilenceDetected();
|
|
13652
14117
|
return;
|
|
13653
14118
|
}
|
|
@@ -13663,7 +14128,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13663
14128
|
}
|
|
13664
14129
|
}
|
|
13665
14130
|
} catch (err) {
|
|
13666
|
-
|
|
14131
|
+
logger43.warn("VAD error", { error: String(err) });
|
|
13667
14132
|
}
|
|
13668
14133
|
}
|
|
13669
14134
|
// ---------------------------------------------------------------------------
|
|
@@ -13674,18 +14139,18 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13674
14139
|
const extended = this.config.silenceTimeoutExtendedMs ?? 700;
|
|
13675
14140
|
const adaptive = this.config.adaptiveTimeout ?? true;
|
|
13676
14141
|
if (!adaptive) return base;
|
|
13677
|
-
const speechDurationMs =
|
|
14142
|
+
const speechDurationMs = getClock().now() - this.speechStartTime;
|
|
13678
14143
|
return speechDurationMs > 3e3 ? extended : base;
|
|
13679
14144
|
}
|
|
13680
14145
|
onSilenceDetected() {
|
|
13681
14146
|
const capturedEpoch = this.epoch;
|
|
13682
14147
|
this.isSpeaking = false;
|
|
13683
|
-
const durationMs =
|
|
13684
|
-
|
|
14148
|
+
const durationMs = getClock().now() - this.speechStartTime;
|
|
14149
|
+
logger43.debug("VAD speech end", { durationMs: Math.round(durationMs) });
|
|
13685
14150
|
this.emit("speech:end", { durationMs });
|
|
13686
14151
|
this.clearSilenceTimer();
|
|
13687
14152
|
this.processEndOfSpeech(capturedEpoch).catch((err) => {
|
|
13688
|
-
|
|
14153
|
+
logger43.error("End of speech processing failed", { error: String(err) });
|
|
13689
14154
|
if (this.epoch === capturedEpoch && !this.stopped) {
|
|
13690
14155
|
this.emit("error", err instanceof Error ? err : new Error(String(err)));
|
|
13691
14156
|
this.setState("listening");
|
|
@@ -13699,7 +14164,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13699
14164
|
const turnSpan = getTelemetry()?.startSpan("VoicePipeline.turn", {
|
|
13700
14165
|
"session.id": this._sessionId ?? ""
|
|
13701
14166
|
});
|
|
13702
|
-
const turnStart =
|
|
14167
|
+
const turnStart = getClock().now();
|
|
13703
14168
|
if (this.progressivePromise) {
|
|
13704
14169
|
try {
|
|
13705
14170
|
await this.progressivePromise;
|
|
@@ -13724,7 +14189,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13724
14189
|
const minEnergy = this.config.minAudioEnergy ?? 0.02;
|
|
13725
14190
|
const durationSec = totalSamples / 16e3;
|
|
13726
14191
|
if (durationSec < minDuration) {
|
|
13727
|
-
|
|
14192
|
+
logger43.info("Audio too short, discarding", { durationSec });
|
|
13728
14193
|
turnSpan?.end();
|
|
13729
14194
|
this.setState("listening");
|
|
13730
14195
|
return;
|
|
@@ -13735,7 +14200,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13735
14200
|
}
|
|
13736
14201
|
rms = Math.sqrt(rms / fullAudio.length);
|
|
13737
14202
|
if (rms < minEnergy) {
|
|
13738
|
-
|
|
14203
|
+
logger43.info("Audio too quiet, discarding", { rms });
|
|
13739
14204
|
turnSpan?.end();
|
|
13740
14205
|
this.setState("listening");
|
|
13741
14206
|
return;
|
|
@@ -13746,12 +14211,12 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13746
14211
|
const coverageThreshold = this.config.progressiveCoverageThreshold ?? 0.8;
|
|
13747
14212
|
if (this.lastProgressiveResult && this.lastProgressiveResult.text.trim().length > 0 && this.lastProgressiveSamples >= totalSamples * coverageThreshold) {
|
|
13748
14213
|
transcript = { ...this.lastProgressiveResult, isFinal: true };
|
|
13749
|
-
|
|
14214
|
+
logger43.info("Using progressive result", {
|
|
13750
14215
|
coverage: (this.lastProgressiveSamples / totalSamples).toFixed(2),
|
|
13751
14216
|
text: transcript.text
|
|
13752
14217
|
});
|
|
13753
14218
|
} else {
|
|
13754
|
-
|
|
14219
|
+
logger43.debug("Progressive result insufficient, running final transcription", {
|
|
13755
14220
|
samples: totalSamples,
|
|
13756
14221
|
hadProgressive: !!this.lastProgressiveResult
|
|
13757
14222
|
});
|
|
@@ -13766,7 +14231,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13766
14231
|
return;
|
|
13767
14232
|
}
|
|
13768
14233
|
if (!transcript || !transcript.text.trim()) {
|
|
13769
|
-
|
|
14234
|
+
logger43.info("No transcript, resuming listening");
|
|
13770
14235
|
turnSpan?.end();
|
|
13771
14236
|
this.setState("listening");
|
|
13772
14237
|
return;
|
|
@@ -13774,7 +14239,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13774
14239
|
this.emit("transcript", transcript);
|
|
13775
14240
|
getTelemetry()?.recordHistogram(
|
|
13776
14241
|
MetricNames.VOICE_TURN_LATENCY,
|
|
13777
|
-
|
|
14242
|
+
getClock().now() - turnStart,
|
|
13778
14243
|
{ mode: this.isLocalMode ? "local" : "cloud" }
|
|
13779
14244
|
);
|
|
13780
14245
|
await this.callResponseHandler(transcript, capturedEpoch, turnSpan?.getContext());
|
|
@@ -13788,7 +14253,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13788
14253
|
const span = getTelemetry()?.startSpan("VoicePipeline.response", {
|
|
13789
14254
|
"text.length": transcript.text.length
|
|
13790
14255
|
}, parentContext);
|
|
13791
|
-
const responseStart =
|
|
14256
|
+
const responseStart = getClock().now();
|
|
13792
14257
|
this.setState("speaking");
|
|
13793
14258
|
this.interruption?.setAISpeaking(true);
|
|
13794
14259
|
if (transcript.emotion) {
|
|
@@ -13805,7 +14270,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13805
14270
|
}
|
|
13806
14271
|
getTelemetry()?.recordHistogram(
|
|
13807
14272
|
MetricNames.VOICE_RESPONSE_LATENCY,
|
|
13808
|
-
|
|
14273
|
+
getClock().now() - responseStart,
|
|
13809
14274
|
{ mode: this.isLocalMode ? "local" : "cloud" }
|
|
13810
14275
|
);
|
|
13811
14276
|
span?.end();
|
|
@@ -13815,8 +14280,9 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13815
14280
|
return;
|
|
13816
14281
|
}
|
|
13817
14282
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
14283
|
+
span?.setAttributes({ "error.type": ErrorTypes.RUNTIME });
|
|
13818
14284
|
span?.endWithError(err);
|
|
13819
|
-
|
|
14285
|
+
logger43.error("Response handler error", { message: err.message });
|
|
13820
14286
|
this.emit("error", err);
|
|
13821
14287
|
if (this.epoch === capturedEpoch && !this.stopped) {
|
|
13822
14288
|
this.interruption?.setAISpeaking(false);
|
|
@@ -13887,11 +14353,11 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13887
14353
|
// ---------------------------------------------------------------------------
|
|
13888
14354
|
handleInterruption() {
|
|
13889
14355
|
if (this._state !== "speaking") return;
|
|
13890
|
-
|
|
14356
|
+
logger43.info("Interruption triggered");
|
|
13891
14357
|
getTelemetry()?.incrementCounter(MetricNames.VOICE_INTERRUPTIONS);
|
|
13892
14358
|
this.epoch++;
|
|
13893
14359
|
if (this.responseAbortController) {
|
|
13894
|
-
|
|
14360
|
+
logger43.debug("Aborting in-flight response");
|
|
13895
14361
|
}
|
|
13896
14362
|
this.responseAbortController?.abort();
|
|
13897
14363
|
this.playback?.stop();
|
|
@@ -13929,7 +14395,15 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13929
14395
|
this.lastProgressiveSamples = snapshotSamples;
|
|
13930
14396
|
this.emit("transcript", { ...result, isFinal: false });
|
|
13931
14397
|
}
|
|
13932
|
-
} catch {
|
|
14398
|
+
} catch (err) {
|
|
14399
|
+
this.progressiveErrorCount++;
|
|
14400
|
+
if (this.progressiveErrorCount % 10 === 1) {
|
|
14401
|
+
logger43.warn("Progressive transcription error", {
|
|
14402
|
+
code: ErrorCodes.SPH_ASR_ERROR,
|
|
14403
|
+
count: this.progressiveErrorCount,
|
|
14404
|
+
error: String(err)
|
|
14405
|
+
});
|
|
14406
|
+
}
|
|
13933
14407
|
}
|
|
13934
14408
|
})();
|
|
13935
14409
|
}, intervalMs);
|
|
@@ -13945,8 +14419,9 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13945
14419
|
// ---------------------------------------------------------------------------
|
|
13946
14420
|
async transcribeWithTimeout(audio) {
|
|
13947
14421
|
if (!this.asr) return null;
|
|
14422
|
+
const currentEpoch = this.epoch;
|
|
13948
14423
|
const timeoutMs = this.config.transcriptionTimeoutMs ?? 1e4;
|
|
13949
|
-
const startTime =
|
|
14424
|
+
const startTime = getClock().now();
|
|
13950
14425
|
const span = getTelemetry()?.startSpan("VoicePipeline.transcribe", {
|
|
13951
14426
|
"inference.input_samples": audio.length,
|
|
13952
14427
|
"inference.input_duration_ms": audio.length / 16e3 * 1e3
|
|
@@ -13960,7 +14435,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13960
14435
|
})
|
|
13961
14436
|
]);
|
|
13962
14437
|
clearTimeout(timeoutId);
|
|
13963
|
-
const latency =
|
|
14438
|
+
const latency = getClock().now() - startTime;
|
|
13964
14439
|
this.asrErrorCount = 0;
|
|
13965
14440
|
getTelemetry()?.recordHistogram(MetricNames.VOICE_TRANSCRIPTION_LATENCY, latency);
|
|
13966
14441
|
getTelemetry()?.incrementCounter(MetricNames.VOICE_TRANSCRIPTIONS);
|
|
@@ -13974,14 +14449,18 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13974
14449
|
inferenceTimeMs: latency
|
|
13975
14450
|
};
|
|
13976
14451
|
} catch (error) {
|
|
14452
|
+
span?.setAttributes({ "error.type": ErrorTypes.INFERENCE });
|
|
13977
14453
|
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
13978
14454
|
this.asrErrorCount++;
|
|
13979
|
-
|
|
14455
|
+
logger43.warn("Transcription failed", {
|
|
13980
14456
|
attempt: this.asrErrorCount,
|
|
13981
14457
|
error: String(error)
|
|
13982
14458
|
});
|
|
13983
14459
|
if (this.asrErrorCount >= 3 && this.config.models) {
|
|
13984
|
-
|
|
14460
|
+
if (this.epoch !== currentEpoch) return null;
|
|
14461
|
+
logger43.warn("3 consecutive ASR errors, recreating session", {
|
|
14462
|
+
code: ErrorCodes.SPH_ASR_ERROR
|
|
14463
|
+
});
|
|
13985
14464
|
try {
|
|
13986
14465
|
await this.asr.dispose();
|
|
13987
14466
|
this.asr = createSenseVoice({
|
|
@@ -13991,9 +14470,10 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13991
14470
|
unifiedWorker: this.unifiedWorker ?? void 0
|
|
13992
14471
|
});
|
|
13993
14472
|
await this.asr.load();
|
|
14473
|
+
if (this.epoch !== currentEpoch) return null;
|
|
13994
14474
|
this.asrErrorCount = 0;
|
|
13995
14475
|
} catch (recreateErr) {
|
|
13996
|
-
|
|
14476
|
+
logger43.error("ASR session recreation failed", { error: String(recreateErr) });
|
|
13997
14477
|
}
|
|
13998
14478
|
}
|
|
13999
14479
|
return null;
|
|
@@ -14022,7 +14502,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
14022
14502
|
// ---------------------------------------------------------------------------
|
|
14023
14503
|
setState(state) {
|
|
14024
14504
|
if (this._state === state) return;
|
|
14025
|
-
|
|
14505
|
+
logger43.info("State transition", { from: this._state, to: state });
|
|
14026
14506
|
this._state = state;
|
|
14027
14507
|
this.emit("state", state);
|
|
14028
14508
|
}
|
|
@@ -14041,7 +14521,7 @@ _VoicePipeline.MAX_AUDIO_BUFFER_SAMPLES = 16e3 * 30;
|
|
|
14041
14521
|
var VoicePipeline = _VoicePipeline;
|
|
14042
14522
|
|
|
14043
14523
|
// src/orchestration/VoiceOrchestrator.ts
|
|
14044
|
-
var
|
|
14524
|
+
var logger44 = createLogger("VoiceOrchestrator");
|
|
14045
14525
|
var VoiceOrchestrator = class extends EventEmitter {
|
|
14046
14526
|
constructor() {
|
|
14047
14527
|
super(...arguments);
|
|
@@ -14091,7 +14571,7 @@ var VoiceOrchestrator = class extends EventEmitter {
|
|
|
14091
14571
|
const epoch = ++this.connectEpoch;
|
|
14092
14572
|
this._mode = config.mode ?? "local";
|
|
14093
14573
|
this._sessionId = crypto.randomUUID();
|
|
14094
|
-
|
|
14574
|
+
logger44.info("Connecting voice orchestrator", { mode: this._mode });
|
|
14095
14575
|
if (this._mode === "local") {
|
|
14096
14576
|
const localCfg = config;
|
|
14097
14577
|
this.ttsSpeaker = new TTSSpeaker();
|
|
@@ -14144,7 +14624,7 @@ var VoiceOrchestrator = class extends EventEmitter {
|
|
|
14144
14624
|
} else {
|
|
14145
14625
|
this.wireCloudTranscript(config);
|
|
14146
14626
|
}
|
|
14147
|
-
|
|
14627
|
+
logger44.info("Voice orchestrator connected", { mode: this._mode });
|
|
14148
14628
|
}
|
|
14149
14629
|
async disconnect() {
|
|
14150
14630
|
this.connectEpoch++;
|
|
@@ -14258,7 +14738,7 @@ var VoiceOrchestrator = class extends EventEmitter {
|
|
|
14258
14738
|
await this.speak(text);
|
|
14259
14739
|
}
|
|
14260
14740
|
} catch (e) {
|
|
14261
|
-
|
|
14741
|
+
logger44.error("Voice transcript handler error", { error: String(e) });
|
|
14262
14742
|
} finally {
|
|
14263
14743
|
this.interruption?.setAISpeaking(false);
|
|
14264
14744
|
this.speechListener?.resume();
|
|
@@ -14299,7 +14779,7 @@ var VoiceOrchestrator = class extends EventEmitter {
|
|
|
14299
14779
|
});
|
|
14300
14780
|
} catch (e) {
|
|
14301
14781
|
if (!abortController.signal.aborted) {
|
|
14302
|
-
|
|
14782
|
+
logger44.error("Cloud response handler error", { error: String(e) });
|
|
14303
14783
|
}
|
|
14304
14784
|
} finally {
|
|
14305
14785
|
this.responseAbortController = null;
|
|
@@ -14313,7 +14793,7 @@ var VoiceOrchestrator = class extends EventEmitter {
|
|
|
14313
14793
|
// -------------------------------------------------------------------------
|
|
14314
14794
|
handleInterruption() {
|
|
14315
14795
|
if (this._state !== "speaking") return;
|
|
14316
|
-
|
|
14796
|
+
logger44.info("Interruption triggered");
|
|
14317
14797
|
this.stopSpeaking();
|
|
14318
14798
|
this.speechListener?.resume();
|
|
14319
14799
|
this.setState("listening");
|