@omote/core 0.9.1 → 0.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ErrorCodes-AX3ADZri.d.mts +266 -0
- package/dist/ErrorCodes-AX3ADZri.d.ts +266 -0
- package/dist/chunk-CYBTTLG7.mjs +927 -0
- package/dist/chunk-CYBTTLG7.mjs.map +1 -0
- package/dist/chunk-X5OTUOE6.mjs +927 -0
- package/dist/chunk-X5OTUOE6.mjs.map +1 -0
- package/dist/chunk-Y3DTP5P3.mjs +927 -0
- package/dist/chunk-Y3DTP5P3.mjs.map +1 -0
- package/dist/index.d.mts +214 -3
- package/dist/index.d.ts +214 -3
- package/dist/index.js +713 -233
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +638 -225
- package/dist/index.mjs.map +1 -1
- package/dist/logging/index.d.mts +2 -2
- package/dist/logging/index.d.ts +2 -2
- package/dist/logging/index.js +75 -1
- package/dist/logging/index.js.map +1 -1
- package/dist/logging/index.mjs +9 -1
- package/package.json +3 -1
package/dist/index.mjs
CHANGED
|
@@ -4,19 +4,22 @@ import {
|
|
|
4
4
|
import {
|
|
5
5
|
ConsoleExporter,
|
|
6
6
|
DEFAULT_LOGGING_CONFIG,
|
|
7
|
+
ErrorCodes,
|
|
7
8
|
LOG_LEVEL_PRIORITY,
|
|
8
9
|
OTLPExporter,
|
|
9
10
|
OmoteTelemetry,
|
|
11
|
+
configureClock,
|
|
10
12
|
configureLogging,
|
|
11
13
|
configureTelemetry,
|
|
12
14
|
createLogger,
|
|
15
|
+
getClock,
|
|
13
16
|
getLoggingConfig,
|
|
14
17
|
getTelemetry,
|
|
15
18
|
noopLogger,
|
|
16
19
|
resetLoggingConfig,
|
|
17
20
|
setLogLevel,
|
|
18
21
|
setLoggingEnabled
|
|
19
|
-
} from "./chunk-
|
|
22
|
+
} from "./chunk-X5OTUOE6.mjs";
|
|
20
23
|
|
|
21
24
|
// src/audio/audioConvert.ts
|
|
22
25
|
function float32ToPcm16(samples) {
|
|
@@ -168,7 +171,7 @@ var MicrophoneCapture = class {
|
|
|
168
171
|
const pcm = this.floatToPCM16(chunk);
|
|
169
172
|
this.events.emit("audio.chunk", {
|
|
170
173
|
pcm,
|
|
171
|
-
timestamp:
|
|
174
|
+
timestamp: getClock().now()
|
|
172
175
|
});
|
|
173
176
|
chunkCount++;
|
|
174
177
|
}
|
|
@@ -399,11 +402,23 @@ var AudioScheduler = class {
|
|
|
399
402
|
source.connect(gainNode);
|
|
400
403
|
const scheduleTime = this.nextPlayTime;
|
|
401
404
|
if (scheduleTime < ctx.currentTime) {
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
405
|
+
const gap = ctx.currentTime - scheduleTime;
|
|
406
|
+
const gapMs = gap * 1e3;
|
|
407
|
+
if (gap > 0.5) {
|
|
408
|
+
logger2.error("Critical audio scheduling gap", {
|
|
409
|
+
code: ErrorCodes.AUD_SCHEDULE_GAP,
|
|
410
|
+
scheduleTime,
|
|
411
|
+
currentTime: ctx.currentTime,
|
|
412
|
+
gapMs: Math.round(gapMs)
|
|
413
|
+
});
|
|
414
|
+
this.options.onError?.(new Error(`Audio scheduling gap: ${gap.toFixed(3)}s`));
|
|
415
|
+
} else {
|
|
416
|
+
logger2.warn("Audio gap detected", {
|
|
417
|
+
scheduleTime,
|
|
418
|
+
currentTime: ctx.currentTime,
|
|
419
|
+
gapMs: Math.round(gapMs)
|
|
420
|
+
});
|
|
421
|
+
}
|
|
407
422
|
}
|
|
408
423
|
source.start(scheduleTime);
|
|
409
424
|
const entry = { source, gainNode };
|
|
@@ -597,8 +612,8 @@ var AudioChunkCoalescer = class {
|
|
|
597
612
|
var logger4 = createLogger("A2EProcessor");
|
|
598
613
|
var FRAME_RATE = 30;
|
|
599
614
|
var DRIP_INTERVAL_MS = 33;
|
|
600
|
-
var HOLD_DURATION_MS =
|
|
601
|
-
var DECAY_DURATION_MS =
|
|
615
|
+
var HOLD_DURATION_MS = 400;
|
|
616
|
+
var DECAY_DURATION_MS = 300;
|
|
602
617
|
var _A2EProcessor = class _A2EProcessor {
|
|
603
618
|
constructor(config) {
|
|
604
619
|
this.writeOffset = 0;
|
|
@@ -762,7 +777,7 @@ var _A2EProcessor = class _A2EProcessor {
|
|
|
762
777
|
if (this.timestampedQueue.length > 0 && this.timestampedQueue[0].timestamp <= currentTime) {
|
|
763
778
|
const { frame } = this.timestampedQueue.shift();
|
|
764
779
|
this.lastPulledFrame = frame;
|
|
765
|
-
this.lastDequeuedTime =
|
|
780
|
+
this.lastDequeuedTime = getClock().now();
|
|
766
781
|
return frame;
|
|
767
782
|
}
|
|
768
783
|
if (this.timestampedQueue.length > 0 && this.getFrameCallCount % 60 === 0) {
|
|
@@ -774,7 +789,7 @@ var _A2EProcessor = class _A2EProcessor {
|
|
|
774
789
|
});
|
|
775
790
|
}
|
|
776
791
|
if (this.lastPulledFrame) {
|
|
777
|
-
const elapsed =
|
|
792
|
+
const elapsed = getClock().now() - this.lastDequeuedTime;
|
|
778
793
|
if (elapsed < HOLD_DURATION_MS) {
|
|
779
794
|
return this.lastPulledFrame;
|
|
780
795
|
}
|
|
@@ -859,9 +874,9 @@ var _A2EProcessor = class _A2EProcessor {
|
|
|
859
874
|
while (this.pendingChunks.length > 0 && !this.disposed) {
|
|
860
875
|
const { chunk, timestamp } = this.pendingChunks.shift();
|
|
861
876
|
try {
|
|
862
|
-
const t0 =
|
|
877
|
+
const t0 = getClock().now();
|
|
863
878
|
const result = await this.backend.infer(chunk, this.identityIndex);
|
|
864
|
-
const inferMs = Math.round(
|
|
879
|
+
const inferMs = Math.round(getClock().now() - t0);
|
|
865
880
|
const actualDuration = chunk.length / this.sampleRate;
|
|
866
881
|
const actualFrameCount = Math.ceil(actualDuration * FRAME_RATE);
|
|
867
882
|
const framesToQueue = Math.min(actualFrameCount, result.blendshapes.length);
|
|
@@ -900,7 +915,11 @@ var _A2EProcessor = class _A2EProcessor {
|
|
|
900
915
|
}
|
|
901
916
|
handleError(err) {
|
|
902
917
|
const error = err instanceof Error ? err : new Error(String(err));
|
|
903
|
-
|
|
918
|
+
const isOOM = typeof err === "number" || error.message && /out of memory|oom|alloc/i.test(error.message);
|
|
919
|
+
logger4.warn("A2EProcessor inference error", {
|
|
920
|
+
error: error.message,
|
|
921
|
+
code: isOOM ? ErrorCodes.INF_OOM : ErrorCodes.INF_SESSION_POISON
|
|
922
|
+
});
|
|
904
923
|
this.onError?.(error);
|
|
905
924
|
}
|
|
906
925
|
};
|
|
@@ -922,6 +941,12 @@ var MetricNames = {
|
|
|
922
941
|
CACHE_HITS: "omote.cache.hits",
|
|
923
942
|
/** Counter: Cache misses */
|
|
924
943
|
CACHE_MISSES: "omote.cache.misses",
|
|
944
|
+
/** Counter: Cache stale (version/etag mismatch) */
|
|
945
|
+
CACHE_STALE: "omote.cache.stale",
|
|
946
|
+
/** Counter: Cache quota warning (>90% used) */
|
|
947
|
+
CACHE_QUOTA_WARNING: "omote.cache.quota_warning",
|
|
948
|
+
/** Counter: Cache eviction (LRU) */
|
|
949
|
+
CACHE_EVICTION: "omote.cache.eviction",
|
|
925
950
|
// --- Pipeline ---
|
|
926
951
|
/** Histogram: VoicePipeline turn latency (speech end → transcript ready, excludes playback) */
|
|
927
952
|
VOICE_TURN_LATENCY: "omote.voice.turn.latency",
|
|
@@ -1221,14 +1246,14 @@ var PlaybackPipeline = class extends EventEmitter {
|
|
|
1221
1246
|
this._currentRawFrame = null;
|
|
1222
1247
|
this.cancelNeutralTransition();
|
|
1223
1248
|
this.scheduler.warmup();
|
|
1224
|
-
this.sessionStartTime =
|
|
1249
|
+
this.sessionStartTime = getClock().now();
|
|
1225
1250
|
this.startFrameLoop();
|
|
1226
1251
|
this.startMonitoring();
|
|
1227
1252
|
this.setState("playing");
|
|
1228
1253
|
}
|
|
1229
1254
|
/** Feed a streaming audio chunk (PCM16 Uint8Array) */
|
|
1230
1255
|
async onAudioChunk(chunk) {
|
|
1231
|
-
const chunkStart =
|
|
1256
|
+
const chunkStart = getClock().now();
|
|
1232
1257
|
const combined = this.coalescer.add(chunk);
|
|
1233
1258
|
if (!combined) return;
|
|
1234
1259
|
const float32 = pcm16ToFloat32(combined);
|
|
@@ -1238,7 +1263,7 @@ var PlaybackPipeline = class extends EventEmitter {
|
|
|
1238
1263
|
this.emit("playback:start", { time: scheduleTime });
|
|
1239
1264
|
}
|
|
1240
1265
|
this.processor.pushAudio(float32, scheduleTime);
|
|
1241
|
-
getTelemetry()?.recordHistogram(MetricNames.PLAYBACK_CHUNK_LATENCY,
|
|
1266
|
+
getTelemetry()?.recordHistogram(MetricNames.PLAYBACK_CHUNK_LATENCY, getClock().now() - chunkStart);
|
|
1242
1267
|
}
|
|
1243
1268
|
/** Signal end of audio stream (flushes remaining audio) */
|
|
1244
1269
|
async end() {
|
|
@@ -1341,15 +1366,15 @@ var PlaybackPipeline = class extends EventEmitter {
|
|
|
1341
1366
|
const currentTime = this.scheduler.getCurrentTime();
|
|
1342
1367
|
const lamFrame = this.processor.getFrameForTime(currentTime);
|
|
1343
1368
|
if (lamFrame && lamFrame !== this.lastKnownLamFrame) {
|
|
1344
|
-
this.lastNewFrameTime =
|
|
1369
|
+
this.lastNewFrameTime = getClock().now();
|
|
1345
1370
|
this.lastKnownLamFrame = lamFrame;
|
|
1346
1371
|
this.staleWarningEmitted = false;
|
|
1347
1372
|
}
|
|
1348
|
-
if (this.playbackStarted && this.lastNewFrameTime > 0 &&
|
|
1373
|
+
if (this.playbackStarted && this.lastNewFrameTime > 0 && getClock().now() - this.lastNewFrameTime > this.staleThresholdMs) {
|
|
1349
1374
|
if (!this.staleWarningEmitted) {
|
|
1350
1375
|
this.staleWarningEmitted = true;
|
|
1351
1376
|
logger5.warn("A2E stalled \u2014 no new inference frames", {
|
|
1352
|
-
staleDurationMs: Math.round(
|
|
1377
|
+
staleDurationMs: Math.round(getClock().now() - this.lastNewFrameTime),
|
|
1353
1378
|
queuedFrames: this.processor.queuedFrameCount
|
|
1354
1379
|
});
|
|
1355
1380
|
}
|
|
@@ -1389,7 +1414,7 @@ var PlaybackPipeline = class extends EventEmitter {
|
|
|
1389
1414
|
if (this.sessionStartTime > 0) {
|
|
1390
1415
|
getTelemetry()?.recordHistogram(
|
|
1391
1416
|
MetricNames.PLAYBACK_SESSION_DURATION,
|
|
1392
|
-
|
|
1417
|
+
getClock().now() - this.sessionStartTime
|
|
1393
1418
|
);
|
|
1394
1419
|
}
|
|
1395
1420
|
this.stopInternal();
|
|
@@ -1407,9 +1432,9 @@ var PlaybackPipeline = class extends EventEmitter {
|
|
|
1407
1432
|
// ---------------------------------------------------------------------------
|
|
1408
1433
|
startNeutralTransition(fromFrame) {
|
|
1409
1434
|
this.neutralTransitionFrame = new Float32Array(fromFrame);
|
|
1410
|
-
this.neutralTransitionStart =
|
|
1435
|
+
this.neutralTransitionStart = getClock().now();
|
|
1411
1436
|
const animate = () => {
|
|
1412
|
-
const elapsed =
|
|
1437
|
+
const elapsed = getClock().now() - this.neutralTransitionStart;
|
|
1413
1438
|
const t = Math.min(1, elapsed / this.neutralTransitionMs);
|
|
1414
1439
|
const eased = 1 - Math.pow(1 - t, 3);
|
|
1415
1440
|
logger5.trace("neutral transition", { t: Math.round(t * 1e3) / 1e3, eased: Math.round(eased * 1e3) / 1e3 });
|
|
@@ -1422,7 +1447,7 @@ var PlaybackPipeline = class extends EventEmitter {
|
|
|
1422
1447
|
blendshapes,
|
|
1423
1448
|
rawBlendshapes: blendshapes,
|
|
1424
1449
|
// raw = scaled during transition
|
|
1425
|
-
timestamp:
|
|
1450
|
+
timestamp: getClock().now() / 1e3,
|
|
1426
1451
|
emotion: this._emotion ?? void 0
|
|
1427
1452
|
};
|
|
1428
1453
|
this.emit("frame", frame);
|
|
@@ -1653,7 +1678,7 @@ var ModelCache = class {
|
|
|
1653
1678
|
logger7.warn("Failed to request persistent storage", { error: String(err) });
|
|
1654
1679
|
}
|
|
1655
1680
|
}
|
|
1656
|
-
const dbOpenStart =
|
|
1681
|
+
const dbOpenStart = getClock().now();
|
|
1657
1682
|
this.dbPromise = new Promise((resolve, reject) => {
|
|
1658
1683
|
const request = indexedDB.open(DB_NAME, DB_VERSION);
|
|
1659
1684
|
request.onerror = () => {
|
|
@@ -1662,7 +1687,7 @@ var ModelCache = class {
|
|
|
1662
1687
|
};
|
|
1663
1688
|
request.onsuccess = () => {
|
|
1664
1689
|
this.db = request.result;
|
|
1665
|
-
logger7.debug("IndexedDB opened", { durationMs: Math.round(
|
|
1690
|
+
logger7.debug("IndexedDB opened", { durationMs: Math.round(getClock().now() - dbOpenStart) });
|
|
1666
1691
|
resolve(this.db);
|
|
1667
1692
|
};
|
|
1668
1693
|
request.onupgradeneeded = (event) => {
|
|
@@ -1736,16 +1761,16 @@ var ModelCache = class {
|
|
|
1736
1761
|
}
|
|
1737
1762
|
span?.end();
|
|
1738
1763
|
if (hit) {
|
|
1739
|
-
telemetry?.incrementCounter(
|
|
1764
|
+
telemetry?.incrementCounter(MetricNames.CACHE_HITS, 1, {});
|
|
1740
1765
|
} else {
|
|
1741
|
-
telemetry?.incrementCounter(
|
|
1766
|
+
telemetry?.incrementCounter(MetricNames.CACHE_MISSES, 1, {});
|
|
1742
1767
|
}
|
|
1743
1768
|
resolve(cached?.data ?? null);
|
|
1744
1769
|
};
|
|
1745
1770
|
request.onerror = () => {
|
|
1746
1771
|
span?.setAttributes({ "cache.hit": false });
|
|
1747
1772
|
span?.end();
|
|
1748
|
-
telemetry?.incrementCounter(
|
|
1773
|
+
telemetry?.incrementCounter(MetricNames.CACHE_MISSES, 1, {});
|
|
1749
1774
|
resolve(null);
|
|
1750
1775
|
};
|
|
1751
1776
|
});
|
|
@@ -1789,14 +1814,14 @@ var ModelCache = class {
|
|
|
1789
1814
|
if (!cached?.data) {
|
|
1790
1815
|
span?.setAttributes({ "cache.hit": false });
|
|
1791
1816
|
span?.end();
|
|
1792
|
-
telemetry?.incrementCounter(
|
|
1817
|
+
telemetry?.incrementCounter(MetricNames.CACHE_MISSES, 1, {});
|
|
1793
1818
|
return { data: null, stale: false };
|
|
1794
1819
|
}
|
|
1795
1820
|
span?.setAttributes({ "cache.hit": true, "cache.size_bytes": cached.size });
|
|
1796
1821
|
if (!cached.etag) {
|
|
1797
1822
|
span?.setAttributes({ "cache.validated": false, "cache.stale": false });
|
|
1798
1823
|
span?.end();
|
|
1799
|
-
telemetry?.incrementCounter(
|
|
1824
|
+
telemetry?.incrementCounter(MetricNames.CACHE_HITS, 1, {});
|
|
1800
1825
|
return { data: cached.data, stale: false };
|
|
1801
1826
|
}
|
|
1802
1827
|
const fetchUrl = originalUrl || url;
|
|
@@ -1805,7 +1830,7 @@ var ModelCache = class {
|
|
|
1805
1830
|
if (!response.ok) {
|
|
1806
1831
|
span?.setAttributes({ "cache.validated": false, "cache.stale": false });
|
|
1807
1832
|
span?.end();
|
|
1808
|
-
telemetry?.incrementCounter(
|
|
1833
|
+
telemetry?.incrementCounter(MetricNames.CACHE_HITS, 1, {});
|
|
1809
1834
|
return { data: cached.data, stale: false };
|
|
1810
1835
|
}
|
|
1811
1836
|
const serverEtag = response.headers.get("etag");
|
|
@@ -1818,17 +1843,17 @@ var ModelCache = class {
|
|
|
1818
1843
|
});
|
|
1819
1844
|
span?.end();
|
|
1820
1845
|
if (isStale) {
|
|
1821
|
-
telemetry?.incrementCounter(
|
|
1846
|
+
telemetry?.incrementCounter(MetricNames.CACHE_STALE, 1, {});
|
|
1822
1847
|
logger7.debug("Stale cache detected", { url });
|
|
1823
1848
|
} else {
|
|
1824
|
-
telemetry?.incrementCounter(
|
|
1849
|
+
telemetry?.incrementCounter(MetricNames.CACHE_HITS, 1, {});
|
|
1825
1850
|
}
|
|
1826
1851
|
return { data: cached.data, stale: isStale };
|
|
1827
1852
|
} catch (fetchError) {
|
|
1828
1853
|
logger7.warn("HEAD validation failed, using cached data", { error: String(fetchError) });
|
|
1829
1854
|
span?.setAttributes({ "cache.validated": false, "cache.stale": false });
|
|
1830
1855
|
span?.end();
|
|
1831
|
-
telemetry?.incrementCounter(
|
|
1856
|
+
telemetry?.incrementCounter(MetricNames.CACHE_HITS, 1, {});
|
|
1832
1857
|
return { data: cached.data, stale: false };
|
|
1833
1858
|
}
|
|
1834
1859
|
} catch {
|
|
@@ -1909,7 +1934,7 @@ var ModelCache = class {
|
|
|
1909
1934
|
const telemetry = getTelemetry();
|
|
1910
1935
|
if (quota.percentUsed > 90) {
|
|
1911
1936
|
logger7.warn("Storage quota warning", { percentUsed: quota.percentUsed.toFixed(1), used: formatBytes(quota.usedBytes), quota: formatBytes(quota.quotaBytes) });
|
|
1912
|
-
telemetry?.incrementCounter(
|
|
1937
|
+
telemetry?.incrementCounter(MetricNames.CACHE_QUOTA_WARNING, 1, {
|
|
1913
1938
|
percent_used: String(Math.round(quota.percentUsed))
|
|
1914
1939
|
});
|
|
1915
1940
|
if (config.onQuotaWarning) {
|
|
@@ -2051,7 +2076,7 @@ var ModelCache = class {
|
|
|
2051
2076
|
});
|
|
2052
2077
|
span?.end();
|
|
2053
2078
|
if (freedBytes > 0) {
|
|
2054
|
-
telemetry?.incrementCounter(
|
|
2079
|
+
telemetry?.incrementCounter(MetricNames.CACHE_EVICTION, evictedUrls.length, {
|
|
2055
2080
|
bytes_freed: String(freedBytes)
|
|
2056
2081
|
});
|
|
2057
2082
|
}
|
|
@@ -2571,7 +2596,7 @@ var _A2EInference = class _A2EInference {
|
|
|
2571
2596
|
throw new Error("Model already loaded. Call dispose() first.");
|
|
2572
2597
|
}
|
|
2573
2598
|
this.isLoading = true;
|
|
2574
|
-
const startTime =
|
|
2599
|
+
const startTime = getClock().now();
|
|
2575
2600
|
const telemetry = getTelemetry();
|
|
2576
2601
|
const span = telemetry?.startSpan("A2EInference.load", {
|
|
2577
2602
|
"model.url": this.config.modelUrl,
|
|
@@ -2667,7 +2692,7 @@ var _A2EInference = class _A2EInference {
|
|
|
2667
2692
|
executionProvider: this._backend,
|
|
2668
2693
|
backend: this._backend
|
|
2669
2694
|
});
|
|
2670
|
-
const loadTimeMs =
|
|
2695
|
+
const loadTimeMs = getClock().now() - startTime;
|
|
2671
2696
|
logger10.info("Model loaded successfully", {
|
|
2672
2697
|
backend: this._backend,
|
|
2673
2698
|
loadTimeMs: Math.round(loadTimeMs),
|
|
@@ -2686,7 +2711,7 @@ var _A2EInference = class _A2EInference {
|
|
|
2686
2711
|
});
|
|
2687
2712
|
await new Promise((r) => setTimeout(r, 0));
|
|
2688
2713
|
logger10.debug("Running warmup inference to initialize GPU context");
|
|
2689
|
-
const warmupStart =
|
|
2714
|
+
const warmupStart = getClock().now();
|
|
2690
2715
|
const warmupAudio = new Float32Array(this.chunkSize);
|
|
2691
2716
|
const warmupIdentity = new Float32Array(this.numIdentityClasses);
|
|
2692
2717
|
warmupIdentity[0] = 1;
|
|
@@ -2699,7 +2724,7 @@ var _A2EInference = class _A2EInference {
|
|
|
2699
2724
|
this.session.run(warmupFeeds).then(() => "ok"),
|
|
2700
2725
|
new Promise((r) => setTimeout(() => r("timeout"), WARMUP_TIMEOUT_MS))
|
|
2701
2726
|
]);
|
|
2702
|
-
const warmupTimeMs =
|
|
2727
|
+
const warmupTimeMs = getClock().now() - warmupStart;
|
|
2703
2728
|
if (warmupResult === "timeout") {
|
|
2704
2729
|
logger10.warn("Warmup inference timed out \u2014 GPU may be unresponsive. Continuing without warmup.", {
|
|
2705
2730
|
timeoutMs: WARMUP_TIMEOUT_MS,
|
|
@@ -2779,7 +2804,7 @@ var _A2EInference = class _A2EInference {
|
|
|
2779
2804
|
"inference.input_samples": this.chunkSize
|
|
2780
2805
|
});
|
|
2781
2806
|
try {
|
|
2782
|
-
const startTime =
|
|
2807
|
+
const startTime = getClock().now();
|
|
2783
2808
|
let timeoutId;
|
|
2784
2809
|
const results = await Promise.race([
|
|
2785
2810
|
this.session.run(feeds).then((r) => {
|
|
@@ -2793,7 +2818,7 @@ var _A2EInference = class _A2EInference {
|
|
|
2793
2818
|
);
|
|
2794
2819
|
})
|
|
2795
2820
|
]);
|
|
2796
|
-
const inferenceTimeMs =
|
|
2821
|
+
const inferenceTimeMs = getClock().now() - startTime;
|
|
2797
2822
|
const blendshapeOutput = results["blendshapes"];
|
|
2798
2823
|
if (!blendshapeOutput) {
|
|
2799
2824
|
throw new Error("Missing blendshapes output from model");
|
|
@@ -3200,9 +3225,9 @@ var A2EUnifiedAdapter = class {
|
|
|
3200
3225
|
"inference.input_samples": audio.length
|
|
3201
3226
|
});
|
|
3202
3227
|
try {
|
|
3203
|
-
const startTime =
|
|
3228
|
+
const startTime = getClock().now();
|
|
3204
3229
|
const result = await this.worker.inferLAM(audio, identityIndex);
|
|
3205
|
-
const inferenceTimeMs =
|
|
3230
|
+
const inferenceTimeMs = getClock().now() - startTime;
|
|
3206
3231
|
const flatBuffer = result.blendshapes;
|
|
3207
3232
|
const { numFrames, numBlendshapes } = result;
|
|
3208
3233
|
const blendshapes = [];
|
|
@@ -3853,7 +3878,7 @@ var KokoroTTSInference = class {
|
|
|
3853
3878
|
throw new Error("KokoroTTS is already loading");
|
|
3854
3879
|
}
|
|
3855
3880
|
this.isLoading = true;
|
|
3856
|
-
const startTime =
|
|
3881
|
+
const startTime = getClock().now();
|
|
3857
3882
|
try {
|
|
3858
3883
|
const backendPref = this.config.backend ?? "wasm";
|
|
3859
3884
|
const ortResult = await getOnnxRuntimeForPreference(backendPref);
|
|
@@ -3877,7 +3902,7 @@ var KokoroTTSInference = class {
|
|
|
3877
3902
|
"KokoroTTS InferenceSession.create"
|
|
3878
3903
|
);
|
|
3879
3904
|
}
|
|
3880
|
-
const loadTimeMs =
|
|
3905
|
+
const loadTimeMs = getClock().now() - startTime;
|
|
3881
3906
|
logger17.info("Kokoro TTS loaded", {
|
|
3882
3907
|
backend: this._backend,
|
|
3883
3908
|
loadTimeMs: Math.round(loadTimeMs),
|
|
@@ -3964,7 +3989,18 @@ var KokoroTTSInference = class {
|
|
|
3964
3989
|
logger17.debug("stream aborted");
|
|
3965
3990
|
return;
|
|
3966
3991
|
}
|
|
3967
|
-
|
|
3992
|
+
let phonemes;
|
|
3993
|
+
try {
|
|
3994
|
+
phonemes = await phonemize(sentence, language);
|
|
3995
|
+
} catch (phonErr) {
|
|
3996
|
+
logger17.error("Phonemizer failed (possible OOM)", {
|
|
3997
|
+
code: ErrorCodes.TTS_PHONEMIZER_OOM,
|
|
3998
|
+
error: String(phonErr),
|
|
3999
|
+
textLength: sentence.length
|
|
4000
|
+
});
|
|
4001
|
+
yield { audio: new Float32Array(0), text: sentence, phonemes: "", duration: 0 };
|
|
4002
|
+
continue;
|
|
4003
|
+
}
|
|
3968
4004
|
const tokens = tokenize(phonemes);
|
|
3969
4005
|
const voiceData = await this.ensureVoice(voiceName);
|
|
3970
4006
|
const style = getStyleForTokenCount(voiceData, tokens.length);
|
|
@@ -4024,16 +4060,27 @@ var KokoroTTSInference = class {
|
|
|
4024
4060
|
"tts.speed": speed
|
|
4025
4061
|
});
|
|
4026
4062
|
try {
|
|
4027
|
-
const startTime =
|
|
4063
|
+
const startTime = getClock().now();
|
|
4028
4064
|
const language = getVoiceLanguage(voiceName);
|
|
4029
|
-
|
|
4065
|
+
let phonemes;
|
|
4066
|
+
try {
|
|
4067
|
+
phonemes = await phonemize(text, language);
|
|
4068
|
+
} catch (phonErr) {
|
|
4069
|
+
logger17.error("Phonemizer failed (possible OOM)", {
|
|
4070
|
+
code: ErrorCodes.TTS_PHONEMIZER_OOM,
|
|
4071
|
+
error: String(phonErr),
|
|
4072
|
+
textLength: text.length
|
|
4073
|
+
});
|
|
4074
|
+
resolve({ audio: new Float32Array(0), duration: 0, inferenceTimeMs: 0 });
|
|
4075
|
+
return;
|
|
4076
|
+
}
|
|
4030
4077
|
logger17.trace("Phonemized", { text: text.substring(0, 50), phonemes: phonemes.substring(0, 50) });
|
|
4031
4078
|
const tokens = tokenize(phonemes);
|
|
4032
4079
|
logger17.trace("Tokenized", { numTokens: tokens.length });
|
|
4033
4080
|
const voiceData = await this.ensureVoice(voiceName);
|
|
4034
4081
|
const style = getStyleForTokenCount(voiceData, tokens.length);
|
|
4035
4082
|
const audio = await this.runInference(tokens, style, speed);
|
|
4036
|
-
const inferenceTimeMs =
|
|
4083
|
+
const inferenceTimeMs = getClock().now() - startTime;
|
|
4037
4084
|
const duration = audio.length / SAMPLE_RATE;
|
|
4038
4085
|
logger17.trace("Synthesis complete", {
|
|
4039
4086
|
duration: `${duration.toFixed(2)}s`,
|
|
@@ -4152,11 +4199,11 @@ var KokoroTTSUnifiedAdapter = class {
|
|
|
4152
4199
|
"model.url": this.modelUrl
|
|
4153
4200
|
});
|
|
4154
4201
|
try {
|
|
4155
|
-
const startTime =
|
|
4202
|
+
const startTime = getClock().now();
|
|
4156
4203
|
await this.worker.loadKokoro({ modelUrl: this.modelUrl });
|
|
4157
4204
|
this._isLoaded = true;
|
|
4158
4205
|
this.loadedGeneration = this.worker.workerGeneration;
|
|
4159
|
-
const loadTimeMs =
|
|
4206
|
+
const loadTimeMs = getClock().now() - startTime;
|
|
4160
4207
|
logger18.info("Kokoro TTS loaded via unified worker", {
|
|
4161
4208
|
backend: "wasm",
|
|
4162
4209
|
loadTimeMs: Math.round(loadTimeMs),
|
|
@@ -4231,11 +4278,11 @@ var KokoroTTSUnifiedAdapter = class {
|
|
|
4231
4278
|
runWorkerInference(tokens, style, speed) {
|
|
4232
4279
|
return new Promise((resolve, reject) => {
|
|
4233
4280
|
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
4234
|
-
const startTime =
|
|
4281
|
+
const startTime = getClock().now();
|
|
4235
4282
|
const telemetry = getTelemetry();
|
|
4236
4283
|
try {
|
|
4237
4284
|
const result = await this.worker.inferKokoro(tokens, style, speed);
|
|
4238
|
-
const latencyMs =
|
|
4285
|
+
const latencyMs = getClock().now() - startTime;
|
|
4239
4286
|
telemetry?.recordHistogram("omote.inference.latency", latencyMs, {
|
|
4240
4287
|
model: "kokoro-tts-unified",
|
|
4241
4288
|
backend: "wasm"
|
|
@@ -4350,11 +4397,11 @@ var SileroVADUnifiedAdapter = class {
|
|
|
4350
4397
|
return new Promise((resolve, reject) => {
|
|
4351
4398
|
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
4352
4399
|
try {
|
|
4353
|
-
const startTime =
|
|
4400
|
+
const startTime = getClock().now();
|
|
4354
4401
|
const result = await this.worker.processVAD(audioChunkCopy, this.state, this.context);
|
|
4355
4402
|
this.state = result.state;
|
|
4356
4403
|
this.context = audioChunkCopy.slice(-this.contextSize);
|
|
4357
|
-
const inferenceTimeMs =
|
|
4404
|
+
const inferenceTimeMs = getClock().now() - startTime;
|
|
4358
4405
|
const isSpeech = result.probability > this.config.threshold;
|
|
4359
4406
|
let preSpeechChunks;
|
|
4360
4407
|
if (isSpeech && !this.wasSpeaking) {
|
|
@@ -4412,17 +4459,20 @@ var SileroVADUnifiedAdapter = class {
|
|
|
4412
4459
|
var logger20 = createLogger("createA2E");
|
|
4413
4460
|
function createA2E(config = {}) {
|
|
4414
4461
|
const modelUrl = config.modelUrl ?? DEFAULT_MODEL_URLS.lam;
|
|
4462
|
+
const platformInfo = {
|
|
4463
|
+
modelUrl,
|
|
4464
|
+
isIOS: isIOS(),
|
|
4465
|
+
webgpu: typeof navigator !== "undefined" && "gpu" in navigator
|
|
4466
|
+
};
|
|
4415
4467
|
if (config.unifiedWorker) {
|
|
4416
|
-
logger20.info("Creating A2EUnifiedAdapter (via unified worker)",
|
|
4417
|
-
modelUrl
|
|
4418
|
-
});
|
|
4468
|
+
logger20.info("Creating A2EUnifiedAdapter (via unified worker)", platformInfo);
|
|
4419
4469
|
return new A2EUnifiedAdapter(config.unifiedWorker, {
|
|
4420
4470
|
modelUrl,
|
|
4421
4471
|
externalDataUrl: config.externalDataUrl,
|
|
4422
4472
|
numIdentityClasses: config.numIdentityClasses
|
|
4423
4473
|
});
|
|
4424
4474
|
}
|
|
4425
|
-
logger20.info("Creating A2EInference",
|
|
4475
|
+
logger20.info("Creating A2EInference", platformInfo);
|
|
4426
4476
|
return new A2EInference({
|
|
4427
4477
|
modelUrl,
|
|
4428
4478
|
externalDataUrl: config.externalDataUrl,
|
|
@@ -4798,16 +4848,28 @@ async function loadOrt(wasmPaths, isIOSDevice) {
|
|
|
4798
4848
|
// ort.webgpu.min.js crashes WebKit's JIT compiler.
|
|
4799
4849
|
var isSafariWorker = typeof navigator !== 'undefined' && /safari/i.test(navigator.userAgent) && !/chrome|crios|fxios|chromium|edg/i.test(navigator.userAgent);
|
|
4800
4850
|
var hasWebGPU = false;
|
|
4801
|
-
|
|
4851
|
+
var webgpuReason = '';
|
|
4852
|
+
if (isIOSDevice) {
|
|
4853
|
+
webgpuReason = 'iOS device';
|
|
4854
|
+
} else if (isSafariWorker) {
|
|
4855
|
+
webgpuReason = 'Safari (JSEP/ASYNCIFY crash)';
|
|
4856
|
+
} else if (typeof navigator === 'undefined' || !navigator.gpu) {
|
|
4857
|
+
webgpuReason = 'navigator.gpu unavailable';
|
|
4858
|
+
} else {
|
|
4802
4859
|
try {
|
|
4803
4860
|
var adapter = await navigator.gpu.requestAdapter();
|
|
4804
4861
|
if (adapter) {
|
|
4805
4862
|
hasWebGPU = true;
|
|
4863
|
+
} else {
|
|
4864
|
+
webgpuReason = 'requestAdapter returned null';
|
|
4806
4865
|
}
|
|
4807
4866
|
} catch (e) {
|
|
4808
|
-
|
|
4867
|
+
webgpuReason = 'requestAdapter failed: ' + String(e);
|
|
4809
4868
|
}
|
|
4810
4869
|
}
|
|
4870
|
+
if (!hasWebGPU && webgpuReason) {
|
|
4871
|
+
console.warn('[UnifiedWorker] WebGPU unavailable: ' + webgpuReason + ', falling back to WASM');
|
|
4872
|
+
}
|
|
4811
4873
|
|
|
4812
4874
|
var ortUrl;
|
|
4813
4875
|
if (hasWebGPU) {
|
|
@@ -5292,7 +5354,12 @@ var UnifiedInferenceWorker = class {
|
|
|
5292
5354
|
span?.setAttributes({ "worker.init_time_ms": loadTimeMs, "worker.backend": this._workerBackend });
|
|
5293
5355
|
span?.end();
|
|
5294
5356
|
} catch (error) {
|
|
5295
|
-
|
|
5357
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
5358
|
+
const isTimeout = err.message.includes("timed out");
|
|
5359
|
+
if (isTimeout) {
|
|
5360
|
+
logger21.error("Worker init timed out", { code: "OMOTE_INF_003", timeoutMs: INIT_TIMEOUT_MS });
|
|
5361
|
+
}
|
|
5362
|
+
span?.endWithError(err);
|
|
5296
5363
|
this.cleanup();
|
|
5297
5364
|
throw error;
|
|
5298
5365
|
}
|
|
@@ -5676,7 +5743,7 @@ var TTSSpeaker = class {
|
|
|
5676
5743
|
async connect(tts, config) {
|
|
5677
5744
|
logger22.info("Connecting TTS...");
|
|
5678
5745
|
const span = getTelemetry()?.startSpan("TTSSpeaker.connect");
|
|
5679
|
-
const connectStart =
|
|
5746
|
+
const connectStart = getClock().now();
|
|
5680
5747
|
this.tts = tts;
|
|
5681
5748
|
if (!tts.isLoaded) {
|
|
5682
5749
|
await tts.load();
|
|
@@ -5685,7 +5752,7 @@ var TTSSpeaker = class {
|
|
|
5685
5752
|
if (!hasLam) {
|
|
5686
5753
|
this._audioOnly = true;
|
|
5687
5754
|
this.scheduler = new AudioScheduler({ sampleRate: tts.sampleRate });
|
|
5688
|
-
getTelemetry()?.recordHistogram(MetricNames.TTS_CONNECT_LATENCY,
|
|
5755
|
+
getTelemetry()?.recordHistogram(MetricNames.TTS_CONNECT_LATENCY, getClock().now() - connectStart);
|
|
5689
5756
|
span?.end();
|
|
5690
5757
|
logger22.info("TTS connected (audio-only mode)");
|
|
5691
5758
|
return;
|
|
@@ -5719,7 +5786,7 @@ var TTSSpeaker = class {
|
|
|
5719
5786
|
neutralTransitionMs: config?.neutralTransitionMs
|
|
5720
5787
|
});
|
|
5721
5788
|
await this.ttsPlayback.initialize();
|
|
5722
|
-
getTelemetry()?.recordHistogram(MetricNames.TTS_CONNECT_LATENCY,
|
|
5789
|
+
getTelemetry()?.recordHistogram(MetricNames.TTS_CONNECT_LATENCY, getClock().now() - connectStart);
|
|
5723
5790
|
span?.end();
|
|
5724
5791
|
logger22.info("TTS connected (lip sync mode)");
|
|
5725
5792
|
}
|
|
@@ -5754,7 +5821,7 @@ var TTSSpeaker = class {
|
|
|
5754
5821
|
const span = getTelemetry()?.startSpan("TTSSpeaker.speak", {
|
|
5755
5822
|
"text.length": text.length
|
|
5756
5823
|
});
|
|
5757
|
-
const speakStart =
|
|
5824
|
+
const speakStart = getClock().now();
|
|
5758
5825
|
try {
|
|
5759
5826
|
if (this._audioOnly) {
|
|
5760
5827
|
await this.speakAudioOnly(text, abort, options?.voice);
|
|
@@ -5764,7 +5831,7 @@ var TTSSpeaker = class {
|
|
|
5764
5831
|
voice: options?.voice
|
|
5765
5832
|
});
|
|
5766
5833
|
}
|
|
5767
|
-
getTelemetry()?.recordHistogram(MetricNames.TTS_SPEAK_LATENCY,
|
|
5834
|
+
getTelemetry()?.recordHistogram(MetricNames.TTS_SPEAK_LATENCY, getClock().now() - speakStart);
|
|
5768
5835
|
span?.end();
|
|
5769
5836
|
} catch (err) {
|
|
5770
5837
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
@@ -5894,42 +5961,42 @@ var TTSSpeaker = class {
|
|
|
5894
5961
|
end: async () => {
|
|
5895
5962
|
if (ended) return;
|
|
5896
5963
|
ended = true;
|
|
5897
|
-
|
|
5898
|
-
|
|
5899
|
-
if (this.currentAbort === abort) this.currentAbort = null;
|
|
5900
|
-
return;
|
|
5901
|
-
}
|
|
5902
|
-
if (buffer.trim()) {
|
|
5903
|
-
enqueueSentence(buffer.trim());
|
|
5904
|
-
buffer = "";
|
|
5905
|
-
}
|
|
5906
|
-
await processChain;
|
|
5907
|
-
if (abort.signal.aborted) {
|
|
5908
|
-
this._isSpeaking = false;
|
|
5909
|
-
if (this.currentAbort === abort) this.currentAbort = null;
|
|
5910
|
-
return;
|
|
5911
|
-
}
|
|
5912
|
-
await pipeline.end();
|
|
5913
|
-
await new Promise((resolve) => {
|
|
5914
|
-
let resolved = false;
|
|
5915
|
-
const done = () => {
|
|
5916
|
-
if (resolved) return;
|
|
5917
|
-
resolved = true;
|
|
5918
|
-
unsubC();
|
|
5919
|
-
unsubS();
|
|
5920
|
-
abort.signal.removeEventListener("abort", done);
|
|
5921
|
-
resolve();
|
|
5922
|
-
};
|
|
5964
|
+
const unsubs = [];
|
|
5965
|
+
try {
|
|
5923
5966
|
if (abort.signal.aborted) {
|
|
5924
|
-
resolve();
|
|
5925
5967
|
return;
|
|
5926
5968
|
}
|
|
5927
|
-
|
|
5928
|
-
|
|
5929
|
-
|
|
5930
|
-
|
|
5931
|
-
|
|
5932
|
-
|
|
5969
|
+
if (buffer.trim()) {
|
|
5970
|
+
enqueueSentence(buffer.trim());
|
|
5971
|
+
buffer = "";
|
|
5972
|
+
}
|
|
5973
|
+
await processChain;
|
|
5974
|
+
if (abort.signal.aborted) {
|
|
5975
|
+
return;
|
|
5976
|
+
}
|
|
5977
|
+
await pipeline.end();
|
|
5978
|
+
await new Promise((resolve) => {
|
|
5979
|
+
let resolved = false;
|
|
5980
|
+
const done = () => {
|
|
5981
|
+
if (resolved) return;
|
|
5982
|
+
resolved = true;
|
|
5983
|
+
resolve();
|
|
5984
|
+
};
|
|
5985
|
+
if (abort.signal.aborted) {
|
|
5986
|
+
resolve();
|
|
5987
|
+
return;
|
|
5988
|
+
}
|
|
5989
|
+
unsubs.push(pipeline.on("playback:complete", done));
|
|
5990
|
+
unsubs.push(pipeline.on("playback:stop", done));
|
|
5991
|
+
const onAbort = () => done();
|
|
5992
|
+
abort.signal.addEventListener("abort", onAbort);
|
|
5993
|
+
unsubs.push(() => abort.signal.removeEventListener("abort", onAbort));
|
|
5994
|
+
});
|
|
5995
|
+
} finally {
|
|
5996
|
+
unsubs.forEach((fn) => fn());
|
|
5997
|
+
this._isSpeaking = false;
|
|
5998
|
+
if (this.currentAbort === abort) this.currentAbort = null;
|
|
5999
|
+
}
|
|
5933
6000
|
}
|
|
5934
6001
|
};
|
|
5935
6002
|
}
|
|
@@ -6620,14 +6687,14 @@ function createKokoroTTS(config = {}) {
|
|
|
6620
6687
|
logger24.info("iOS + unified worker: creating KokoroTTSUnifiedAdapter (off-main-thread ONNX)");
|
|
6621
6688
|
return new KokoroTTSUnifiedAdapter(config.unifiedWorker, config);
|
|
6622
6689
|
}
|
|
6623
|
-
logger24.info("iOS
|
|
6690
|
+
logger24.info("iOS: creating KokoroTTSInference (main thread, shared ORT)");
|
|
6624
6691
|
return new KokoroTTSInference(config);
|
|
6625
6692
|
}
|
|
6626
6693
|
if (!KokoroTTSWorker.isSupported()) {
|
|
6627
6694
|
logger24.info("Worker not supported: creating KokoroTTSInference (main thread)");
|
|
6628
6695
|
return new KokoroTTSInference(config);
|
|
6629
6696
|
}
|
|
6630
|
-
logger24.info("Auto
|
|
6697
|
+
logger24.info("Auto: creating KokoroTTSWorker (off-main-thread)");
|
|
6631
6698
|
return new KokoroTTSWorker(config);
|
|
6632
6699
|
}
|
|
6633
6700
|
|
|
@@ -6861,6 +6928,9 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
6861
6928
|
// so all future transcribe() calls reject immediately to prevent concurrent access.
|
|
6862
6929
|
this.poisoned = false;
|
|
6863
6930
|
// 10s for SenseVoice (heavier preprocessing)
|
|
6931
|
+
// WebGPU shape change tracking (for dynamic shape warning)
|
|
6932
|
+
this.lastLfrFrames = 0;
|
|
6933
|
+
this.webgpuShapeWarned = false;
|
|
6864
6934
|
// Preprocessing state (loaded once)
|
|
6865
6935
|
this.tokenMap = null;
|
|
6866
6936
|
this.negMean = null;
|
|
@@ -6895,7 +6965,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
6895
6965
|
throw new Error("Model already loaded. Call dispose() first.");
|
|
6896
6966
|
}
|
|
6897
6967
|
this.isLoading = true;
|
|
6898
|
-
const startTime =
|
|
6968
|
+
const startTime = getClock().now();
|
|
6899
6969
|
const telemetry = getTelemetry();
|
|
6900
6970
|
const span = telemetry?.startSpan("SenseVoice.load", {
|
|
6901
6971
|
"model.url": this.config.modelUrl,
|
|
@@ -6962,7 +7032,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
6962
7032
|
} catch (cmvnErr) {
|
|
6963
7033
|
logger25.warn("Failed to read CMVN from model metadata", { error: cmvnErr });
|
|
6964
7034
|
}
|
|
6965
|
-
const loadTimeMs =
|
|
7035
|
+
const loadTimeMs = getClock().now() - startTime;
|
|
6966
7036
|
logger25.info("SenseVoice model loaded", {
|
|
6967
7037
|
backend: this._backend,
|
|
6968
7038
|
loadTimeMs: Math.round(loadTimeMs),
|
|
@@ -7027,24 +7097,35 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
7027
7097
|
"inference.input_samples": audio.length
|
|
7028
7098
|
});
|
|
7029
7099
|
try {
|
|
7030
|
-
const startTime =
|
|
7031
|
-
const preprocessStart =
|
|
7100
|
+
const startTime = getClock().now();
|
|
7101
|
+
const preprocessStart = getClock().now();
|
|
7032
7102
|
const fbank = computeKaldiFbank(audio, 16e3, 80);
|
|
7033
7103
|
const numFrames = fbank.length / 80;
|
|
7034
7104
|
if (numFrames === 0) {
|
|
7035
7105
|
resolve({
|
|
7036
7106
|
text: "",
|
|
7037
|
-
inferenceTimeMs:
|
|
7038
|
-
preprocessTimeMs:
|
|
7107
|
+
inferenceTimeMs: getClock().now() - startTime,
|
|
7108
|
+
preprocessTimeMs: getClock().now() - preprocessStart
|
|
7039
7109
|
});
|
|
7040
7110
|
return;
|
|
7041
7111
|
}
|
|
7042
7112
|
const lfrFeatures = applyLFR(fbank, 80, 7, 6);
|
|
7043
7113
|
const numLfrFrames = lfrFeatures.length / 560;
|
|
7114
|
+
if (this._backend === "webgpu" && this.lastLfrFrames !== 0 && numLfrFrames !== this.lastLfrFrames) {
|
|
7115
|
+
if (!this.webgpuShapeWarned) {
|
|
7116
|
+
this.webgpuShapeWarned = true;
|
|
7117
|
+
logger25.warn("SenseVoice running on WebGPU with variable audio shapes \u2014 risk of kernel crash", {
|
|
7118
|
+
code: ErrorCodes.INF_SHAPE_MISMATCH,
|
|
7119
|
+
previousFrames: this.lastLfrFrames,
|
|
7120
|
+
currentFrames: numLfrFrames
|
|
7121
|
+
});
|
|
7122
|
+
}
|
|
7123
|
+
}
|
|
7124
|
+
this.lastLfrFrames = numLfrFrames;
|
|
7044
7125
|
if (this.negMean && this.invStddev) {
|
|
7045
7126
|
applyCMVN(lfrFeatures, 560, this.negMean, this.invStddev);
|
|
7046
7127
|
}
|
|
7047
|
-
const preprocessTimeMs =
|
|
7128
|
+
const preprocessTimeMs = getClock().now() - preprocessStart;
|
|
7048
7129
|
const ort = this.ort;
|
|
7049
7130
|
const feeds = {
|
|
7050
7131
|
x: new ort.Tensor("float32", lfrFeatures, [1, numLfrFrames, 560]),
|
|
@@ -7074,7 +7155,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
7074
7155
|
const seqLen = logitsDims[1];
|
|
7075
7156
|
const vocabSize = logitsDims[2];
|
|
7076
7157
|
const decoded = ctcGreedyDecode(logitsData, seqLen, vocabSize, this.tokenMap);
|
|
7077
|
-
const inferenceTimeMs =
|
|
7158
|
+
const inferenceTimeMs = getClock().now() - startTime;
|
|
7078
7159
|
logger25.trace("Transcription complete", {
|
|
7079
7160
|
text: decoded.text.substring(0, 50),
|
|
7080
7161
|
language: decoded.language,
|
|
@@ -8413,7 +8494,7 @@ var SileroVADInference = class {
|
|
|
8413
8494
|
throw new Error("Model already loaded. Call dispose() first.");
|
|
8414
8495
|
}
|
|
8415
8496
|
this.isLoading = true;
|
|
8416
|
-
const startTime =
|
|
8497
|
+
const startTime = getClock().now();
|
|
8417
8498
|
const telemetry = getTelemetry();
|
|
8418
8499
|
const span = telemetry?.startSpan("SileroVAD.load", {
|
|
8419
8500
|
"model.url": this.config.modelUrl,
|
|
@@ -8445,7 +8526,7 @@ var SileroVADInference = class {
|
|
|
8445
8526
|
const modelData = new Uint8Array(modelBuffer);
|
|
8446
8527
|
this.session = await ort.InferenceSession.create(modelData, sessionOptions);
|
|
8447
8528
|
this.reset();
|
|
8448
|
-
const loadTimeMs =
|
|
8529
|
+
const loadTimeMs = getClock().now() - startTime;
|
|
8449
8530
|
logger28.info("Model loaded successfully", {
|
|
8450
8531
|
backend: this._backend,
|
|
8451
8532
|
loadTimeMs: Math.round(loadTimeMs),
|
|
@@ -8625,7 +8706,7 @@ var SileroVADInference = class {
|
|
|
8625
8706
|
"inference.chunk_size": this.chunkSize
|
|
8626
8707
|
});
|
|
8627
8708
|
try {
|
|
8628
|
-
const startTime =
|
|
8709
|
+
const startTime = getClock().now();
|
|
8629
8710
|
const inputSize = this.contextSize + this.chunkSize;
|
|
8630
8711
|
const inputBuffer = new Float32Array(inputSize);
|
|
8631
8712
|
inputBuffer.set(this.context, 0);
|
|
@@ -8655,7 +8736,7 @@ var SileroVADInference = class {
|
|
|
8655
8736
|
);
|
|
8656
8737
|
}
|
|
8657
8738
|
this.context = audioChunkCopy.slice(-this.contextSize);
|
|
8658
|
-
const inferenceTimeMs =
|
|
8739
|
+
const inferenceTimeMs = getClock().now() - startTime;
|
|
8659
8740
|
const isSpeech = probability > this.config.threshold;
|
|
8660
8741
|
let preSpeechChunks;
|
|
8661
8742
|
if (isSpeech && !this.wasSpeaking) {
|
|
@@ -9470,6 +9551,7 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
9470
9551
|
this.lastProgressiveSamples = 0;
|
|
9471
9552
|
// ASR error recovery
|
|
9472
9553
|
this.asrErrorCount = 0;
|
|
9554
|
+
this.progressiveErrorCount = 0;
|
|
9473
9555
|
this.config = config ?? {};
|
|
9474
9556
|
}
|
|
9475
9557
|
/** Current listener state */
|
|
@@ -9662,7 +9744,7 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
9662
9744
|
if (result.isSpeech) {
|
|
9663
9745
|
if (!wasSpeaking) {
|
|
9664
9746
|
this.isSpeechActive = true;
|
|
9665
|
-
this.speechStartTime =
|
|
9747
|
+
this.speechStartTime = getClock().now();
|
|
9666
9748
|
this.audioBuffer = [];
|
|
9667
9749
|
this.audioBufferSamples = 0;
|
|
9668
9750
|
this.lastProgressiveResult = null;
|
|
@@ -9701,13 +9783,13 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
9701
9783
|
const extended = this.config.silenceTimeoutExtendedMs ?? 700;
|
|
9702
9784
|
const adaptive = this.config.adaptiveTimeout ?? true;
|
|
9703
9785
|
if (!adaptive) return base;
|
|
9704
|
-
const speechDurationMs =
|
|
9786
|
+
const speechDurationMs = getClock().now() - this.speechStartTime;
|
|
9705
9787
|
return speechDurationMs > 3e3 ? extended : base;
|
|
9706
9788
|
}
|
|
9707
9789
|
onSilenceDetected() {
|
|
9708
9790
|
const capturedEpoch = this.epoch;
|
|
9709
9791
|
this.isSpeechActive = false;
|
|
9710
|
-
const durationMs =
|
|
9792
|
+
const durationMs = getClock().now() - this.speechStartTime;
|
|
9711
9793
|
logger31.debug("Speech end", { durationMs: Math.round(durationMs) });
|
|
9712
9794
|
this.emit("speech:end", { durationMs });
|
|
9713
9795
|
this.clearSilenceTimer();
|
|
@@ -9804,7 +9886,15 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
9804
9886
|
this.lastProgressiveSamples = snapshotSamples;
|
|
9805
9887
|
this.emit("transcript", { ...result, isFinal: false });
|
|
9806
9888
|
}
|
|
9807
|
-
} catch {
|
|
9889
|
+
} catch (err) {
|
|
9890
|
+
this.progressiveErrorCount = (this.progressiveErrorCount ?? 0) + 1;
|
|
9891
|
+
if (this.progressiveErrorCount % 10 === 1) {
|
|
9892
|
+
logger31.warn("Progressive transcription error", {
|
|
9893
|
+
code: ErrorCodes.SPH_ASR_ERROR,
|
|
9894
|
+
error: String(err),
|
|
9895
|
+
count: this.progressiveErrorCount
|
|
9896
|
+
});
|
|
9897
|
+
}
|
|
9808
9898
|
}
|
|
9809
9899
|
})();
|
|
9810
9900
|
}, intervalMs);
|
|
@@ -9821,7 +9911,7 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
9821
9911
|
async transcribeWithTimeout(audio) {
|
|
9822
9912
|
if (!this.asr) return null;
|
|
9823
9913
|
const timeoutMs = this.config.transcriptionTimeoutMs ?? 1e4;
|
|
9824
|
-
const startTime =
|
|
9914
|
+
const startTime = getClock().now();
|
|
9825
9915
|
const span = getTelemetry()?.startSpan("SpeechListener.transcribe", {
|
|
9826
9916
|
"inference.input_samples": audio.length,
|
|
9827
9917
|
"inference.input_duration_ms": audio.length / 16e3 * 1e3
|
|
@@ -9835,7 +9925,7 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
9835
9925
|
})
|
|
9836
9926
|
]);
|
|
9837
9927
|
clearTimeout(timeoutId);
|
|
9838
|
-
const latency =
|
|
9928
|
+
const latency = getClock().now() - startTime;
|
|
9839
9929
|
this.asrErrorCount = 0;
|
|
9840
9930
|
getTelemetry()?.recordHistogram(MetricNames.VOICE_TRANSCRIPTION_LATENCY, latency);
|
|
9841
9931
|
getTelemetry()?.incrementCounter(MetricNames.VOICE_TRANSCRIPTIONS);
|
|
@@ -10009,11 +10099,11 @@ var InterruptionHandler = class extends EventEmitter {
|
|
|
10009
10099
|
getState() {
|
|
10010
10100
|
return {
|
|
10011
10101
|
isSpeaking: this.isSpeaking,
|
|
10012
|
-
speechDurationMs: this.isSpeaking ?
|
|
10102
|
+
speechDurationMs: this.isSpeaking ? getClock().now() - this.speechStartTime : 0
|
|
10013
10103
|
};
|
|
10014
10104
|
}
|
|
10015
10105
|
onSpeechDetected(rms) {
|
|
10016
|
-
const now =
|
|
10106
|
+
const now = getClock().now();
|
|
10017
10107
|
this.lastSpeechTime = now;
|
|
10018
10108
|
if (this.silenceTimer) {
|
|
10019
10109
|
clearTimeout(this.silenceTimer);
|
|
@@ -10230,7 +10320,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
10230
10320
|
this.setupEventHandlers();
|
|
10231
10321
|
this.recognition.start();
|
|
10232
10322
|
this.isListening = true;
|
|
10233
|
-
this.startTime =
|
|
10323
|
+
this.startTime = getClock().now();
|
|
10234
10324
|
this.accumulatedText = "";
|
|
10235
10325
|
logger33.info("Speech recognition started", {
|
|
10236
10326
|
language: this.config.language
|
|
@@ -10331,7 +10421,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
10331
10421
|
const speechResult = {
|
|
10332
10422
|
text: isFinal ? this.accumulatedText.trim() : text,
|
|
10333
10423
|
language: this.config.language,
|
|
10334
|
-
inferenceTimeMs:
|
|
10424
|
+
inferenceTimeMs: getClock().now() - this.startTime,
|
|
10335
10425
|
isFinal,
|
|
10336
10426
|
confidence: alternative.confidence
|
|
10337
10427
|
};
|
|
@@ -10363,13 +10453,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
10363
10453
|
this.isListening = false;
|
|
10364
10454
|
logger33.info("Speech recognition ended", {
|
|
10365
10455
|
totalText: this.accumulatedText.length,
|
|
10366
|
-
durationMs:
|
|
10456
|
+
durationMs: getClock().now() - this.startTime
|
|
10367
10457
|
});
|
|
10368
10458
|
if (this.stopResolver) {
|
|
10369
10459
|
const result = {
|
|
10370
10460
|
text: this.accumulatedText.trim(),
|
|
10371
10461
|
language: this.config.language,
|
|
10372
|
-
inferenceTimeMs:
|
|
10462
|
+
inferenceTimeMs: getClock().now() - this.startTime,
|
|
10373
10463
|
isFinal: true
|
|
10374
10464
|
};
|
|
10375
10465
|
this.stopResolver(result);
|
|
@@ -10413,6 +10503,303 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
10413
10503
|
}
|
|
10414
10504
|
};
|
|
10415
10505
|
|
|
10506
|
+
// src/inference/ElevenLabsTTSBackend.ts
|
|
10507
|
+
var logger34 = createLogger("ElevenLabsTTS");
|
|
10508
|
+
var DEFAULT_MODEL = "eleven_multilingual_v2";
|
|
10509
|
+
var DEFAULT_OUTPUT_FORMAT = "pcm_16000";
|
|
10510
|
+
var DEFAULT_STABILITY = 0.5;
|
|
10511
|
+
var DEFAULT_SIMILARITY_BOOST = 0.75;
|
|
10512
|
+
var DEFAULT_BASE_URL = "https://api.elevenlabs.io";
|
|
10513
|
+
var FORMAT_TO_SAMPLE_RATE = {
|
|
10514
|
+
pcm_16000: 16e3,
|
|
10515
|
+
pcm_22050: 22050,
|
|
10516
|
+
pcm_24000: 24e3,
|
|
10517
|
+
pcm_44100: 44100
|
|
10518
|
+
};
|
|
10519
|
+
var ElevenLabsTTSBackend = class {
|
|
10520
|
+
constructor(config) {
|
|
10521
|
+
this._isLoaded = false;
|
|
10522
|
+
if (!config.apiKey) throw new Error("ElevenLabsTTS: apiKey is required");
|
|
10523
|
+
if (!config.voiceId) throw new Error("ElevenLabsTTS: voiceId is required");
|
|
10524
|
+
this.apiKey = config.apiKey;
|
|
10525
|
+
this.voiceId = config.voiceId;
|
|
10526
|
+
this.model = config.model ?? DEFAULT_MODEL;
|
|
10527
|
+
this.outputFormat = config.outputFormat ?? DEFAULT_OUTPUT_FORMAT;
|
|
10528
|
+
this.stability = config.stability ?? DEFAULT_STABILITY;
|
|
10529
|
+
this.similarityBoost = config.similarityBoost ?? DEFAULT_SIMILARITY_BOOST;
|
|
10530
|
+
this.baseUrl = config.baseUrl ?? DEFAULT_BASE_URL;
|
|
10531
|
+
const rate = FORMAT_TO_SAMPLE_RATE[this.outputFormat];
|
|
10532
|
+
if (!rate) {
|
|
10533
|
+
throw new Error(
|
|
10534
|
+
`ElevenLabsTTS: unsupported outputFormat "${this.outputFormat}". Supported: ${Object.keys(FORMAT_TO_SAMPLE_RATE).join(", ")}`
|
|
10535
|
+
);
|
|
10536
|
+
}
|
|
10537
|
+
this._sampleRate = rate;
|
|
10538
|
+
}
|
|
10539
|
+
get sampleRate() {
|
|
10540
|
+
return this._sampleRate;
|
|
10541
|
+
}
|
|
10542
|
+
get isLoaded() {
|
|
10543
|
+
return this._isLoaded;
|
|
10544
|
+
}
|
|
10545
|
+
// ─── Load ───────────────────────────────────────────────────────────────
|
|
10546
|
+
/**
|
|
10547
|
+
* No-op for cloud TTS (no model to load).
|
|
10548
|
+
* Marks backend as ready.
|
|
10549
|
+
*/
|
|
10550
|
+
async load() {
|
|
10551
|
+
this._isLoaded = true;
|
|
10552
|
+
logger34.info("ElevenLabs TTS ready", { voiceId: this.voiceId, model: this.model });
|
|
10553
|
+
}
|
|
10554
|
+
// ─── Stream ─────────────────────────────────────────────────────────────
|
|
10555
|
+
/**
|
|
10556
|
+
* Stream audio from ElevenLabs for the given text.
|
|
10557
|
+
*
|
|
10558
|
+
* Uses the streaming endpoint. Yields a single chunk for non-streaming
|
|
10559
|
+
* or multiple chunks as response data arrives.
|
|
10560
|
+
*/
|
|
10561
|
+
async *stream(text, options) {
|
|
10562
|
+
if (!this._isLoaded) {
|
|
10563
|
+
throw new Error("ElevenLabsTTS: not loaded. Call load() first.");
|
|
10564
|
+
}
|
|
10565
|
+
const trimmed = text.trim();
|
|
10566
|
+
if (trimmed.length === 0) {
|
|
10567
|
+
throw new Error("ElevenLabsTTS: text must not be empty");
|
|
10568
|
+
}
|
|
10569
|
+
const startTime = getClock().now();
|
|
10570
|
+
const telemetry = getTelemetry();
|
|
10571
|
+
const span = telemetry?.startSpan("ElevenLabsTTS.stream", {
|
|
10572
|
+
"tts.text_length": trimmed.length,
|
|
10573
|
+
"tts.voice_id": this.voiceId,
|
|
10574
|
+
"tts.model": this.model
|
|
10575
|
+
});
|
|
10576
|
+
const url = `${this.baseUrl}/v1/text-to-speech/${this.voiceId}?output_format=${this.outputFormat}`;
|
|
10577
|
+
try {
|
|
10578
|
+
const response = await fetch(url, {
|
|
10579
|
+
method: "POST",
|
|
10580
|
+
headers: {
|
|
10581
|
+
"xi-api-key": this.apiKey,
|
|
10582
|
+
"Content-Type": "application/json",
|
|
10583
|
+
Accept: "audio/pcm"
|
|
10584
|
+
},
|
|
10585
|
+
body: JSON.stringify({
|
|
10586
|
+
text: trimmed,
|
|
10587
|
+
model_id: this.model,
|
|
10588
|
+
voice_settings: {
|
|
10589
|
+
stability: this.stability,
|
|
10590
|
+
similarity_boost: this.similarityBoost
|
|
10591
|
+
}
|
|
10592
|
+
}),
|
|
10593
|
+
signal: options?.signal
|
|
10594
|
+
});
|
|
10595
|
+
if (!response.ok) {
|
|
10596
|
+
const errorText = await response.text().catch(() => "unknown");
|
|
10597
|
+
const msg = `ElevenLabsTTS: HTTP ${response.status} \u2014 ${this.getHttpErrorMessage(response.status, errorText)}`;
|
|
10598
|
+
logger34.error(msg);
|
|
10599
|
+
throw new Error(msg);
|
|
10600
|
+
}
|
|
10601
|
+
if (!response.body) {
|
|
10602
|
+
const buffer = await response.arrayBuffer();
|
|
10603
|
+
const audio = pcm16ToFloat32(buffer);
|
|
10604
|
+
const duration = audio.length / this._sampleRate;
|
|
10605
|
+
const latency2 = getClock().now() - startTime;
|
|
10606
|
+
span?.setAttributes({ "tts.duration_s": duration, "tts.latency_ms": latency2 });
|
|
10607
|
+
span?.end();
|
|
10608
|
+
telemetry?.recordHistogram("omote.inference.latency", latency2, {
|
|
10609
|
+
model: "elevenlabs-tts",
|
|
10610
|
+
backend: "cloud"
|
|
10611
|
+
});
|
|
10612
|
+
yield { audio, duration, text: trimmed };
|
|
10613
|
+
return;
|
|
10614
|
+
}
|
|
10615
|
+
const reader = response.body.getReader();
|
|
10616
|
+
let totalSamples = 0;
|
|
10617
|
+
try {
|
|
10618
|
+
while (true) {
|
|
10619
|
+
if (options?.signal?.aborted) {
|
|
10620
|
+
reader.cancel();
|
|
10621
|
+
logger34.debug("Stream aborted by signal");
|
|
10622
|
+
return;
|
|
10623
|
+
}
|
|
10624
|
+
const { done, value } = await reader.read();
|
|
10625
|
+
if (done) break;
|
|
10626
|
+
if (value && value.byteLength > 0) {
|
|
10627
|
+
const usableBytes = value.byteLength & ~1;
|
|
10628
|
+
if (usableBytes === 0) continue;
|
|
10629
|
+
const audio = pcm16ToFloat32(value.buffer.slice(value.byteOffset, value.byteOffset + usableBytes));
|
|
10630
|
+
const duration = audio.length / this._sampleRate;
|
|
10631
|
+
totalSamples += audio.length;
|
|
10632
|
+
yield { audio, duration, text: trimmed };
|
|
10633
|
+
}
|
|
10634
|
+
}
|
|
10635
|
+
} finally {
|
|
10636
|
+
reader.releaseLock();
|
|
10637
|
+
}
|
|
10638
|
+
const latency = getClock().now() - startTime;
|
|
10639
|
+
const totalDuration = totalSamples / this._sampleRate;
|
|
10640
|
+
logger34.debug("Stream complete", {
|
|
10641
|
+
totalDuration: `${totalDuration.toFixed(2)}s`,
|
|
10642
|
+
latencyMs: Math.round(latency),
|
|
10643
|
+
totalSamples
|
|
10644
|
+
});
|
|
10645
|
+
span?.setAttributes({ "tts.duration_s": totalDuration, "tts.latency_ms": latency });
|
|
10646
|
+
span?.end();
|
|
10647
|
+
telemetry?.recordHistogram("omote.inference.latency", latency, {
|
|
10648
|
+
model: "elevenlabs-tts",
|
|
10649
|
+
backend: "cloud"
|
|
10650
|
+
});
|
|
10651
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
10652
|
+
model: "elevenlabs-tts",
|
|
10653
|
+
backend: "cloud",
|
|
10654
|
+
status: "success"
|
|
10655
|
+
});
|
|
10656
|
+
} catch (err) {
|
|
10657
|
+
if (err instanceof DOMException && err.name === "AbortError") {
|
|
10658
|
+
logger34.debug("Stream aborted");
|
|
10659
|
+
span?.end();
|
|
10660
|
+
return;
|
|
10661
|
+
}
|
|
10662
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
10663
|
+
logger34.error("Stream failed", { error: errMsg });
|
|
10664
|
+
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
10665
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
10666
|
+
model: "elevenlabs-tts",
|
|
10667
|
+
backend: "cloud",
|
|
10668
|
+
status: "error"
|
|
10669
|
+
});
|
|
10670
|
+
throw err;
|
|
10671
|
+
}
|
|
10672
|
+
}
|
|
10673
|
+
// ─── Dispose ────────────────────────────────────────────────────────────
|
|
10674
|
+
async dispose() {
|
|
10675
|
+
this._isLoaded = false;
|
|
10676
|
+
logger34.info("ElevenLabs TTS disposed");
|
|
10677
|
+
}
|
|
10678
|
+
// ─── Private ────────────────────────────────────────────────────────────
|
|
10679
|
+
getHttpErrorMessage(status, body) {
|
|
10680
|
+
switch (status) {
|
|
10681
|
+
case 401:
|
|
10682
|
+
return "Unauthorized \u2014 check your API key";
|
|
10683
|
+
case 403:
|
|
10684
|
+
return "Forbidden \u2014 API key lacks required permissions";
|
|
10685
|
+
case 429:
|
|
10686
|
+
return "Rate limited \u2014 too many requests";
|
|
10687
|
+
case 400:
|
|
10688
|
+
return `Bad request \u2014 ${body}`;
|
|
10689
|
+
default:
|
|
10690
|
+
return body || `HTTP error ${status}`;
|
|
10691
|
+
}
|
|
10692
|
+
}
|
|
10693
|
+
};
|
|
10694
|
+
|
|
10695
|
+
// src/inference/PollyTTSBackend.ts
|
|
10696
|
+
var logger35 = createLogger("PollyTTS");
|
|
10697
|
+
var DEFAULT_VOICE = "Joanna";
|
|
10698
|
+
var DEFAULT_SAMPLE_RATE = 16e3;
|
|
10699
|
+
var PollyTTSBackend = class {
|
|
10700
|
+
constructor(config) {
|
|
10701
|
+
this._isLoaded = false;
|
|
10702
|
+
if (!config.synthesizeFn) {
|
|
10703
|
+
throw new Error("PollyTTS: synthesizeFn is required");
|
|
10704
|
+
}
|
|
10705
|
+
this.synthesizeFn = config.synthesizeFn;
|
|
10706
|
+
this.voice = config.voice ?? DEFAULT_VOICE;
|
|
10707
|
+
this._sampleRate = config.sampleRate ?? DEFAULT_SAMPLE_RATE;
|
|
10708
|
+
this.engine = config.engine ?? "neural";
|
|
10709
|
+
}
|
|
10710
|
+
get sampleRate() {
|
|
10711
|
+
return this._sampleRate;
|
|
10712
|
+
}
|
|
10713
|
+
get isLoaded() {
|
|
10714
|
+
return this._isLoaded;
|
|
10715
|
+
}
|
|
10716
|
+
// ─── Load ───────────────────────────────────────────────────────────────
|
|
10717
|
+
/**
|
|
10718
|
+
* No-op for cloud TTS (no model to load).
|
|
10719
|
+
* Marks backend as ready.
|
|
10720
|
+
*/
|
|
10721
|
+
async load() {
|
|
10722
|
+
this._isLoaded = true;
|
|
10723
|
+
logger35.info("Polly TTS ready", { voice: this.voice, engine: this.engine, sampleRate: this._sampleRate });
|
|
10724
|
+
}
|
|
10725
|
+
// ─── Stream ─────────────────────────────────────────────────────────────
|
|
10726
|
+
/**
|
|
10727
|
+
* Synthesize audio via consumer's Polly function.
|
|
10728
|
+
*
|
|
10729
|
+
* Polly's SynthesizeSpeech is request/response (not streaming for PCM),
|
|
10730
|
+
* so this yields a single chunk per call. For long text, consider splitting
|
|
10731
|
+
* into sentences on the consumer side.
|
|
10732
|
+
*/
|
|
10733
|
+
async *stream(text, options) {
|
|
10734
|
+
if (!this._isLoaded) {
|
|
10735
|
+
throw new Error("PollyTTS: not loaded. Call load() first.");
|
|
10736
|
+
}
|
|
10737
|
+
const trimmed = text.trim();
|
|
10738
|
+
if (trimmed.length === 0) {
|
|
10739
|
+
throw new Error("PollyTTS: text must not be empty");
|
|
10740
|
+
}
|
|
10741
|
+
if (options?.signal?.aborted) {
|
|
10742
|
+
return;
|
|
10743
|
+
}
|
|
10744
|
+
const voiceName = options?.voice ?? this.voice;
|
|
10745
|
+
const startTime = getClock().now();
|
|
10746
|
+
const telemetry = getTelemetry();
|
|
10747
|
+
const span = telemetry?.startSpan("PollyTTS.stream", {
|
|
10748
|
+
"tts.text_length": trimmed.length,
|
|
10749
|
+
"tts.voice": voiceName,
|
|
10750
|
+
"tts.engine": this.engine
|
|
10751
|
+
});
|
|
10752
|
+
try {
|
|
10753
|
+
const result = await this.synthesizeFn(trimmed, voiceName, this._sampleRate);
|
|
10754
|
+
if (options?.signal?.aborted) {
|
|
10755
|
+
span?.end();
|
|
10756
|
+
return;
|
|
10757
|
+
}
|
|
10758
|
+
const audio = pcm16ToFloat32(result.audio);
|
|
10759
|
+
const duration = audio.length / this._sampleRate;
|
|
10760
|
+
const latency = getClock().now() - startTime;
|
|
10761
|
+
logger35.debug("Synthesis complete", {
|
|
10762
|
+
voice: voiceName,
|
|
10763
|
+
duration: `${duration.toFixed(2)}s`,
|
|
10764
|
+
latencyMs: Math.round(latency),
|
|
10765
|
+
numSamples: audio.length
|
|
10766
|
+
});
|
|
10767
|
+
span?.setAttributes({ "tts.duration_s": duration, "tts.latency_ms": latency });
|
|
10768
|
+
span?.end();
|
|
10769
|
+
telemetry?.recordHistogram("omote.inference.latency", latency, {
|
|
10770
|
+
model: "polly-tts",
|
|
10771
|
+
backend: "cloud"
|
|
10772
|
+
});
|
|
10773
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
10774
|
+
model: "polly-tts",
|
|
10775
|
+
backend: "cloud",
|
|
10776
|
+
status: "success"
|
|
10777
|
+
});
|
|
10778
|
+
yield { audio, duration, text: trimmed };
|
|
10779
|
+
} catch (err) {
|
|
10780
|
+
if (err instanceof DOMException && err.name === "AbortError") {
|
|
10781
|
+
logger35.debug("Synthesis aborted");
|
|
10782
|
+
span?.end();
|
|
10783
|
+
return;
|
|
10784
|
+
}
|
|
10785
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
10786
|
+
logger35.error("Synthesis failed", { error: errMsg });
|
|
10787
|
+
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
10788
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
10789
|
+
model: "polly-tts",
|
|
10790
|
+
backend: "cloud",
|
|
10791
|
+
status: "error"
|
|
10792
|
+
});
|
|
10793
|
+
throw err;
|
|
10794
|
+
}
|
|
10795
|
+
}
|
|
10796
|
+
// ─── Dispose ────────────────────────────────────────────────────────────
|
|
10797
|
+
async dispose() {
|
|
10798
|
+
this._isLoaded = false;
|
|
10799
|
+
logger35.info("Polly TTS disposed");
|
|
10800
|
+
}
|
|
10801
|
+
};
|
|
10802
|
+
|
|
10416
10803
|
// src/inference/ortConfig.ts
|
|
10417
10804
|
var ortCdnBase = null;
|
|
10418
10805
|
function configureOrtCdn(cdnPath) {
|
|
@@ -10426,7 +10813,7 @@ function getOrtCdnBase() {
|
|
|
10426
10813
|
}
|
|
10427
10814
|
|
|
10428
10815
|
// src/emotion/Emotion.ts
|
|
10429
|
-
var
|
|
10816
|
+
var logger36 = createLogger("EmotionController");
|
|
10430
10817
|
var EMOTION_NAMES = [
|
|
10431
10818
|
"amazement",
|
|
10432
10819
|
"anger",
|
|
@@ -10448,7 +10835,7 @@ function createEmotionVector(weights = {}) {
|
|
|
10448
10835
|
if (idx >= 0) {
|
|
10449
10836
|
vector[idx] = Math.max(0, Math.min(1, value));
|
|
10450
10837
|
} else {
|
|
10451
|
-
|
|
10838
|
+
logger36.warn(`Invalid emotion name in createEmotionVector: "${name}"`);
|
|
10452
10839
|
}
|
|
10453
10840
|
}
|
|
10454
10841
|
return vector;
|
|
@@ -10531,7 +10918,7 @@ var EmotionController = class {
|
|
|
10531
10918
|
this.targetEmotion.set(newEmotion);
|
|
10532
10919
|
this.currentEmotion.set(newEmotion);
|
|
10533
10920
|
this.transitionProgress = 1;
|
|
10534
|
-
|
|
10921
|
+
logger36.debug("set", { weights });
|
|
10535
10922
|
}
|
|
10536
10923
|
/**
|
|
10537
10924
|
* Set emotion from preset immediately
|
|
@@ -10541,7 +10928,7 @@ var EmotionController = class {
|
|
|
10541
10928
|
this.targetEmotion.set(newEmotion);
|
|
10542
10929
|
this.currentEmotion.set(newEmotion);
|
|
10543
10930
|
this.transitionProgress = 1;
|
|
10544
|
-
|
|
10931
|
+
logger36.debug("setPreset", { preset });
|
|
10545
10932
|
}
|
|
10546
10933
|
/**
|
|
10547
10934
|
* Transition to new emotion over time
|
|
@@ -10553,9 +10940,9 @@ var EmotionController = class {
|
|
|
10553
10940
|
this.currentEmotion.set(this.emotion);
|
|
10554
10941
|
this.targetEmotion.set(createEmotionVector(weights));
|
|
10555
10942
|
this.transitionDuration = durationMs;
|
|
10556
|
-
this.transitionStartTime =
|
|
10943
|
+
this.transitionStartTime = getClock().now();
|
|
10557
10944
|
this.transitionProgress = 0;
|
|
10558
|
-
|
|
10945
|
+
logger36.debug("transitionTo", { weights, durationMs });
|
|
10559
10946
|
}
|
|
10560
10947
|
/**
|
|
10561
10948
|
* Transition to preset over time
|
|
@@ -10564,7 +10951,7 @@ var EmotionController = class {
|
|
|
10564
10951
|
this.currentEmotion.set(this.emotion);
|
|
10565
10952
|
this.targetEmotion.set(getEmotionPreset(preset));
|
|
10566
10953
|
this.transitionDuration = durationMs;
|
|
10567
|
-
this.transitionStartTime =
|
|
10954
|
+
this.transitionStartTime = getClock().now();
|
|
10568
10955
|
this.transitionProgress = 0;
|
|
10569
10956
|
}
|
|
10570
10957
|
/**
|
|
@@ -10572,7 +10959,7 @@ var EmotionController = class {
|
|
|
10572
10959
|
*/
|
|
10573
10960
|
update() {
|
|
10574
10961
|
if (this.transitionProgress >= 1) return;
|
|
10575
|
-
const elapsed =
|
|
10962
|
+
const elapsed = getClock().now() - this.transitionStartTime;
|
|
10576
10963
|
this.transitionProgress = Math.min(1, elapsed / this.transitionDuration);
|
|
10577
10964
|
}
|
|
10578
10965
|
/**
|
|
@@ -10588,7 +10975,7 @@ var EmotionController = class {
|
|
|
10588
10975
|
this.currentEmotion.fill(0);
|
|
10589
10976
|
this.targetEmotion.fill(0);
|
|
10590
10977
|
this.transitionProgress = 1;
|
|
10591
|
-
|
|
10978
|
+
logger36.debug("reset");
|
|
10592
10979
|
}
|
|
10593
10980
|
};
|
|
10594
10981
|
|
|
@@ -10669,7 +11056,7 @@ var DEFAULT_ANIMATION_CONFIG = {
|
|
|
10669
11056
|
};
|
|
10670
11057
|
|
|
10671
11058
|
// src/animation/AnimationGraph.ts
|
|
10672
|
-
var
|
|
11059
|
+
var logger37 = createLogger("AnimationGraph");
|
|
10673
11060
|
var AnimationGraph = class extends EventEmitter {
|
|
10674
11061
|
constructor(config = {}) {
|
|
10675
11062
|
super();
|
|
@@ -10702,7 +11089,7 @@ var AnimationGraph = class extends EventEmitter {
|
|
|
10702
11089
|
this.stateEnterTime = Date.now();
|
|
10703
11090
|
this.lastUpdateTime = Date.now();
|
|
10704
11091
|
this.cachedOutput = this.computeOutput();
|
|
10705
|
-
|
|
11092
|
+
logger37.info("constructor", {
|
|
10706
11093
|
initialState: this.config.initialState,
|
|
10707
11094
|
stateCount: this.config.states.length,
|
|
10708
11095
|
transitionCount: this.config.transitions.length
|
|
@@ -10773,7 +11160,7 @@ var AnimationGraph = class extends EventEmitter {
|
|
|
10773
11160
|
setState(stateName, blendDuration = 300) {
|
|
10774
11161
|
const targetState = this.config.states.find((s) => s.name === stateName);
|
|
10775
11162
|
if (!targetState) {
|
|
10776
|
-
|
|
11163
|
+
logger37.warn(`State '${stateName}' not found`);
|
|
10777
11164
|
return;
|
|
10778
11165
|
}
|
|
10779
11166
|
if (targetState.name === this.currentState.name && !this.isTransitioning) {
|
|
@@ -10851,7 +11238,7 @@ var AnimationGraph = class extends EventEmitter {
|
|
|
10851
11238
|
(s) => s.name === transition.to
|
|
10852
11239
|
);
|
|
10853
11240
|
if (!targetState) {
|
|
10854
|
-
|
|
11241
|
+
logger37.warn(`Target state '${transition.to}' not found`);
|
|
10855
11242
|
return;
|
|
10856
11243
|
}
|
|
10857
11244
|
const fromState = this.currentState.name;
|
|
@@ -10865,7 +11252,7 @@ var AnimationGraph = class extends EventEmitter {
|
|
|
10865
11252
|
if (!this.currentState.emotionBlendEnabled) {
|
|
10866
11253
|
this.targetEmotionWeight = 0;
|
|
10867
11254
|
}
|
|
10868
|
-
|
|
11255
|
+
logger37.debug("state transition", {
|
|
10869
11256
|
from: fromState,
|
|
10870
11257
|
to: targetState.name,
|
|
10871
11258
|
trigger: event,
|
|
@@ -10902,7 +11289,7 @@ var AnimationGraph = class extends EventEmitter {
|
|
|
10902
11289
|
if (this.currentState.timeout <= 0) return;
|
|
10903
11290
|
const elapsed = now - this.stateEnterTime;
|
|
10904
11291
|
if (elapsed >= this.currentState.timeout) {
|
|
10905
|
-
|
|
11292
|
+
logger37.debug("timeout transition", {
|
|
10906
11293
|
state: this.currentState.name,
|
|
10907
11294
|
elapsed,
|
|
10908
11295
|
timeout: this.currentState.timeout
|
|
@@ -11016,7 +11403,7 @@ var AnimationGraph = class extends EventEmitter {
|
|
|
11016
11403
|
|
|
11017
11404
|
// src/animation/ProceduralLifeLayer.ts
|
|
11018
11405
|
import { createNoise2D } from "simplex-noise";
|
|
11019
|
-
var
|
|
11406
|
+
var logger38 = createLogger("ProceduralLifeLayer");
|
|
11020
11407
|
var simplex2d = createNoise2D();
|
|
11021
11408
|
var LIFE_BS_INDEX = /* @__PURE__ */ new Map();
|
|
11022
11409
|
for (let i = 0; i < LAM_BLENDSHAPES.length; i++) {
|
|
@@ -11122,7 +11509,7 @@ var ProceduralLifeLayer = class {
|
|
|
11122
11509
|
}
|
|
11123
11510
|
this.blinkInterval = this.nextBlinkInterval();
|
|
11124
11511
|
this.gazeBreakInterval = randomRange(...this.gazeBreakIntervalRange);
|
|
11125
|
-
|
|
11512
|
+
logger38.debug("constructor", {
|
|
11126
11513
|
blinkIntervalRange: this.blinkIntervalRange,
|
|
11127
11514
|
useLogNormalBlinks: this.useLogNormalBlinks,
|
|
11128
11515
|
gazeBreakIntervalRange: this.gazeBreakIntervalRange,
|
|
@@ -11226,7 +11613,7 @@ var ProceduralLifeLayer = class {
|
|
|
11226
11613
|
* Reset all internal state to initial values.
|
|
11227
11614
|
*/
|
|
11228
11615
|
reset() {
|
|
11229
|
-
|
|
11616
|
+
logger38.debug("reset");
|
|
11230
11617
|
this.blinkTimer = 0;
|
|
11231
11618
|
this.blinkInterval = this.nextBlinkInterval();
|
|
11232
11619
|
this.blinkPhase = PHASE_OPEN;
|
|
@@ -11278,7 +11665,7 @@ var ProceduralLifeLayer = class {
|
|
|
11278
11665
|
this.blinkTimer = 0;
|
|
11279
11666
|
this.blinkInterval = this.nextBlinkInterval();
|
|
11280
11667
|
this.asymmetryRight = 0.95 + Math.random() * 0.08;
|
|
11281
|
-
|
|
11668
|
+
logger38.trace("blink", { nextInterval: this.blinkInterval });
|
|
11282
11669
|
}
|
|
11283
11670
|
if (this.blinkPhase > PHASE_OPEN) {
|
|
11284
11671
|
this.blinkProgress += delta;
|
|
@@ -11359,7 +11746,7 @@ var ProceduralLifeLayer = class {
|
|
|
11359
11746
|
this.gazeBreakTargetX = (Math.random() - 0.5) * 2 * amp;
|
|
11360
11747
|
this.gazeBreakTargetY = (Math.random() - 0.5) * amp * 0.4;
|
|
11361
11748
|
this.gazeBreakInterval = randomRange(...params.interval);
|
|
11362
|
-
|
|
11749
|
+
logger38.trace("gaze break", {
|
|
11363
11750
|
targetX: this.gazeBreakTargetX.toFixed(3),
|
|
11364
11751
|
targetY: this.gazeBreakTargetY.toFixed(3),
|
|
11365
11752
|
nextInterval: this.gazeBreakInterval.toFixed(2),
|
|
@@ -11602,7 +11989,7 @@ var ALL_AUS = [...new Set(
|
|
|
11602
11989
|
)];
|
|
11603
11990
|
|
|
11604
11991
|
// src/face/EmotionResolver.ts
|
|
11605
|
-
var
|
|
11992
|
+
var logger39 = createLogger("EmotionResolver");
|
|
11606
11993
|
var BS_INDEX = /* @__PURE__ */ new Map();
|
|
11607
11994
|
for (let i = 0; i < LAM_BLENDSHAPES.length; i++) {
|
|
11608
11995
|
BS_INDEX.set(LAM_BLENDSHAPES[i], i);
|
|
@@ -11629,7 +12016,7 @@ var EmotionResolver = class {
|
|
|
11629
12016
|
if (!emotionWeight || emotionWeight < 0.01) continue;
|
|
11630
12017
|
const auActivations = EMOTION_TO_AU[emotionName];
|
|
11631
12018
|
if (!auActivations) {
|
|
11632
|
-
|
|
12019
|
+
logger39.warn(`Unknown emotion name with no AU mapping: "${emotionName}"`);
|
|
11633
12020
|
continue;
|
|
11634
12021
|
}
|
|
11635
12022
|
for (const activation of auActivations) {
|
|
@@ -11654,7 +12041,7 @@ var EmotionResolver = class {
|
|
|
11654
12041
|
};
|
|
11655
12042
|
|
|
11656
12043
|
// src/face/FaceCompositor.ts
|
|
11657
|
-
var
|
|
12044
|
+
var logger40 = createLogger("FaceCompositor");
|
|
11658
12045
|
function smoothstep(t) {
|
|
11659
12046
|
return t * t * (3 - 2 * t);
|
|
11660
12047
|
}
|
|
@@ -11685,7 +12072,7 @@ var FaceCompositor = class {
|
|
|
11685
12072
|
if (config?.profile) {
|
|
11686
12073
|
this.applyProfileArrays(config.profile);
|
|
11687
12074
|
}
|
|
11688
|
-
|
|
12075
|
+
logger40.debug("constructor", {
|
|
11689
12076
|
emotionSmoothing: this.emotionSmoothing,
|
|
11690
12077
|
hasProfile: !!config?.profile,
|
|
11691
12078
|
hasLifeLayer: !!config?.lifeLayer
|
|
@@ -11701,7 +12088,7 @@ var FaceCompositor = class {
|
|
|
11701
12088
|
* @returns Blendshapes (Float32Array[52] clamped [0,1]) and head rotation deltas
|
|
11702
12089
|
*/
|
|
11703
12090
|
compose(base, input, target) {
|
|
11704
|
-
const composeStart =
|
|
12091
|
+
const composeStart = getClock().now();
|
|
11705
12092
|
const out = target ?? this.outputBuffer;
|
|
11706
12093
|
out.set(base);
|
|
11707
12094
|
const emotion = input.emotion ?? this.stickyEmotion;
|
|
@@ -11748,7 +12135,7 @@ var FaceCompositor = class {
|
|
|
11748
12135
|
}
|
|
11749
12136
|
getTelemetry()?.recordHistogram(
|
|
11750
12137
|
MetricNames.COMPOSITOR_COMPOSE_LATENCY,
|
|
11751
|
-
(
|
|
12138
|
+
(getClock().now() - composeStart) * 1e3
|
|
11752
12139
|
// µs
|
|
11753
12140
|
);
|
|
11754
12141
|
return { blendshapes: out, headDelta: lifeResult.headDelta };
|
|
@@ -11758,7 +12145,7 @@ var FaceCompositor = class {
|
|
|
11758
12145
|
*/
|
|
11759
12146
|
setEmotion(weights) {
|
|
11760
12147
|
this.stickyEmotion = weights;
|
|
11761
|
-
|
|
12148
|
+
logger40.debug("setEmotion", { weights });
|
|
11762
12149
|
}
|
|
11763
12150
|
/**
|
|
11764
12151
|
* Update character profile at runtime.
|
|
@@ -11767,7 +12154,7 @@ var FaceCompositor = class {
|
|
|
11767
12154
|
this.multiplier.fill(1);
|
|
11768
12155
|
this.offset.fill(0);
|
|
11769
12156
|
this.applyProfileArrays(profile);
|
|
11770
|
-
|
|
12157
|
+
logger40.debug("setProfile", {
|
|
11771
12158
|
multiplierKeys: profile.multiplier ? Object.keys(profile.multiplier).length : 0,
|
|
11772
12159
|
offsetKeys: profile.offset ? Object.keys(profile.offset).length : 0
|
|
11773
12160
|
});
|
|
@@ -11781,7 +12168,7 @@ var FaceCompositor = class {
|
|
|
11781
12168
|
this.lifeBuffer.fill(0);
|
|
11782
12169
|
this.stickyEmotion = void 0;
|
|
11783
12170
|
this.lifeLayer.reset();
|
|
11784
|
-
|
|
12171
|
+
logger40.debug("reset");
|
|
11785
12172
|
}
|
|
11786
12173
|
/** Expand partial profile maps into dense Float32Arrays */
|
|
11787
12174
|
applyProfileArrays(profile) {
|
|
@@ -11866,7 +12253,7 @@ function parseEmotionTags(text) {
|
|
|
11866
12253
|
}
|
|
11867
12254
|
|
|
11868
12255
|
// src/character/CharacterController.ts
|
|
11869
|
-
var
|
|
12256
|
+
var logger41 = createLogger("CharacterController");
|
|
11870
12257
|
var FRAME_BUDGET_US = 33e3;
|
|
11871
12258
|
var EMOTION_MAP = {
|
|
11872
12259
|
// Synced with EmotionPresets (packages/core/src/emotion/Emotion.ts)
|
|
@@ -11936,7 +12323,7 @@ var CharacterController = class {
|
|
|
11936
12323
|
this.gazeYawInfluence = config?.gaze?.yawInfluence ?? 0.4;
|
|
11937
12324
|
this.gazePitchInfluence = config?.gaze?.pitchInfluence ?? 0.3;
|
|
11938
12325
|
this.gazeSmoothing = config?.gaze?.smoothing ?? 5;
|
|
11939
|
-
|
|
12326
|
+
logger41.debug("constructor", {
|
|
11940
12327
|
gazeEnabled: this.gazeEnabled,
|
|
11941
12328
|
gazeYawInfluence: this.gazeYawInfluence,
|
|
11942
12329
|
gazePitchInfluence: this.gazePitchInfluence,
|
|
@@ -11951,7 +12338,7 @@ var CharacterController = class {
|
|
|
11951
12338
|
* into a single output frame.
|
|
11952
12339
|
*/
|
|
11953
12340
|
update(input) {
|
|
11954
|
-
const frameStart =
|
|
12341
|
+
const frameStart = getClock().now();
|
|
11955
12342
|
const base = input.baseBlendshapes ?? this.zeroBase;
|
|
11956
12343
|
const eyeTargets = this.computeEyeTargets(
|
|
11957
12344
|
input.cameraWorldPos,
|
|
@@ -11978,7 +12365,7 @@ var CharacterController = class {
|
|
|
11978
12365
|
lifeHeadDelta,
|
|
11979
12366
|
input.avatarRotationY ?? 0
|
|
11980
12367
|
);
|
|
11981
|
-
const frameUs = (
|
|
12368
|
+
const frameUs = (getClock().now() - frameStart) * 1e3;
|
|
11982
12369
|
this.frameTimes[this.frameTimeIdx] = frameUs;
|
|
11983
12370
|
this.frameTimeIdx = (this.frameTimeIdx + 1) % this.frameTimes.length;
|
|
11984
12371
|
if (this.frameTimeFill < this.frameTimes.length) this.frameTimeFill++;
|
|
@@ -12000,13 +12387,13 @@ var CharacterController = class {
|
|
|
12000
12387
|
const resolved = resolveEmotion(emotion);
|
|
12001
12388
|
if (resolved) {
|
|
12002
12389
|
this._compositor.setEmotion(resolved);
|
|
12003
|
-
|
|
12390
|
+
logger41.debug("setEmotion", { emotion, resolved });
|
|
12004
12391
|
}
|
|
12005
12392
|
}
|
|
12006
12393
|
/** Update character profile at runtime. */
|
|
12007
12394
|
setProfile(profile) {
|
|
12008
12395
|
this._compositor.setProfile(profile);
|
|
12009
|
-
|
|
12396
|
+
logger41.debug("setProfile", {
|
|
12010
12397
|
multiplierKeys: profile.multiplier ? Object.keys(profile.multiplier).length : 0,
|
|
12011
12398
|
offsetKeys: profile.offset ? Object.keys(profile.offset).length : 0
|
|
12012
12399
|
});
|
|
@@ -12041,11 +12428,11 @@ var CharacterController = class {
|
|
|
12041
12428
|
this._compositor.reset();
|
|
12042
12429
|
this.gazeHeadYaw = 0;
|
|
12043
12430
|
this.gazeHeadPitch = -0.1;
|
|
12044
|
-
|
|
12431
|
+
logger41.debug("reset");
|
|
12045
12432
|
}
|
|
12046
12433
|
dispose() {
|
|
12047
12434
|
this.reset();
|
|
12048
|
-
|
|
12435
|
+
logger41.debug("dispose");
|
|
12049
12436
|
}
|
|
12050
12437
|
// ---------------------------------------------------------------------------
|
|
12051
12438
|
// Eye angle math (extracted from r3f useGazeTracking.computeEyeTargets)
|
|
@@ -12127,7 +12514,7 @@ var CharacterController = class {
|
|
|
12127
12514
|
};
|
|
12128
12515
|
|
|
12129
12516
|
// src/orchestration/MicLipSync.ts
|
|
12130
|
-
var
|
|
12517
|
+
var logger42 = createLogger("MicLipSync");
|
|
12131
12518
|
var MicLipSync = class extends EventEmitter {
|
|
12132
12519
|
constructor(config) {
|
|
12133
12520
|
super();
|
|
@@ -12146,7 +12533,7 @@ var MicLipSync = class extends EventEmitter {
|
|
|
12146
12533
|
this.vadChunkSize = 0;
|
|
12147
12534
|
this.vadBuffer = null;
|
|
12148
12535
|
this.vadBufferOffset = 0;
|
|
12149
|
-
|
|
12536
|
+
logger42.info("MicLipSync created", {
|
|
12150
12537
|
sampleRate: config.sampleRate ?? 16e3,
|
|
12151
12538
|
micChunkSize: config.micChunkSize ?? 512,
|
|
12152
12539
|
hasVAD: !!config.vad,
|
|
@@ -12168,12 +12555,12 @@ var MicLipSync = class extends EventEmitter {
|
|
|
12168
12555
|
this._currentFrame = scaled;
|
|
12169
12556
|
if (!this._firstFrameEmitted) {
|
|
12170
12557
|
this._firstFrameEmitted = true;
|
|
12171
|
-
|
|
12558
|
+
logger42.trace("First blendshape frame emitted");
|
|
12172
12559
|
}
|
|
12173
12560
|
this.emit("frame", { blendshapes: scaled, rawBlendshapes: raw });
|
|
12174
12561
|
},
|
|
12175
12562
|
onError: (error) => {
|
|
12176
|
-
|
|
12563
|
+
logger42.error("A2E inference error", { message: error.message });
|
|
12177
12564
|
this.emit("error", error);
|
|
12178
12565
|
}
|
|
12179
12566
|
});
|
|
@@ -12181,7 +12568,9 @@ var MicLipSync = class extends EventEmitter {
|
|
|
12181
12568
|
const float32 = int16ToFloat32(pcm);
|
|
12182
12569
|
this.processor.pushAudio(float32);
|
|
12183
12570
|
if (this.vad) {
|
|
12184
|
-
this.vadQueue = this.vadQueue.then(() => this.processVAD(float32)).catch(() => {
|
|
12571
|
+
this.vadQueue = this.vadQueue.then(() => this.processVAD(float32)).catch((err) => {
|
|
12572
|
+
logger42.warn("VAD processing error", { error: String(err), code: ErrorCodes.SPH_VAD_ERROR });
|
|
12573
|
+
this.emit("error", err instanceof Error ? err : new Error(String(err)));
|
|
12185
12574
|
});
|
|
12186
12575
|
}
|
|
12187
12576
|
});
|
|
@@ -12216,7 +12605,7 @@ var MicLipSync = class extends EventEmitter {
|
|
|
12216
12605
|
/** Start microphone capture and inference loop */
|
|
12217
12606
|
async start() {
|
|
12218
12607
|
if (this._state === "active") return;
|
|
12219
|
-
|
|
12608
|
+
logger42.info("Starting MicLipSync");
|
|
12220
12609
|
getTelemetry()?.incrementCounter(MetricNames.MIC_SESSIONS);
|
|
12221
12610
|
await this.mic.start();
|
|
12222
12611
|
this.processor.startDrip();
|
|
@@ -12226,7 +12615,7 @@ var MicLipSync = class extends EventEmitter {
|
|
|
12226
12615
|
/** Stop microphone and inference */
|
|
12227
12616
|
stop() {
|
|
12228
12617
|
if (this._state === "idle") return;
|
|
12229
|
-
|
|
12618
|
+
logger42.info("Stopping MicLipSync");
|
|
12230
12619
|
this.processor.stopDrip();
|
|
12231
12620
|
this.mic.stop();
|
|
12232
12621
|
this._isSpeaking = false;
|
|
@@ -12268,14 +12657,15 @@ var MicLipSync = class extends EventEmitter {
|
|
|
12268
12657
|
const wasSpeaking = this._isSpeaking;
|
|
12269
12658
|
this._isSpeaking = result.isSpeech;
|
|
12270
12659
|
if (!wasSpeaking && result.isSpeech) {
|
|
12271
|
-
this.speechStartTime =
|
|
12660
|
+
this.speechStartTime = getClock().now();
|
|
12272
12661
|
this.emit("speech:start");
|
|
12273
12662
|
} else if (wasSpeaking && !result.isSpeech) {
|
|
12274
|
-
const durationMs =
|
|
12663
|
+
const durationMs = getClock().now() - this.speechStartTime;
|
|
12275
12664
|
this.emit("speech:end", { durationMs });
|
|
12276
12665
|
}
|
|
12277
12666
|
} catch (err) {
|
|
12278
|
-
|
|
12667
|
+
logger42.warn("VAD process error", { error: String(err), code: ErrorCodes.SPH_VAD_ERROR });
|
|
12668
|
+
this.emit("error", err instanceof Error ? err : new Error(String(err)));
|
|
12279
12669
|
}
|
|
12280
12670
|
this.vadBufferOffset = 0;
|
|
12281
12671
|
}
|
|
@@ -12292,7 +12682,7 @@ var MicLipSync = class extends EventEmitter {
|
|
|
12292
12682
|
};
|
|
12293
12683
|
|
|
12294
12684
|
// src/orchestration/VoicePipeline.ts
|
|
12295
|
-
var
|
|
12685
|
+
var logger43 = createLogger("VoicePipeline");
|
|
12296
12686
|
var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
12297
12687
|
constructor(config) {
|
|
12298
12688
|
super();
|
|
@@ -12324,6 +12714,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12324
12714
|
this.lastProgressiveSamples = 0;
|
|
12325
12715
|
// ASR error recovery
|
|
12326
12716
|
this.asrErrorCount = 0;
|
|
12717
|
+
this.progressiveErrorCount = 0;
|
|
12327
12718
|
// Response abort
|
|
12328
12719
|
this.responseAbortController = null;
|
|
12329
12720
|
// Listener cleanup
|
|
@@ -12367,7 +12758,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12367
12758
|
if (typeof requestAnimationFrame !== "undefined") {
|
|
12368
12759
|
await new Promise((r) => requestAnimationFrame(() => r()));
|
|
12369
12760
|
}
|
|
12370
|
-
|
|
12761
|
+
logger43.debug("Creating PlaybackPipeline", {
|
|
12371
12762
|
neutralTransitionEnabled: this.config.neutralTransitionEnabled ?? true,
|
|
12372
12763
|
audioDelayMs: this.config.audioDelayMs,
|
|
12373
12764
|
chunkTargetMs: this.config.chunkTargetMs
|
|
@@ -12407,8 +12798,9 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12407
12798
|
this.setState("ready");
|
|
12408
12799
|
} catch (error) {
|
|
12409
12800
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
12801
|
+
span?.setAttributes({ "error.type": ErrorTypes.MODEL });
|
|
12410
12802
|
span?.endWithError(err);
|
|
12411
|
-
|
|
12803
|
+
logger43.error("Model loading failed", { message: err.message });
|
|
12412
12804
|
this.emit("error", err);
|
|
12413
12805
|
this.setState("error");
|
|
12414
12806
|
throw err;
|
|
@@ -12422,7 +12814,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12422
12814
|
const { backends } = this.config;
|
|
12423
12815
|
if (!backends) throw new Error("No backends config");
|
|
12424
12816
|
this.emitProgress("Loading models", 0, 1, 0);
|
|
12425
|
-
|
|
12817
|
+
logger43.info("Loading from pre-built backends");
|
|
12426
12818
|
const toLoad = [];
|
|
12427
12819
|
if (!backends.asr.isLoaded) toLoad.push(backends.asr.load());
|
|
12428
12820
|
if (!backends.lam.isLoaded) toLoad.push(backends.lam.load());
|
|
@@ -12456,7 +12848,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12456
12848
|
} else if (UnifiedInferenceWorker.isSupported()) {
|
|
12457
12849
|
this.unifiedWorker = new UnifiedInferenceWorker();
|
|
12458
12850
|
await this.unifiedWorker.init();
|
|
12459
|
-
|
|
12851
|
+
logger43.info("Created internal unified worker", { backend: this.unifiedWorker.backend });
|
|
12460
12852
|
}
|
|
12461
12853
|
}
|
|
12462
12854
|
this.emitProgress("Loading models", 0, 3, 0);
|
|
@@ -12493,17 +12885,17 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12493
12885
|
throw asrResult.reason;
|
|
12494
12886
|
}
|
|
12495
12887
|
this.asr = asr;
|
|
12496
|
-
|
|
12888
|
+
logger43.info("SenseVoice loaded");
|
|
12497
12889
|
if (vadResult.status === "rejected") {
|
|
12498
12890
|
throw vadResult.reason;
|
|
12499
12891
|
}
|
|
12500
12892
|
this.vad = vad;
|
|
12501
|
-
|
|
12893
|
+
logger43.info("Silero VAD loaded");
|
|
12502
12894
|
if (lamResult.status === "rejected") {
|
|
12503
12895
|
throw lamResult.reason;
|
|
12504
12896
|
}
|
|
12505
12897
|
this.lam = lam;
|
|
12506
|
-
|
|
12898
|
+
logger43.info("LAM loaded");
|
|
12507
12899
|
} finally {
|
|
12508
12900
|
clearInterval(progressInterval);
|
|
12509
12901
|
}
|
|
@@ -12511,7 +12903,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12511
12903
|
if (this.isLocalMode) {
|
|
12512
12904
|
const localConfig = this.config;
|
|
12513
12905
|
if (localConfig.ttsConfig && !localConfig.tts) {
|
|
12514
|
-
|
|
12906
|
+
logger43.info("Creating Kokoro TTS from config", {
|
|
12515
12907
|
hasUnifiedWorker: !!this.unifiedWorker,
|
|
12516
12908
|
voice: localConfig.ttsConfig.defaultVoice
|
|
12517
12909
|
});
|
|
@@ -12521,7 +12913,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12521
12913
|
});
|
|
12522
12914
|
}
|
|
12523
12915
|
if (localConfig.tts && !localConfig.ttsConfig && isIOS()) {
|
|
12524
|
-
|
|
12916
|
+
logger43.warn(
|
|
12525
12917
|
"External TTS on iOS creates a separate ORT WASM instance, risking OOM. Prefer ttsConfig for automatic unified worker integration."
|
|
12526
12918
|
);
|
|
12527
12919
|
}
|
|
@@ -12529,9 +12921,9 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12529
12921
|
throw new Error("VoicePipeline local mode requires either tts or ttsConfig");
|
|
12530
12922
|
}
|
|
12531
12923
|
if (!localConfig.tts.isLoaded) {
|
|
12532
|
-
|
|
12924
|
+
logger43.info("Loading local TTS model...");
|
|
12533
12925
|
await localConfig.tts.load();
|
|
12534
|
-
|
|
12926
|
+
logger43.info("Local TTS model loaded");
|
|
12535
12927
|
}
|
|
12536
12928
|
}
|
|
12537
12929
|
this.emitProgress("Loading models", 100, 3, 3);
|
|
@@ -12547,8 +12939,8 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12547
12939
|
this.epoch++;
|
|
12548
12940
|
this._sessionId = crypto.randomUUID();
|
|
12549
12941
|
this.asrErrorCount = 0;
|
|
12550
|
-
|
|
12551
|
-
|
|
12942
|
+
logger43.info("Starting voice pipeline", { sessionId: this._sessionId });
|
|
12943
|
+
logger43.debug("Pipeline mode", { mode: this.isLocalMode ? "local" : "cloud" });
|
|
12552
12944
|
this.mic = new MicrophoneCapture(this.omoteEvents, {
|
|
12553
12945
|
sampleRate: 16e3,
|
|
12554
12946
|
chunkSize: 512
|
|
@@ -12561,11 +12953,11 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12561
12953
|
this.emit("audio:level", level);
|
|
12562
12954
|
});
|
|
12563
12955
|
await this.mic.start();
|
|
12564
|
-
|
|
12956
|
+
logger43.debug("Microphone started");
|
|
12565
12957
|
this.setState("listening");
|
|
12566
12958
|
}
|
|
12567
12959
|
stop() {
|
|
12568
|
-
|
|
12960
|
+
logger43.info("Stopping voice pipeline", { sessionId: this._sessionId });
|
|
12569
12961
|
this.stopped = true;
|
|
12570
12962
|
this.epoch++;
|
|
12571
12963
|
this.clearSilenceTimer();
|
|
@@ -12594,7 +12986,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12594
12986
|
this.playback?.setProfile(profile);
|
|
12595
12987
|
}
|
|
12596
12988
|
async dispose() {
|
|
12597
|
-
|
|
12989
|
+
logger43.debug("Disposing VoicePipeline");
|
|
12598
12990
|
this.stop();
|
|
12599
12991
|
this.epoch++;
|
|
12600
12992
|
await Promise.allSettled([
|
|
@@ -12628,19 +13020,20 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12628
13020
|
if (result.isSpeech) {
|
|
12629
13021
|
if (!wasSpeaking) {
|
|
12630
13022
|
this.isSpeaking = true;
|
|
12631
|
-
this.speechStartTime =
|
|
13023
|
+
this.speechStartTime = getClock().now();
|
|
12632
13024
|
this.audioBuffer = [];
|
|
12633
13025
|
this.audioBufferSamples = 0;
|
|
12634
13026
|
this.lastProgressiveResult = null;
|
|
12635
13027
|
this.lastProgressiveSamples = 0;
|
|
12636
|
-
|
|
13028
|
+
this.progressiveErrorCount = 0;
|
|
13029
|
+
logger43.debug("VAD speech start");
|
|
12637
13030
|
this.emit("speech:start");
|
|
12638
13031
|
this.startProgressiveTranscription();
|
|
12639
13032
|
}
|
|
12640
13033
|
this.audioBuffer.push(new Float32Array(samples));
|
|
12641
13034
|
this.audioBufferSamples += samples.length;
|
|
12642
13035
|
if (this.audioBufferSamples >= _VoicePipeline.MAX_AUDIO_BUFFER_SAMPLES) {
|
|
12643
|
-
|
|
13036
|
+
logger43.warn("Audio buffer exceeded max, forcing transcription flush");
|
|
12644
13037
|
this.onSilenceDetected();
|
|
12645
13038
|
return;
|
|
12646
13039
|
}
|
|
@@ -12656,7 +13049,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12656
13049
|
}
|
|
12657
13050
|
}
|
|
12658
13051
|
} catch (err) {
|
|
12659
|
-
|
|
13052
|
+
logger43.warn("VAD error", { error: String(err) });
|
|
12660
13053
|
}
|
|
12661
13054
|
}
|
|
12662
13055
|
// ---------------------------------------------------------------------------
|
|
@@ -12667,18 +13060,18 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12667
13060
|
const extended = this.config.silenceTimeoutExtendedMs ?? 700;
|
|
12668
13061
|
const adaptive = this.config.adaptiveTimeout ?? true;
|
|
12669
13062
|
if (!adaptive) return base;
|
|
12670
|
-
const speechDurationMs =
|
|
13063
|
+
const speechDurationMs = getClock().now() - this.speechStartTime;
|
|
12671
13064
|
return speechDurationMs > 3e3 ? extended : base;
|
|
12672
13065
|
}
|
|
12673
13066
|
onSilenceDetected() {
|
|
12674
13067
|
const capturedEpoch = this.epoch;
|
|
12675
13068
|
this.isSpeaking = false;
|
|
12676
|
-
const durationMs =
|
|
12677
|
-
|
|
13069
|
+
const durationMs = getClock().now() - this.speechStartTime;
|
|
13070
|
+
logger43.debug("VAD speech end", { durationMs: Math.round(durationMs) });
|
|
12678
13071
|
this.emit("speech:end", { durationMs });
|
|
12679
13072
|
this.clearSilenceTimer();
|
|
12680
13073
|
this.processEndOfSpeech(capturedEpoch).catch((err) => {
|
|
12681
|
-
|
|
13074
|
+
logger43.error("End of speech processing failed", { error: String(err) });
|
|
12682
13075
|
if (this.epoch === capturedEpoch && !this.stopped) {
|
|
12683
13076
|
this.emit("error", err instanceof Error ? err : new Error(String(err)));
|
|
12684
13077
|
this.setState("listening");
|
|
@@ -12692,7 +13085,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12692
13085
|
const turnSpan = getTelemetry()?.startSpan("VoicePipeline.turn", {
|
|
12693
13086
|
"session.id": this._sessionId ?? ""
|
|
12694
13087
|
});
|
|
12695
|
-
const turnStart =
|
|
13088
|
+
const turnStart = getClock().now();
|
|
12696
13089
|
if (this.progressivePromise) {
|
|
12697
13090
|
try {
|
|
12698
13091
|
await this.progressivePromise;
|
|
@@ -12717,7 +13110,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12717
13110
|
const minEnergy = this.config.minAudioEnergy ?? 0.02;
|
|
12718
13111
|
const durationSec = totalSamples / 16e3;
|
|
12719
13112
|
if (durationSec < minDuration) {
|
|
12720
|
-
|
|
13113
|
+
logger43.info("Audio too short, discarding", { durationSec });
|
|
12721
13114
|
turnSpan?.end();
|
|
12722
13115
|
this.setState("listening");
|
|
12723
13116
|
return;
|
|
@@ -12728,7 +13121,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12728
13121
|
}
|
|
12729
13122
|
rms = Math.sqrt(rms / fullAudio.length);
|
|
12730
13123
|
if (rms < minEnergy) {
|
|
12731
|
-
|
|
13124
|
+
logger43.info("Audio too quiet, discarding", { rms });
|
|
12732
13125
|
turnSpan?.end();
|
|
12733
13126
|
this.setState("listening");
|
|
12734
13127
|
return;
|
|
@@ -12739,12 +13132,12 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12739
13132
|
const coverageThreshold = this.config.progressiveCoverageThreshold ?? 0.8;
|
|
12740
13133
|
if (this.lastProgressiveResult && this.lastProgressiveResult.text.trim().length > 0 && this.lastProgressiveSamples >= totalSamples * coverageThreshold) {
|
|
12741
13134
|
transcript = { ...this.lastProgressiveResult, isFinal: true };
|
|
12742
|
-
|
|
13135
|
+
logger43.info("Using progressive result", {
|
|
12743
13136
|
coverage: (this.lastProgressiveSamples / totalSamples).toFixed(2),
|
|
12744
13137
|
text: transcript.text
|
|
12745
13138
|
});
|
|
12746
13139
|
} else {
|
|
12747
|
-
|
|
13140
|
+
logger43.debug("Progressive result insufficient, running final transcription", {
|
|
12748
13141
|
samples: totalSamples,
|
|
12749
13142
|
hadProgressive: !!this.lastProgressiveResult
|
|
12750
13143
|
});
|
|
@@ -12759,7 +13152,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12759
13152
|
return;
|
|
12760
13153
|
}
|
|
12761
13154
|
if (!transcript || !transcript.text.trim()) {
|
|
12762
|
-
|
|
13155
|
+
logger43.info("No transcript, resuming listening");
|
|
12763
13156
|
turnSpan?.end();
|
|
12764
13157
|
this.setState("listening");
|
|
12765
13158
|
return;
|
|
@@ -12767,7 +13160,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12767
13160
|
this.emit("transcript", transcript);
|
|
12768
13161
|
getTelemetry()?.recordHistogram(
|
|
12769
13162
|
MetricNames.VOICE_TURN_LATENCY,
|
|
12770
|
-
|
|
13163
|
+
getClock().now() - turnStart,
|
|
12771
13164
|
{ mode: this.isLocalMode ? "local" : "cloud" }
|
|
12772
13165
|
);
|
|
12773
13166
|
await this.callResponseHandler(transcript, capturedEpoch, turnSpan?.getContext());
|
|
@@ -12781,7 +13174,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12781
13174
|
const span = getTelemetry()?.startSpan("VoicePipeline.response", {
|
|
12782
13175
|
"text.length": transcript.text.length
|
|
12783
13176
|
}, parentContext);
|
|
12784
|
-
const responseStart =
|
|
13177
|
+
const responseStart = getClock().now();
|
|
12785
13178
|
this.setState("speaking");
|
|
12786
13179
|
this.interruption?.setAISpeaking(true);
|
|
12787
13180
|
if (transcript.emotion) {
|
|
@@ -12798,7 +13191,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12798
13191
|
}
|
|
12799
13192
|
getTelemetry()?.recordHistogram(
|
|
12800
13193
|
MetricNames.VOICE_RESPONSE_LATENCY,
|
|
12801
|
-
|
|
13194
|
+
getClock().now() - responseStart,
|
|
12802
13195
|
{ mode: this.isLocalMode ? "local" : "cloud" }
|
|
12803
13196
|
);
|
|
12804
13197
|
span?.end();
|
|
@@ -12808,8 +13201,9 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12808
13201
|
return;
|
|
12809
13202
|
}
|
|
12810
13203
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
13204
|
+
span?.setAttributes({ "error.type": ErrorTypes.RUNTIME });
|
|
12811
13205
|
span?.endWithError(err);
|
|
12812
|
-
|
|
13206
|
+
logger43.error("Response handler error", { message: err.message });
|
|
12813
13207
|
this.emit("error", err);
|
|
12814
13208
|
if (this.epoch === capturedEpoch && !this.stopped) {
|
|
12815
13209
|
this.interruption?.setAISpeaking(false);
|
|
@@ -12880,11 +13274,11 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12880
13274
|
// ---------------------------------------------------------------------------
|
|
12881
13275
|
handleInterruption() {
|
|
12882
13276
|
if (this._state !== "speaking") return;
|
|
12883
|
-
|
|
13277
|
+
logger43.info("Interruption triggered");
|
|
12884
13278
|
getTelemetry()?.incrementCounter(MetricNames.VOICE_INTERRUPTIONS);
|
|
12885
13279
|
this.epoch++;
|
|
12886
13280
|
if (this.responseAbortController) {
|
|
12887
|
-
|
|
13281
|
+
logger43.debug("Aborting in-flight response");
|
|
12888
13282
|
}
|
|
12889
13283
|
this.responseAbortController?.abort();
|
|
12890
13284
|
this.playback?.stop();
|
|
@@ -12922,7 +13316,15 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12922
13316
|
this.lastProgressiveSamples = snapshotSamples;
|
|
12923
13317
|
this.emit("transcript", { ...result, isFinal: false });
|
|
12924
13318
|
}
|
|
12925
|
-
} catch {
|
|
13319
|
+
} catch (err) {
|
|
13320
|
+
this.progressiveErrorCount++;
|
|
13321
|
+
if (this.progressiveErrorCount % 10 === 1) {
|
|
13322
|
+
logger43.warn("Progressive transcription error", {
|
|
13323
|
+
code: ErrorCodes.SPH_ASR_ERROR,
|
|
13324
|
+
count: this.progressiveErrorCount,
|
|
13325
|
+
error: String(err)
|
|
13326
|
+
});
|
|
13327
|
+
}
|
|
12926
13328
|
}
|
|
12927
13329
|
})();
|
|
12928
13330
|
}, intervalMs);
|
|
@@ -12938,8 +13340,9 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12938
13340
|
// ---------------------------------------------------------------------------
|
|
12939
13341
|
async transcribeWithTimeout(audio) {
|
|
12940
13342
|
if (!this.asr) return null;
|
|
13343
|
+
const currentEpoch = this.epoch;
|
|
12941
13344
|
const timeoutMs = this.config.transcriptionTimeoutMs ?? 1e4;
|
|
12942
|
-
const startTime =
|
|
13345
|
+
const startTime = getClock().now();
|
|
12943
13346
|
const span = getTelemetry()?.startSpan("VoicePipeline.transcribe", {
|
|
12944
13347
|
"inference.input_samples": audio.length,
|
|
12945
13348
|
"inference.input_duration_ms": audio.length / 16e3 * 1e3
|
|
@@ -12953,7 +13356,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12953
13356
|
})
|
|
12954
13357
|
]);
|
|
12955
13358
|
clearTimeout(timeoutId);
|
|
12956
|
-
const latency =
|
|
13359
|
+
const latency = getClock().now() - startTime;
|
|
12957
13360
|
this.asrErrorCount = 0;
|
|
12958
13361
|
getTelemetry()?.recordHistogram(MetricNames.VOICE_TRANSCRIPTION_LATENCY, latency);
|
|
12959
13362
|
getTelemetry()?.incrementCounter(MetricNames.VOICE_TRANSCRIPTIONS);
|
|
@@ -12967,14 +13370,18 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12967
13370
|
inferenceTimeMs: latency
|
|
12968
13371
|
};
|
|
12969
13372
|
} catch (error) {
|
|
13373
|
+
span?.setAttributes({ "error.type": ErrorTypes.INFERENCE });
|
|
12970
13374
|
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
12971
13375
|
this.asrErrorCount++;
|
|
12972
|
-
|
|
13376
|
+
logger43.warn("Transcription failed", {
|
|
12973
13377
|
attempt: this.asrErrorCount,
|
|
12974
13378
|
error: String(error)
|
|
12975
13379
|
});
|
|
12976
13380
|
if (this.asrErrorCount >= 3 && this.config.models) {
|
|
12977
|
-
|
|
13381
|
+
if (this.epoch !== currentEpoch) return null;
|
|
13382
|
+
logger43.warn("3 consecutive ASR errors, recreating session", {
|
|
13383
|
+
code: ErrorCodes.SPH_ASR_ERROR
|
|
13384
|
+
});
|
|
12978
13385
|
try {
|
|
12979
13386
|
await this.asr.dispose();
|
|
12980
13387
|
this.asr = createSenseVoice({
|
|
@@ -12984,9 +13391,10 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
12984
13391
|
unifiedWorker: this.unifiedWorker ?? void 0
|
|
12985
13392
|
});
|
|
12986
13393
|
await this.asr.load();
|
|
13394
|
+
if (this.epoch !== currentEpoch) return null;
|
|
12987
13395
|
this.asrErrorCount = 0;
|
|
12988
13396
|
} catch (recreateErr) {
|
|
12989
|
-
|
|
13397
|
+
logger43.error("ASR session recreation failed", { error: String(recreateErr) });
|
|
12990
13398
|
}
|
|
12991
13399
|
}
|
|
12992
13400
|
return null;
|
|
@@ -13015,7 +13423,7 @@ var _VoicePipeline = class _VoicePipeline extends EventEmitter {
|
|
|
13015
13423
|
// ---------------------------------------------------------------------------
|
|
13016
13424
|
setState(state) {
|
|
13017
13425
|
if (this._state === state) return;
|
|
13018
|
-
|
|
13426
|
+
logger43.info("State transition", { from: this._state, to: state });
|
|
13019
13427
|
this._state = state;
|
|
13020
13428
|
this.emit("state", state);
|
|
13021
13429
|
}
|
|
@@ -13034,7 +13442,7 @@ _VoicePipeline.MAX_AUDIO_BUFFER_SAMPLES = 16e3 * 30;
|
|
|
13034
13442
|
var VoicePipeline = _VoicePipeline;
|
|
13035
13443
|
|
|
13036
13444
|
// src/orchestration/VoiceOrchestrator.ts
|
|
13037
|
-
var
|
|
13445
|
+
var logger44 = createLogger("VoiceOrchestrator");
|
|
13038
13446
|
var VoiceOrchestrator = class extends EventEmitter {
|
|
13039
13447
|
constructor() {
|
|
13040
13448
|
super(...arguments);
|
|
@@ -13084,7 +13492,7 @@ var VoiceOrchestrator = class extends EventEmitter {
|
|
|
13084
13492
|
const epoch = ++this.connectEpoch;
|
|
13085
13493
|
this._mode = config.mode ?? "local";
|
|
13086
13494
|
this._sessionId = crypto.randomUUID();
|
|
13087
|
-
|
|
13495
|
+
logger44.info("Connecting voice orchestrator", { mode: this._mode });
|
|
13088
13496
|
if (this._mode === "local") {
|
|
13089
13497
|
const localCfg = config;
|
|
13090
13498
|
this.ttsSpeaker = new TTSSpeaker();
|
|
@@ -13137,7 +13545,7 @@ var VoiceOrchestrator = class extends EventEmitter {
|
|
|
13137
13545
|
} else {
|
|
13138
13546
|
this.wireCloudTranscript(config);
|
|
13139
13547
|
}
|
|
13140
|
-
|
|
13548
|
+
logger44.info("Voice orchestrator connected", { mode: this._mode });
|
|
13141
13549
|
}
|
|
13142
13550
|
async disconnect() {
|
|
13143
13551
|
this.connectEpoch++;
|
|
@@ -13251,7 +13659,7 @@ var VoiceOrchestrator = class extends EventEmitter {
|
|
|
13251
13659
|
await this.speak(text);
|
|
13252
13660
|
}
|
|
13253
13661
|
} catch (e) {
|
|
13254
|
-
|
|
13662
|
+
logger44.error("Voice transcript handler error", { error: String(e) });
|
|
13255
13663
|
} finally {
|
|
13256
13664
|
this.interruption?.setAISpeaking(false);
|
|
13257
13665
|
this.speechListener?.resume();
|
|
@@ -13292,7 +13700,7 @@ var VoiceOrchestrator = class extends EventEmitter {
|
|
|
13292
13700
|
});
|
|
13293
13701
|
} catch (e) {
|
|
13294
13702
|
if (!abortController.signal.aborted) {
|
|
13295
|
-
|
|
13703
|
+
logger44.error("Cloud response handler error", { error: String(e) });
|
|
13296
13704
|
}
|
|
13297
13705
|
} finally {
|
|
13298
13706
|
this.responseAbortController = null;
|
|
@@ -13306,7 +13714,7 @@ var VoiceOrchestrator = class extends EventEmitter {
|
|
|
13306
13714
|
// -------------------------------------------------------------------------
|
|
13307
13715
|
handleInterruption() {
|
|
13308
13716
|
if (this._state !== "speaking") return;
|
|
13309
|
-
|
|
13717
|
+
logger44.info("Interruption triggered");
|
|
13310
13718
|
this.stopSpeaking();
|
|
13311
13719
|
this.speechListener?.resume();
|
|
13312
13720
|
this.setState("listening");
|
|
@@ -13352,10 +13760,12 @@ export {
|
|
|
13352
13760
|
EMOTION_TO_AU,
|
|
13353
13761
|
EMOTION_VECTOR_SIZE,
|
|
13354
13762
|
EXPLICIT_EMOTION_COUNT,
|
|
13763
|
+
ElevenLabsTTSBackend,
|
|
13355
13764
|
EmotionController,
|
|
13356
13765
|
EmotionPresets,
|
|
13357
13766
|
EmotionResolver,
|
|
13358
13767
|
EmphasisDetector,
|
|
13768
|
+
ErrorCodes,
|
|
13359
13769
|
ErrorTypes,
|
|
13360
13770
|
EventEmitter,
|
|
13361
13771
|
FaceCompositor,
|
|
@@ -13379,6 +13789,7 @@ export {
|
|
|
13379
13789
|
PRESERVE_POSITION_BONES,
|
|
13380
13790
|
PROTOCOL_VERSION,
|
|
13381
13791
|
PlaybackPipeline,
|
|
13792
|
+
PollyTTSBackend,
|
|
13382
13793
|
ProceduralLifeLayer,
|
|
13383
13794
|
RingBuffer,
|
|
13384
13795
|
SafariSpeechRecognition,
|
|
@@ -13402,6 +13813,7 @@ export {
|
|
|
13402
13813
|
calculatePeak,
|
|
13403
13814
|
calculateRMS,
|
|
13404
13815
|
configureCacheLimit,
|
|
13816
|
+
configureClock,
|
|
13405
13817
|
configureLogging,
|
|
13406
13818
|
configureModelUrls,
|
|
13407
13819
|
configureOrtCdn,
|
|
@@ -13418,6 +13830,7 @@ export {
|
|
|
13418
13830
|
formatBytes,
|
|
13419
13831
|
getCacheConfig,
|
|
13420
13832
|
getCacheKey,
|
|
13833
|
+
getClock,
|
|
13421
13834
|
getEmotionPreset,
|
|
13422
13835
|
getLoggingConfig,
|
|
13423
13836
|
getModelCache,
|