@omote/core 0.10.5 → 0.10.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +76 -34
- package/dist/chunk-3FILA2CD.mjs +785 -0
- package/dist/chunk-3FILA2CD.mjs.map +1 -0
- package/dist/chunk-5WIOGMJA.mjs +785 -0
- package/dist/chunk-5WIOGMJA.mjs.map +1 -0
- package/dist/chunk-NWZMIQK4.mjs +782 -0
- package/dist/chunk-NWZMIQK4.mjs.map +1 -0
- package/dist/chunk-WW4XAUJ3.mjs +208 -0
- package/dist/chunk-WW4XAUJ3.mjs.map +1 -0
- package/dist/index.d.mts +84 -79
- package/dist/index.d.ts +84 -79
- package/dist/index.js +514 -406
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +233 -199
- package/dist/index.mjs.map +1 -1
- package/dist/logging/index.js +5 -0
- package/dist/logging/index.js.map +1 -1
- package/dist/logging/index.mjs +1 -1
- package/dist/otlp-2BML6FIK.mjs +7 -0
- package/dist/otlp-2BML6FIK.mjs.map +1 -0
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -6,7 +6,6 @@ import {
|
|
|
6
6
|
DEFAULT_LOGGING_CONFIG,
|
|
7
7
|
ErrorCodes,
|
|
8
8
|
LOG_LEVEL_PRIORITY,
|
|
9
|
-
OTLPExporter,
|
|
10
9
|
OmoteTelemetry,
|
|
11
10
|
configureClock,
|
|
12
11
|
configureLogging,
|
|
@@ -19,7 +18,10 @@ import {
|
|
|
19
18
|
resetLoggingConfig,
|
|
20
19
|
setLogLevel,
|
|
21
20
|
setLoggingEnabled
|
|
22
|
-
} from "./chunk-
|
|
21
|
+
} from "./chunk-NWZMIQK4.mjs";
|
|
22
|
+
import {
|
|
23
|
+
OTLPExporter
|
|
24
|
+
} from "./chunk-WW4XAUJ3.mjs";
|
|
23
25
|
|
|
24
26
|
// src/audio/audioConvert.ts
|
|
25
27
|
function float32ToPcm16(samples) {
|
|
@@ -74,6 +76,67 @@ function int16ToFloat32(int16) {
|
|
|
74
76
|
return float32;
|
|
75
77
|
}
|
|
76
78
|
|
|
79
|
+
// src/telemetry/types.ts
|
|
80
|
+
var MetricNames = {
|
|
81
|
+
// --- Inference ---
|
|
82
|
+
/** Histogram: Inference latency in ms */
|
|
83
|
+
INFERENCE_LATENCY: "omote.inference.latency",
|
|
84
|
+
/** Histogram: Model load time in ms */
|
|
85
|
+
MODEL_LOAD_TIME: "omote.model.load_time",
|
|
86
|
+
/** Counter: Total inference operations */
|
|
87
|
+
INFERENCE_TOTAL: "omote.inference.total",
|
|
88
|
+
/** Counter: Total errors */
|
|
89
|
+
ERRORS_TOTAL: "omote.errors.total",
|
|
90
|
+
/** Counter: Cache hits */
|
|
91
|
+
CACHE_HITS: "omote.cache.hits",
|
|
92
|
+
/** Counter: Cache misses */
|
|
93
|
+
CACHE_MISSES: "omote.cache.misses",
|
|
94
|
+
/** Counter: Cache stale (version/etag mismatch) */
|
|
95
|
+
CACHE_STALE: "omote.cache.stale",
|
|
96
|
+
/** Counter: Cache quota warning (>90% used) */
|
|
97
|
+
CACHE_QUOTA_WARNING: "omote.cache.quota_warning",
|
|
98
|
+
/** Counter: Cache eviction (LRU) */
|
|
99
|
+
CACHE_EVICTION: "omote.cache.eviction",
|
|
100
|
+
// --- Pipeline ---
|
|
101
|
+
/** Histogram: Voice turn latency (speech end → transcript ready, excludes playback) */
|
|
102
|
+
VOICE_TURN_LATENCY: "omote.voice.turn.latency",
|
|
103
|
+
/** Histogram: ASR transcription latency in ms */
|
|
104
|
+
VOICE_TRANSCRIPTION_LATENCY: "omote.voice.transcription.latency",
|
|
105
|
+
/** Histogram: Response handler latency in ms */
|
|
106
|
+
VOICE_RESPONSE_LATENCY: "omote.voice.response.latency",
|
|
107
|
+
/** Counter: Total transcriptions */
|
|
108
|
+
VOICE_TRANSCRIPTIONS: "omote.voice.transcriptions",
|
|
109
|
+
/** Counter: Total interruptions */
|
|
110
|
+
VOICE_INTERRUPTIONS: "omote.voice.interruptions",
|
|
111
|
+
// --- Playback ---
|
|
112
|
+
/** Histogram: PlaybackPipeline session duration in ms */
|
|
113
|
+
PLAYBACK_SESSION_DURATION: "omote.playback.session.duration",
|
|
114
|
+
/** Histogram: Audio chunk processing latency in ms */
|
|
115
|
+
PLAYBACK_CHUNK_LATENCY: "omote.playback.chunk.latency",
|
|
116
|
+
// --- TTS ---
|
|
117
|
+
/** Histogram: TTSSpeaker.connect() latency in ms */
|
|
118
|
+
TTS_CONNECT_LATENCY: "omote.tts.connect.latency",
|
|
119
|
+
/** Histogram: TTSSpeaker.speak() latency in ms */
|
|
120
|
+
TTS_SPEAK_LATENCY: "omote.tts.speak.latency",
|
|
121
|
+
/** Counter: TTSSpeaker.stop() aborted speak calls */
|
|
122
|
+
TTS_SPEAK_ABORTED: "omote.tts.speak.aborted",
|
|
123
|
+
// --- Mic ---
|
|
124
|
+
/** Counter: MicLipSync sessions started */
|
|
125
|
+
MIC_SESSIONS: "omote.mic.sessions",
|
|
126
|
+
// --- Frame budget ---
|
|
127
|
+
/** Histogram: CharacterController.update() latency in µs */
|
|
128
|
+
AVATAR_FRAME_LATENCY: "omote.avatar.frame.latency_us",
|
|
129
|
+
/** Histogram: FaceCompositor.compose() latency in µs */
|
|
130
|
+
COMPOSITOR_COMPOSE_LATENCY: "omote.compositor.compose.latency_us",
|
|
131
|
+
/** Counter: Frames exceeding budget threshold */
|
|
132
|
+
AVATAR_FRAME_DROPS: "omote.avatar.frame.drops",
|
|
133
|
+
// --- Audio scheduling ---
|
|
134
|
+
/** Counter: Audio scheduling gaps (playback fell behind) */
|
|
135
|
+
AUDIO_SCHEDULE_GAP: "omote.audio.schedule_gap"
|
|
136
|
+
};
|
|
137
|
+
var INFERENCE_LATENCY_BUCKETS = [1, 5, 10, 25, 50, 100, 250, 500, 1e3, 2500, 5e3];
|
|
138
|
+
var MODEL_LOAD_TIME_BUCKETS = [100, 500, 1e3, 2500, 5e3, 1e4, 3e4, 6e4];
|
|
139
|
+
|
|
77
140
|
// src/audio/MicrophoneCapture.ts
|
|
78
141
|
var logger = createLogger("MicrophoneCapture");
|
|
79
142
|
var MicrophoneCapture = class {
|
|
@@ -110,6 +173,7 @@ var MicrophoneCapture = class {
|
|
|
110
173
|
return;
|
|
111
174
|
}
|
|
112
175
|
if (this._isRecording) return;
|
|
176
|
+
const span = getTelemetry()?.startSpan("MicrophoneCapture.start");
|
|
113
177
|
try {
|
|
114
178
|
this.stream = await navigator.mediaDevices.getUserMedia({
|
|
115
179
|
audio: {
|
|
@@ -183,6 +247,8 @@ var MicrophoneCapture = class {
|
|
|
183
247
|
source.connect(this.processor);
|
|
184
248
|
this.processor.connect(this.context.destination);
|
|
185
249
|
this._isRecording = true;
|
|
250
|
+
getTelemetry()?.incrementCounter(MetricNames.MIC_SESSIONS);
|
|
251
|
+
span?.end();
|
|
186
252
|
logger.info("Started recording", {
|
|
187
253
|
contextState: this.context.state,
|
|
188
254
|
sampleRate: this.config.sampleRate,
|
|
@@ -203,6 +269,7 @@ var MicrophoneCapture = class {
|
|
|
203
269
|
message: err.message,
|
|
204
270
|
details: err
|
|
205
271
|
});
|
|
272
|
+
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
206
273
|
}
|
|
207
274
|
}
|
|
208
275
|
stop() {
|
|
@@ -404,6 +471,7 @@ var AudioScheduler = class {
|
|
|
404
471
|
if (scheduleTime < ctx.currentTime) {
|
|
405
472
|
const gap = ctx.currentTime - scheduleTime;
|
|
406
473
|
const gapMs = gap * 1e3;
|
|
474
|
+
getTelemetry()?.incrementCounter(MetricNames.AUDIO_SCHEDULE_GAP, 1, { gap_ms: Math.round(gapMs) });
|
|
407
475
|
if (gap > 0.5) {
|
|
408
476
|
logger2.error("Critical audio scheduling gap", {
|
|
409
477
|
code: ErrorCodes.AUD_SCHEDULE_GAP,
|
|
@@ -981,6 +1049,8 @@ var _A2EProcessor = class _A2EProcessor {
|
|
|
981
1049
|
const t0 = getClock().now();
|
|
982
1050
|
const result = await this.backend.infer(chunk, this.identityIndex);
|
|
983
1051
|
const inferMs = Math.round(getClock().now() - t0);
|
|
1052
|
+
getTelemetry()?.recordHistogram(MetricNames.INFERENCE_LATENCY, inferMs);
|
|
1053
|
+
getTelemetry()?.incrementCounter(MetricNames.INFERENCE_TOTAL);
|
|
984
1054
|
const effectiveSamples = actualSamples ?? chunk.length;
|
|
985
1055
|
const actualDuration = effectiveSamples / this.sampleRate;
|
|
986
1056
|
const actualFrameCount = Math.ceil(actualDuration * FRAME_RATE);
|
|
@@ -1027,79 +1097,13 @@ var _A2EProcessor = class _A2EProcessor {
|
|
|
1027
1097
|
error: error.message,
|
|
1028
1098
|
code
|
|
1029
1099
|
});
|
|
1100
|
+
getTelemetry()?.incrementCounter(MetricNames.ERRORS_TOTAL, 1, { source: "A2EProcessor", code });
|
|
1030
1101
|
this.onError?.(error);
|
|
1031
1102
|
}
|
|
1032
1103
|
};
|
|
1033
1104
|
_A2EProcessor.MAX_PENDING_CHUNKS = 10;
|
|
1034
1105
|
var A2EProcessor = _A2EProcessor;
|
|
1035
1106
|
|
|
1036
|
-
// src/telemetry/types.ts
|
|
1037
|
-
var MetricNames = {
|
|
1038
|
-
// --- Inference ---
|
|
1039
|
-
/** Histogram: Inference latency in ms */
|
|
1040
|
-
INFERENCE_LATENCY: "omote.inference.latency",
|
|
1041
|
-
/** Histogram: Model load time in ms */
|
|
1042
|
-
MODEL_LOAD_TIME: "omote.model.load_time",
|
|
1043
|
-
/** Counter: Total inference operations */
|
|
1044
|
-
INFERENCE_TOTAL: "omote.inference.total",
|
|
1045
|
-
/** Counter: Total errors */
|
|
1046
|
-
ERRORS_TOTAL: "omote.errors.total",
|
|
1047
|
-
/** Counter: Cache hits */
|
|
1048
|
-
CACHE_HITS: "omote.cache.hits",
|
|
1049
|
-
/** Counter: Cache misses */
|
|
1050
|
-
CACHE_MISSES: "omote.cache.misses",
|
|
1051
|
-
/** Counter: Cache stale (version/etag mismatch) */
|
|
1052
|
-
CACHE_STALE: "omote.cache.stale",
|
|
1053
|
-
/** Counter: Cache quota warning (>90% used) */
|
|
1054
|
-
CACHE_QUOTA_WARNING: "omote.cache.quota_warning",
|
|
1055
|
-
/** Counter: Cache eviction (LRU) */
|
|
1056
|
-
CACHE_EVICTION: "omote.cache.eviction",
|
|
1057
|
-
// --- Pipeline ---
|
|
1058
|
-
/** Histogram: Voice turn latency (speech end → transcript ready, excludes playback) */
|
|
1059
|
-
VOICE_TURN_LATENCY: "omote.voice.turn.latency",
|
|
1060
|
-
/** Histogram: ASR transcription latency in ms */
|
|
1061
|
-
VOICE_TRANSCRIPTION_LATENCY: "omote.voice.transcription.latency",
|
|
1062
|
-
/** Histogram: Response handler latency in ms */
|
|
1063
|
-
VOICE_RESPONSE_LATENCY: "omote.voice.response.latency",
|
|
1064
|
-
/** Counter: Total transcriptions */
|
|
1065
|
-
VOICE_TRANSCRIPTIONS: "omote.voice.transcriptions",
|
|
1066
|
-
/** Counter: Total interruptions */
|
|
1067
|
-
VOICE_INTERRUPTIONS: "omote.voice.interruptions",
|
|
1068
|
-
// --- Playback ---
|
|
1069
|
-
/** Histogram: PlaybackPipeline session duration in ms */
|
|
1070
|
-
PLAYBACK_SESSION_DURATION: "omote.playback.session.duration",
|
|
1071
|
-
/** Histogram: Audio chunk processing latency in ms */
|
|
1072
|
-
PLAYBACK_CHUNK_LATENCY: "omote.playback.chunk.latency",
|
|
1073
|
-
// --- TTS ---
|
|
1074
|
-
/** Histogram: TTSSpeaker.connect() latency in ms */
|
|
1075
|
-
TTS_CONNECT_LATENCY: "omote.tts.connect.latency",
|
|
1076
|
-
/** Histogram: TTSSpeaker.speak() latency in ms */
|
|
1077
|
-
TTS_SPEAK_LATENCY: "omote.tts.speak.latency",
|
|
1078
|
-
/** Counter: TTSSpeaker.stop() aborted speak calls */
|
|
1079
|
-
TTS_SPEAK_ABORTED: "omote.tts.speak.aborted",
|
|
1080
|
-
// --- Mic ---
|
|
1081
|
-
/** Counter: MicLipSync sessions started */
|
|
1082
|
-
MIC_SESSIONS: "omote.mic.sessions",
|
|
1083
|
-
// --- Frame budget ---
|
|
1084
|
-
/** Histogram: CharacterController.update() latency in µs */
|
|
1085
|
-
AVATAR_FRAME_LATENCY: "omote.avatar.frame.latency_us",
|
|
1086
|
-
/** Histogram: FaceCompositor.compose() latency in µs */
|
|
1087
|
-
COMPOSITOR_COMPOSE_LATENCY: "omote.compositor.compose.latency_us",
|
|
1088
|
-
/** Counter: Frames exceeding budget threshold */
|
|
1089
|
-
AVATAR_FRAME_DROPS: "omote.avatar.frame.drops"
|
|
1090
|
-
};
|
|
1091
|
-
var ErrorTypes = {
|
|
1092
|
-
INFERENCE: "inference_error",
|
|
1093
|
-
NETWORK: "network_error",
|
|
1094
|
-
TIMEOUT: "timeout",
|
|
1095
|
-
USER: "user_error",
|
|
1096
|
-
RUNTIME: "runtime_error",
|
|
1097
|
-
MEDIA: "media_error",
|
|
1098
|
-
MODEL: "model_error"
|
|
1099
|
-
};
|
|
1100
|
-
var INFERENCE_LATENCY_BUCKETS = [1, 5, 10, 25, 50, 100, 250, 500, 1e3, 2500, 5e3];
|
|
1101
|
-
var MODEL_LOAD_TIME_BUCKETS = [100, 500, 1e3, 2500, 5e3, 1e4, 3e4, 6e4];
|
|
1102
|
-
|
|
1103
1107
|
// src/inference/blendshapeUtils.ts
|
|
1104
1108
|
var ARKIT_BLENDSHAPES = [
|
|
1105
1109
|
"browDownLeft",
|
|
@@ -3299,7 +3303,7 @@ var SenseVoiceUnifiedAdapter = class {
|
|
|
3299
3303
|
});
|
|
3300
3304
|
span?.setAttributes({ "model.backend": "wasm", "model.load_time_ms": result.loadTimeMs });
|
|
3301
3305
|
span?.end();
|
|
3302
|
-
telemetry?.recordHistogram(
|
|
3306
|
+
telemetry?.recordHistogram(MetricNames.MODEL_LOAD_TIME, result.loadTimeMs, {
|
|
3303
3307
|
model: "sensevoice-unified",
|
|
3304
3308
|
backend: "wasm"
|
|
3305
3309
|
});
|
|
@@ -3323,11 +3327,11 @@ var SenseVoiceUnifiedAdapter = class {
|
|
|
3323
3327
|
try {
|
|
3324
3328
|
const result = await this.worker.transcribe(audio);
|
|
3325
3329
|
const latencyMs = getClock().now() - startTime;
|
|
3326
|
-
telemetry?.recordHistogram(
|
|
3330
|
+
telemetry?.recordHistogram(MetricNames.INFERENCE_LATENCY, latencyMs, {
|
|
3327
3331
|
model: "sensevoice-unified",
|
|
3328
3332
|
backend: "wasm"
|
|
3329
3333
|
});
|
|
3330
|
-
telemetry?.incrementCounter(
|
|
3334
|
+
telemetry?.incrementCounter(MetricNames.INFERENCE_TOTAL, 1, {
|
|
3331
3335
|
model: "sensevoice-unified",
|
|
3332
3336
|
backend: "wasm",
|
|
3333
3337
|
status: "success"
|
|
@@ -3336,7 +3340,7 @@ var SenseVoiceUnifiedAdapter = class {
|
|
|
3336
3340
|
span?.end();
|
|
3337
3341
|
resolve(result);
|
|
3338
3342
|
} catch (err) {
|
|
3339
|
-
telemetry?.incrementCounter(
|
|
3343
|
+
telemetry?.incrementCounter(MetricNames.INFERENCE_TOTAL, 1, {
|
|
3340
3344
|
model: "sensevoice-unified",
|
|
3341
3345
|
backend: "wasm",
|
|
3342
3346
|
status: "error"
|
|
@@ -3404,7 +3408,7 @@ var A2EUnifiedAdapter = class {
|
|
|
3404
3408
|
});
|
|
3405
3409
|
span?.setAttributes({ "model.backend": result.backend, "model.load_time_ms": result.loadTimeMs });
|
|
3406
3410
|
span?.end();
|
|
3407
|
-
telemetry?.recordHistogram(
|
|
3411
|
+
telemetry?.recordHistogram(MetricNames.MODEL_LOAD_TIME, result.loadTimeMs, {
|
|
3408
3412
|
model: "a2e-unified",
|
|
3409
3413
|
backend: result.backend
|
|
3410
3414
|
});
|
|
@@ -4773,14 +4777,14 @@ var KokoroTTSUnifiedAdapter = class {
|
|
|
4773
4777
|
});
|
|
4774
4778
|
span?.setAttributes({ "model.backend": this._backend, "model.load_time_ms": loadTimeMs });
|
|
4775
4779
|
span?.end();
|
|
4776
|
-
telemetry?.recordHistogram(
|
|
4780
|
+
telemetry?.recordHistogram(MetricNames.MODEL_LOAD_TIME, loadTimeMs, {
|
|
4777
4781
|
model: "kokoro-tts-unified",
|
|
4778
4782
|
backend: this._backend
|
|
4779
4783
|
});
|
|
4780
4784
|
return { backend: this._backend, loadTimeMs, defaultVoice: this.config.defaultVoice };
|
|
4781
4785
|
} catch (error) {
|
|
4782
4786
|
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
4783
|
-
getTelemetry()?.incrementCounter(
|
|
4787
|
+
getTelemetry()?.incrementCounter(MetricNames.ERRORS_TOTAL, 1, {
|
|
4784
4788
|
model: "kokoro-tts-unified",
|
|
4785
4789
|
error_type: "load_failed"
|
|
4786
4790
|
});
|
|
@@ -4845,22 +4849,27 @@ var KokoroTTSUnifiedAdapter = class {
|
|
|
4845
4849
|
try {
|
|
4846
4850
|
const result = await this.worker.inferKokoro(tokens, style, speed);
|
|
4847
4851
|
const latencyMs = getClock().now() - startTime;
|
|
4848
|
-
telemetry?.recordHistogram(
|
|
4852
|
+
telemetry?.recordHistogram(MetricNames.INFERENCE_LATENCY, latencyMs, {
|
|
4849
4853
|
model: "kokoro-tts-unified",
|
|
4850
4854
|
backend: this._backend
|
|
4851
4855
|
});
|
|
4852
|
-
telemetry?.incrementCounter(
|
|
4856
|
+
telemetry?.incrementCounter(MetricNames.INFERENCE_TOTAL, 1, {
|
|
4853
4857
|
model: "kokoro-tts-unified",
|
|
4854
4858
|
backend: this._backend,
|
|
4855
4859
|
status: "success"
|
|
4856
4860
|
});
|
|
4857
4861
|
resolve(result.audio);
|
|
4858
4862
|
} catch (err) {
|
|
4859
|
-
telemetry?.incrementCounter(
|
|
4863
|
+
telemetry?.incrementCounter(MetricNames.INFERENCE_TOTAL, 1, {
|
|
4860
4864
|
model: "kokoro-tts-unified",
|
|
4861
4865
|
backend: this._backend,
|
|
4862
4866
|
status: "error"
|
|
4863
4867
|
});
|
|
4868
|
+
const span = telemetry?.startSpan("KokoroTTSUnifiedAdapter.inferError", {
|
|
4869
|
+
"model.name": "kokoro-tts-unified",
|
|
4870
|
+
"model.backend": this._backend
|
|
4871
|
+
});
|
|
4872
|
+
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
4864
4873
|
reject(err);
|
|
4865
4874
|
}
|
|
4866
4875
|
});
|
|
@@ -4938,7 +4947,7 @@ var SileroVADUnifiedAdapter = class {
|
|
|
4938
4947
|
});
|
|
4939
4948
|
span?.setAttributes({ "model.backend": "wasm", "model.load_time_ms": result.loadTimeMs });
|
|
4940
4949
|
span?.end();
|
|
4941
|
-
telemetry?.recordHistogram(
|
|
4950
|
+
telemetry?.recordHistogram(MetricNames.MODEL_LOAD_TIME, result.loadTimeMs, {
|
|
4942
4951
|
model: "silero-vad-unified",
|
|
4943
4952
|
backend: "wasm"
|
|
4944
4953
|
});
|
|
@@ -5549,6 +5558,7 @@ function createKokoroTTS(config = {}) {
|
|
|
5549
5558
|
}
|
|
5550
5559
|
|
|
5551
5560
|
// src/audio/createTTSPlayer.ts
|
|
5561
|
+
var logger22 = createLogger("TTSPlayer");
|
|
5552
5562
|
function createTTSPlayer(config) {
|
|
5553
5563
|
return new TTSPlayer(config);
|
|
5554
5564
|
}
|
|
@@ -5562,19 +5572,27 @@ var TTSPlayer = class extends TTSSpeaker {
|
|
|
5562
5572
|
}
|
|
5563
5573
|
/** Load TTS model and connect in audio-only mode. */
|
|
5564
5574
|
async load() {
|
|
5565
|
-
|
|
5566
|
-
|
|
5567
|
-
worker =
|
|
5568
|
-
|
|
5569
|
-
|
|
5570
|
-
|
|
5571
|
-
|
|
5572
|
-
|
|
5573
|
-
|
|
5574
|
-
|
|
5575
|
-
|
|
5576
|
-
|
|
5577
|
-
|
|
5575
|
+
const span = getTelemetry()?.startSpan("TTSPlayer.load");
|
|
5576
|
+
try {
|
|
5577
|
+
let worker = this.ttsConfig.unifiedWorker;
|
|
5578
|
+
if (!worker) {
|
|
5579
|
+
worker = await acquireSharedWorker();
|
|
5580
|
+
this.ttsPlayerUsesSharedWorker = true;
|
|
5581
|
+
}
|
|
5582
|
+
this.backend = createKokoroTTS({
|
|
5583
|
+
defaultVoice: this.ttsConfig.voice,
|
|
5584
|
+
modelUrl: this.ttsConfig.modelUrl,
|
|
5585
|
+
voiceBaseUrl: this.ttsConfig.voiceBaseUrl,
|
|
5586
|
+
unifiedWorker: worker
|
|
5587
|
+
});
|
|
5588
|
+
await this.backend.load();
|
|
5589
|
+
await this.connect(this.backend, { audioOnly: true });
|
|
5590
|
+
logger22.info("TTSPlayer loaded");
|
|
5591
|
+
span?.end();
|
|
5592
|
+
} catch (err) {
|
|
5593
|
+
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
5594
|
+
throw err;
|
|
5595
|
+
}
|
|
5578
5596
|
}
|
|
5579
5597
|
/** Whether the TTS model is loaded and ready. */
|
|
5580
5598
|
get isLoaded() {
|
|
@@ -5593,7 +5611,7 @@ var TTSPlayer = class extends TTSSpeaker {
|
|
|
5593
5611
|
};
|
|
5594
5612
|
|
|
5595
5613
|
// src/inference/createSenseVoice.ts
|
|
5596
|
-
var
|
|
5614
|
+
var logger23 = createLogger("createSenseVoice");
|
|
5597
5615
|
var LazySenseVoice = class {
|
|
5598
5616
|
constructor(config) {
|
|
5599
5617
|
this.inner = null;
|
|
@@ -5641,7 +5659,7 @@ var LazySenseVoice = class {
|
|
|
5641
5659
|
function createSenseVoice(config = {}) {
|
|
5642
5660
|
const modelUrl = config.modelUrl ?? DEFAULT_MODEL_URLS.senseVoice;
|
|
5643
5661
|
if (config.unifiedWorker) {
|
|
5644
|
-
|
|
5662
|
+
logger23.info("Creating SenseVoiceUnifiedAdapter (shared unified worker)");
|
|
5645
5663
|
return new SenseVoiceUnifiedAdapter(config.unifiedWorker, {
|
|
5646
5664
|
modelUrl,
|
|
5647
5665
|
tokensUrl: config.tokensUrl,
|
|
@@ -5649,12 +5667,12 @@ function createSenseVoice(config = {}) {
|
|
|
5649
5667
|
textNorm: config.textNorm
|
|
5650
5668
|
});
|
|
5651
5669
|
}
|
|
5652
|
-
|
|
5670
|
+
logger23.info("Creating SenseVoiceUnifiedAdapter (dedicated worker, lazy init)");
|
|
5653
5671
|
return new LazySenseVoice(config);
|
|
5654
5672
|
}
|
|
5655
5673
|
|
|
5656
5674
|
// src/inference/createSileroVAD.ts
|
|
5657
|
-
var
|
|
5675
|
+
var logger24 = createLogger("createSileroVAD");
|
|
5658
5676
|
var LazySileroVAD = class {
|
|
5659
5677
|
constructor(config) {
|
|
5660
5678
|
this.inner = null;
|
|
@@ -5715,15 +5733,15 @@ function createSileroVAD(config = {}) {
|
|
|
5715
5733
|
const modelUrl = config.modelUrl ?? DEFAULT_MODEL_URLS.sileroVad;
|
|
5716
5734
|
const resolvedConfig = { ...config, modelUrl };
|
|
5717
5735
|
if (config.unifiedWorker) {
|
|
5718
|
-
|
|
5736
|
+
logger24.info("Creating SileroVADUnifiedAdapter (shared unified worker)");
|
|
5719
5737
|
return new SileroVADUnifiedAdapter(config.unifiedWorker, resolvedConfig);
|
|
5720
5738
|
}
|
|
5721
|
-
|
|
5739
|
+
logger24.info("Creating SileroVADUnifiedAdapter (dedicated worker, lazy init)");
|
|
5722
5740
|
return new LazySileroVAD(config);
|
|
5723
5741
|
}
|
|
5724
5742
|
|
|
5725
5743
|
// src/audio/SpeechListener.ts
|
|
5726
|
-
var
|
|
5744
|
+
var logger25 = createLogger("SpeechListener");
|
|
5727
5745
|
var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
5728
5746
|
constructor(config) {
|
|
5729
5747
|
super();
|
|
@@ -5847,11 +5865,11 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
5847
5865
|
}
|
|
5848
5866
|
span?.end();
|
|
5849
5867
|
this.setState("ready");
|
|
5850
|
-
|
|
5868
|
+
logger25.info("SpeechListener models loaded");
|
|
5851
5869
|
} catch (error) {
|
|
5852
5870
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
5853
5871
|
span?.endWithError(err);
|
|
5854
|
-
|
|
5872
|
+
logger25.error("Model loading failed", { message: err.message });
|
|
5855
5873
|
this.emit("error", err);
|
|
5856
5874
|
this.setState("idle");
|
|
5857
5875
|
throw err;
|
|
@@ -5884,7 +5902,7 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
5884
5902
|
});
|
|
5885
5903
|
await this.mic.start();
|
|
5886
5904
|
this.setState("listening");
|
|
5887
|
-
|
|
5905
|
+
logger25.info("Listening started");
|
|
5888
5906
|
}
|
|
5889
5907
|
/** Stop listening — deactivates mic, clears buffers. */
|
|
5890
5908
|
stop() {
|
|
@@ -5904,7 +5922,7 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
5904
5922
|
if (this._state !== "idle") {
|
|
5905
5923
|
this.setState("ready");
|
|
5906
5924
|
}
|
|
5907
|
-
|
|
5925
|
+
logger25.info("Listening stopped");
|
|
5908
5926
|
}
|
|
5909
5927
|
/** Pause VAD/ASR but keep mic active for audio:chunk events (for interruption detection). */
|
|
5910
5928
|
pause() {
|
|
@@ -5925,7 +5943,7 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
5925
5943
|
}
|
|
5926
5944
|
/** Dispose all resources. */
|
|
5927
5945
|
async dispose() {
|
|
5928
|
-
|
|
5946
|
+
logger25.debug("Disposing SpeechListener");
|
|
5929
5947
|
this.stop();
|
|
5930
5948
|
this.epoch++;
|
|
5931
5949
|
await Promise.allSettled([
|
|
@@ -5960,14 +5978,14 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
5960
5978
|
this.audioBufferSamples = 0;
|
|
5961
5979
|
this.lastProgressiveResult = null;
|
|
5962
5980
|
this.lastProgressiveSamples = 0;
|
|
5963
|
-
|
|
5981
|
+
logger25.debug("Speech start");
|
|
5964
5982
|
this.emit("speech:start");
|
|
5965
5983
|
this.startProgressiveTranscription();
|
|
5966
5984
|
}
|
|
5967
5985
|
this.audioBuffer.push(new Float32Array(samples));
|
|
5968
5986
|
this.audioBufferSamples += samples.length;
|
|
5969
5987
|
if (this.audioBufferSamples >= _SpeechListener.MAX_AUDIO_BUFFER_SAMPLES) {
|
|
5970
|
-
|
|
5988
|
+
logger25.warn("Audio buffer exceeded max, forcing transcription flush");
|
|
5971
5989
|
this.onSilenceDetected();
|
|
5972
5990
|
return;
|
|
5973
5991
|
}
|
|
@@ -5983,7 +6001,7 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
5983
6001
|
}
|
|
5984
6002
|
}
|
|
5985
6003
|
} catch (err) {
|
|
5986
|
-
|
|
6004
|
+
logger25.warn("VAD error", { error: String(err) });
|
|
5987
6005
|
}
|
|
5988
6006
|
}
|
|
5989
6007
|
// ---------------------------------------------------------------------------
|
|
@@ -6001,11 +6019,11 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
6001
6019
|
const capturedEpoch = this.epoch;
|
|
6002
6020
|
this.isSpeechActive = false;
|
|
6003
6021
|
const durationMs = getClock().now() - this.speechStartTime;
|
|
6004
|
-
|
|
6022
|
+
logger25.debug("Speech end", { durationMs: Math.round(durationMs) });
|
|
6005
6023
|
this.emit("speech:end", { durationMs });
|
|
6006
6024
|
this.clearSilenceTimer();
|
|
6007
6025
|
this.processEndOfSpeech(capturedEpoch).catch((err) => {
|
|
6008
|
-
|
|
6026
|
+
logger25.error("End of speech processing failed", { error: String(err) });
|
|
6009
6027
|
if (this.epoch === capturedEpoch) {
|
|
6010
6028
|
this.emit("error", err instanceof Error ? err : new Error(String(err)));
|
|
6011
6029
|
this.setState("listening");
|
|
@@ -6037,7 +6055,7 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
6037
6055
|
const minEnergy = this.config.minAudioEnergy ?? 0.02;
|
|
6038
6056
|
const durationSec = totalSamples / 16e3;
|
|
6039
6057
|
if (durationSec < minDuration) {
|
|
6040
|
-
|
|
6058
|
+
logger25.info("Audio too short, discarding", { durationSec });
|
|
6041
6059
|
this.setState("listening");
|
|
6042
6060
|
return;
|
|
6043
6061
|
}
|
|
@@ -6047,7 +6065,7 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
6047
6065
|
}
|
|
6048
6066
|
rms = Math.sqrt(rms / fullAudio.length);
|
|
6049
6067
|
if (rms < minEnergy) {
|
|
6050
|
-
|
|
6068
|
+
logger25.info("Audio too quiet, discarding", { rms });
|
|
6051
6069
|
this.setState("listening");
|
|
6052
6070
|
return;
|
|
6053
6071
|
}
|
|
@@ -6064,7 +6082,7 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
6064
6082
|
}
|
|
6065
6083
|
if (this.epoch !== capturedEpoch) return;
|
|
6066
6084
|
if (!transcript || !transcript.text.trim()) {
|
|
6067
|
-
|
|
6085
|
+
logger25.info("No transcript, resuming listening");
|
|
6068
6086
|
this.setState("listening");
|
|
6069
6087
|
return;
|
|
6070
6088
|
}
|
|
@@ -6100,7 +6118,7 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
6100
6118
|
} catch (err) {
|
|
6101
6119
|
this.progressiveErrorCount = (this.progressiveErrorCount ?? 0) + 1;
|
|
6102
6120
|
if (this.progressiveErrorCount % 10 === 1) {
|
|
6103
|
-
|
|
6121
|
+
logger25.warn("Progressive transcription error", {
|
|
6104
6122
|
code: ErrorCodes.SPH_ASR_ERROR,
|
|
6105
6123
|
error: String(err),
|
|
6106
6124
|
count: this.progressiveErrorCount
|
|
@@ -6152,9 +6170,9 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
6152
6170
|
} catch (error) {
|
|
6153
6171
|
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
6154
6172
|
this.asrErrorCount++;
|
|
6155
|
-
|
|
6173
|
+
logger25.warn("Transcription failed", { attempt: this.asrErrorCount, error: String(error) });
|
|
6156
6174
|
if (this.asrErrorCount >= 3 && this.config.models) {
|
|
6157
|
-
|
|
6175
|
+
logger25.warn("3 consecutive ASR errors, recreating session");
|
|
6158
6176
|
try {
|
|
6159
6177
|
await this.asr.dispose();
|
|
6160
6178
|
this.asr = createSenseVoice({
|
|
@@ -6167,7 +6185,7 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
6167
6185
|
await this.asr.load();
|
|
6168
6186
|
this.asrErrorCount = 0;
|
|
6169
6187
|
} catch (recreateErr) {
|
|
6170
|
-
|
|
6188
|
+
logger25.error("ASR session recreation failed", { error: String(recreateErr) });
|
|
6171
6189
|
}
|
|
6172
6190
|
}
|
|
6173
6191
|
return null;
|
|
@@ -6196,7 +6214,7 @@ var _SpeechListener = class _SpeechListener extends EventEmitter {
|
|
|
6196
6214
|
// ---------------------------------------------------------------------------
|
|
6197
6215
|
setState(state) {
|
|
6198
6216
|
if (this._state === state) return;
|
|
6199
|
-
|
|
6217
|
+
logger25.debug("State transition", { from: this._state, to: state });
|
|
6200
6218
|
this._state = state;
|
|
6201
6219
|
this.emit("state", state);
|
|
6202
6220
|
}
|
|
@@ -6215,7 +6233,7 @@ _SpeechListener.MAX_AUDIO_BUFFER_SAMPLES = 16e3 * 30;
|
|
|
6215
6233
|
var SpeechListener = _SpeechListener;
|
|
6216
6234
|
|
|
6217
6235
|
// src/audio/InterruptionHandler.ts
|
|
6218
|
-
var
|
|
6236
|
+
var logger26 = createLogger("InterruptionHandler");
|
|
6219
6237
|
var InterruptionHandler = class extends EventEmitter {
|
|
6220
6238
|
constructor(config = {}) {
|
|
6221
6239
|
super();
|
|
@@ -6236,7 +6254,7 @@ var InterruptionHandler = class extends EventEmitter {
|
|
|
6236
6254
|
enabled: true,
|
|
6237
6255
|
...config
|
|
6238
6256
|
};
|
|
6239
|
-
|
|
6257
|
+
logger26.debug("Constructed with config", {
|
|
6240
6258
|
vadThreshold: this.config.vadThreshold,
|
|
6241
6259
|
minSpeechDurationMs: this.config.minSpeechDurationMs,
|
|
6242
6260
|
silenceTimeoutMs: this.config.silenceTimeoutMs,
|
|
@@ -6267,7 +6285,7 @@ var InterruptionHandler = class extends EventEmitter {
|
|
|
6267
6285
|
processVADResult(vadProbability, audioEnergy = 0) {
|
|
6268
6286
|
if (!this.config.enabled) return;
|
|
6269
6287
|
if (this.aiIsSpeaking) {
|
|
6270
|
-
|
|
6288
|
+
logger26.trace("VAD during AI speech", {
|
|
6271
6289
|
vadProbability,
|
|
6272
6290
|
audioEnergy,
|
|
6273
6291
|
threshold: this.config.vadThreshold
|
|
@@ -6281,12 +6299,12 @@ var InterruptionHandler = class extends EventEmitter {
|
|
|
6281
6299
|
}
|
|
6282
6300
|
/** Notify that AI started/stopped speaking */
|
|
6283
6301
|
setAISpeaking(speaking) {
|
|
6284
|
-
|
|
6302
|
+
logger26.debug("AI speaking state changed", { speaking });
|
|
6285
6303
|
this.aiIsSpeaking = speaking;
|
|
6286
6304
|
}
|
|
6287
6305
|
/** Enable/disable interruption detection */
|
|
6288
6306
|
setEnabled(enabled) {
|
|
6289
|
-
|
|
6307
|
+
logger26.debug("Enabled state changed", { enabled });
|
|
6290
6308
|
this.config.enabled = enabled;
|
|
6291
6309
|
if (!enabled) {
|
|
6292
6310
|
this.reset();
|
|
@@ -6330,7 +6348,8 @@ var InterruptionHandler = class extends EventEmitter {
|
|
|
6330
6348
|
const speechDuration = now - this.speechStartTime;
|
|
6331
6349
|
if (speechDuration >= this.config.minSpeechDurationMs) {
|
|
6332
6350
|
this.interruptionTriggeredThisSession = true;
|
|
6333
|
-
|
|
6351
|
+
logger26.debug("Interruption triggered", { rms, durationMs: speechDuration });
|
|
6352
|
+
getTelemetry()?.incrementCounter(MetricNames.VOICE_INTERRUPTIONS, 1, { source: "detector" });
|
|
6334
6353
|
this.emit("interruption.triggered", { rms, durationMs: speechDuration });
|
|
6335
6354
|
}
|
|
6336
6355
|
}
|
|
@@ -6350,7 +6369,7 @@ var InterruptionHandler = class extends EventEmitter {
|
|
|
6350
6369
|
};
|
|
6351
6370
|
|
|
6352
6371
|
// src/inference/SafariSpeechRecognition.ts
|
|
6353
|
-
var
|
|
6372
|
+
var logger27 = createLogger("SafariSpeech");
|
|
6354
6373
|
var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
6355
6374
|
constructor(config = {}) {
|
|
6356
6375
|
this.recognition = null;
|
|
@@ -6369,7 +6388,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
6369
6388
|
interimResults: config.interimResults ?? true,
|
|
6370
6389
|
maxAlternatives: config.maxAlternatives ?? 1
|
|
6371
6390
|
};
|
|
6372
|
-
|
|
6391
|
+
logger27.debug("SafariSpeechRecognition created", {
|
|
6373
6392
|
language: this.config.language,
|
|
6374
6393
|
continuous: this.config.continuous
|
|
6375
6394
|
});
|
|
@@ -6430,7 +6449,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
6430
6449
|
*/
|
|
6431
6450
|
async start() {
|
|
6432
6451
|
if (this.isListening) {
|
|
6433
|
-
|
|
6452
|
+
logger27.warn("Already listening");
|
|
6434
6453
|
return;
|
|
6435
6454
|
}
|
|
6436
6455
|
if (!_SafariSpeechRecognition.isAvailable()) {
|
|
@@ -6460,7 +6479,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
6460
6479
|
this.isListening = true;
|
|
6461
6480
|
this.startTime = getClock().now();
|
|
6462
6481
|
this.accumulatedText = "";
|
|
6463
|
-
|
|
6482
|
+
logger27.info("Speech recognition started", {
|
|
6464
6483
|
language: this.config.language
|
|
6465
6484
|
});
|
|
6466
6485
|
span?.end();
|
|
@@ -6475,7 +6494,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
6475
6494
|
*/
|
|
6476
6495
|
async stop() {
|
|
6477
6496
|
if (!this.isListening || !this.recognition) {
|
|
6478
|
-
|
|
6497
|
+
logger27.warn("Not currently listening");
|
|
6479
6498
|
return {
|
|
6480
6499
|
text: this.accumulatedText,
|
|
6481
6500
|
language: this.config.language,
|
|
@@ -6504,7 +6523,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
6504
6523
|
if (this.recognition && this.isListening) {
|
|
6505
6524
|
this.recognition.abort();
|
|
6506
6525
|
this.isListening = false;
|
|
6507
|
-
|
|
6526
|
+
logger27.info("Speech recognition aborted");
|
|
6508
6527
|
}
|
|
6509
6528
|
}
|
|
6510
6529
|
/**
|
|
@@ -6526,7 +6545,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
6526
6545
|
* Dispose of recognition resources
|
|
6527
6546
|
*/
|
|
6528
6547
|
dispose() {
|
|
6529
|
-
|
|
6548
|
+
logger27.debug("Disposed");
|
|
6530
6549
|
if (this.recognition) {
|
|
6531
6550
|
if (this.isListening) {
|
|
6532
6551
|
this.recognition.abort();
|
|
@@ -6536,7 +6555,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
6536
6555
|
this.isListening = false;
|
|
6537
6556
|
this.resultCallbacks = [];
|
|
6538
6557
|
this.errorCallbacks = [];
|
|
6539
|
-
|
|
6558
|
+
logger27.debug("SafariSpeechRecognition disposed");
|
|
6540
6559
|
}
|
|
6541
6560
|
/**
|
|
6542
6561
|
* Set up event handlers for the recognition instance
|
|
@@ -6564,7 +6583,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
6564
6583
|
confidence: alternative.confidence
|
|
6565
6584
|
};
|
|
6566
6585
|
this.emitResult(speechResult);
|
|
6567
|
-
|
|
6586
|
+
logger27.trace("Speech result", {
|
|
6568
6587
|
text: text.substring(0, 50),
|
|
6569
6588
|
isFinal,
|
|
6570
6589
|
confidence: alternative.confidence
|
|
@@ -6574,12 +6593,12 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
6574
6593
|
span?.end();
|
|
6575
6594
|
} catch (error) {
|
|
6576
6595
|
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
6577
|
-
|
|
6596
|
+
logger27.error("Error processing speech result", { error });
|
|
6578
6597
|
}
|
|
6579
6598
|
};
|
|
6580
6599
|
this.recognition.onerror = (event) => {
|
|
6581
6600
|
const error = new Error(`Speech recognition error: ${event.error} - ${event.message}`);
|
|
6582
|
-
|
|
6601
|
+
logger27.error("Speech recognition error", { error: event.error, message: event.message });
|
|
6583
6602
|
this.emitError(error);
|
|
6584
6603
|
if (this.stopRejecter) {
|
|
6585
6604
|
this.stopRejecter(error);
|
|
@@ -6589,7 +6608,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
6589
6608
|
};
|
|
6590
6609
|
this.recognition.onend = () => {
|
|
6591
6610
|
this.isListening = false;
|
|
6592
|
-
|
|
6611
|
+
logger27.info("Speech recognition ended", {
|
|
6593
6612
|
totalText: this.accumulatedText.length,
|
|
6594
6613
|
durationMs: getClock().now() - this.startTime
|
|
6595
6614
|
});
|
|
@@ -6606,13 +6625,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
6606
6625
|
}
|
|
6607
6626
|
};
|
|
6608
6627
|
this.recognition.onstart = () => {
|
|
6609
|
-
|
|
6628
|
+
logger27.debug("Speech recognition started by browser");
|
|
6610
6629
|
};
|
|
6611
6630
|
this.recognition.onspeechstart = () => {
|
|
6612
|
-
|
|
6631
|
+
logger27.debug("Speech detected");
|
|
6613
6632
|
};
|
|
6614
6633
|
this.recognition.onspeechend = () => {
|
|
6615
|
-
|
|
6634
|
+
logger27.debug("Speech ended");
|
|
6616
6635
|
};
|
|
6617
6636
|
}
|
|
6618
6637
|
/**
|
|
@@ -6623,7 +6642,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
6623
6642
|
try {
|
|
6624
6643
|
callback(result);
|
|
6625
6644
|
} catch (error) {
|
|
6626
|
-
|
|
6645
|
+
logger27.error("Error in result callback", { error });
|
|
6627
6646
|
}
|
|
6628
6647
|
}
|
|
6629
6648
|
}
|
|
@@ -6635,14 +6654,14 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
6635
6654
|
try {
|
|
6636
6655
|
callback(error);
|
|
6637
6656
|
} catch (callbackError) {
|
|
6638
|
-
|
|
6657
|
+
logger27.error("Error in error callback", { error: callbackError });
|
|
6639
6658
|
}
|
|
6640
6659
|
}
|
|
6641
6660
|
}
|
|
6642
6661
|
};
|
|
6643
6662
|
|
|
6644
6663
|
// src/inference/ElevenLabsTTSBackend.ts
|
|
6645
|
-
var
|
|
6664
|
+
var logger28 = createLogger("ElevenLabsTTS");
|
|
6646
6665
|
var DEFAULT_MODEL = "eleven_multilingual_v2";
|
|
6647
6666
|
var DEFAULT_OUTPUT_FORMAT = "pcm_16000";
|
|
6648
6667
|
var DEFAULT_STABILITY = 0.5;
|
|
@@ -6687,7 +6706,7 @@ var ElevenLabsTTSBackend = class {
|
|
|
6687
6706
|
*/
|
|
6688
6707
|
async load() {
|
|
6689
6708
|
this._isLoaded = true;
|
|
6690
|
-
|
|
6709
|
+
logger28.info("ElevenLabs TTS ready", { voiceId: this.voiceId, model: this.model });
|
|
6691
6710
|
}
|
|
6692
6711
|
// ─── Stream ─────────────────────────────────────────────────────────────
|
|
6693
6712
|
/**
|
|
@@ -6733,7 +6752,7 @@ var ElevenLabsTTSBackend = class {
|
|
|
6733
6752
|
if (!response.ok) {
|
|
6734
6753
|
const errorText = await response.text().catch(() => "unknown");
|
|
6735
6754
|
const msg = `ElevenLabsTTS: HTTP ${response.status} \u2014 ${this.getHttpErrorMessage(response.status, errorText)}`;
|
|
6736
|
-
|
|
6755
|
+
logger28.error(msg);
|
|
6737
6756
|
throw new Error(msg);
|
|
6738
6757
|
}
|
|
6739
6758
|
if (!response.body) {
|
|
@@ -6743,7 +6762,7 @@ var ElevenLabsTTSBackend = class {
|
|
|
6743
6762
|
const latency2 = getClock().now() - startTime;
|
|
6744
6763
|
span?.setAttributes({ "tts.duration_s": duration, "tts.latency_ms": latency2 });
|
|
6745
6764
|
span?.end();
|
|
6746
|
-
telemetry?.recordHistogram(
|
|
6765
|
+
telemetry?.recordHistogram(MetricNames.INFERENCE_LATENCY, latency2, {
|
|
6747
6766
|
model: "elevenlabs-tts",
|
|
6748
6767
|
backend: "cloud"
|
|
6749
6768
|
});
|
|
@@ -6756,7 +6775,7 @@ var ElevenLabsTTSBackend = class {
|
|
|
6756
6775
|
while (true) {
|
|
6757
6776
|
if (options?.signal?.aborted) {
|
|
6758
6777
|
reader.cancel();
|
|
6759
|
-
|
|
6778
|
+
logger28.debug("Stream aborted by signal");
|
|
6760
6779
|
return;
|
|
6761
6780
|
}
|
|
6762
6781
|
const { done, value } = await reader.read();
|
|
@@ -6775,32 +6794,32 @@ var ElevenLabsTTSBackend = class {
|
|
|
6775
6794
|
}
|
|
6776
6795
|
const latency = getClock().now() - startTime;
|
|
6777
6796
|
const totalDuration = totalSamples / this._sampleRate;
|
|
6778
|
-
|
|
6797
|
+
logger28.debug("Stream complete", {
|
|
6779
6798
|
totalDuration: `${totalDuration.toFixed(2)}s`,
|
|
6780
6799
|
latencyMs: Math.round(latency),
|
|
6781
6800
|
totalSamples
|
|
6782
6801
|
});
|
|
6783
6802
|
span?.setAttributes({ "tts.duration_s": totalDuration, "tts.latency_ms": latency });
|
|
6784
6803
|
span?.end();
|
|
6785
|
-
telemetry?.recordHistogram(
|
|
6804
|
+
telemetry?.recordHistogram(MetricNames.INFERENCE_LATENCY, latency, {
|
|
6786
6805
|
model: "elevenlabs-tts",
|
|
6787
6806
|
backend: "cloud"
|
|
6788
6807
|
});
|
|
6789
|
-
telemetry?.incrementCounter(
|
|
6808
|
+
telemetry?.incrementCounter(MetricNames.INFERENCE_TOTAL, 1, {
|
|
6790
6809
|
model: "elevenlabs-tts",
|
|
6791
6810
|
backend: "cloud",
|
|
6792
6811
|
status: "success"
|
|
6793
6812
|
});
|
|
6794
6813
|
} catch (err) {
|
|
6795
6814
|
if (err instanceof DOMException && err.name === "AbortError") {
|
|
6796
|
-
|
|
6815
|
+
logger28.debug("Stream aborted");
|
|
6797
6816
|
span?.end();
|
|
6798
6817
|
return;
|
|
6799
6818
|
}
|
|
6800
6819
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
6801
|
-
|
|
6820
|
+
logger28.error("Stream failed", { error: errMsg });
|
|
6802
6821
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
6803
|
-
telemetry?.incrementCounter(
|
|
6822
|
+
telemetry?.incrementCounter(MetricNames.INFERENCE_TOTAL, 1, {
|
|
6804
6823
|
model: "elevenlabs-tts",
|
|
6805
6824
|
backend: "cloud",
|
|
6806
6825
|
status: "error"
|
|
@@ -6811,7 +6830,7 @@ var ElevenLabsTTSBackend = class {
|
|
|
6811
6830
|
// ─── Dispose ────────────────────────────────────────────────────────────
|
|
6812
6831
|
async dispose() {
|
|
6813
6832
|
this._isLoaded = false;
|
|
6814
|
-
|
|
6833
|
+
logger28.info("ElevenLabs TTS disposed");
|
|
6815
6834
|
}
|
|
6816
6835
|
// ─── Private ────────────────────────────────────────────────────────────
|
|
6817
6836
|
getHttpErrorMessage(status, body) {
|
|
@@ -6831,7 +6850,7 @@ var ElevenLabsTTSBackend = class {
|
|
|
6831
6850
|
};
|
|
6832
6851
|
|
|
6833
6852
|
// src/emotion/Emotion.ts
|
|
6834
|
-
var
|
|
6853
|
+
var logger29 = createLogger("EmotionController");
|
|
6835
6854
|
var EMOTION_NAMES = [
|
|
6836
6855
|
"amazement",
|
|
6837
6856
|
"anger",
|
|
@@ -6853,7 +6872,7 @@ function createEmotionVector(weights = {}) {
|
|
|
6853
6872
|
if (idx >= 0) {
|
|
6854
6873
|
vector[idx] = Math.max(0, Math.min(1, value));
|
|
6855
6874
|
} else {
|
|
6856
|
-
|
|
6875
|
+
logger29.warn(`Invalid emotion name in createEmotionVector: "${name}"`);
|
|
6857
6876
|
}
|
|
6858
6877
|
}
|
|
6859
6878
|
return vector;
|
|
@@ -6936,7 +6955,7 @@ var EmotionController = class {
|
|
|
6936
6955
|
this.targetEmotion.set(newEmotion);
|
|
6937
6956
|
this.currentEmotion.set(newEmotion);
|
|
6938
6957
|
this.transitionProgress = 1;
|
|
6939
|
-
|
|
6958
|
+
logger29.debug("set", { weights });
|
|
6940
6959
|
}
|
|
6941
6960
|
/**
|
|
6942
6961
|
* Set emotion from preset immediately
|
|
@@ -6946,7 +6965,7 @@ var EmotionController = class {
|
|
|
6946
6965
|
this.targetEmotion.set(newEmotion);
|
|
6947
6966
|
this.currentEmotion.set(newEmotion);
|
|
6948
6967
|
this.transitionProgress = 1;
|
|
6949
|
-
|
|
6968
|
+
logger29.debug("setPreset", { preset });
|
|
6950
6969
|
}
|
|
6951
6970
|
/**
|
|
6952
6971
|
* Transition to new emotion over time
|
|
@@ -6960,7 +6979,7 @@ var EmotionController = class {
|
|
|
6960
6979
|
this.transitionDuration = durationMs;
|
|
6961
6980
|
this.transitionStartTime = getClock().now();
|
|
6962
6981
|
this.transitionProgress = 0;
|
|
6963
|
-
|
|
6982
|
+
logger29.debug("transitionTo", { weights, durationMs });
|
|
6964
6983
|
}
|
|
6965
6984
|
/**
|
|
6966
6985
|
* Transition to preset over time
|
|
@@ -6993,7 +7012,7 @@ var EmotionController = class {
|
|
|
6993
7012
|
this.currentEmotion.fill(0);
|
|
6994
7013
|
this.targetEmotion.fill(0);
|
|
6995
7014
|
this.transitionProgress = 1;
|
|
6996
|
-
|
|
7015
|
+
logger29.debug("reset");
|
|
6997
7016
|
}
|
|
6998
7017
|
};
|
|
6999
7018
|
|
|
@@ -7074,7 +7093,7 @@ var DEFAULT_ANIMATION_CONFIG = {
|
|
|
7074
7093
|
};
|
|
7075
7094
|
|
|
7076
7095
|
// src/animation/AnimationGraph.ts
|
|
7077
|
-
var
|
|
7096
|
+
var logger30 = createLogger("AnimationGraph");
|
|
7078
7097
|
var AnimationGraph = class extends EventEmitter {
|
|
7079
7098
|
constructor(config = {}) {
|
|
7080
7099
|
super();
|
|
@@ -7107,7 +7126,7 @@ var AnimationGraph = class extends EventEmitter {
|
|
|
7107
7126
|
this.stateEnterTime = Date.now();
|
|
7108
7127
|
this.lastUpdateTime = Date.now();
|
|
7109
7128
|
this.cachedOutput = this.computeOutput();
|
|
7110
|
-
|
|
7129
|
+
logger30.info("constructor", {
|
|
7111
7130
|
initialState: this.config.initialState,
|
|
7112
7131
|
stateCount: this.config.states.length,
|
|
7113
7132
|
transitionCount: this.config.transitions.length
|
|
@@ -7178,7 +7197,7 @@ var AnimationGraph = class extends EventEmitter {
|
|
|
7178
7197
|
setState(stateName, blendDuration = 300) {
|
|
7179
7198
|
const targetState = this.config.states.find((s) => s.name === stateName);
|
|
7180
7199
|
if (!targetState) {
|
|
7181
|
-
|
|
7200
|
+
logger30.warn(`State '${stateName}' not found`);
|
|
7182
7201
|
return;
|
|
7183
7202
|
}
|
|
7184
7203
|
if (targetState.name === this.currentState.name && !this.isTransitioning) {
|
|
@@ -7256,7 +7275,7 @@ var AnimationGraph = class extends EventEmitter {
|
|
|
7256
7275
|
(s) => s.name === transition.to
|
|
7257
7276
|
);
|
|
7258
7277
|
if (!targetState) {
|
|
7259
|
-
|
|
7278
|
+
logger30.warn(`Target state '${transition.to}' not found`);
|
|
7260
7279
|
return;
|
|
7261
7280
|
}
|
|
7262
7281
|
const fromState = this.currentState.name;
|
|
@@ -7270,7 +7289,7 @@ var AnimationGraph = class extends EventEmitter {
|
|
|
7270
7289
|
if (!this.currentState.emotionBlendEnabled) {
|
|
7271
7290
|
this.targetEmotionWeight = 0;
|
|
7272
7291
|
}
|
|
7273
|
-
|
|
7292
|
+
logger30.debug("state transition", {
|
|
7274
7293
|
from: fromState,
|
|
7275
7294
|
to: targetState.name,
|
|
7276
7295
|
trigger: event,
|
|
@@ -7307,7 +7326,7 @@ var AnimationGraph = class extends EventEmitter {
|
|
|
7307
7326
|
if (this.currentState.timeout <= 0) return;
|
|
7308
7327
|
const elapsed = now - this.stateEnterTime;
|
|
7309
7328
|
if (elapsed >= this.currentState.timeout) {
|
|
7310
|
-
|
|
7329
|
+
logger30.debug("timeout transition", {
|
|
7311
7330
|
state: this.currentState.name,
|
|
7312
7331
|
elapsed,
|
|
7313
7332
|
timeout: this.currentState.timeout
|
|
@@ -7537,7 +7556,7 @@ var EmphasisDetector = class {
|
|
|
7537
7556
|
|
|
7538
7557
|
// src/animation/ProceduralLifeLayer.ts
|
|
7539
7558
|
import { createNoise2D } from "simplex-noise";
|
|
7540
|
-
var
|
|
7559
|
+
var logger31 = createLogger("ProceduralLifeLayer");
|
|
7541
7560
|
var simplex2d = createNoise2D();
|
|
7542
7561
|
var LIFE_BS_INDEX = /* @__PURE__ */ new Map();
|
|
7543
7562
|
for (let i = 0; i < ARKIT_BLENDSHAPES.length; i++) {
|
|
@@ -7643,7 +7662,7 @@ var ProceduralLifeLayer = class {
|
|
|
7643
7662
|
}
|
|
7644
7663
|
this.blinkInterval = this.nextBlinkInterval();
|
|
7645
7664
|
this.gazeBreakInterval = randomRange(...this.gazeBreakIntervalRange);
|
|
7646
|
-
|
|
7665
|
+
logger31.debug("constructor", {
|
|
7647
7666
|
blinkIntervalRange: this.blinkIntervalRange,
|
|
7648
7667
|
useLogNormalBlinks: this.useLogNormalBlinks,
|
|
7649
7668
|
gazeBreakIntervalRange: this.gazeBreakIntervalRange,
|
|
@@ -7747,7 +7766,7 @@ var ProceduralLifeLayer = class {
|
|
|
7747
7766
|
* Reset all internal state to initial values.
|
|
7748
7767
|
*/
|
|
7749
7768
|
reset() {
|
|
7750
|
-
|
|
7769
|
+
logger31.debug("reset");
|
|
7751
7770
|
this.blinkTimer = 0;
|
|
7752
7771
|
this.blinkInterval = this.nextBlinkInterval();
|
|
7753
7772
|
this.blinkPhase = PHASE_OPEN;
|
|
@@ -7799,7 +7818,7 @@ var ProceduralLifeLayer = class {
|
|
|
7799
7818
|
this.blinkTimer = 0;
|
|
7800
7819
|
this.blinkInterval = this.nextBlinkInterval();
|
|
7801
7820
|
this.asymmetryRight = 0.95 + Math.random() * 0.08;
|
|
7802
|
-
|
|
7821
|
+
logger31.trace("blink", { nextInterval: this.blinkInterval });
|
|
7803
7822
|
}
|
|
7804
7823
|
if (this.blinkPhase > PHASE_OPEN) {
|
|
7805
7824
|
this.blinkProgress += delta;
|
|
@@ -7880,7 +7899,7 @@ var ProceduralLifeLayer = class {
|
|
|
7880
7899
|
this.gazeBreakTargetX = (Math.random() - 0.5) * 2 * amp;
|
|
7881
7900
|
this.gazeBreakTargetY = (Math.random() - 0.5) * amp * 0.4;
|
|
7882
7901
|
this.gazeBreakInterval = randomRange(...params.interval);
|
|
7883
|
-
|
|
7902
|
+
logger31.trace("gaze break", {
|
|
7884
7903
|
targetX: this.gazeBreakTargetX.toFixed(3),
|
|
7885
7904
|
targetY: this.gazeBreakTargetY.toFixed(3),
|
|
7886
7905
|
nextInterval: this.gazeBreakInterval.toFixed(2),
|
|
@@ -8123,7 +8142,7 @@ var ALL_AUS = [...new Set(
|
|
|
8123
8142
|
)];
|
|
8124
8143
|
|
|
8125
8144
|
// src/face/EmotionResolver.ts
|
|
8126
|
-
var
|
|
8145
|
+
var logger32 = createLogger("EmotionResolver");
|
|
8127
8146
|
var BS_INDEX = /* @__PURE__ */ new Map();
|
|
8128
8147
|
for (let i = 0; i < ARKIT_BLENDSHAPES.length; i++) {
|
|
8129
8148
|
BS_INDEX.set(ARKIT_BLENDSHAPES[i], i);
|
|
@@ -8150,7 +8169,7 @@ var EmotionResolver = class {
|
|
|
8150
8169
|
if (!emotionWeight || emotionWeight < 0.01) continue;
|
|
8151
8170
|
const auActivations = EMOTION_TO_AU[emotionName];
|
|
8152
8171
|
if (!auActivations) {
|
|
8153
|
-
|
|
8172
|
+
logger32.warn(`Unknown emotion name with no AU mapping: "${emotionName}"`);
|
|
8154
8173
|
continue;
|
|
8155
8174
|
}
|
|
8156
8175
|
for (const activation of auActivations) {
|
|
@@ -8175,7 +8194,7 @@ var EmotionResolver = class {
|
|
|
8175
8194
|
};
|
|
8176
8195
|
|
|
8177
8196
|
// src/face/FaceCompositor.ts
|
|
8178
|
-
var
|
|
8197
|
+
var logger33 = createLogger("FaceCompositor");
|
|
8179
8198
|
function smoothstep(t) {
|
|
8180
8199
|
return t * t * (3 - 2 * t);
|
|
8181
8200
|
}
|
|
@@ -8206,7 +8225,7 @@ var FaceCompositor = class {
|
|
|
8206
8225
|
if (config?.profile) {
|
|
8207
8226
|
this.applyProfileArrays(config.profile);
|
|
8208
8227
|
}
|
|
8209
|
-
|
|
8228
|
+
logger33.debug("constructor", {
|
|
8210
8229
|
emotionSmoothing: this.emotionSmoothing,
|
|
8211
8230
|
hasProfile: !!config?.profile,
|
|
8212
8231
|
hasLifeLayer: !!config?.lifeLayer
|
|
@@ -8279,7 +8298,7 @@ var FaceCompositor = class {
|
|
|
8279
8298
|
*/
|
|
8280
8299
|
setEmotion(weights) {
|
|
8281
8300
|
this.stickyEmotion = weights;
|
|
8282
|
-
|
|
8301
|
+
logger33.debug("setEmotion", { weights });
|
|
8283
8302
|
}
|
|
8284
8303
|
/**
|
|
8285
8304
|
* Update character profile at runtime.
|
|
@@ -8288,7 +8307,7 @@ var FaceCompositor = class {
|
|
|
8288
8307
|
this.multiplier.fill(1);
|
|
8289
8308
|
this.offset.fill(0);
|
|
8290
8309
|
this.applyProfileArrays(profile);
|
|
8291
|
-
|
|
8310
|
+
logger33.debug("setProfile", {
|
|
8292
8311
|
multiplierKeys: profile.multiplier ? Object.keys(profile.multiplier).length : 0,
|
|
8293
8312
|
offsetKeys: profile.offset ? Object.keys(profile.offset).length : 0
|
|
8294
8313
|
});
|
|
@@ -8302,7 +8321,7 @@ var FaceCompositor = class {
|
|
|
8302
8321
|
this.lifeBuffer.fill(0);
|
|
8303
8322
|
this.stickyEmotion = void 0;
|
|
8304
8323
|
this.lifeLayer.reset();
|
|
8305
|
-
|
|
8324
|
+
logger33.debug("reset");
|
|
8306
8325
|
}
|
|
8307
8326
|
/** Expand partial profile maps into dense Float32Arrays */
|
|
8308
8327
|
applyProfileArrays(profile) {
|
|
@@ -8387,7 +8406,7 @@ function parseEmotionTags(text) {
|
|
|
8387
8406
|
}
|
|
8388
8407
|
|
|
8389
8408
|
// src/character/CharacterController.ts
|
|
8390
|
-
var
|
|
8409
|
+
var logger34 = createLogger("CharacterController");
|
|
8391
8410
|
var FRAME_BUDGET_US = 33e3;
|
|
8392
8411
|
var EMOTION_MAP = {
|
|
8393
8412
|
// Synced with EmotionPresets (packages/core/src/emotion/Emotion.ts)
|
|
@@ -8457,7 +8476,7 @@ var CharacterController = class {
|
|
|
8457
8476
|
this.gazeYawInfluence = config?.gaze?.yawInfluence ?? 0.4;
|
|
8458
8477
|
this.gazePitchInfluence = config?.gaze?.pitchInfluence ?? 0.3;
|
|
8459
8478
|
this.gazeSmoothing = config?.gaze?.smoothing ?? 5;
|
|
8460
|
-
|
|
8479
|
+
logger34.debug("constructor", {
|
|
8461
8480
|
gazeEnabled: this.gazeEnabled,
|
|
8462
8481
|
gazeYawInfluence: this.gazeYawInfluence,
|
|
8463
8482
|
gazePitchInfluence: this.gazePitchInfluence,
|
|
@@ -8521,13 +8540,13 @@ var CharacterController = class {
|
|
|
8521
8540
|
const resolved = resolveEmotion(emotion);
|
|
8522
8541
|
if (resolved) {
|
|
8523
8542
|
this._compositor.setEmotion(resolved);
|
|
8524
|
-
|
|
8543
|
+
logger34.debug("setEmotion", { emotion, resolved });
|
|
8525
8544
|
}
|
|
8526
8545
|
}
|
|
8527
8546
|
/** Update character profile at runtime. */
|
|
8528
8547
|
setProfile(profile) {
|
|
8529
8548
|
this._compositor.setProfile(profile);
|
|
8530
|
-
|
|
8549
|
+
logger34.debug("setProfile", {
|
|
8531
8550
|
multiplierKeys: profile.multiplier ? Object.keys(profile.multiplier).length : 0,
|
|
8532
8551
|
offsetKeys: profile.offset ? Object.keys(profile.offset).length : 0
|
|
8533
8552
|
});
|
|
@@ -8562,11 +8581,11 @@ var CharacterController = class {
|
|
|
8562
8581
|
this._compositor.reset();
|
|
8563
8582
|
this.gazeHeadYaw = 0;
|
|
8564
8583
|
this.gazeHeadPitch = -0.1;
|
|
8565
|
-
|
|
8584
|
+
logger34.debug("reset");
|
|
8566
8585
|
}
|
|
8567
8586
|
dispose() {
|
|
8568
8587
|
this.reset();
|
|
8569
|
-
|
|
8588
|
+
logger34.debug("dispose");
|
|
8570
8589
|
}
|
|
8571
8590
|
// ---------------------------------------------------------------------------
|
|
8572
8591
|
// Eye angle math (extracted from r3f useGazeTracking.computeEyeTargets)
|
|
@@ -8648,7 +8667,7 @@ var CharacterController = class {
|
|
|
8648
8667
|
};
|
|
8649
8668
|
|
|
8650
8669
|
// src/orchestration/MicLipSync.ts
|
|
8651
|
-
var
|
|
8670
|
+
var logger35 = createLogger("MicLipSync");
|
|
8652
8671
|
var MicLipSync = class extends EventEmitter {
|
|
8653
8672
|
constructor(config) {
|
|
8654
8673
|
super();
|
|
@@ -8667,7 +8686,7 @@ var MicLipSync = class extends EventEmitter {
|
|
|
8667
8686
|
this.vadChunkSize = 0;
|
|
8668
8687
|
this.vadBuffer = null;
|
|
8669
8688
|
this.vadBufferOffset = 0;
|
|
8670
|
-
|
|
8689
|
+
logger35.info("MicLipSync created", {
|
|
8671
8690
|
sampleRate: config.sampleRate ?? 16e3,
|
|
8672
8691
|
micChunkSize: config.micChunkSize ?? 512,
|
|
8673
8692
|
hasVAD: !!config.vad,
|
|
@@ -8689,12 +8708,12 @@ var MicLipSync = class extends EventEmitter {
|
|
|
8689
8708
|
this._currentFrame = scaled;
|
|
8690
8709
|
if (!this._firstFrameEmitted) {
|
|
8691
8710
|
this._firstFrameEmitted = true;
|
|
8692
|
-
|
|
8711
|
+
logger35.trace("First blendshape frame emitted");
|
|
8693
8712
|
}
|
|
8694
8713
|
this.emit("frame", { blendshapes: scaled, rawBlendshapes: raw });
|
|
8695
8714
|
},
|
|
8696
8715
|
onError: (error) => {
|
|
8697
|
-
|
|
8716
|
+
logger35.error("A2E inference error", { message: error.message });
|
|
8698
8717
|
this.emit("error", error);
|
|
8699
8718
|
}
|
|
8700
8719
|
});
|
|
@@ -8703,7 +8722,7 @@ var MicLipSync = class extends EventEmitter {
|
|
|
8703
8722
|
this.processor.pushAudio(float32);
|
|
8704
8723
|
if (this.vad) {
|
|
8705
8724
|
this.vadQueue = this.vadQueue.then(() => this.processVAD(float32)).catch((err) => {
|
|
8706
|
-
|
|
8725
|
+
logger35.warn("VAD processing error", { error: String(err), code: ErrorCodes.SPH_VAD_ERROR });
|
|
8707
8726
|
this.emit("error", err instanceof Error ? err : new Error(String(err)));
|
|
8708
8727
|
});
|
|
8709
8728
|
}
|
|
@@ -8739,7 +8758,7 @@ var MicLipSync = class extends EventEmitter {
|
|
|
8739
8758
|
/** Start microphone capture and inference loop */
|
|
8740
8759
|
async start() {
|
|
8741
8760
|
if (this._state === "active") return;
|
|
8742
|
-
|
|
8761
|
+
logger35.info("Starting MicLipSync");
|
|
8743
8762
|
getTelemetry()?.incrementCounter(MetricNames.MIC_SESSIONS);
|
|
8744
8763
|
await this.mic.start();
|
|
8745
8764
|
this.processor.startDrip();
|
|
@@ -8749,7 +8768,7 @@ var MicLipSync = class extends EventEmitter {
|
|
|
8749
8768
|
/** Stop microphone and inference */
|
|
8750
8769
|
stop() {
|
|
8751
8770
|
if (this._state === "idle") return;
|
|
8752
|
-
|
|
8771
|
+
logger35.info("Stopping MicLipSync");
|
|
8753
8772
|
this.processor.stopDrip();
|
|
8754
8773
|
this.mic.stop();
|
|
8755
8774
|
this._isSpeaking = false;
|
|
@@ -8798,7 +8817,7 @@ var MicLipSync = class extends EventEmitter {
|
|
|
8798
8817
|
this.emit("speech:end", { durationMs });
|
|
8799
8818
|
}
|
|
8800
8819
|
} catch (err) {
|
|
8801
|
-
|
|
8820
|
+
logger35.warn("VAD process error", { error: String(err), code: ErrorCodes.SPH_VAD_ERROR });
|
|
8802
8821
|
this.emit("error", err instanceof Error ? err : new Error(String(err)));
|
|
8803
8822
|
}
|
|
8804
8823
|
this.vadBufferOffset = 0;
|
|
@@ -8816,7 +8835,7 @@ var MicLipSync = class extends EventEmitter {
|
|
|
8816
8835
|
};
|
|
8817
8836
|
|
|
8818
8837
|
// src/orchestration/VoiceOrchestrator.ts
|
|
8819
|
-
var
|
|
8838
|
+
var logger36 = createLogger("VoiceOrchestrator");
|
|
8820
8839
|
var VoiceOrchestrator = class extends EventEmitter {
|
|
8821
8840
|
constructor() {
|
|
8822
8841
|
super(...arguments);
|
|
@@ -8868,12 +8887,16 @@ var VoiceOrchestrator = class extends EventEmitter {
|
|
|
8868
8887
|
const epoch = ++this.connectEpoch;
|
|
8869
8888
|
this._mode = config.mode ?? "local";
|
|
8870
8889
|
this._sessionId = crypto.randomUUID();
|
|
8890
|
+
const span = getTelemetry()?.startSpan("VoiceOrchestrator.connect", {
|
|
8891
|
+
"mode": this._mode,
|
|
8892
|
+
"session.id": this._sessionId
|
|
8893
|
+
});
|
|
8871
8894
|
if (config.onStateChange) this.on("state", config.onStateChange);
|
|
8872
8895
|
if (config.onLoadingProgress) this.on("loading:progress", config.onLoadingProgress);
|
|
8873
8896
|
if (config.onError) this.on("error", config.onError);
|
|
8874
8897
|
if (config.onTranscriptEvent) this.on("transcript", config.onTranscriptEvent);
|
|
8875
8898
|
if (config.onInterruption) this.on("interruption", config.onInterruption);
|
|
8876
|
-
|
|
8899
|
+
logger36.info("Connecting voice orchestrator", { mode: this._mode });
|
|
8877
8900
|
try {
|
|
8878
8901
|
if (this._mode === "local") {
|
|
8879
8902
|
const localCfg = config;
|
|
@@ -8954,9 +8977,11 @@ var VoiceOrchestrator = class extends EventEmitter {
|
|
|
8954
8977
|
} else {
|
|
8955
8978
|
this.wireCloudTranscript(config);
|
|
8956
8979
|
}
|
|
8957
|
-
|
|
8980
|
+
logger36.info("Voice orchestrator connected", { mode: this._mode });
|
|
8981
|
+
span?.end();
|
|
8958
8982
|
} catch (err) {
|
|
8959
|
-
|
|
8983
|
+
logger36.error("Voice orchestrator connect failed, cleaning up", { error: String(err) });
|
|
8984
|
+
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
8960
8985
|
await this.disconnect();
|
|
8961
8986
|
throw err;
|
|
8962
8987
|
}
|
|
@@ -9065,6 +9090,7 @@ var VoiceOrchestrator = class extends EventEmitter {
|
|
|
9065
9090
|
const handler = async (result) => {
|
|
9066
9091
|
this.emit("transcript", result);
|
|
9067
9092
|
if (!result.isFinal || !result.text.trim()) return;
|
|
9093
|
+
const turnStart = getClock().now();
|
|
9068
9094
|
this.setState("thinking");
|
|
9069
9095
|
this.speechListener?.pause();
|
|
9070
9096
|
this.interruption?.setAISpeaking(true);
|
|
@@ -9081,10 +9107,11 @@ var VoiceOrchestrator = class extends EventEmitter {
|
|
|
9081
9107
|
await this.speak(text);
|
|
9082
9108
|
}
|
|
9083
9109
|
} catch (e) {
|
|
9084
|
-
|
|
9110
|
+
logger36.error("Voice transcript handler error", { error: String(e) });
|
|
9085
9111
|
} finally {
|
|
9086
9112
|
this.interruption?.setAISpeaking(false);
|
|
9087
9113
|
this.speechListener?.resume();
|
|
9114
|
+
getTelemetry()?.recordHistogram(MetricNames.VOICE_TURN_LATENCY, getClock().now() - turnStart);
|
|
9088
9115
|
this.setState("listening");
|
|
9089
9116
|
}
|
|
9090
9117
|
};
|
|
@@ -9095,6 +9122,8 @@ var VoiceOrchestrator = class extends EventEmitter {
|
|
|
9095
9122
|
const handler = async (result) => {
|
|
9096
9123
|
this.emit("transcript", result);
|
|
9097
9124
|
if (!result.isFinal || !result.text.trim()) return;
|
|
9125
|
+
const turnStart = getClock().now();
|
|
9126
|
+
let firstChunkSent = false;
|
|
9098
9127
|
this.setState("thinking");
|
|
9099
9128
|
this.speechListener?.pause();
|
|
9100
9129
|
this.interruption?.setAISpeaking(true);
|
|
@@ -9111,10 +9140,15 @@ var VoiceOrchestrator = class extends EventEmitter {
|
|
|
9111
9140
|
setEmotion: (emotion) => this.playbackPipeline?.setEmotion(emotion),
|
|
9112
9141
|
send: async (chunk) => {
|
|
9113
9142
|
if (abortController.signal.aborted) return;
|
|
9143
|
+
if (!firstChunkSent) {
|
|
9144
|
+
firstChunkSent = true;
|
|
9145
|
+
getTelemetry()?.recordHistogram(MetricNames.VOICE_RESPONSE_LATENCY, getClock().now() - turnStart);
|
|
9146
|
+
}
|
|
9114
9147
|
await this.playbackPipeline.onAudioChunk(chunk);
|
|
9115
9148
|
},
|
|
9116
9149
|
done: async () => {
|
|
9117
9150
|
if (abortController.signal.aborted) return;
|
|
9151
|
+
getTelemetry()?.recordHistogram(MetricNames.VOICE_TURN_LATENCY, getClock().now() - turnStart);
|
|
9118
9152
|
await this.playbackPipeline.end();
|
|
9119
9153
|
},
|
|
9120
9154
|
signal: abortController.signal,
|
|
@@ -9122,7 +9156,7 @@ var VoiceOrchestrator = class extends EventEmitter {
|
|
|
9122
9156
|
});
|
|
9123
9157
|
} catch (e) {
|
|
9124
9158
|
if (!abortController.signal.aborted) {
|
|
9125
|
-
|
|
9159
|
+
logger36.error("Cloud response handler error", { error: String(e) });
|
|
9126
9160
|
}
|
|
9127
9161
|
} finally {
|
|
9128
9162
|
this.responseAbortController = null;
|
|
@@ -9136,9 +9170,10 @@ var VoiceOrchestrator = class extends EventEmitter {
|
|
|
9136
9170
|
// -------------------------------------------------------------------------
|
|
9137
9171
|
handleInterruption() {
|
|
9138
9172
|
if (this._state !== "speaking") return;
|
|
9139
|
-
|
|
9173
|
+
logger36.info("Interruption triggered");
|
|
9140
9174
|
this.stopSpeaking();
|
|
9141
9175
|
this.emit("interruption");
|
|
9176
|
+
getTelemetry()?.incrementCounter(MetricNames.VOICE_INTERRUPTIONS, 1, { source: "orchestrator" });
|
|
9142
9177
|
this.speechListener?.resume();
|
|
9143
9178
|
this.setState("listening");
|
|
9144
9179
|
}
|
|
@@ -9188,7 +9223,6 @@ export {
|
|
|
9188
9223
|
EmotionResolver,
|
|
9189
9224
|
EmphasisDetector,
|
|
9190
9225
|
ErrorCodes,
|
|
9191
|
-
ErrorTypes,
|
|
9192
9226
|
EventEmitter,
|
|
9193
9227
|
FaceCompositor,
|
|
9194
9228
|
HF_CDN_URLS,
|