osborn 0.9.68 → 0.9.72
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +174 -3
- package/dist/voice-io.js +2 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -15,6 +15,9 @@ import { dirname, join } from 'node:path';
|
|
|
15
15
|
import { fileURLToPath } from 'node:url';
|
|
16
16
|
import { spawn } from 'node:child_process';
|
|
17
17
|
import { randomUUID } from 'node:crypto';
|
|
18
|
+
import { createRequire } from 'node:module';
|
|
19
|
+
// 0.9.71: createRequire for resolving package.json versions inside ESM
|
|
20
|
+
const __sdkVersionRequire = createRequire(import.meta.url);
|
|
18
21
|
import { homedir, tmpdir } from 'node:os';
|
|
19
22
|
import { PassThrough } from 'node:stream';
|
|
20
23
|
import { createGunzip } from 'node:zlib';
|
|
@@ -2167,14 +2170,42 @@ async function main() {
|
|
|
2167
2170
|
// a full 3s window to keep talking before deciding it was false and
|
|
2168
2171
|
// resuming. Other two knobs left at SDK defaults.
|
|
2169
2172
|
interruption: {
|
|
2170
|
-
minDuration:
|
|
2171
|
-
minWords:
|
|
2172
|
-
falseInterruptionTimeout:
|
|
2173
|
+
minDuration: 1500, // default 500 — require 1.5s sustained speech (faster barge-in than 2500)
|
|
2174
|
+
minWords: 2, // default 0 — require ≥2 transcript words
|
|
2175
|
+
falseInterruptionTimeout: 3500, // default 2000 — 3.5s false-interrupt window (belt-and-suspenders since minDuration was loosened)
|
|
2173
2176
|
// resumeFalseInterruption: true, // default true (unchanged)
|
|
2174
2177
|
// discardAudioIfUninterruptible: true,// default true (unchanged)
|
|
2175
2178
|
},
|
|
2176
2179
|
},
|
|
2177
2180
|
});
|
|
2181
|
+
// 0.9.71: dump the RESOLVED AgentSession options (after defaults applied)
|
|
2182
|
+
// so prod logs prove exactly what tuning is live for any given session.
|
|
2183
|
+
try {
|
|
2184
|
+
const so = session.sessionOptions ?? {};
|
|
2185
|
+
const detect = session.interruptionDetection;
|
|
2186
|
+
const turn = so.turnHandling ?? {};
|
|
2187
|
+
console.log('🧪 [BE-AGENT-SESSION-CONFIG]', JSON.stringify({
|
|
2188
|
+
t: new Date().toISOString(),
|
|
2189
|
+
maxToolSteps: so.maxToolSteps,
|
|
2190
|
+
userAwayTimeout: so.userAwayTimeout,
|
|
2191
|
+
aecWarmupDuration: so.aecWarmupDuration,
|
|
2192
|
+
ttsReadIdleTimeout: so.ttsReadIdleTimeout,
|
|
2193
|
+
forwardAudioIdleTimeout: so.forwardAudioIdleTimeout,
|
|
2194
|
+
useTtsAlignedTranscript: so.useTtsAlignedTranscript,
|
|
2195
|
+
ttsTextTransforms: so.ttsTextTransforms,
|
|
2196
|
+
interruptionDetectionMode: detect, // 'vad' | 'adaptive' | undefined
|
|
2197
|
+
turnHandling: {
|
|
2198
|
+
turnDetection: turn.turnDetection,
|
|
2199
|
+
endpointing: turn.endpointing,
|
|
2200
|
+
interruption: turn.interruption,
|
|
2201
|
+
preemptiveGeneration: turn.preemptiveGeneration,
|
|
2202
|
+
userTurnLimit: turn.userTurnLimit,
|
|
2203
|
+
},
|
|
2204
|
+
}));
|
|
2205
|
+
}
|
|
2206
|
+
catch (err) {
|
|
2207
|
+
console.log('🧪 [BE-AGENT-SESSION-CONFIG] failed:', err instanceof Error ? err.message : String(err));
|
|
2208
|
+
}
|
|
2178
2209
|
return { session, agent };
|
|
2179
2210
|
}
|
|
2180
2211
|
// ============================================================
|
|
@@ -2713,6 +2744,35 @@ async function main() {
|
|
|
2713
2744
|
}).catch((err) => {
|
|
2714
2745
|
console.log(`⚠️ [LIVEKIT-DASHBOARD] failed to fetch room SID: ${err instanceof Error ? err.message : String(err)}`);
|
|
2715
2746
|
});
|
|
2747
|
+
// 0.9.71: SDK + runtime snapshot — proves what's actually running so
|
|
2748
|
+
// future log forensics can rule out version drift in one grep.
|
|
2749
|
+
try {
|
|
2750
|
+
const pkgs = {};
|
|
2751
|
+
for (const name of [
|
|
2752
|
+
'osborn',
|
|
2753
|
+
'@livekit/agents',
|
|
2754
|
+
'@livekit/agents-plugin-openai',
|
|
2755
|
+
'@livekit/agents-plugin-deepgram',
|
|
2756
|
+
'@livekit/agents-plugin-silero',
|
|
2757
|
+
'@livekit/agents-plugin-google',
|
|
2758
|
+
'@livekit/agents-plugin-elevenlabs',
|
|
2759
|
+
'@livekit/agents-plugin-livekit',
|
|
2760
|
+
'@livekit/rtc-node',
|
|
2761
|
+
'livekit-server-sdk',
|
|
2762
|
+
'@anthropic-ai/claude-agent-sdk',
|
|
2763
|
+
'@google/genai',
|
|
2764
|
+
'openai',
|
|
2765
|
+
]) {
|
|
2766
|
+
try {
|
|
2767
|
+
pkgs[name] = __sdkVersionRequire(`${name}/package.json`).version;
|
|
2768
|
+
}
|
|
2769
|
+
catch { }
|
|
2770
|
+
}
|
|
2771
|
+
console.log('🧪 [BE-SDK-VERSIONS]', JSON.stringify({ t: new Date().toISOString(), node: process.version, pkgs }));
|
|
2772
|
+
}
|
|
2773
|
+
catch (err) {
|
|
2774
|
+
console.log('🧪 [BE-SDK-VERSIONS] failed:', err instanceof Error ? err.message : String(err));
|
|
2775
|
+
}
|
|
2716
2776
|
localParticipant = room.localParticipant;
|
|
2717
2777
|
// Arm the alone timer: if we connected but no user joins within the grace
|
|
2718
2778
|
// window (e.g. machine woken then abandoned mid-handshake), leave the room
|
|
@@ -2729,6 +2789,46 @@ async function main() {
|
|
|
2729
2789
|
// Flux STT's speech-vs-noise classification: slower (~100-300ms) but
|
|
2730
2790
|
// confidence-aware. The latency tradeoff is worth eliminating the false
|
|
2731
2791
|
// interrupts at the root.
|
|
2792
|
+
// 0.9.71: Room-level audio observability — observe-only logs so we can
|
|
2793
|
+
// cross-reference user mic mute/quality changes against TTS cutoffs without
|
|
2794
|
+
// re-introducing the over-eager ActiveSpeakers interrupt.
|
|
2795
|
+
room.on(RoomEvent.ActiveSpeakersChanged, (speakers) => {
|
|
2796
|
+
try {
|
|
2797
|
+
const ids = (speakers || []).map((s) => s?.identity).filter(Boolean);
|
|
2798
|
+
console.log(`🎙️ [ROOM-SPEAKERS] count=${ids.length} ids=${JSON.stringify(ids)} t=${new Date().toISOString()}`);
|
|
2799
|
+
}
|
|
2800
|
+
catch { }
|
|
2801
|
+
});
|
|
2802
|
+
room.on(RoomEvent.ConnectionQualityChanged, (quality, participant) => {
|
|
2803
|
+
try {
|
|
2804
|
+
console.log(`📶 [ROOM-QUALITY] participant=${participant?.identity} quality=${quality} t=${new Date().toISOString()}`);
|
|
2805
|
+
}
|
|
2806
|
+
catch { }
|
|
2807
|
+
});
|
|
2808
|
+
room.on(RoomEvent.TrackMuted, (publication, participant) => {
|
|
2809
|
+
try {
|
|
2810
|
+
console.log(`🔇 [ROOM-TRACK-MUTED] participant=${participant?.identity} kind=${publication?.kind} source=${publication?.source} sid=${publication?.sid} t=${new Date().toISOString()}`);
|
|
2811
|
+
}
|
|
2812
|
+
catch { }
|
|
2813
|
+
});
|
|
2814
|
+
room.on(RoomEvent.TrackUnmuted, (publication, participant) => {
|
|
2815
|
+
try {
|
|
2816
|
+
console.log(`🔊 [ROOM-TRACK-UNMUTED] participant=${participant?.identity} kind=${publication?.kind} source=${publication?.source} sid=${publication?.sid} t=${new Date().toISOString()}`);
|
|
2817
|
+
}
|
|
2818
|
+
catch { }
|
|
2819
|
+
});
|
|
2820
|
+
room.on(RoomEvent.TrackSubscribed, (track, publication, participant) => {
|
|
2821
|
+
try {
|
|
2822
|
+
console.log(`📥 [ROOM-TRACK-SUBSCRIBED] participant=${participant?.identity} kind=${track?.kind} source=${publication?.source} sid=${publication?.sid} t=${new Date().toISOString()}`);
|
|
2823
|
+
}
|
|
2824
|
+
catch { }
|
|
2825
|
+
});
|
|
2826
|
+
room.on(RoomEvent.TrackUnsubscribed, (track, publication, participant) => {
|
|
2827
|
+
try {
|
|
2828
|
+
console.log(`📤 [ROOM-TRACK-UNSUBSCRIBED] participant=${participant?.identity} kind=${track?.kind} source=${publication?.source} sid=${publication?.sid} t=${new Date().toISOString()}`);
|
|
2829
|
+
}
|
|
2830
|
+
catch { }
|
|
2831
|
+
});
|
|
2732
2832
|
room.on(RoomEvent.Disconnected, () => {
|
|
2733
2833
|
console.log('👋 Disconnected from room');
|
|
2734
2834
|
// Clean up active research and voice queue
|
|
@@ -3188,10 +3288,81 @@ async function main() {
|
|
|
3188
3288
|
// FALLBACK: playout_completed
|
|
3189
3289
|
sess.on('playout_completed', (ev) => {
|
|
3190
3290
|
const message = ev.message || ev.text || ev.content;
|
|
3291
|
+
console.log(`🎧 PLAYOUT COMPLETED [${new Date().toISOString()}]:`, JSON.stringify({
|
|
3292
|
+
speechId: ev.speechHandle?.id ?? ev.speechId,
|
|
3293
|
+
interrupted: ev.interrupted,
|
|
3294
|
+
durationMs: ev.durationMs,
|
|
3295
|
+
messageLen: message ? message.length : 0,
|
|
3296
|
+
}));
|
|
3191
3297
|
if (message && message.length > 0) {
|
|
3192
3298
|
sendAgentTranscript(message, 'playout');
|
|
3193
3299
|
}
|
|
3194
3300
|
});
|
|
3301
|
+
// 0.9.71: metrics_collected — per-call latency for STT/TTS/LLM/VAD/EOU/Interruption.
|
|
3302
|
+
// SINGLE highest-signal event for diagnosing audio cutoffs.
|
|
3303
|
+
// • TTSMetrics.ttfbMs / durationMs / audioDurationMs / cancelled → directly answers
|
|
3304
|
+
// "did the OpenAI HTTP fetch hang or did it complete and the SDK aborted?"
|
|
3305
|
+
// • STTMetrics.audioDurationMs / durationMs → Deepgram latency per utterance
|
|
3306
|
+
// • LLMMetrics.ttftMs → cold-vs-warm Claude subprocess
|
|
3307
|
+
// • EOUMetrics.endOfUtteranceDelayMs / transcriptionDelayMs → end-of-turn timing
|
|
3308
|
+
// • InterruptionMetrics.{detectionDelay, numInterruptions, numBackchannels} →
|
|
3309
|
+
// turn-detector signal at the source
|
|
3310
|
+
sess.on('metrics_collected', (ev) => {
|
|
3311
|
+
const m = ev?.metrics;
|
|
3312
|
+
if (!m)
|
|
3313
|
+
return;
|
|
3314
|
+
const compact = { type: m.type, label: m.label, t: new Date().toISOString() };
|
|
3315
|
+
// Per-type subset — keep tight
|
|
3316
|
+
if (m.type === 'tts_metrics') {
|
|
3317
|
+
compact.ttfbMs = Math.round(m.ttfbMs ?? -1);
|
|
3318
|
+
compact.durationMs = Math.round(m.durationMs ?? -1);
|
|
3319
|
+
compact.audioDurationMs = Math.round(m.audioDurationMs ?? -1);
|
|
3320
|
+
compact.cancelled = !!m.cancelled;
|
|
3321
|
+
compact.charactersCount = m.charactersCount;
|
|
3322
|
+
compact.streamed = !!m.streamed;
|
|
3323
|
+
compact.speechId = m.speechId;
|
|
3324
|
+
}
|
|
3325
|
+
else if (m.type === 'stt_metrics') {
|
|
3326
|
+
compact.audioDurationMs = Math.round(m.audioDurationMs ?? -1);
|
|
3327
|
+
compact.durationMs = Math.round(m.durationMs ?? -1);
|
|
3328
|
+
compact.streamed = !!m.streamed;
|
|
3329
|
+
}
|
|
3330
|
+
else if (m.type === 'llm_metrics') {
|
|
3331
|
+
compact.ttftMs = Math.round(m.ttftMs ?? -1);
|
|
3332
|
+
compact.durationMs = Math.round(m.durationMs ?? -1);
|
|
3333
|
+
compact.cancelled = !!m.cancelled;
|
|
3334
|
+
compact.completionTokens = m.completionTokens;
|
|
3335
|
+
compact.promptTokens = m.promptTokens;
|
|
3336
|
+
compact.speechId = m.speechId;
|
|
3337
|
+
}
|
|
3338
|
+
else if (m.type === 'vad_metrics') {
|
|
3339
|
+
compact.idleTimeMs = Math.round(m.idleTimeMs ?? -1);
|
|
3340
|
+
compact.inferenceCount = m.inferenceCount;
|
|
3341
|
+
}
|
|
3342
|
+
else if (m.type === 'eou_metrics') {
|
|
3343
|
+
compact.endOfUtteranceDelayMs = Math.round(m.endOfUtteranceDelayMs ?? -1);
|
|
3344
|
+
compact.transcriptionDelayMs = Math.round(m.transcriptionDelayMs ?? -1);
|
|
3345
|
+
compact.onUserTurnCompletedDelayMs = Math.round(m.onUserTurnCompletedDelayMs ?? -1);
|
|
3346
|
+
compact.speechId = m.speechId;
|
|
3347
|
+
}
|
|
3348
|
+
else if (m.type === 'interruption_metrics') {
|
|
3349
|
+
compact.detectionDelay = Math.round(m.detectionDelay ?? -1);
|
|
3350
|
+
compact.predictionDuration = Math.round(m.predictionDuration ?? -1);
|
|
3351
|
+
compact.numInterruptions = m.numInterruptions;
|
|
3352
|
+
compact.numBackchannels = m.numBackchannels;
|
|
3353
|
+
compact.numRequests = m.numRequests;
|
|
3354
|
+
}
|
|
3355
|
+
console.log(`📈 [METRICS]`, JSON.stringify(compact));
|
|
3356
|
+
});
|
|
3357
|
+
// 0.9.71: function_tools_executed — when a tool batch completes inside the SDK.
|
|
3358
|
+
sess.on('function_tools_executed', (ev) => {
|
|
3359
|
+
try {
|
|
3360
|
+
const calls = ev?.functionCalls?.length ?? 0;
|
|
3361
|
+
const outputs = ev?.functionOutputs?.length ?? 0;
|
|
3362
|
+
console.log(`🛠️ [TOOLS-EXECUTED] calls=${calls} outputs=${outputs} t=${new Date().toISOString()}`);
|
|
3363
|
+
}
|
|
3364
|
+
catch { }
|
|
3365
|
+
});
|
|
3195
3366
|
// 0.9.68: mirror SDK's internal unrecoverable-error counters so we can
|
|
3196
3367
|
// see EXACTLY how close we are to closeImpl() firing (default threshold 3).
|
|
3197
3368
|
// Counter resets on each successful "speaking" transition (agent_session.js:740).
|
package/dist/voice-io.js
CHANGED
|
@@ -137,7 +137,8 @@ export const DIRECT_MODE_STT = {
|
|
|
137
137
|
export const DIRECT_MODE_TTS = {
|
|
138
138
|
// provider: 'deepgram', model: 'aura-2-asteria-en', // WebSocket-based: handles TTS abort cleanly (no unrecoverable crash on interruption)
|
|
139
139
|
// provider: 'gemini', model: 'gemini-2.5-flash-preview-tts', voice: 'apollo',
|
|
140
|
-
provider: 'openai', model: 'tts-1', voice: 'fable',
|
|
140
|
+
// provider: 'openai', model: 'tts-1', voice: 'fable', // HTTP streaming: throws APIUserAbortError on interrupt → unrecoverable session crash
|
|
141
|
+
provider: 'openai', model: 'tts-1-hd', voice: 'fable', // 0.9.70: test tts-1-hd — tts-1 had chronic per-sentence HTTP hangs (40s SDK watchdog → APIUserAbortError mid-message)
|
|
141
142
|
// provider: 'groq-orpheus', model: 'canopylabs/orpheus-v1-english', voice: 'autumn', // $22/M chars — voices: autumn, diana, hannah, austin, daniel, troy
|
|
142
143
|
};
|
|
143
144
|
/**
|