osborn 0.9.67 โ†’ 0.9.71

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -15,6 +15,9 @@ import { dirname, join } from 'node:path';
15
15
  import { fileURLToPath } from 'node:url';
16
16
  import { spawn } from 'node:child_process';
17
17
  import { randomUUID } from 'node:crypto';
18
+ import { createRequire } from 'node:module';
19
+ // 0.9.71: createRequire for resolving package.json versions inside ESM
20
+ const __sdkVersionRequire = createRequire(import.meta.url);
18
21
  import { homedir, tmpdir } from 'node:os';
19
22
  import { PassThrough } from 'node:stream';
20
23
  import { createGunzip } from 'node:zlib';
@@ -2175,6 +2178,34 @@ async function main() {
2175
2178
  },
2176
2179
  },
2177
2180
  });
2181
+ // 0.9.71: dump the RESOLVED AgentSession options (after defaults applied)
2182
+ // so prod logs prove exactly what tuning is live for any given session.
2183
+ try {
2184
+ const so = session.sessionOptions ?? {};
2185
+ const detect = session.interruptionDetection;
2186
+ const turn = so.turnHandling ?? {};
2187
+ console.log('๐Ÿงช [BE-AGENT-SESSION-CONFIG]', JSON.stringify({
2188
+ t: new Date().toISOString(),
2189
+ maxToolSteps: so.maxToolSteps,
2190
+ userAwayTimeout: so.userAwayTimeout,
2191
+ aecWarmupDuration: so.aecWarmupDuration,
2192
+ ttsReadIdleTimeout: so.ttsReadIdleTimeout,
2193
+ forwardAudioIdleTimeout: so.forwardAudioIdleTimeout,
2194
+ useTtsAlignedTranscript: so.useTtsAlignedTranscript,
2195
+ ttsTextTransforms: so.ttsTextTransforms,
2196
+ interruptionDetectionMode: detect, // 'vad' | 'adaptive' | undefined
2197
+ turnHandling: {
2198
+ turnDetection: turn.turnDetection,
2199
+ endpointing: turn.endpointing,
2200
+ interruption: turn.interruption,
2201
+ preemptiveGeneration: turn.preemptiveGeneration,
2202
+ userTurnLimit: turn.userTurnLimit,
2203
+ },
2204
+ }));
2205
+ }
2206
+ catch (err) {
2207
+ console.log('๐Ÿงช [BE-AGENT-SESSION-CONFIG] failed:', err instanceof Error ? err.message : String(err));
2208
+ }
2178
2209
  return { session, agent };
2179
2210
  }
2180
2211
  // ============================================================
@@ -2703,7 +2734,45 @@ async function main() {
2703
2734
  // Room Event Handlers
2704
2735
  // ============================================================
2705
2736
  room.on(RoomEvent.Connected, () => {
2706
- console.log('โœ… Connected to room:', roomName);
2737
+ // 0.9.68: log Room SID + name PROMINENTLY so we can cross-reference
2738
+ // this specific session in LiveKit Cloud dashboard โ†’ Sessions tab.
2739
+ // @livekit/rtc-node Room exposes SID via async getSid() (it's resolved
2740
+ // after WebRTC handshake), so we fetch it asynchronously and log when ready.
2741
+ console.log(`โœ… Connected to room: ${roomName} | t=${new Date().toISOString()}`);
2742
+ room.getSid().then((sid) => {
2743
+ console.log(`๐Ÿ”— [LIVEKIT-DASHBOARD] room sid=${sid} name=${roomName} โ€” search at https://cloud.livekit.io/projects โ†’ Sessions โ†’ "${sid}"`);
2744
+ }).catch((err) => {
2745
+ console.log(`โš ๏ธ [LIVEKIT-DASHBOARD] failed to fetch room SID: ${err instanceof Error ? err.message : String(err)}`);
2746
+ });
2747
+ // 0.9.71: SDK + runtime snapshot โ€” proves what's actually running so
2748
+ // future log forensics can rule out version drift in one grep.
2749
+ try {
2750
+ const pkgs = {};
2751
+ for (const name of [
2752
+ 'osborn',
2753
+ '@livekit/agents',
2754
+ '@livekit/agents-plugin-openai',
2755
+ '@livekit/agents-plugin-deepgram',
2756
+ '@livekit/agents-plugin-silero',
2757
+ '@livekit/agents-plugin-google',
2758
+ '@livekit/agents-plugin-elevenlabs',
2759
+ '@livekit/agents-plugin-livekit',
2760
+ '@livekit/rtc-node',
2761
+ 'livekit-server-sdk',
2762
+ '@anthropic-ai/claude-agent-sdk',
2763
+ '@google/genai',
2764
+ 'openai',
2765
+ ]) {
2766
+ try {
2767
+ pkgs[name] = __sdkVersionRequire(`${name}/package.json`).version;
2768
+ }
2769
+ catch { }
2770
+ }
2771
+ console.log('๐Ÿงช [BE-SDK-VERSIONS]', JSON.stringify({ t: new Date().toISOString(), node: process.version, pkgs }));
2772
+ }
2773
+ catch (err) {
2774
+ console.log('๐Ÿงช [BE-SDK-VERSIONS] failed:', err instanceof Error ? err.message : String(err));
2775
+ }
2707
2776
  localParticipant = room.localParticipant;
2708
2777
  // Arm the alone timer: if we connected but no user joins within the grace
2709
2778
  // window (e.g. machine woken then abandoned mid-handshake), leave the room
@@ -2720,6 +2789,46 @@ async function main() {
2720
2789
  // Flux STT's speech-vs-noise classification: slower (~100-300ms) but
2721
2790
  // confidence-aware. The latency tradeoff is worth eliminating the false
2722
2791
  // interrupts at the root.
2792
+ // 0.9.71: Room-level audio observability โ€” observe-only logs so we can
2793
+ // cross-reference user mic mute/quality changes against TTS cutoffs without
2794
+ // re-introducing the over-eager ActiveSpeakers interrupt.
2795
+ room.on(RoomEvent.ActiveSpeakersChanged, (speakers) => {
2796
+ try {
2797
+ const ids = (speakers || []).map((s) => s?.identity).filter(Boolean);
2798
+ console.log(`๐ŸŽ™๏ธ [ROOM-SPEAKERS] count=${ids.length} ids=${JSON.stringify(ids)} t=${new Date().toISOString()}`);
2799
+ }
2800
+ catch { }
2801
+ });
2802
+ room.on(RoomEvent.ConnectionQualityChanged, (quality, participant) => {
2803
+ try {
2804
+ console.log(`๐Ÿ“ถ [ROOM-QUALITY] participant=${participant?.identity} quality=${quality} t=${new Date().toISOString()}`);
2805
+ }
2806
+ catch { }
2807
+ });
2808
+ room.on(RoomEvent.TrackMuted, (publication, participant) => {
2809
+ try {
2810
+ console.log(`๐Ÿ”‡ [ROOM-TRACK-MUTED] participant=${participant?.identity} kind=${publication?.kind} source=${publication?.source} sid=${publication?.sid} t=${new Date().toISOString()}`);
2811
+ }
2812
+ catch { }
2813
+ });
2814
+ room.on(RoomEvent.TrackUnmuted, (publication, participant) => {
2815
+ try {
2816
+ console.log(`๐Ÿ”Š [ROOM-TRACK-UNMUTED] participant=${participant?.identity} kind=${publication?.kind} source=${publication?.source} sid=${publication?.sid} t=${new Date().toISOString()}`);
2817
+ }
2818
+ catch { }
2819
+ });
2820
+ room.on(RoomEvent.TrackSubscribed, (track, publication, participant) => {
2821
+ try {
2822
+ console.log(`๐Ÿ“ฅ [ROOM-TRACK-SUBSCRIBED] participant=${participant?.identity} kind=${track?.kind} source=${publication?.source} sid=${publication?.sid} t=${new Date().toISOString()}`);
2823
+ }
2824
+ catch { }
2825
+ });
2826
+ room.on(RoomEvent.TrackUnsubscribed, (track, publication, participant) => {
2827
+ try {
2828
+ console.log(`๐Ÿ“ค [ROOM-TRACK-UNSUBSCRIBED] participant=${participant?.identity} kind=${track?.kind} source=${publication?.source} sid=${publication?.sid} t=${new Date().toISOString()}`);
2829
+ }
2830
+ catch { }
2831
+ });
2723
2832
  room.on(RoomEvent.Disconnected, () => {
2724
2833
  console.log('๐Ÿ‘‹ Disconnected from room');
2725
2834
  // Clean up active research and voice queue
@@ -3179,13 +3288,101 @@ async function main() {
3179
3288
  // FALLBACK: playout_completed
3180
3289
  sess.on('playout_completed', (ev) => {
3181
3290
  const message = ev.message || ev.text || ev.content;
3291
+ console.log(`๐ŸŽง PLAYOUT COMPLETED [${new Date().toISOString()}]:`, JSON.stringify({
3292
+ speechId: ev.speechHandle?.id ?? ev.speechId,
3293
+ interrupted: ev.interrupted,
3294
+ durationMs: ev.durationMs,
3295
+ messageLen: message ? message.length : 0,
3296
+ }));
3182
3297
  if (message && message.length > 0) {
3183
3298
  sendAgentTranscript(message, 'playout');
3184
3299
  }
3185
3300
  });
3301
+ // 0.9.71: metrics_collected โ€” per-call latency for STT/TTS/LLM/VAD/EOU/Interruption.
3302
+ // SINGLE highest-signal event for diagnosing audio cutoffs.
3303
+ // โ€ข TTSMetrics.ttfbMs / durationMs / audioDurationMs / cancelled โ†’ directly answers
3304
+ // "did the OpenAI HTTP fetch hang or did it complete and the SDK aborted?"
3305
+ // โ€ข STTMetrics.audioDurationMs / durationMs โ†’ Deepgram latency per utterance
3306
+ // โ€ข LLMMetrics.ttftMs โ†’ cold-vs-warm Claude subprocess
3307
+ // โ€ข EOUMetrics.endOfUtteranceDelayMs / transcriptionDelayMs โ†’ end-of-turn timing
3308
+ // โ€ข InterruptionMetrics.{detectionDelay, numInterruptions, numBackchannels} โ†’
3309
+ // turn-detector signal at the source
3310
+ sess.on('metrics_collected', (ev) => {
3311
+ const m = ev?.metrics;
3312
+ if (!m)
3313
+ return;
3314
+ const compact = { type: m.type, label: m.label, t: new Date().toISOString() };
3315
+ // Per-type subset โ€” keep tight
3316
+ if (m.type === 'tts_metrics') {
3317
+ compact.ttfbMs = Math.round(m.ttfbMs ?? -1);
3318
+ compact.durationMs = Math.round(m.durationMs ?? -1);
3319
+ compact.audioDurationMs = Math.round(m.audioDurationMs ?? -1);
3320
+ compact.cancelled = !!m.cancelled;
3321
+ compact.charactersCount = m.charactersCount;
3322
+ compact.streamed = !!m.streamed;
3323
+ compact.speechId = m.speechId;
3324
+ }
3325
+ else if (m.type === 'stt_metrics') {
3326
+ compact.audioDurationMs = Math.round(m.audioDurationMs ?? -1);
3327
+ compact.durationMs = Math.round(m.durationMs ?? -1);
3328
+ compact.streamed = !!m.streamed;
3329
+ }
3330
+ else if (m.type === 'llm_metrics') {
3331
+ compact.ttftMs = Math.round(m.ttftMs ?? -1);
3332
+ compact.durationMs = Math.round(m.durationMs ?? -1);
3333
+ compact.cancelled = !!m.cancelled;
3334
+ compact.completionTokens = m.completionTokens;
3335
+ compact.promptTokens = m.promptTokens;
3336
+ compact.speechId = m.speechId;
3337
+ }
3338
+ else if (m.type === 'vad_metrics') {
3339
+ compact.idleTimeMs = Math.round(m.idleTimeMs ?? -1);
3340
+ compact.inferenceCount = m.inferenceCount;
3341
+ }
3342
+ else if (m.type === 'eou_metrics') {
3343
+ compact.endOfUtteranceDelayMs = Math.round(m.endOfUtteranceDelayMs ?? -1);
3344
+ compact.transcriptionDelayMs = Math.round(m.transcriptionDelayMs ?? -1);
3345
+ compact.onUserTurnCompletedDelayMs = Math.round(m.onUserTurnCompletedDelayMs ?? -1);
3346
+ compact.speechId = m.speechId;
3347
+ }
3348
+ else if (m.type === 'interruption_metrics') {
3349
+ compact.detectionDelay = Math.round(m.detectionDelay ?? -1);
3350
+ compact.predictionDuration = Math.round(m.predictionDuration ?? -1);
3351
+ compact.numInterruptions = m.numInterruptions;
3352
+ compact.numBackchannels = m.numBackchannels;
3353
+ compact.numRequests = m.numRequests;
3354
+ }
3355
+ console.log(`๐Ÿ“ˆ [METRICS]`, JSON.stringify(compact));
3356
+ });
3357
+ // 0.9.71: function_tools_executed โ€” when a tool batch completes inside the SDK.
3358
+ sess.on('function_tools_executed', (ev) => {
3359
+ try {
3360
+ const calls = ev?.functionCalls?.length ?? 0;
3361
+ const outputs = ev?.functionOutputs?.length ?? 0;
3362
+ console.log(`๐Ÿ› ๏ธ [TOOLS-EXECUTED] calls=${calls} outputs=${outputs} t=${new Date().toISOString()}`);
3363
+ }
3364
+ catch { }
3365
+ });
3366
+ // 0.9.68: mirror SDK's internal unrecoverable-error counters so we can
3367
+ // see EXACTLY how close we are to closeImpl() firing (default threshold 3).
3368
+ // Counter resets on each successful "speaking" transition (agent_session.js:740).
3369
+ let __ttsErrorCounter = 0;
3370
+ let __llmErrorCounter = 0;
3371
+ const __maxUnrecov = 3; // SDK default DEFAULT_SESSION_CONNECT_OPTIONS.maxUnrecoverableErrors
3186
3372
  // Error handler
3187
3373
  sess.on('error', (ev) => {
3188
3374
  const msg = ev.error?.message || String(ev.error);
3375
+ const errType = ev.type || 'unknown';
3376
+ const recoverable = ev.recoverable;
3377
+ // 0.9.68: counter mirror โ€” increment for recoverable:false same as SDK does
3378
+ if (recoverable === false) {
3379
+ if (errType === 'tts_error')
3380
+ __ttsErrorCounter++;
3381
+ else if (errType === 'llm_error')
3382
+ __llmErrorCounter++;
3383
+ }
3384
+ const willCloseNext = (__ttsErrorCounter > __maxUnrecov || __llmErrorCounter > __maxUnrecov);
3385
+ console.log(`๐Ÿ“Š [ERROR-COUNTER] type=${errType} recoverable=${recoverable} ttsErrorCount=${__ttsErrorCounter}/${__maxUnrecov} llmErrorCount=${__llmErrorCounter}/${__maxUnrecov} willCloseNext=${willCloseNext} t=${new Date().toISOString()}`);
3189
3386
  // OpenAI race: voice queue collided with server-side VAD auto-response
3190
3387
  if (msg.includes('conversation_already_has_active_response') || msg.includes('active_response')) {
3191
3388
  console.log('โš ๏ธ OpenAI active response collision โ€” queue will retry on next listening state');
@@ -3198,6 +3395,15 @@ async function main() {
3198
3395
  }
3199
3396
  console.error('โŒ Session error:', ev.error);
3200
3397
  });
3398
+ // 0.9.68: reset error counter mirror when SDK does (on speaking transition).
3399
+ // Reuses the existing agent_state_changed handler logic โ€” fires AFTER.
3400
+ sess.on('agent_state_changed', (ev) => {
3401
+ if (ev.newState === 'speaking' && (__ttsErrorCounter > 0 || __llmErrorCounter > 0)) {
3402
+ console.log(`๐Ÿ“Š [COUNTER-RESET] speaking transition cleared ttsErrorCount=${__ttsErrorCounter}โ†’0 llmErrorCount=${__llmErrorCounter}โ†’0`);
3403
+ __ttsErrorCounter = 0;
3404
+ __llmErrorCounter = 0;
3405
+ }
3406
+ });
3201
3407
  // Capture voice mode at session creation โ€” prevents state confusion
3202
3408
  // if currentVoiceMode changes between session start and crash recovery
3203
3409
  const sessionVoiceMode = currentVoiceMode;
package/dist/voice-io.js CHANGED
@@ -137,7 +137,8 @@ export const DIRECT_MODE_STT = {
137
137
  export const DIRECT_MODE_TTS = {
138
138
  // provider: 'deepgram', model: 'aura-2-asteria-en', // WebSocket-based: handles TTS abort cleanly (no unrecoverable crash on interruption)
139
139
  // provider: 'gemini', model: 'gemini-2.5-flash-preview-tts', voice: 'apollo',
140
- provider: 'openai', model: 'tts-1', voice: 'fable', // HTTP streaming: throws APIUserAbortError on interrupt โ†’ unrecoverable session crash
140
+ // provider: 'openai', model: 'tts-1', voice: 'fable', // HTTP streaming: throws APIUserAbortError on interrupt โ†’ unrecoverable session crash
141
+ provider: 'openai', model: 'tts-1-hd', voice: 'fable', // 0.9.70: test tts-1-hd โ€” tts-1 had chronic per-sentence HTTP hangs (40s SDK watchdog โ†’ APIUserAbortError mid-message)
141
142
  // provider: 'groq-orpheus', model: 'canopylabs/orpheus-v1-english', voice: 'autumn', // $22/M chars โ€” voices: autumn, diana, hannah, austin, daniel, troy
142
143
  };
143
144
  /**
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "osborn",
3
- "version": "0.9.67",
3
+ "version": "0.9.71",
4
4
  "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
5
5
  "type": "module",
6
6
  "bin": {