osborn 0.9.68 โ†’ 0.9.71

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -15,6 +15,9 @@ import { dirname, join } from 'node:path';
15
15
  import { fileURLToPath } from 'node:url';
16
16
  import { spawn } from 'node:child_process';
17
17
  import { randomUUID } from 'node:crypto';
18
+ import { createRequire } from 'node:module';
19
+ // 0.9.71: createRequire for resolving package.json versions inside ESM
20
+ const __sdkVersionRequire = createRequire(import.meta.url);
18
21
  import { homedir, tmpdir } from 'node:os';
19
22
  import { PassThrough } from 'node:stream';
20
23
  import { createGunzip } from 'node:zlib';
@@ -2175,6 +2178,34 @@ async function main() {
2175
2178
  },
2176
2179
  },
2177
2180
  });
2181
+ // 0.9.71: dump the RESOLVED AgentSession options (after defaults applied)
2182
+ // so prod logs prove exactly what tuning is live for any given session.
2183
+ try {
2184
+ const so = session.sessionOptions ?? {};
2185
+ const detect = session.interruptionDetection;
2186
+ const turn = so.turnHandling ?? {};
2187
+ console.log('๐Ÿงช [BE-AGENT-SESSION-CONFIG]', JSON.stringify({
2188
+ t: new Date().toISOString(),
2189
+ maxToolSteps: so.maxToolSteps,
2190
+ userAwayTimeout: so.userAwayTimeout,
2191
+ aecWarmupDuration: so.aecWarmupDuration,
2192
+ ttsReadIdleTimeout: so.ttsReadIdleTimeout,
2193
+ forwardAudioIdleTimeout: so.forwardAudioIdleTimeout,
2194
+ useTtsAlignedTranscript: so.useTtsAlignedTranscript,
2195
+ ttsTextTransforms: so.ttsTextTransforms,
2196
+ interruptionDetectionMode: detect, // 'vad' | 'adaptive' | undefined
2197
+ turnHandling: {
2198
+ turnDetection: turn.turnDetection,
2199
+ endpointing: turn.endpointing,
2200
+ interruption: turn.interruption,
2201
+ preemptiveGeneration: turn.preemptiveGeneration,
2202
+ userTurnLimit: turn.userTurnLimit,
2203
+ },
2204
+ }));
2205
+ }
2206
+ catch (err) {
2207
+ console.log('๐Ÿงช [BE-AGENT-SESSION-CONFIG] failed:', err instanceof Error ? err.message : String(err));
2208
+ }
2178
2209
  return { session, agent };
2179
2210
  }
2180
2211
  // ============================================================
@@ -2713,6 +2744,35 @@ async function main() {
2713
2744
  }).catch((err) => {
2714
2745
  console.log(`โš ๏ธ [LIVEKIT-DASHBOARD] failed to fetch room SID: ${err instanceof Error ? err.message : String(err)}`);
2715
2746
  });
2747
+ // 0.9.71: SDK + runtime snapshot โ€” proves what's actually running so
2748
+ // future log forensics can rule out version drift in one grep.
2749
+ try {
2750
+ const pkgs = {};
2751
+ for (const name of [
2752
+ 'osborn',
2753
+ '@livekit/agents',
2754
+ '@livekit/agents-plugin-openai',
2755
+ '@livekit/agents-plugin-deepgram',
2756
+ '@livekit/agents-plugin-silero',
2757
+ '@livekit/agents-plugin-google',
2758
+ '@livekit/agents-plugin-elevenlabs',
2759
+ '@livekit/agents-plugin-livekit',
2760
+ '@livekit/rtc-node',
2761
+ 'livekit-server-sdk',
2762
+ '@anthropic-ai/claude-agent-sdk',
2763
+ '@google/genai',
2764
+ 'openai',
2765
+ ]) {
2766
+ try {
2767
+ pkgs[name] = __sdkVersionRequire(`${name}/package.json`).version;
2768
+ }
2769
+ catch { }
2770
+ }
2771
+ console.log('๐Ÿงช [BE-SDK-VERSIONS]', JSON.stringify({ t: new Date().toISOString(), node: process.version, pkgs }));
2772
+ }
2773
+ catch (err) {
2774
+ console.log('๐Ÿงช [BE-SDK-VERSIONS] failed:', err instanceof Error ? err.message : String(err));
2775
+ }
2716
2776
  localParticipant = room.localParticipant;
2717
2777
  // Arm the alone timer: if we connected but no user joins within the grace
2718
2778
  // window (e.g. machine woken then abandoned mid-handshake), leave the room
@@ -2729,6 +2789,46 @@ async function main() {
2729
2789
  // Flux STT's speech-vs-noise classification: slower (~100-300ms) but
2730
2790
  // confidence-aware. The latency tradeoff is worth eliminating the false
2731
2791
  // interrupts at the root.
2792
+ // 0.9.71: Room-level audio observability โ€” observe-only logs so we can
2793
+ // cross-reference user mic mute/quality changes against TTS cutoffs without
2794
+ // re-introducing the over-eager ActiveSpeakers interrupt.
2795
+ room.on(RoomEvent.ActiveSpeakersChanged, (speakers) => {
2796
+ try {
2797
+ const ids = (speakers || []).map((s) => s?.identity).filter(Boolean);
2798
+ console.log(`๐ŸŽ™๏ธ [ROOM-SPEAKERS] count=${ids.length} ids=${JSON.stringify(ids)} t=${new Date().toISOString()}`);
2799
+ }
2800
+ catch { }
2801
+ });
2802
+ room.on(RoomEvent.ConnectionQualityChanged, (quality, participant) => {
2803
+ try {
2804
+ console.log(`๐Ÿ“ถ [ROOM-QUALITY] participant=${participant?.identity} quality=${quality} t=${new Date().toISOString()}`);
2805
+ }
2806
+ catch { }
2807
+ });
2808
+ room.on(RoomEvent.TrackMuted, (publication, participant) => {
2809
+ try {
2810
+ console.log(`๐Ÿ”‡ [ROOM-TRACK-MUTED] participant=${participant?.identity} kind=${publication?.kind} source=${publication?.source} sid=${publication?.sid} t=${new Date().toISOString()}`);
2811
+ }
2812
+ catch { }
2813
+ });
2814
+ room.on(RoomEvent.TrackUnmuted, (publication, participant) => {
2815
+ try {
2816
+ console.log(`๐Ÿ”Š [ROOM-TRACK-UNMUTED] participant=${participant?.identity} kind=${publication?.kind} source=${publication?.source} sid=${publication?.sid} t=${new Date().toISOString()}`);
2817
+ }
2818
+ catch { }
2819
+ });
2820
+ room.on(RoomEvent.TrackSubscribed, (track, publication, participant) => {
2821
+ try {
2822
+ console.log(`๐Ÿ“ฅ [ROOM-TRACK-SUBSCRIBED] participant=${participant?.identity} kind=${track?.kind} source=${publication?.source} sid=${publication?.sid} t=${new Date().toISOString()}`);
2823
+ }
2824
+ catch { }
2825
+ });
2826
+ room.on(RoomEvent.TrackUnsubscribed, (track, publication, participant) => {
2827
+ try {
2828
+ console.log(`๐Ÿ“ค [ROOM-TRACK-UNSUBSCRIBED] participant=${participant?.identity} kind=${track?.kind} source=${publication?.source} sid=${publication?.sid} t=${new Date().toISOString()}`);
2829
+ }
2830
+ catch { }
2831
+ });
2732
2832
  room.on(RoomEvent.Disconnected, () => {
2733
2833
  console.log('๐Ÿ‘‹ Disconnected from room');
2734
2834
  // Clean up active research and voice queue
@@ -3188,10 +3288,81 @@ async function main() {
3188
3288
  // FALLBACK: playout_completed
3189
3289
  sess.on('playout_completed', (ev) => {
3190
3290
  const message = ev.message || ev.text || ev.content;
3291
+ console.log(`๐ŸŽง PLAYOUT COMPLETED [${new Date().toISOString()}]:`, JSON.stringify({
3292
+ speechId: ev.speechHandle?.id ?? ev.speechId,
3293
+ interrupted: ev.interrupted,
3294
+ durationMs: ev.durationMs,
3295
+ messageLen: message ? message.length : 0,
3296
+ }));
3191
3297
  if (message && message.length > 0) {
3192
3298
  sendAgentTranscript(message, 'playout');
3193
3299
  }
3194
3300
  });
3301
+ // 0.9.71: metrics_collected โ€” per-call latency for STT/TTS/LLM/VAD/EOU/Interruption.
3302
+ // SINGLE highest-signal event for diagnosing audio cutoffs.
3303
+ // โ€ข TTSMetrics.ttfbMs / durationMs / audioDurationMs / cancelled โ†’ directly answers
3304
+ // "did the OpenAI HTTP fetch hang or did it complete and the SDK aborted?"
3305
+ // โ€ข STTMetrics.audioDurationMs / durationMs โ†’ Deepgram latency per utterance
3306
+ // โ€ข LLMMetrics.ttftMs โ†’ cold-vs-warm Claude subprocess
3307
+ // โ€ข EOUMetrics.endOfUtteranceDelayMs / transcriptionDelayMs โ†’ end-of-turn timing
3308
+ // โ€ข InterruptionMetrics.{detectionDelay, numInterruptions, numBackchannels} โ†’
3309
+ // turn-detector signal at the source
3310
+ sess.on('metrics_collected', (ev) => {
3311
+ const m = ev?.metrics;
3312
+ if (!m)
3313
+ return;
3314
+ const compact = { type: m.type, label: m.label, t: new Date().toISOString() };
3315
+ // Per-type subset โ€” keep tight
3316
+ if (m.type === 'tts_metrics') {
3317
+ compact.ttfbMs = Math.round(m.ttfbMs ?? -1);
3318
+ compact.durationMs = Math.round(m.durationMs ?? -1);
3319
+ compact.audioDurationMs = Math.round(m.audioDurationMs ?? -1);
3320
+ compact.cancelled = !!m.cancelled;
3321
+ compact.charactersCount = m.charactersCount;
3322
+ compact.streamed = !!m.streamed;
3323
+ compact.speechId = m.speechId;
3324
+ }
3325
+ else if (m.type === 'stt_metrics') {
3326
+ compact.audioDurationMs = Math.round(m.audioDurationMs ?? -1);
3327
+ compact.durationMs = Math.round(m.durationMs ?? -1);
3328
+ compact.streamed = !!m.streamed;
3329
+ }
3330
+ else if (m.type === 'llm_metrics') {
3331
+ compact.ttftMs = Math.round(m.ttftMs ?? -1);
3332
+ compact.durationMs = Math.round(m.durationMs ?? -1);
3333
+ compact.cancelled = !!m.cancelled;
3334
+ compact.completionTokens = m.completionTokens;
3335
+ compact.promptTokens = m.promptTokens;
3336
+ compact.speechId = m.speechId;
3337
+ }
3338
+ else if (m.type === 'vad_metrics') {
3339
+ compact.idleTimeMs = Math.round(m.idleTimeMs ?? -1);
3340
+ compact.inferenceCount = m.inferenceCount;
3341
+ }
3342
+ else if (m.type === 'eou_metrics') {
3343
+ compact.endOfUtteranceDelayMs = Math.round(m.endOfUtteranceDelayMs ?? -1);
3344
+ compact.transcriptionDelayMs = Math.round(m.transcriptionDelayMs ?? -1);
3345
+ compact.onUserTurnCompletedDelayMs = Math.round(m.onUserTurnCompletedDelayMs ?? -1);
3346
+ compact.speechId = m.speechId;
3347
+ }
3348
+ else if (m.type === 'interruption_metrics') {
3349
+ compact.detectionDelay = Math.round(m.detectionDelay ?? -1);
3350
+ compact.predictionDuration = Math.round(m.predictionDuration ?? -1);
3351
+ compact.numInterruptions = m.numInterruptions;
3352
+ compact.numBackchannels = m.numBackchannels;
3353
+ compact.numRequests = m.numRequests;
3354
+ }
3355
+ console.log(`๐Ÿ“ˆ [METRICS]`, JSON.stringify(compact));
3356
+ });
3357
+ // 0.9.71: function_tools_executed โ€” when a tool batch completes inside the SDK.
3358
+ sess.on('function_tools_executed', (ev) => {
3359
+ try {
3360
+ const calls = ev?.functionCalls?.length ?? 0;
3361
+ const outputs = ev?.functionOutputs?.length ?? 0;
3362
+ console.log(`๐Ÿ› ๏ธ [TOOLS-EXECUTED] calls=${calls} outputs=${outputs} t=${new Date().toISOString()}`);
3363
+ }
3364
+ catch { }
3365
+ });
3195
3366
  // 0.9.68: mirror SDK's internal unrecoverable-error counters so we can
3196
3367
  // see EXACTLY how close we are to closeImpl() firing (default threshold 3).
3197
3368
  // Counter resets on each successful "speaking" transition (agent_session.js:740).
package/dist/voice-io.js CHANGED
@@ -137,7 +137,8 @@ export const DIRECT_MODE_STT = {
137
137
  export const DIRECT_MODE_TTS = {
138
138
  // provider: 'deepgram', model: 'aura-2-asteria-en', // WebSocket-based: handles TTS abort cleanly (no unrecoverable crash on interruption)
139
139
  // provider: 'gemini', model: 'gemini-2.5-flash-preview-tts', voice: 'apollo',
140
- provider: 'openai', model: 'tts-1', voice: 'fable', // HTTP streaming: throws APIUserAbortError on interrupt โ†’ unrecoverable session crash
140
+ // provider: 'openai', model: 'tts-1', voice: 'fable', // HTTP streaming: throws APIUserAbortError on interrupt โ†’ unrecoverable session crash
141
+ provider: 'openai', model: 'tts-1-hd', voice: 'fable', // 0.9.70: test tts-1-hd โ€” tts-1 had chronic per-sentence HTTP hangs (40s SDK watchdog โ†’ APIUserAbortError mid-message)
141
142
  // provider: 'groq-orpheus', model: 'canopylabs/orpheus-v1-english', voice: 'autumn', // $22/M chars โ€” voices: autumn, diana, hannah, austin, daniel, troy
142
143
  };
143
144
  /**
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "osborn",
3
- "version": "0.9.68",
3
+ "version": "0.9.71",
4
4
  "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
5
5
  "type": "module",
6
6
  "bin": {