osborn 0.9.37 → 0.9.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,10 +31,23 @@ RUN npm install -g "osborn@${OSBORN_VERSION}" @anthropic-ai/claude-code
31
31
  # Persistent workspace + claude config dirs
32
32
  RUN mkdir -p /workspace /root/.claude
33
33
 
34
+ # Marker so orchestration (machines.ts isManifestAware) can detect this image
35
+ # supports the manifest-driven update flow. Pre-marker machines fall back to
36
+ # the image-swap update path, which brings them onto a marker-aware image;
37
+ # from then on, all updates use the manifest flow defined in the entrypoint.
38
+ RUN touch /etc/osborn-manifest-aware
39
+
34
40
  ENV OSBORN_CWD=/workspace
35
41
  ENV OSBORN_API_PORT=8741
36
42
  ENV NODE_ENV=production
37
43
 
44
+ # HOME points at the volume so user-space config from any tool that respects
45
+ # HOME (gh, git, ssh, aws, etc.) automatically writes to the persistent
46
+ # volume instead of the ephemeral container overlay. The existing /root/.claude
47
+ # symlink machinery below stays in place — it's redundant with HOME=/workspace
48
+ # but harmless.
49
+ ENV HOME=/workspace
50
+
38
51
  WORKDIR /workspace
39
52
 
40
53
  EXPOSE 8741
@@ -52,6 +65,10 @@ ln -sf /workspace/.claude /root/.claude
52
65
  # Suppress Claude Code interactive onboarding prompts
53
66
  ONBOARDING_JSON='{"numStartups":10,"installMethod":"npm","autoUpdates":false,"hasCompletedOnboarding":true,"hasTrustDialogAccepted":true,"hasTrustDialogHooksAccepted":true,"hasCompletedProjectOnboarding":true,"hasAcknowledgedCostThreshold":true,"effortCalloutV2Dismissed":true,"theme":"dark","projects":{"/workspace":{"hasTrustDialogAccepted":true,"hasTrustDialogHooksAccepted":true,"hasCompletedProjectOnboarding":true}}}'
54
67
  echo "$ONBOARDING_JSON" > /root/.claude.json
68
+ # Additional write at $HOME/.claude.json. With HOME=/workspace this is where
69
+ # Claude Code actually reads its top-level config from; the /root/.claude.json
70
+ # write above becomes dead but is left in place (harmless).
71
+ echo "$ONBOARDING_JSON" > /workspace/.claude.json
55
72
  mkdir -p /workspace/.claude
56
73
  echo "$ONBOARDING_JSON" > /workspace/.claude/.config.json
57
74
  echo "$ONBOARDING_JSON" > /workspace/.claude/claude.json
@@ -79,6 +96,23 @@ if [ -d "$PKG_SKILLS_DIR" ]; then
79
96
  done
80
97
  fi
81
98
 
99
+ # Manifest-driven version check.
100
+ # Orchestration writes /workspace/.osborn-want-version on update (machines.ts
101
+ # updateViaManifest). On every boot we compare to the currently-installed
102
+ # osborn and run `npm install -g osborn@<want>` if they differ. The install
103
+ # lands in the container overlay (default npm prefix) — Fly wipes overlay on
104
+ # stop/start so the install re-runs on every boot until the base image is
105
+ # rebuilt with that version baked in. Update is fast between Fly restarts;
106
+ # only the first boot after a restart pays the npm install cost.
107
+ WANT=$(cat /workspace/.osborn-want-version 2>/dev/null | tr -d '[:space:]')
108
+ if [ -n "$WANT" ]; then
109
+ CURRENT=$(osborn --version 2>/dev/null | head -1 | tr -d '[:space:]')
110
+ if [ "$WANT" != "$CURRENT" ]; then
111
+ echo "[sandbox] osborn ${CURRENT:-none} → ${WANT} (manifest install)"
112
+ npm install -g "osborn@${WANT}" || echo "[sandbox] install failed — running ${CURRENT:-image-baked} version"
113
+ fi
114
+ fi
115
+
82
116
  exec osborn
83
117
  ENTRYPOINT
84
118
 
package/dist/index.js CHANGED
@@ -1,7 +1,7 @@
1
1
  // Load environment variables FIRST before any other imports
2
2
  import 'dotenv/config';
3
3
  import { voice, initializeLogger } from '@livekit/agents';
4
- import { Room, RoomEvent, RemoteParticipant } from '@livekit/rtc-node';
4
+ import { Room, RoomEvent } from '@livekit/rtc-node';
5
5
  import { AccessToken } from 'livekit-server-sdk';
6
6
  // Initialize logger before anything else
7
7
  initializeLogger({ pretty: true, level: 'info' });
@@ -2530,51 +2530,16 @@ async function main() {
2530
2530
  console.log('✅ Connected to room:', roomName);
2531
2531
  localParticipant = room.localParticipant;
2532
2532
  });
2533
- // EARLIEST possible "user is speaking" signal in our setup. Driven by LiveKit's
2534
- // server-side audio-level VAD on the participant's WebRTC track fires ~50-100ms
2535
- // after mic onset, independent of Deepgram STT or any local VAD (we don't run one).
2536
- //
2537
- // Flow: user starts talking ActiveSpeakersChanged includes a RemoteParticipant
2538
- // if agent is currently speaking interrupt the SpeechHandle to flush TTS playback.
2539
- // The existing handleSpeechDone callback (around line 1320) captures the spoken-text
2540
- // + JSONL context into lastInterruption; PipelineDirectLLM consumes it on the next
2541
- // chat() call to enrich the user's message with [INTERRUPTED] context — so the
2542
- // post-interrupt note flow is preserved even though we're cutting TTS earlier.
2543
- //
2544
- // Filter is `instanceof RemoteParticipant`. The agent IS the LocalParticipant in this
2545
- // room, and when its TTS plays it appears in the active-speakers list too. An earlier
2546
- // attempt that compared `s.identity !== room.localParticipant?.identity` failed because
2547
- // localParticipant.identity could be undefined at event-fire time, letting the agent's
2548
- // own speech trigger a self-interrupt. The type check is bulletproof.
2549
- //
2550
- // Realtime mode skipped — the SDK handles interruption internally there, and manual
2551
- // interrupt for Gemini realtime crashes its state machine (code 1008, memory v0.4.5).
2552
- let lastActiveSpeakerInterruptAt = 0;
2553
- room.on(RoomEvent.ActiveSpeakersChanged, (speakers) => {
2554
- if (!Array.isArray(speakers) || speakers.length === 0)
2555
- return;
2556
- const remoteSpeakers = speakers.filter((s) => s instanceof RemoteParticipant);
2557
- if (remoteSpeakers.length === 0)
2558
- return;
2559
- if (currentVoiceMode === 'realtime')
2560
- return;
2561
- if (agentState !== 'speaking')
2562
- return;
2563
- const now = Date.now();
2564
- const debounced = now - lastActiveSpeakerInterruptAt < 1000;
2565
- lastActiveSpeakerInterruptAt = now;
2566
- try {
2567
- if (!debounced) {
2568
- const ids = remoteSpeakers.map((s) => s.identity).join(',');
2569
- console.log(`🎤 ActiveSpeakersChanged: remote speakers [${ids}] + agent speaking → interrupting TTS`);
2570
- }
2571
- currentSession?.interrupt();
2572
- }
2573
- catch (err) {
2574
- if (!debounced)
2575
- console.warn('⚠️ active-speaker interrupt failed:', err instanceof Error ? err.message : err);
2576
- }
2577
- });
2533
+ // NOTE: previously this section also had a RoomEvent.ActiveSpeakersChanged
2534
+ // handler that interrupted TTS on any sustained audio activity (~50ms after
2535
+ // mic onset). That fired too eagerly coughs, paper rustles, the agent's
2536
+ // own TTS bleeding through the mic, and other non-speech sounds tripped it
2537
+ // ~10-15% of the time, leaving the agent silent with no recovery path
2538
+ // (because no STT transcript would follow). Dropped in favor of the
2539
+ // user_state_changed 'speaking' handler below, which is fed by Deepgram
2540
+ // Flux STT's speech-vs-noise classification: slower (~100-300ms) but
2541
+ // confidence-aware. The latency tradeoff is worth eliminating the false
2542
+ // interrupts at the root.
2578
2543
  room.on(RoomEvent.Disconnected, () => {
2579
2544
  console.log('👋 Disconnected from room');
2580
2545
  // Clean up active research and voice queue
@@ -2868,19 +2833,20 @@ async function main() {
2868
2833
  }
2869
2834
  });
2870
2835
  // User state tracking — prevents queue from colliding with server-side VAD.
2871
- // Also a secondary interrupt trigger: when Deepgram STT classifies speech onset
2872
- // it propagates here via agent_activity.onStartOfSpeech _updateUserState('speaking').
2873
- // Fires later than ActiveSpeakersChanged (Deepgram has ~100-300ms classification
2874
- // latency vs LiveKit's ~50-100ms audio-level) but acts as a redundant fallback in
2875
- // case the room-level event drops. interrupt() is idempotent on an already-
2876
- // interrupted SpeechHandle so calling both paths is harmless.
2836
+ // Also the PRIMARY interrupt trigger now that the over-eager ActiveSpeakersChanged
2837
+ // path is gone. Fires when Deepgram Flux STT classifies frames as speech (not noise)
2838
+ // and propagates via agent_activity.onStartOfSpeech → _updateUserState('speaking').
2839
+ // Latency ~100-300ms after mic onset, which is the cost of confidence-aware
2840
+ // detection — vs the prior ActiveSpeakers handler that fired at ~50ms on any audio
2841
+ // activity and tripped ~10-15% false interrupts on coughs, paper rustle, agent's
2842
+ // own TTS bleeding through the mic, etc.
2877
2843
  sess.on('user_state_changed', (ev) => {
2878
2844
  const prev = userState;
2879
2845
  userState = ev.newState;
2880
2846
  console.log(`👤 User state: ${prev} → ${ev.newState} (agent: ${agentState})`);
2881
2847
  if (ev.newState === 'speaking' && agentState === 'speaking' && sessionVoiceMode !== 'realtime') {
2882
2848
  try {
2883
- console.log('🎤 user_state_changed=speaking + agent speaking → interrupting TTS (fallback)');
2849
+ console.log('🎤 user_state_changed=speaking + agent speaking → interrupting TTS');
2884
2850
  currentSession?.interrupt();
2885
2851
  }
2886
2852
  catch (err) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "osborn",
3
- "version": "0.9.37",
3
+ "version": "0.9.39",
4
4
  "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
5
5
  "type": "module",
6
6
  "bin": {