npm - osborn - Versions diffs - 0.9.37 → 0.9.39 - Mend

osborn 0.9.37 → 0.9.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/Dockerfile.sandbox CHANGED Viewed

@@ -31,10 +31,23 @@ RUN npm install -g "osborn@${OSBORN_VERSION}" @anthropic-ai/claude-code
 # Persistent workspace + claude config dirs
 RUN mkdir -p /workspace /root/.claude
+# Marker so orchestration (machines.ts isManifestAware) can detect this image
+# supports the manifest-driven update flow. Pre-marker machines fall back to
+# the image-swap update path, which brings them onto a marker-aware image;
+# from then on, all updates use the manifest flow defined in the entrypoint.
+RUN touch /etc/osborn-manifest-aware
 ENV OSBORN_CWD=/workspace
 ENV OSBORN_API_PORT=8741
 ENV NODE_ENV=production
+# HOME points at the volume so user-space config from any tool that respects
+# HOME (gh, git, ssh, aws, etc.) automatically writes to the persistent
+# volume instead of the ephemeral container overlay. The existing /root/.claude
+# symlink machinery below stays in place — it's redundant with HOME=/workspace
+# but harmless.
+ENV HOME=/workspace
 WORKDIR /workspace
 EXPOSE 8741
@@ -52,6 +65,10 @@ ln -sf /workspace/.claude /root/.claude
 # Suppress Claude Code interactive onboarding prompts
 ONBOARDING_JSON='{"numStartups":10,"installMethod":"npm","autoUpdates":false,"hasCompletedOnboarding":true,"hasTrustDialogAccepted":true,"hasTrustDialogHooksAccepted":true,"hasCompletedProjectOnboarding":true,"hasAcknowledgedCostThreshold":true,"effortCalloutV2Dismissed":true,"theme":"dark","projects":{"/workspace":{"hasTrustDialogAccepted":true,"hasTrustDialogHooksAccepted":true,"hasCompletedProjectOnboarding":true}}}'
 echo "$ONBOARDING_JSON" > /root/.claude.json
+# Additional write at $HOME/.claude.json. With HOME=/workspace this is where
+# Claude Code actually reads its top-level config from; the /root/.claude.json
+# write above becomes dead but is left in place (harmless).
+echo "$ONBOARDING_JSON" > /workspace/.claude.json
 mkdir -p /workspace/.claude
 echo "$ONBOARDING_JSON" > /workspace/.claude/.config.json
 echo "$ONBOARDING_JSON" > /workspace/.claude/claude.json
@@ -79,6 +96,23 @@ if [ -d "$PKG_SKILLS_DIR" ]; then
   done
 fi
+# Manifest-driven version check.
+# Orchestration writes /workspace/.osborn-want-version on update (machines.ts
+# updateViaManifest). On every boot we compare to the currently-installed
+# osborn and run `npm install -g osborn@<want>` if they differ. The install
+# lands in the container overlay (default npm prefix) — Fly wipes overlay on
+# stop/start so the install re-runs on every boot until the base image is
+# rebuilt with that version baked in. Update is fast between Fly restarts;
+# only the first boot after a restart pays the npm install cost.
+WANT=$(cat /workspace/.osborn-want-version 2>/dev/null | tr -d '[:space:]')
+if [ -n "$WANT" ]; then
+  CURRENT=$(osborn --version 2>/dev/null | head -1 | tr -d '[:space:]')
+  if [ "$WANT" != "$CURRENT" ]; then
+    echo "[sandbox] osborn ${CURRENT:-none} → ${WANT} (manifest install)"
+    npm install -g "osborn@${WANT}" || echo "[sandbox] install failed — running ${CURRENT:-image-baked} version"
+  fi
+fi
 exec osborn
 ENTRYPOINT

package/dist/index.js CHANGED Viewed

@@ -1,7 +1,7 @@
 // Load environment variables FIRST before any other imports
 import 'dotenv/config';
 import { voice, initializeLogger } from '@livekit/agents';
-import { Room, RoomEvent, RemoteParticipant } from '@livekit/rtc-node';
+import { Room, RoomEvent } from '@livekit/rtc-node';
 import { AccessToken } from 'livekit-server-sdk';
 // Initialize logger before anything else
 initializeLogger({ pretty: true, level: 'info' });
@@ -2530,51 +2530,16 @@ async function main() {
         console.log('✅ Connected to room:', roomName);
         localParticipant = room.localParticipant;
     });
-    // EARLIEST possible "user is speaking" signal in our setup. Driven by LiveKit's
-    // server-side audio-level VAD on the participant's WebRTC track — fires ~50-100ms
-    // after mic onset, independent of Deepgram STT or any local VAD (we don't run one).
-    //
-    // Flow: user starts talking → ActiveSpeakersChanged includes a RemoteParticipant →
-    // if agent is currently speaking → interrupt the SpeechHandle to flush TTS playback.
-    // The existing handleSpeechDone callback (around line 1320) captures the spoken-text
-    // + JSONL context into lastInterruption; PipelineDirectLLM consumes it on the next
-    // chat() call to enrich the user's message with [INTERRUPTED] context — so the
-    // post-interrupt note flow is preserved even though we're cutting TTS earlier.
-    //
-    // Filter is `instanceof RemoteParticipant`. The agent IS the LocalParticipant in this
-    // room, and when its TTS plays it appears in the active-speakers list too. An earlier
-    // attempt that compared `s.identity !== room.localParticipant?.identity` failed because
-    // localParticipant.identity could be undefined at event-fire time, letting the agent's
-    // own speech trigger a self-interrupt. The type check is bulletproof.
-    //
-    // Realtime mode skipped — the SDK handles interruption internally there, and manual
-    // interrupt for Gemini realtime crashes its state machine (code 1008, memory v0.4.5).
-    let lastActiveSpeakerInterruptAt = 0;
-    room.on(RoomEvent.ActiveSpeakersChanged, (speakers) => {
-        if (!Array.isArray(speakers) || speakers.length === 0)
-            return;
-        const remoteSpeakers = speakers.filter((s) => s instanceof RemoteParticipant);
-        if (remoteSpeakers.length === 0)
-            return;
-        if (currentVoiceMode === 'realtime')
-            return;
-        if (agentState !== 'speaking')
-            return;
-        const now = Date.now();
-        const debounced = now - lastActiveSpeakerInterruptAt < 1000;
-        lastActiveSpeakerInterruptAt = now;
-        try {
-            if (!debounced) {
-                const ids = remoteSpeakers.map((s) => s.identity).join(',');
-                console.log(`🎤 ActiveSpeakersChanged: remote speakers [${ids}] + agent speaking → interrupting TTS`);
-            }
-            currentSession?.interrupt();
-        }
-        catch (err) {
-            if (!debounced)
-                console.warn('⚠️ active-speaker interrupt failed:', err instanceof Error ? err.message : err);
-        }
-    });
+    // NOTE: previously this section also had a RoomEvent.ActiveSpeakersChanged
+    // handler that interrupted TTS on any sustained audio activity (~50ms after
+    // mic onset). That fired too eagerly — coughs, paper rustles, the agent's
+    // own TTS bleeding through the mic, and other non-speech sounds tripped it
+    // ~10-15% of the time, leaving the agent silent with no recovery path
+    // (because no STT transcript would follow). Dropped in favor of the
+    // user_state_changed → 'speaking' handler below, which is fed by Deepgram
+    // Flux STT's speech-vs-noise classification: slower (~100-300ms) but
+    // confidence-aware. The latency tradeoff is worth eliminating the false
+    // interrupts at the root.
     room.on(RoomEvent.Disconnected, () => {
         console.log('👋 Disconnected from room');
         // Clean up active research and voice queue
@@ -2868,19 +2833,20 @@ async function main() {
                 }
             });
             // User state tracking — prevents queue from colliding with server-side VAD.
-            // Also a secondary interrupt trigger: when Deepgram STT classifies speech onset
-            // it propagates here via agent_activity.onStartOfSpeech → _updateUserState('speaking').
-            // Fires later than ActiveSpeakersChanged (Deepgram has ~100-300ms classification
-            // latency vs LiveKit's ~50-100ms audio-level) but acts as a redundant fallback in
-            // case the room-level event drops. interrupt() is idempotent on an already-
-            // interrupted SpeechHandle so calling both paths is harmless.
+            // Also the PRIMARY interrupt trigger now that the over-eager ActiveSpeakersChanged
+            // path is gone. Fires when Deepgram Flux STT classifies frames as speech (not noise)
+            // and propagates via agent_activity.onStartOfSpeech → _updateUserState('speaking').
+            // Latency ~100-300ms after mic onset, which is the cost of confidence-aware
+            // detection — vs the prior ActiveSpeakers handler that fired at ~50ms on any audio
+            // activity and tripped ~10-15% false interrupts on coughs, paper rustle, agent's
+            // own TTS bleeding through the mic, etc.
             sess.on('user_state_changed', (ev) => {
                 const prev = userState;
                 userState = ev.newState;
                 console.log(`👤 User state: ${prev} → ${ev.newState} (agent: ${agentState})`);
                 if (ev.newState === 'speaking' && agentState === 'speaking' && sessionVoiceMode !== 'realtime') {
                     try {
-                        console.log('🎤 user_state_changed=speaking + agent speaking → interrupting TTS (fallback)');
+                        console.log('🎤 user_state_changed=speaking + agent speaking → interrupting TTS');
                         currentSession?.interrupt();
                     }
                     catch (err) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "osborn",
-  "version": "0.9.37",
+  "version": "0.9.39",
   "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
   "type": "module",
   "bin": {