osborn 0.9.44 → 0.9.46

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -54,9 +54,37 @@ EXPOSE 8741
54
54
 
55
55
  # Entrypoint: credential persistence + onboarding suppression + start
56
56
  COPY <<'ENTRYPOINT' /entrypoint.sh
57
- #!/bin/sh
57
+ #!/bin/bash
58
58
  set -e
59
59
 
60
+ # Persistent log capture for post-disconnect upload to Supabase Storage.
61
+ # Fly Machines has NO REST endpoint for fetching machine logs (the previous
62
+ # implementation hit /v1/apps/{app}/machines/{id}/logs which returns 404 → that
63
+ # 404 error string was getting uploaded as "the log" for every session).
64
+ # Volume-backed /workspace/osborn.log survives reboots and is readable via
65
+ # the documented /exec endpoint (`tail -n 500 /workspace/osborn.log`).
66
+ #
67
+ # We use process substitution to tee output to BOTH the log file AND the
68
+ # original stdout (so `flyctl logs` keeps working for ad-hoc debugging).
69
+ # Requires bash, hence the #!/bin/bash shebang.
70
+ #
71
+ # Size cap: if log grows past 100 MB, keep only the last 50 MB. Prevents
72
+ # disk-fill from long-running retry loops (we saw 17h × ~1 line/min = ~1000
73
+ # lines today, but anything connecting to LiveKit produces orders of
74
+ # magnitude more output).
75
+ LOGFILE=/workspace/osborn.log
76
+ mkdir -p /workspace
77
+ if [ -f "$LOGFILE" ] && [ "$(stat -c%s "$LOGFILE" 2>/dev/null || echo 0)" -gt 104857600 ]; then
78
+ echo "[sandbox] Rotating /workspace/osborn.log (>100MB, keeping last 50MB)"
79
+ tail -c 52428800 "$LOGFILE" > "$LOGFILE.tmp" && mv "$LOGFILE.tmp" "$LOGFILE"
80
+ fi
81
+ echo "[sandbox] === boot at $(date -Iseconds) ===" >> "$LOGFILE"
82
+ # Redirect all subsequent stdout+stderr from this script (and the eventual
83
+ # `exec osborn`) to both the original fd (Fly stdout collector) AND the
84
+ # append-only log file. tee runs as a backgrounded subshell that survives
85
+ # the final exec replacement.
86
+ exec > >(tee -a "$LOGFILE") 2>&1
87
+
60
88
  # Claude credential persistence (volume at /workspace)
61
89
  mkdir -p /workspace/.claude
62
90
  rm -rf /root/.claude
@@ -660,6 +660,28 @@ export class ClaudeLLM extends llm.LLM {
660
660
  }
661
661
  }
662
662
  }
663
+ // Compaction signals observed on the SDK iterator (parallel to hook path).
664
+ // The SDK emits TWO message subtypes during compaction independent of
665
+ // hook registration:
666
+ // - type:'system', subtype:'compact_boundary' (with compact_metadata)
667
+ // - type:'system', subtype:'status', status:'compacting' | null
668
+ // We DON'T route these through onCompactionEvent (to avoid duplicate
669
+ // chat bubbles — hooks already do that), but we LOG them. If the hook
670
+ // path ever silently fails, these logs will be the only signal that
671
+ // compaction actually happened — making the failure obvious in fly logs.
672
+ if (msg.type === 'system' && msg.subtype === 'compact_boundary') {
673
+ const meta = msg.compact_metadata || {};
674
+ console.log(`[COMPACT-SDK-ITER] compact_boundary observed: trigger=${meta.trigger ?? '?'} pre_tokens=${meta.pre_tokens ?? '?'} preserved=${meta.preserved_segment ? 'yes' : 'no'}`);
675
+ // Fire onCompactionEvent as a FALLBACK if hooks didn't fire — we
676
+ // detect this by checking whether we've seen 'compaction_started'
677
+ // recently. For now, log only; can wire as fallback if hooks fail.
678
+ }
679
+ if (msg.type === 'system' && msg.subtype === 'status') {
680
+ const status = msg.status;
681
+ if (status === 'compacting' || status === null) {
682
+ console.log(`[COMPACT-SDK-ITER] status change: ${status === 'compacting' ? 'ENTERED compacting' : 'EXITED compacting'} session=${msg.session_id?.substring(0, 8) ?? '?'}`);
683
+ }
684
+ }
663
685
  // Checkpoint capture
664
686
  if (msg.type === 'user' && msg.uuid) {
665
687
  callbacks.onCheckpoint(msg.uuid);
package/dist/index.js CHANGED
@@ -147,6 +147,24 @@ process.on('uncaughtException', (error) => {
147
147
  // ============================================================
148
148
  // Module-level room code so the HTTP server can expose it via GET /room-code
149
149
  let currentRoomCode = null;
150
+ // Module-level LiveKit connection state. Shared between main() (which runs the
151
+ // connect-with-retry loop) and the /health handler in startApiServer (which
152
+ // reports it to the frontend so the user sees a meaningful error instead of a
153
+ // dashboard redirect when LiveKit is unreachable / out of quota / etc).
154
+ //
155
+ // We deliberately do NOT 503 /health on connect failure — Fly's machine
156
+ // health-check uses /health, and returning non-2xx triggers a restart loop
157
+ // which (a) burns the same failing LiveKit calls every 30s and (b) gets the
158
+ // machine killed after 3 failed restarts. By staying 200 OK and surfacing the
159
+ // status as a field, we keep the container alive long enough for LiveKit to
160
+ // recover (auto-retry) or for the user to read the error and upgrade quota.
161
+ const livekitState = {
162
+ status: 'connecting',
163
+ error: null,
164
+ errorCode: null,
165
+ lastAttemptAt: null,
166
+ attemptCount: 0,
167
+ };
150
168
  function startApiServer(workingDir, port) {
151
169
  const server = createServer(async (req, res) => {
152
170
  // CORS headers for cloud frontend
@@ -221,7 +239,22 @@ function startApiServer(workingDir, port) {
221
239
  }
222
240
  catch { /* version optional */ }
223
241
  res.writeHead(200, { 'Content-Type': 'application/json' });
224
- res.end(JSON.stringify({ status: 'ok', workingDir, version }));
242
+ res.end(JSON.stringify({
243
+ status: 'ok',
244
+ workingDir,
245
+ version,
246
+ // LiveKit subsystem status — frontend can use this to surface a real
247
+ // error instead of treating the sandbox as totally broken. The HTTP
248
+ // status code stays 200 so Fly health-check stays green and the
249
+ // container isn't restart-looped while LiveKit is unreachable.
250
+ livekit: {
251
+ status: livekitState.status,
252
+ error: livekitState.error,
253
+ errorCode: livekitState.errorCode,
254
+ attemptCount: livekitState.attemptCount,
255
+ lastAttemptAt: livekitState.lastAttemptAt,
256
+ },
257
+ }));
225
258
  return;
226
259
  }
227
260
  // POST /webhook/recall — Recall.ai real-time transcript webhooks
@@ -1545,31 +1578,52 @@ async function main() {
1545
1578
  // previously skipped this entirely, so compaction events fired into the void
1546
1579
  // in pipeline mode.
1547
1580
  const buildOnCompactionEvent = () => (event) => {
1581
+ // CRITICAL diagnostic — every compaction event MUST appear in the agent
1582
+ // log first. If you don't see [COMPACT-AGENT-RX] for an event type, the
1583
+ // ClaudeLLM hook isn't calling this callback (most likely culprits:
1584
+ // PreCompact/PostCompact hook never registered, or the callback wasn't
1585
+ // passed through createPipelineDirectLLM's opts). If you see RX but no
1586
+ // CHAT-EMIT, the type didn't match the chat-emit branch. If you see both
1587
+ // but the frontend log never shows [COMPACT-FRONTEND], the data channel
1588
+ // dropped the message (room not connected, payload too big, etc.).
1589
+ console.log(`[COMPACT-AGENT-RX] type=${event.type} keys=[${Object.keys(event).filter(k => k !== 'type').join(',')}]`);
1548
1590
  try {
1549
1591
  // Raw event → banner state machine (compaction_started/progress/complete handlers in VoiceRoom.tsx).
1550
1592
  sendToFrontend({ ...event });
1593
+ console.log(`[COMPACT-AGENT-RAW-SENT] type=${event.type}`);
1551
1594
  // Inline chat bubble — reuses the existing claude_output path that's already working.
1552
1595
  if (event.type === 'compaction_started') {
1553
1596
  const triggerLabel = event.trigger ? ` (${event.trigger})` : '';
1597
+ const text = `🧠 _Crystallizing session memory…_${triggerLabel}`;
1554
1598
  sendToFrontend({
1555
1599
  type: 'claude_output',
1556
- text: `🧠 _Crystallizing session memory…_${triggerLabel}`,
1600
+ text,
1557
1601
  agentRole: 'direct',
1558
1602
  });
1603
+ console.log(`[COMPACT-AGENT-CHAT-EMIT] started → "${text.substring(0, 60)}"`);
1559
1604
  }
1560
1605
  else if (event.type === 'compaction_complete') {
1561
1606
  const n = event.skillsWritten ?? 0;
1562
1607
  const names = Array.isArray(event.skillNames) && event.skillNames.length > 0
1563
1608
  ? ` — ${event.skillNames.join(', ')}`
1564
1609
  : '';
1610
+ const text = `🧠 Memory crystallized — ${n} skill${n === 1 ? '' : 's'} updated${names}.`;
1565
1611
  sendToFrontend({
1566
1612
  type: 'claude_output',
1567
- text: `🧠 Memory crystallized — ${n} skill${n === 1 ? '' : 's'} updated${names}.`,
1613
+ text,
1568
1614
  agentRole: 'direct',
1569
1615
  });
1616
+ console.log(`[COMPACT-AGENT-CHAT-EMIT] complete → "${text.substring(0, 80)}"`);
1570
1617
  }
1618
+ else {
1619
+ // progress events don't get a chat bubble (too noisy) — they only feed the banner.
1620
+ // Log at debug level so we can confirm they fired.
1621
+ console.log(`[COMPACT-AGENT-CHAT-SKIP] type=${event.type} (progress events feed the banner only, no inline bubble)`);
1622
+ }
1623
+ }
1624
+ catch (err) {
1625
+ console.error(`[COMPACT-AGENT-ERROR] ${err.message}`);
1571
1626
  }
1572
- catch { /* non-fatal */ }
1573
1627
  };
1574
1628
  // Create DIRECT session (STT + Claude Agent SDK + TTS)
1575
1629
  async function createDirectSession(resumeSessionId, llmOverride) {
@@ -3931,22 +3985,87 @@ async function main() {
3931
3985
  // ============================================================
3932
3986
  // Connect to Room
3933
3987
  // ============================================================
3934
- try {
3935
- await room.connect(livekitUrl, jwt, {
3936
- autoSubscribe: true,
3937
- dynacast: true,
3938
- });
3939
- localParticipant = room.localParticipant;
3940
- console.log('✅ Connected to room:', roomName);
3941
- console.log('\n⏳ Waiting for user to connect...');
3942
- console.log(` Room: ${roomCode}\n`);
3943
- // Keep process alive
3944
- await new Promise(() => { });
3945
- }
3946
- catch (err) {
3947
- console.error('❌ Failed to connect:', err);
3948
- process.exit(1);
3949
- }
3988
+ // Connect to LiveKit with retry-on-failure.
3989
+ //
3990
+ // Earlier behavior: a single attempt followed by `process.exit(1)` on error.
3991
+ // Combined with Fly's restart policy this produced a tight restart loop that
3992
+ // (a) hit the same 429 / auth error every ~30s, (b) burned the LiveKit
3993
+ // quota's retry budget, and (c) hit Fly's max-restart-count (3) and killed
3994
+ // the machine — at which point the frontend's /api/sandbox probe saw the
3995
+ // sandbox as failed and bounced the user back to the dashboard with no
3996
+ // useful error.
3997
+ //
3998
+ // New behavior: bounded-backoff retry, infinite attempts. The API server
3999
+ // stays up the whole time serving /health (which surfaces the LiveKit error
4000
+ // as a field, NOT as an HTTP failure — see the /health handler comment).
4001
+ // When the underlying issue is resolved (quota reset, key fixed, LiveKit
4002
+ // service back), the next retry succeeds and the agent picks up where it
4003
+ // left off without anyone needing to manually restart.
4004
+ //
4005
+ // Backoff: 5s → 10s → 20s → 40s → 60s (capped). Resets to 5s after each
4006
+ // successful connect (so a single transient hiccup doesn't disable fast
4007
+ // recovery on the next disconnect).
4008
+ const connectWithRetry = async () => {
4009
+ const backoffSchedule = [5_000, 10_000, 20_000, 40_000, 60_000];
4010
+ let backoffIdx = 0;
4011
+ while (true) {
4012
+ livekitState.status = livekitState.attemptCount === 0 ? 'connecting' : 'retrying';
4013
+ livekitState.lastAttemptAt = Date.now();
4014
+ livekitState.attemptCount += 1;
4015
+ try {
4016
+ await room.connect(livekitUrl, jwt, {
4017
+ autoSubscribe: true,
4018
+ dynacast: true,
4019
+ });
4020
+ localParticipant = room.localParticipant;
4021
+ livekitState.status = 'connected';
4022
+ livekitState.error = null;
4023
+ livekitState.errorCode = null;
4024
+ backoffIdx = 0;
4025
+ console.log('✅ Connected to room:', roomName);
4026
+ console.log('\n⏳ Waiting for user to connect...');
4027
+ console.log(` Room: ${roomCode}\n`);
4028
+ return;
4029
+ }
4030
+ catch (err) {
4031
+ const msg = err instanceof Error ? err.message : String(err);
4032
+ // Categorize the error so the frontend can show specific guidance.
4033
+ // Substring matching on the message because LiveKit's rtc-node SDK
4034
+ // wraps the underlying HTTP status in a generic ConnectError.
4035
+ let errorCode;
4036
+ if (/429|connection minutes limit/i.test(msg))
4037
+ errorCode = 'quota_exceeded';
4038
+ else if (/401|403|unauthorized|invalid/i.test(msg))
4039
+ errorCode = 'auth';
4040
+ else if (/ENOTFOUND|ECONNREFUSED|ETIMEDOUT|network/i.test(msg))
4041
+ errorCode = 'network';
4042
+ else
4043
+ errorCode = 'unknown';
4044
+ livekitState.status = 'failed';
4045
+ livekitState.error = msg;
4046
+ livekitState.errorCode = errorCode;
4047
+ const waitMs = backoffSchedule[Math.min(backoffIdx, backoffSchedule.length - 1)];
4048
+ backoffIdx += 1;
4049
+ console.error(`❌ LiveKit connect failed (${errorCode}, attempt ${livekitState.attemptCount}): ${msg.substring(0, 200)}`);
4050
+ console.error(` Retrying in ${waitMs / 1000}s — process staying alive; /health remains 200 with livekit.status='failed'`);
4051
+ await new Promise(r => setTimeout(r, waitMs));
4052
+ // loop — try again
4053
+ }
4054
+ }
4055
+ };
4056
+ // Fire and forget; the retry loop keeps the process alive on its own (so
4057
+ // we don't need the explicit `new Promise(() => {})` keepalive anymore).
4058
+ // Errors that escape the retry loop should never happen, but if they do,
4059
+ // log them rather than crash.
4060
+ connectWithRetry().catch(err => {
4061
+ console.error('❌ Unrecoverable error in LiveKit retry loop (should not happen):', err);
4062
+ livekitState.status = 'failed';
4063
+ livekitState.error = err instanceof Error ? err.message : String(err);
4064
+ livekitState.errorCode = 'unrecoverable';
4065
+ });
4066
+ // Keep main() alive forever — without this the await chain ends and Node
4067
+ // exits 0, which Fly treats as a clean shutdown.
4068
+ await new Promise(() => { });
3950
4069
  }
3951
4070
  // Run
3952
4071
  main().catch(console.error);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "osborn",
3
- "version": "0.9.44",
3
+ "version": "0.9.46",
4
4
  "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
5
5
  "type": "module",
6
6
  "bin": {