osborn 0.9.44 → 0.9.46
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Dockerfile.sandbox +29 -1
- package/dist/claude-llm.js +22 -0
- package/dist/index.js +139 -20
- package/package.json +1 -1
package/Dockerfile.sandbox
CHANGED
|
@@ -54,9 +54,37 @@ EXPOSE 8741
|
|
|
54
54
|
|
|
55
55
|
# Entrypoint: credential persistence + onboarding suppression + start
|
|
56
56
|
COPY <<'ENTRYPOINT' /entrypoint.sh
|
|
57
|
-
#!/bin/
|
|
57
|
+
#!/bin/bash
|
|
58
58
|
set -e
|
|
59
59
|
|
|
60
|
+
# Persistent log capture for post-disconnect upload to Supabase Storage.
|
|
61
|
+
# Fly Machines has NO REST endpoint for fetching machine logs (the previous
|
|
62
|
+
# implementation hit /v1/apps/{app}/machines/{id}/logs which returns 404 → that
|
|
63
|
+
# 404 error string was getting uploaded as "the log" for every session).
|
|
64
|
+
# Volume-backed /workspace/osborn.log survives reboots and is readable via
|
|
65
|
+
# the documented /exec endpoint (`tail -n 500 /workspace/osborn.log`).
|
|
66
|
+
#
|
|
67
|
+
# We use process substitution to tee output to BOTH the log file AND the
|
|
68
|
+
# original stdout (so `flyctl logs` keeps working for ad-hoc debugging).
|
|
69
|
+
# Requires bash, hence the #!/bin/bash shebang.
|
|
70
|
+
#
|
|
71
|
+
# Size cap: if log grows past 100 MB, keep only the last 50 MB. Prevents
|
|
72
|
+
# disk-fill from long-running retry loops (we saw 17h × ~1 line/min = ~1000
|
|
73
|
+
# lines today, but anything connecting to LiveKit produces orders of
|
|
74
|
+
# magnitude more output).
|
|
75
|
+
LOGFILE=/workspace/osborn.log
|
|
76
|
+
mkdir -p /workspace
|
|
77
|
+
if [ -f "$LOGFILE" ] && [ "$(stat -c%s "$LOGFILE" 2>/dev/null || echo 0)" -gt 104857600 ]; then
|
|
78
|
+
echo "[sandbox] Rotating /workspace/osborn.log (>100MB, keeping last 50MB)"
|
|
79
|
+
tail -c 52428800 "$LOGFILE" > "$LOGFILE.tmp" && mv "$LOGFILE.tmp" "$LOGFILE"
|
|
80
|
+
fi
|
|
81
|
+
echo "[sandbox] === boot at $(date -Iseconds) ===" >> "$LOGFILE"
|
|
82
|
+
# Redirect all subsequent stdout+stderr from this script (and the eventual
|
|
83
|
+
# `exec osborn`) to both the original fd (Fly stdout collector) AND the
|
|
84
|
+
# append-only log file. tee runs as a backgrounded subshell that survives
|
|
85
|
+
# the final exec replacement.
|
|
86
|
+
exec > >(tee -a "$LOGFILE") 2>&1
|
|
87
|
+
|
|
60
88
|
# Claude credential persistence (volume at /workspace)
|
|
61
89
|
mkdir -p /workspace/.claude
|
|
62
90
|
rm -rf /root/.claude
|
package/dist/claude-llm.js
CHANGED
|
@@ -660,6 +660,28 @@ export class ClaudeLLM extends llm.LLM {
|
|
|
660
660
|
}
|
|
661
661
|
}
|
|
662
662
|
}
|
|
663
|
+
// Compaction signals observed on the SDK iterator (parallel to hook path).
|
|
664
|
+
// The SDK emits TWO message subtypes during compaction independent of
|
|
665
|
+
// hook registration:
|
|
666
|
+
// - type:'system', subtype:'compact_boundary' (with compact_metadata)
|
|
667
|
+
// - type:'system', subtype:'status', status:'compacting' | null
|
|
668
|
+
// We DON'T route these through onCompactionEvent (to avoid duplicate
|
|
669
|
+
// chat bubbles — hooks already do that), but we LOG them. If the hook
|
|
670
|
+
// path ever silently fails, these logs will be the only signal that
|
|
671
|
+
// compaction actually happened — making the failure obvious in fly logs.
|
|
672
|
+
if (msg.type === 'system' && msg.subtype === 'compact_boundary') {
|
|
673
|
+
const meta = msg.compact_metadata || {};
|
|
674
|
+
console.log(`[COMPACT-SDK-ITER] compact_boundary observed: trigger=${meta.trigger ?? '?'} pre_tokens=${meta.pre_tokens ?? '?'} preserved=${meta.preserved_segment ? 'yes' : 'no'}`);
|
|
675
|
+
// Fire onCompactionEvent as a FALLBACK if hooks didn't fire — we
|
|
676
|
+
// detect this by checking whether we've seen 'compaction_started'
|
|
677
|
+
// recently. For now, log only; can wire as fallback if hooks fail.
|
|
678
|
+
}
|
|
679
|
+
if (msg.type === 'system' && msg.subtype === 'status') {
|
|
680
|
+
const status = msg.status;
|
|
681
|
+
if (status === 'compacting' || status === null) {
|
|
682
|
+
console.log(`[COMPACT-SDK-ITER] status change: ${status === 'compacting' ? 'ENTERED compacting' : 'EXITED compacting'} session=${msg.session_id?.substring(0, 8) ?? '?'}`);
|
|
683
|
+
}
|
|
684
|
+
}
|
|
663
685
|
// Checkpoint capture
|
|
664
686
|
if (msg.type === 'user' && msg.uuid) {
|
|
665
687
|
callbacks.onCheckpoint(msg.uuid);
|
package/dist/index.js
CHANGED
|
@@ -147,6 +147,24 @@ process.on('uncaughtException', (error) => {
|
|
|
147
147
|
// ============================================================
|
|
148
148
|
// Module-level room code so the HTTP server can expose it via GET /room-code
|
|
149
149
|
let currentRoomCode = null;
|
|
150
|
+
// Module-level LiveKit connection state. Shared between main() (which runs the
|
|
151
|
+
// connect-with-retry loop) and the /health handler in startApiServer (which
|
|
152
|
+
// reports it to the frontend so the user sees a meaningful error instead of a
|
|
153
|
+
// dashboard redirect when LiveKit is unreachable / out of quota / etc).
|
|
154
|
+
//
|
|
155
|
+
// We deliberately do NOT 503 /health on connect failure — Fly's machine
|
|
156
|
+
// health-check uses /health, and returning non-2xx triggers a restart loop
|
|
157
|
+
// which (a) burns the same failing LiveKit calls every 30s and (b) gets the
|
|
158
|
+
// machine killed after 3 failed restarts. By staying 200 OK and surfacing the
|
|
159
|
+
// status as a field, we keep the container alive long enough for LiveKit to
|
|
160
|
+
// recover (auto-retry) or for the user to read the error and upgrade quota.
|
|
161
|
+
const livekitState = {
|
|
162
|
+
status: 'connecting',
|
|
163
|
+
error: null,
|
|
164
|
+
errorCode: null,
|
|
165
|
+
lastAttemptAt: null,
|
|
166
|
+
attemptCount: 0,
|
|
167
|
+
};
|
|
150
168
|
function startApiServer(workingDir, port) {
|
|
151
169
|
const server = createServer(async (req, res) => {
|
|
152
170
|
// CORS headers for cloud frontend
|
|
@@ -221,7 +239,22 @@ function startApiServer(workingDir, port) {
|
|
|
221
239
|
}
|
|
222
240
|
catch { /* version optional */ }
|
|
223
241
|
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
224
|
-
res.end(JSON.stringify({
|
|
242
|
+
res.end(JSON.stringify({
|
|
243
|
+
status: 'ok',
|
|
244
|
+
workingDir,
|
|
245
|
+
version,
|
|
246
|
+
// LiveKit subsystem status — frontend can use this to surface a real
|
|
247
|
+
// error instead of treating the sandbox as totally broken. The HTTP
|
|
248
|
+
// status code stays 200 so Fly health-check stays green and the
|
|
249
|
+
// container isn't restart-looped while LiveKit is unreachable.
|
|
250
|
+
livekit: {
|
|
251
|
+
status: livekitState.status,
|
|
252
|
+
error: livekitState.error,
|
|
253
|
+
errorCode: livekitState.errorCode,
|
|
254
|
+
attemptCount: livekitState.attemptCount,
|
|
255
|
+
lastAttemptAt: livekitState.lastAttemptAt,
|
|
256
|
+
},
|
|
257
|
+
}));
|
|
225
258
|
return;
|
|
226
259
|
}
|
|
227
260
|
// POST /webhook/recall — Recall.ai real-time transcript webhooks
|
|
@@ -1545,31 +1578,52 @@ async function main() {
|
|
|
1545
1578
|
// previously skipped this entirely, so compaction events fired into the void
|
|
1546
1579
|
// in pipeline mode.
|
|
1547
1580
|
const buildOnCompactionEvent = () => (event) => {
|
|
1581
|
+
// CRITICAL diagnostic — every compaction event MUST appear in the agent
|
|
1582
|
+
// log first. If you don't see [COMPACT-AGENT-RX] for an event type, the
|
|
1583
|
+
// ClaudeLLM hook isn't calling this callback (most likely culprits:
|
|
1584
|
+
// PreCompact/PostCompact hook never registered, or the callback wasn't
|
|
1585
|
+
// passed through createPipelineDirectLLM's opts). If you see RX but no
|
|
1586
|
+
// CHAT-EMIT, the type didn't match the chat-emit branch. If you see both
|
|
1587
|
+
// but the frontend log never shows [COMPACT-FRONTEND], the data channel
|
|
1588
|
+
// dropped the message (room not connected, payload too big, etc.).
|
|
1589
|
+
console.log(`[COMPACT-AGENT-RX] type=${event.type} keys=[${Object.keys(event).filter(k => k !== 'type').join(',')}]`);
|
|
1548
1590
|
try {
|
|
1549
1591
|
// Raw event → banner state machine (compaction_started/progress/complete handlers in VoiceRoom.tsx).
|
|
1550
1592
|
sendToFrontend({ ...event });
|
|
1593
|
+
console.log(`[COMPACT-AGENT-RAW-SENT] type=${event.type}`);
|
|
1551
1594
|
// Inline chat bubble — reuses the existing claude_output path that's already working.
|
|
1552
1595
|
if (event.type === 'compaction_started') {
|
|
1553
1596
|
const triggerLabel = event.trigger ? ` (${event.trigger})` : '';
|
|
1597
|
+
const text = `🧠 _Crystallizing session memory…_${triggerLabel}`;
|
|
1554
1598
|
sendToFrontend({
|
|
1555
1599
|
type: 'claude_output',
|
|
1556
|
-
text
|
|
1600
|
+
text,
|
|
1557
1601
|
agentRole: 'direct',
|
|
1558
1602
|
});
|
|
1603
|
+
console.log(`[COMPACT-AGENT-CHAT-EMIT] started → "${text.substring(0, 60)}"`);
|
|
1559
1604
|
}
|
|
1560
1605
|
else if (event.type === 'compaction_complete') {
|
|
1561
1606
|
const n = event.skillsWritten ?? 0;
|
|
1562
1607
|
const names = Array.isArray(event.skillNames) && event.skillNames.length > 0
|
|
1563
1608
|
? ` — ${event.skillNames.join(', ')}`
|
|
1564
1609
|
: '';
|
|
1610
|
+
const text = `🧠 Memory crystallized — ${n} skill${n === 1 ? '' : 's'} updated${names}.`;
|
|
1565
1611
|
sendToFrontend({
|
|
1566
1612
|
type: 'claude_output',
|
|
1567
|
-
text
|
|
1613
|
+
text,
|
|
1568
1614
|
agentRole: 'direct',
|
|
1569
1615
|
});
|
|
1616
|
+
console.log(`[COMPACT-AGENT-CHAT-EMIT] complete → "${text.substring(0, 80)}"`);
|
|
1570
1617
|
}
|
|
1618
|
+
else {
|
|
1619
|
+
// progress events don't get a chat bubble (too noisy) — they only feed the banner.
|
|
1620
|
+
// Log at debug level so we can confirm they fired.
|
|
1621
|
+
console.log(`[COMPACT-AGENT-CHAT-SKIP] type=${event.type} (progress events feed the banner only, no inline bubble)`);
|
|
1622
|
+
}
|
|
1623
|
+
}
|
|
1624
|
+
catch (err) {
|
|
1625
|
+
console.error(`[COMPACT-AGENT-ERROR] ${err.message}`);
|
|
1571
1626
|
}
|
|
1572
|
-
catch { /* non-fatal */ }
|
|
1573
1627
|
};
|
|
1574
1628
|
// Create DIRECT session (STT + Claude Agent SDK + TTS)
|
|
1575
1629
|
async function createDirectSession(resumeSessionId, llmOverride) {
|
|
@@ -3931,22 +3985,87 @@ async function main() {
|
|
|
3931
3985
|
// ============================================================
|
|
3932
3986
|
// Connect to Room
|
|
3933
3987
|
// ============================================================
|
|
3934
|
-
|
|
3935
|
-
|
|
3936
|
-
|
|
3937
|
-
|
|
3938
|
-
|
|
3939
|
-
|
|
3940
|
-
|
|
3941
|
-
|
|
3942
|
-
|
|
3943
|
-
|
|
3944
|
-
|
|
3945
|
-
|
|
3946
|
-
|
|
3947
|
-
|
|
3948
|
-
|
|
3949
|
-
|
|
3988
|
+
// Connect to LiveKit with retry-on-failure.
|
|
3989
|
+
//
|
|
3990
|
+
// Earlier behavior: a single attempt followed by `process.exit(1)` on error.
|
|
3991
|
+
// Combined with Fly's restart policy this produced a tight restart loop that
|
|
3992
|
+
// (a) hit the same 429 / auth error every ~30s, (b) burned the LiveKit
|
|
3993
|
+
// quota's retry budget, and (c) hit Fly's max-restart-count (3) and killed
|
|
3994
|
+
// the machine — at which point the frontend's /api/sandbox probe saw the
|
|
3995
|
+
// sandbox as failed and bounced the user back to the dashboard with no
|
|
3996
|
+
// useful error.
|
|
3997
|
+
//
|
|
3998
|
+
// New behavior: bounded-backoff retry, infinite attempts. The API server
|
|
3999
|
+
// stays up the whole time serving /health (which surfaces the LiveKit error
|
|
4000
|
+
// as a field, NOT as an HTTP failure — see the /health handler comment).
|
|
4001
|
+
// When the underlying issue is resolved (quota reset, key fixed, LiveKit
|
|
4002
|
+
// service back), the next retry succeeds and the agent picks up where it
|
|
4003
|
+
// left off without anyone needing to manually restart.
|
|
4004
|
+
//
|
|
4005
|
+
// Backoff: 5s → 10s → 20s → 40s → 60s (capped). Resets to 5s after each
|
|
4006
|
+
// successful connect (so a single transient hiccup doesn't disable fast
|
|
4007
|
+
// recovery on the next disconnect).
|
|
4008
|
+
const connectWithRetry = async () => {
|
|
4009
|
+
const backoffSchedule = [5_000, 10_000, 20_000, 40_000, 60_000];
|
|
4010
|
+
let backoffIdx = 0;
|
|
4011
|
+
while (true) {
|
|
4012
|
+
livekitState.status = livekitState.attemptCount === 0 ? 'connecting' : 'retrying';
|
|
4013
|
+
livekitState.lastAttemptAt = Date.now();
|
|
4014
|
+
livekitState.attemptCount += 1;
|
|
4015
|
+
try {
|
|
4016
|
+
await room.connect(livekitUrl, jwt, {
|
|
4017
|
+
autoSubscribe: true,
|
|
4018
|
+
dynacast: true,
|
|
4019
|
+
});
|
|
4020
|
+
localParticipant = room.localParticipant;
|
|
4021
|
+
livekitState.status = 'connected';
|
|
4022
|
+
livekitState.error = null;
|
|
4023
|
+
livekitState.errorCode = null;
|
|
4024
|
+
backoffIdx = 0;
|
|
4025
|
+
console.log('✅ Connected to room:', roomName);
|
|
4026
|
+
console.log('\n⏳ Waiting for user to connect...');
|
|
4027
|
+
console.log(` Room: ${roomCode}\n`);
|
|
4028
|
+
return;
|
|
4029
|
+
}
|
|
4030
|
+
catch (err) {
|
|
4031
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
4032
|
+
// Categorize the error so the frontend can show specific guidance.
|
|
4033
|
+
// Substring matching on the message because LiveKit's rtc-node SDK
|
|
4034
|
+
// wraps the underlying HTTP status in a generic ConnectError.
|
|
4035
|
+
let errorCode;
|
|
4036
|
+
if (/429|connection minutes limit/i.test(msg))
|
|
4037
|
+
errorCode = 'quota_exceeded';
|
|
4038
|
+
else if (/401|403|unauthorized|invalid/i.test(msg))
|
|
4039
|
+
errorCode = 'auth';
|
|
4040
|
+
else if (/ENOTFOUND|ECONNREFUSED|ETIMEDOUT|network/i.test(msg))
|
|
4041
|
+
errorCode = 'network';
|
|
4042
|
+
else
|
|
4043
|
+
errorCode = 'unknown';
|
|
4044
|
+
livekitState.status = 'failed';
|
|
4045
|
+
livekitState.error = msg;
|
|
4046
|
+
livekitState.errorCode = errorCode;
|
|
4047
|
+
const waitMs = backoffSchedule[Math.min(backoffIdx, backoffSchedule.length - 1)];
|
|
4048
|
+
backoffIdx += 1;
|
|
4049
|
+
console.error(`❌ LiveKit connect failed (${errorCode}, attempt ${livekitState.attemptCount}): ${msg.substring(0, 200)}`);
|
|
4050
|
+
console.error(` Retrying in ${waitMs / 1000}s — process staying alive; /health remains 200 with livekit.status='failed'`);
|
|
4051
|
+
await new Promise(r => setTimeout(r, waitMs));
|
|
4052
|
+
// loop — try again
|
|
4053
|
+
}
|
|
4054
|
+
}
|
|
4055
|
+
};
|
|
4056
|
+
// Fire and forget; the retry loop keeps the process alive on its own (so
|
|
4057
|
+
// we don't need the explicit `new Promise(() => {})` keepalive anymore).
|
|
4058
|
+
// Errors that escape the retry loop should never happen, but if they do,
|
|
4059
|
+
// log them rather than crash.
|
|
4060
|
+
connectWithRetry().catch(err => {
|
|
4061
|
+
console.error('❌ Unrecoverable error in LiveKit retry loop (should not happen):', err);
|
|
4062
|
+
livekitState.status = 'failed';
|
|
4063
|
+
livekitState.error = err instanceof Error ? err.message : String(err);
|
|
4064
|
+
livekitState.errorCode = 'unrecoverable';
|
|
4065
|
+
});
|
|
4066
|
+
// Keep main() alive forever — without this the await chain ends and Node
|
|
4067
|
+
// exits 0, which Fly treats as a clean shutdown.
|
|
4068
|
+
await new Promise(() => { });
|
|
3950
4069
|
}
|
|
3951
4070
|
// Run
|
|
3952
4071
|
main().catch(console.error);
|