osborn 0.9.44 → 0.9.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +115 -17
  2. package/package.json +1 -1
package/dist/index.js CHANGED
@@ -147,6 +147,24 @@ process.on('uncaughtException', (error) => {
147
147
  // ============================================================
148
148
  // Module-level room code so the HTTP server can expose it via GET /room-code
149
149
  let currentRoomCode = null;
150
+ // Module-level LiveKit connection state. Shared between main() (which runs the
151
+ // connect-with-retry loop) and the /health handler in startApiServer (which
152
+ // reports it to the frontend so the user sees a meaningful error instead of a
153
+ // dashboard redirect when LiveKit is unreachable / out of quota / etc).
154
+ //
155
+ // We deliberately do NOT 503 /health on connect failure — Fly's machine
156
+ // health-check uses /health, and returning non-2xx triggers a restart loop
157
+ // which (a) burns the same failing LiveKit calls every 30s and (b) gets the
158
+ // machine killed after 3 failed restarts. By staying 200 OK and surfacing the
159
+ // status as a field, we keep the container alive long enough for LiveKit to
160
+ // recover (auto-retry) or for the user to read the error and upgrade quota.
161
+ const livekitState = {
162
+ status: 'connecting',
163
+ error: null,
164
+ errorCode: null,
165
+ lastAttemptAt: null,
166
+ attemptCount: 0,
167
+ };
150
168
  function startApiServer(workingDir, port) {
151
169
  const server = createServer(async (req, res) => {
152
170
  // CORS headers for cloud frontend
@@ -221,7 +239,22 @@ function startApiServer(workingDir, port) {
221
239
  }
222
240
  catch { /* version optional */ }
223
241
  res.writeHead(200, { 'Content-Type': 'application/json' });
224
- res.end(JSON.stringify({ status: 'ok', workingDir, version }));
242
+ res.end(JSON.stringify({
243
+ status: 'ok',
244
+ workingDir,
245
+ version,
246
+ // LiveKit subsystem status — frontend can use this to surface a real
247
+ // error instead of treating the sandbox as totally broken. The HTTP
248
+ // status code stays 200 so Fly health-check stays green and the
249
+ // container isn't restart-looped while LiveKit is unreachable.
250
+ livekit: {
251
+ status: livekitState.status,
252
+ error: livekitState.error,
253
+ errorCode: livekitState.errorCode,
254
+ attemptCount: livekitState.attemptCount,
255
+ lastAttemptAt: livekitState.lastAttemptAt,
256
+ },
257
+ }));
225
258
  return;
226
259
  }
227
260
  // POST /webhook/recall — Recall.ai real-time transcript webhooks
@@ -3931,22 +3964,87 @@ async function main() {
3931
3964
  // ============================================================
3932
3965
  // Connect to Room
3933
3966
  // ============================================================
3934
- try {
3935
- await room.connect(livekitUrl, jwt, {
3936
- autoSubscribe: true,
3937
- dynacast: true,
3938
- });
3939
- localParticipant = room.localParticipant;
3940
- console.log('✅ Connected to room:', roomName);
3941
- console.log('\n⏳ Waiting for user to connect...');
3942
- console.log(` Room: ${roomCode}\n`);
3943
- // Keep process alive
3944
- await new Promise(() => { });
3945
- }
3946
- catch (err) {
3947
- console.error('❌ Failed to connect:', err);
3948
- process.exit(1);
3949
- }
3967
+ // Connect to LiveKit with retry-on-failure.
3968
+ //
3969
+ // Earlier behavior: a single attempt followed by `process.exit(1)` on error.
3970
+ // Combined with Fly's restart policy this produced a tight restart loop that
3971
+ // (a) hit the same 429 / auth error every ~30s, (b) burned the LiveKit
3972
+ // quota's retry budget, and (c) hit Fly's max-restart-count (3) and killed
3973
+ // the machine — at which point the frontend's /api/sandbox probe saw the
3974
+ // sandbox as failed and bounced the user back to the dashboard with no
3975
+ // useful error.
3976
+ //
3977
+ // New behavior: bounded-backoff retry, infinite attempts. The API server
3978
+ // stays up the whole time serving /health (which surfaces the LiveKit error
3979
+ // as a field, NOT as an HTTP failure — see the /health handler comment).
3980
+ // When the underlying issue is resolved (quota reset, key fixed, LiveKit
3981
+ // service back), the next retry succeeds and the agent picks up where it
3982
+ // left off without anyone needing to manually restart.
3983
+ //
3984
+ // Backoff: 5s → 10s → 20s → 40s → 60s (capped). Resets to 5s after each
3985
+ // successful connect (so a single transient hiccup doesn't disable fast
3986
+ // recovery on the next disconnect).
3987
+ const connectWithRetry = async () => {
3988
+ const backoffSchedule = [5_000, 10_000, 20_000, 40_000, 60_000];
3989
+ let backoffIdx = 0;
3990
+ while (true) {
3991
+ livekitState.status = livekitState.attemptCount === 0 ? 'connecting' : 'retrying';
3992
+ livekitState.lastAttemptAt = Date.now();
3993
+ livekitState.attemptCount += 1;
3994
+ try {
3995
+ await room.connect(livekitUrl, jwt, {
3996
+ autoSubscribe: true,
3997
+ dynacast: true,
3998
+ });
3999
+ localParticipant = room.localParticipant;
4000
+ livekitState.status = 'connected';
4001
+ livekitState.error = null;
4002
+ livekitState.errorCode = null;
4003
+ backoffIdx = 0;
4004
+ console.log('✅ Connected to room:', roomName);
4005
+ console.log('\n⏳ Waiting for user to connect...');
4006
+ console.log(` Room: ${roomCode}\n`);
4007
+ return;
4008
+ }
4009
+ catch (err) {
4010
+ const msg = err instanceof Error ? err.message : String(err);
4011
+ // Categorize the error so the frontend can show specific guidance.
4012
+ // Substring matching on the message because LiveKit's rtc-node SDK
4013
+ // wraps the underlying HTTP status in a generic ConnectError.
4014
+ let errorCode;
4015
+ if (/429|connection minutes limit/i.test(msg))
4016
+ errorCode = 'quota_exceeded';
4017
+ else if (/401|403|unauthorized|invalid/i.test(msg))
4018
+ errorCode = 'auth';
4019
+ else if (/ENOTFOUND|ECONNREFUSED|ETIMEDOUT|network/i.test(msg))
4020
+ errorCode = 'network';
4021
+ else
4022
+ errorCode = 'unknown';
4023
+ livekitState.status = 'failed';
4024
+ livekitState.error = msg;
4025
+ livekitState.errorCode = errorCode;
4026
+ const waitMs = backoffSchedule[Math.min(backoffIdx, backoffSchedule.length - 1)];
4027
+ backoffIdx += 1;
4028
+ console.error(`❌ LiveKit connect failed (${errorCode}, attempt ${livekitState.attemptCount}): ${msg.substring(0, 200)}`);
4029
+ console.error(` Retrying in ${waitMs / 1000}s — process staying alive; /health remains 200 with livekit.status='failed'`);
4030
+ await new Promise(r => setTimeout(r, waitMs));
4031
+ // loop — try again
4032
+ }
4033
+ }
4034
+ };
4035
+ // Fire and forget; the retry loop keeps the process alive on its own (so
4036
+ // we don't need the explicit `new Promise(() => {})` keepalive anymore).
4037
+ // Errors that escape the retry loop should never happen, but if they do,
4038
+ // log them rather than crash.
4039
+ connectWithRetry().catch(err => {
4040
+ console.error('❌ Unrecoverable error in LiveKit retry loop (should not happen):', err);
4041
+ livekitState.status = 'failed';
4042
+ livekitState.error = err instanceof Error ? err.message : String(err);
4043
+ livekitState.errorCode = 'unrecoverable';
4044
+ });
4045
+ // Keep main() alive forever — without this the await chain ends and Node
4046
+ // exits 0, which Fly treats as a clean shutdown.
4047
+ await new Promise(() => { });
3950
4048
  }
3951
4049
  // Run
3952
4050
  main().catch(console.error);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "osborn",
3
- "version": "0.9.44",
3
+ "version": "0.9.45",
4
4
  "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
5
5
  "type": "module",
6
6
  "bin": {