osborn 0.9.44 → 0.9.45
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +115 -17
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -147,6 +147,24 @@ process.on('uncaughtException', (error) => {
|
|
|
147
147
|
// ============================================================
|
|
148
148
|
// Module-level room code so the HTTP server can expose it via GET /room-code
|
|
149
149
|
let currentRoomCode = null;
|
|
150
|
+
// Module-level LiveKit connection state. Shared between main() (which runs the
|
|
151
|
+
// connect-with-retry loop) and the /health handler in startApiServer (which
|
|
152
|
+
// reports it to the frontend so the user sees a meaningful error instead of a
|
|
153
|
+
// dashboard redirect when LiveKit is unreachable / out of quota / etc).
|
|
154
|
+
//
|
|
155
|
+
// We deliberately do NOT 503 /health on connect failure — Fly's machine
|
|
156
|
+
// health-check uses /health, and returning non-2xx triggers a restart loop
|
|
157
|
+
// which (a) burns the same failing LiveKit calls every 30s and (b) gets the
|
|
158
|
+
// machine killed after 3 failed restarts. By staying 200 OK and surfacing the
|
|
159
|
+
// status as a field, we keep the container alive long enough for LiveKit to
|
|
160
|
+
// recover (auto-retry) or for the user to read the error and upgrade quota.
|
|
161
|
+
const livekitState = {
|
|
162
|
+
status: 'connecting',
|
|
163
|
+
error: null,
|
|
164
|
+
errorCode: null,
|
|
165
|
+
lastAttemptAt: null,
|
|
166
|
+
attemptCount: 0,
|
|
167
|
+
};
|
|
150
168
|
function startApiServer(workingDir, port) {
|
|
151
169
|
const server = createServer(async (req, res) => {
|
|
152
170
|
// CORS headers for cloud frontend
|
|
@@ -221,7 +239,22 @@ function startApiServer(workingDir, port) {
|
|
|
221
239
|
}
|
|
222
240
|
catch { /* version optional */ }
|
|
223
241
|
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
224
|
-
res.end(JSON.stringify({
|
|
242
|
+
res.end(JSON.stringify({
|
|
243
|
+
status: 'ok',
|
|
244
|
+
workingDir,
|
|
245
|
+
version,
|
|
246
|
+
// LiveKit subsystem status — frontend can use this to surface a real
|
|
247
|
+
// error instead of treating the sandbox as totally broken. The HTTP
|
|
248
|
+
// status code stays 200 so Fly health-check stays green and the
|
|
249
|
+
// container isn't restart-looped while LiveKit is unreachable.
|
|
250
|
+
livekit: {
|
|
251
|
+
status: livekitState.status,
|
|
252
|
+
error: livekitState.error,
|
|
253
|
+
errorCode: livekitState.errorCode,
|
|
254
|
+
attemptCount: livekitState.attemptCount,
|
|
255
|
+
lastAttemptAt: livekitState.lastAttemptAt,
|
|
256
|
+
},
|
|
257
|
+
}));
|
|
225
258
|
return;
|
|
226
259
|
}
|
|
227
260
|
// POST /webhook/recall — Recall.ai real-time transcript webhooks
|
|
@@ -3931,22 +3964,87 @@ async function main() {
|
|
|
3931
3964
|
// ============================================================
|
|
3932
3965
|
// Connect to Room
|
|
3933
3966
|
// ============================================================
|
|
3934
|
-
|
|
3935
|
-
|
|
3936
|
-
|
|
3937
|
-
|
|
3938
|
-
|
|
3939
|
-
|
|
3940
|
-
|
|
3941
|
-
|
|
3942
|
-
|
|
3943
|
-
|
|
3944
|
-
|
|
3945
|
-
|
|
3946
|
-
|
|
3947
|
-
|
|
3948
|
-
|
|
3949
|
-
|
|
3967
|
+
// Connect to LiveKit with retry-on-failure.
|
|
3968
|
+
//
|
|
3969
|
+
// Earlier behavior: a single attempt followed by `process.exit(1)` on error.
|
|
3970
|
+
// Combined with Fly's restart policy this produced a tight restart loop that
|
|
3971
|
+
// (a) hit the same 429 / auth error every ~30s, (b) burned the LiveKit
|
|
3972
|
+
// quota's retry budget, and (c) hit Fly's max-restart-count (3) and killed
|
|
3973
|
+
// the machine — at which point the frontend's /api/sandbox probe saw the
|
|
3974
|
+
// sandbox as failed and bounced the user back to the dashboard with no
|
|
3975
|
+
// useful error.
|
|
3976
|
+
//
|
|
3977
|
+
// New behavior: bounded-backoff retry, infinite attempts. The API server
|
|
3978
|
+
// stays up the whole time serving /health (which surfaces the LiveKit error
|
|
3979
|
+
// as a field, NOT as an HTTP failure — see the /health handler comment).
|
|
3980
|
+
// When the underlying issue is resolved (quota reset, key fixed, LiveKit
|
|
3981
|
+
// service back), the next retry succeeds and the agent picks up where it
|
|
3982
|
+
// left off without anyone needing to manually restart.
|
|
3983
|
+
//
|
|
3984
|
+
// Backoff: 5s → 10s → 20s → 40s → 60s (capped). Resets to 5s after each
|
|
3985
|
+
// successful connect (so a single transient hiccup doesn't disable fast
|
|
3986
|
+
// recovery on the next disconnect).
|
|
3987
|
+
const connectWithRetry = async () => {
|
|
3988
|
+
const backoffSchedule = [5_000, 10_000, 20_000, 40_000, 60_000];
|
|
3989
|
+
let backoffIdx = 0;
|
|
3990
|
+
while (true) {
|
|
3991
|
+
livekitState.status = livekitState.attemptCount === 0 ? 'connecting' : 'retrying';
|
|
3992
|
+
livekitState.lastAttemptAt = Date.now();
|
|
3993
|
+
livekitState.attemptCount += 1;
|
|
3994
|
+
try {
|
|
3995
|
+
await room.connect(livekitUrl, jwt, {
|
|
3996
|
+
autoSubscribe: true,
|
|
3997
|
+
dynacast: true,
|
|
3998
|
+
});
|
|
3999
|
+
localParticipant = room.localParticipant;
|
|
4000
|
+
livekitState.status = 'connected';
|
|
4001
|
+
livekitState.error = null;
|
|
4002
|
+
livekitState.errorCode = null;
|
|
4003
|
+
backoffIdx = 0;
|
|
4004
|
+
console.log('✅ Connected to room:', roomName);
|
|
4005
|
+
console.log('\n⏳ Waiting for user to connect...');
|
|
4006
|
+
console.log(` Room: ${roomCode}\n`);
|
|
4007
|
+
return;
|
|
4008
|
+
}
|
|
4009
|
+
catch (err) {
|
|
4010
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
4011
|
+
// Categorize the error so the frontend can show specific guidance.
|
|
4012
|
+
// Substring matching on the message because LiveKit's rtc-node SDK
|
|
4013
|
+
// wraps the underlying HTTP status in a generic ConnectError.
|
|
4014
|
+
let errorCode;
|
|
4015
|
+
if (/429|connection minutes limit/i.test(msg))
|
|
4016
|
+
errorCode = 'quota_exceeded';
|
|
4017
|
+
else if (/401|403|unauthorized|invalid/i.test(msg))
|
|
4018
|
+
errorCode = 'auth';
|
|
4019
|
+
else if (/ENOTFOUND|ECONNREFUSED|ETIMEDOUT|network/i.test(msg))
|
|
4020
|
+
errorCode = 'network';
|
|
4021
|
+
else
|
|
4022
|
+
errorCode = 'unknown';
|
|
4023
|
+
livekitState.status = 'failed';
|
|
4024
|
+
livekitState.error = msg;
|
|
4025
|
+
livekitState.errorCode = errorCode;
|
|
4026
|
+
const waitMs = backoffSchedule[Math.min(backoffIdx, backoffSchedule.length - 1)];
|
|
4027
|
+
backoffIdx += 1;
|
|
4028
|
+
console.error(`❌ LiveKit connect failed (${errorCode}, attempt ${livekitState.attemptCount}): ${msg.substring(0, 200)}`);
|
|
4029
|
+
console.error(` Retrying in ${waitMs / 1000}s — process staying alive; /health remains 200 with livekit.status='failed'`);
|
|
4030
|
+
await new Promise(r => setTimeout(r, waitMs));
|
|
4031
|
+
// loop — try again
|
|
4032
|
+
}
|
|
4033
|
+
}
|
|
4034
|
+
};
|
|
4035
|
+
// Fire and forget; the retry loop keeps the process alive on its own (so
|
|
4036
|
+
// we don't need the explicit `new Promise(() => {})` keepalive anymore).
|
|
4037
|
+
// Errors that escape the retry loop should never happen, but if they do,
|
|
4038
|
+
// log them rather than crash.
|
|
4039
|
+
connectWithRetry().catch(err => {
|
|
4040
|
+
console.error('❌ Unrecoverable error in LiveKit retry loop (should not happen):', err);
|
|
4041
|
+
livekitState.status = 'failed';
|
|
4042
|
+
livekitState.error = err instanceof Error ? err.message : String(err);
|
|
4043
|
+
livekitState.errorCode = 'unrecoverable';
|
|
4044
|
+
});
|
|
4045
|
+
// Keep main() alive forever — without this the await chain ends and Node
|
|
4046
|
+
// exits 0, which Fly treats as a clean shutdown.
|
|
4047
|
+
await new Promise(() => { });
|
|
3950
4048
|
}
|
|
3951
4049
|
// Run
|
|
3952
4050
|
main().catch(console.error);
|