@agent-relay/cloud 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api/admin.d.ts +8 -0
- package/dist/api/admin.d.ts.map +1 -0
- package/dist/api/admin.js +225 -0
- package/dist/api/admin.js.map +1 -0
- package/dist/api/auth.d.ts +20 -0
- package/dist/api/auth.d.ts.map +1 -0
- package/dist/api/auth.js +136 -0
- package/dist/api/auth.js.map +1 -0
- package/dist/api/billing.d.ts +7 -0
- package/dist/api/billing.d.ts.map +1 -0
- package/dist/api/billing.js +564 -0
- package/dist/api/billing.js.map +1 -0
- package/dist/api/cli-pty-runner.d.ts +53 -0
- package/dist/api/cli-pty-runner.d.ts.map +1 -0
- package/dist/api/cli-pty-runner.js +193 -0
- package/dist/api/cli-pty-runner.js.map +1 -0
- package/dist/api/codex-auth-helper.d.ts +21 -0
- package/dist/api/codex-auth-helper.d.ts.map +1 -0
- package/dist/api/codex-auth-helper.js +327 -0
- package/dist/api/codex-auth-helper.js.map +1 -0
- package/dist/api/consensus.d.ts +13 -0
- package/dist/api/consensus.d.ts.map +1 -0
- package/dist/api/consensus.js +261 -0
- package/dist/api/consensus.js.map +1 -0
- package/dist/api/coordinators.d.ts +8 -0
- package/dist/api/coordinators.d.ts.map +1 -0
- package/dist/api/coordinators.js +750 -0
- package/dist/api/coordinators.js.map +1 -0
- package/dist/api/daemons.d.ts +12 -0
- package/dist/api/daemons.d.ts.map +1 -0
- package/dist/api/daemons.js +535 -0
- package/dist/api/daemons.js.map +1 -0
- package/dist/api/generic-webhooks.d.ts +8 -0
- package/dist/api/generic-webhooks.d.ts.map +1 -0
- package/dist/api/generic-webhooks.js +129 -0
- package/dist/api/generic-webhooks.js.map +1 -0
- package/dist/api/git.d.ts +8 -0
- package/dist/api/git.d.ts.map +1 -0
- package/dist/api/git.js +269 -0
- package/dist/api/git.js.map +1 -0
- package/dist/api/github-app.d.ts +11 -0
- package/dist/api/github-app.d.ts.map +1 -0
- package/dist/api/github-app.js +223 -0
- package/dist/api/github-app.js.map +1 -0
- package/dist/api/middleware/planLimits.d.ts +43 -0
- package/dist/api/middleware/planLimits.d.ts.map +1 -0
- package/dist/api/middleware/planLimits.js +202 -0
- package/dist/api/middleware/planLimits.js.map +1 -0
- package/dist/api/monitoring.d.ts +11 -0
- package/dist/api/monitoring.d.ts.map +1 -0
- package/dist/api/monitoring.js +578 -0
- package/dist/api/monitoring.js.map +1 -0
- package/dist/api/nango-auth.d.ts +9 -0
- package/dist/api/nango-auth.d.ts.map +1 -0
- package/dist/api/nango-auth.js +674 -0
- package/dist/api/nango-auth.js.map +1 -0
- package/dist/api/onboarding.d.ts +15 -0
- package/dist/api/onboarding.d.ts.map +1 -0
- package/dist/api/onboarding.js +679 -0
- package/dist/api/onboarding.js.map +1 -0
- package/dist/api/policy.d.ts +8 -0
- package/dist/api/policy.d.ts.map +1 -0
- package/dist/api/policy.js +229 -0
- package/dist/api/policy.js.map +1 -0
- package/dist/api/provider-env.d.ts +14 -0
- package/dist/api/provider-env.d.ts.map +1 -0
- package/dist/api/provider-env.js +75 -0
- package/dist/api/provider-env.js.map +1 -0
- package/dist/api/providers.d.ts +7 -0
- package/dist/api/providers.d.ts.map +1 -0
- package/dist/api/providers.js +564 -0
- package/dist/api/providers.js.map +1 -0
- package/dist/api/repos.d.ts +8 -0
- package/dist/api/repos.d.ts.map +1 -0
- package/dist/api/repos.js +577 -0
- package/dist/api/repos.js.map +1 -0
- package/dist/api/sessions.d.ts +11 -0
- package/dist/api/sessions.d.ts.map +1 -0
- package/dist/api/sessions.js +302 -0
- package/dist/api/sessions.js.map +1 -0
- package/dist/api/teams.d.ts +7 -0
- package/dist/api/teams.d.ts.map +1 -0
- package/dist/api/teams.js +281 -0
- package/dist/api/teams.js.map +1 -0
- package/dist/api/test-helpers.d.ts +10 -0
- package/dist/api/test-helpers.d.ts.map +1 -0
- package/dist/api/test-helpers.js +745 -0
- package/dist/api/test-helpers.js.map +1 -0
- package/dist/api/usage.d.ts +7 -0
- package/dist/api/usage.d.ts.map +1 -0
- package/dist/api/usage.js +111 -0
- package/dist/api/usage.js.map +1 -0
- package/dist/api/webhooks.d.ts +8 -0
- package/dist/api/webhooks.d.ts.map +1 -0
- package/dist/api/webhooks.js +645 -0
- package/dist/api/webhooks.js.map +1 -0
- package/dist/api/workspaces.d.ts +25 -0
- package/dist/api/workspaces.d.ts.map +1 -0
- package/dist/api/workspaces.js +1799 -0
- package/dist/api/workspaces.js.map +1 -0
- package/dist/billing/index.d.ts +9 -0
- package/dist/billing/index.d.ts.map +1 -0
- package/dist/billing/index.js +9 -0
- package/dist/billing/index.js.map +1 -0
- package/dist/billing/plans.d.ts +39 -0
- package/dist/billing/plans.d.ts.map +1 -0
- package/dist/billing/plans.js +245 -0
- package/dist/billing/plans.js.map +1 -0
- package/dist/billing/service.d.ts +80 -0
- package/dist/billing/service.d.ts.map +1 -0
- package/dist/billing/service.js +388 -0
- package/dist/billing/service.js.map +1 -0
- package/dist/billing/types.d.ts +141 -0
- package/dist/billing/types.d.ts.map +1 -0
- package/dist/billing/types.js +7 -0
- package/dist/billing/types.js.map +1 -0
- package/dist/config.d.ts +5 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +5 -0
- package/dist/config.js.map +1 -0
- package/dist/db/bulk-ingest.d.ts +89 -0
- package/dist/db/bulk-ingest.d.ts.map +1 -0
- package/dist/db/bulk-ingest.js +268 -0
- package/dist/db/bulk-ingest.js.map +1 -0
- package/dist/db/drizzle.d.ts +256 -0
- package/dist/db/drizzle.d.ts.map +1 -0
- package/dist/db/drizzle.js +1286 -0
- package/dist/db/drizzle.js.map +1 -0
- package/dist/db/index.d.ts +55 -0
- package/dist/db/index.d.ts.map +1 -0
- package/dist/db/index.js +68 -0
- package/dist/db/index.js.map +1 -0
- package/dist/db/schema.d.ts +4873 -0
- package/dist/db/schema.d.ts.map +1 -0
- package/dist/db/schema.js +620 -0
- package/dist/db/schema.js.map +1 -0
- package/dist/index.d.ts +11 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +38 -0
- package/dist/index.js.map +1 -0
- package/dist/provisioner/index.d.ts +207 -0
- package/dist/provisioner/index.d.ts.map +1 -0
- package/dist/provisioner/index.js +2114 -0
- package/dist/provisioner/index.js.map +1 -0
- package/dist/server.d.ts +17 -0
- package/dist/server.d.ts.map +1 -0
- package/dist/server.js +1924 -0
- package/dist/server.js.map +1 -0
- package/dist/services/auto-scaler.d.ts +152 -0
- package/dist/services/auto-scaler.d.ts.map +1 -0
- package/dist/services/auto-scaler.js +439 -0
- package/dist/services/auto-scaler.js.map +1 -0
- package/dist/services/capacity-manager.d.ts +148 -0
- package/dist/services/capacity-manager.d.ts.map +1 -0
- package/dist/services/capacity-manager.js +449 -0
- package/dist/services/capacity-manager.js.map +1 -0
- package/dist/services/ci-agent-spawner.d.ts +49 -0
- package/dist/services/ci-agent-spawner.d.ts.map +1 -0
- package/dist/services/ci-agent-spawner.js +373 -0
- package/dist/services/ci-agent-spawner.js.map +1 -0
- package/dist/services/cloud-message-bus.d.ts +28 -0
- package/dist/services/cloud-message-bus.d.ts.map +1 -0
- package/dist/services/cloud-message-bus.js +19 -0
- package/dist/services/cloud-message-bus.js.map +1 -0
- package/dist/services/compute-enforcement.d.ts +57 -0
- package/dist/services/compute-enforcement.d.ts.map +1 -0
- package/dist/services/compute-enforcement.js +175 -0
- package/dist/services/compute-enforcement.js.map +1 -0
- package/dist/services/coordinator.d.ts +62 -0
- package/dist/services/coordinator.d.ts.map +1 -0
- package/dist/services/coordinator.js +389 -0
- package/dist/services/coordinator.js.map +1 -0
- package/dist/services/index.d.ts +17 -0
- package/dist/services/index.d.ts.map +1 -0
- package/dist/services/index.js +25 -0
- package/dist/services/index.js.map +1 -0
- package/dist/services/intro-expiration.d.ts +60 -0
- package/dist/services/intro-expiration.d.ts.map +1 -0
- package/dist/services/intro-expiration.js +252 -0
- package/dist/services/intro-expiration.js.map +1 -0
- package/dist/services/mention-handler.d.ts +65 -0
- package/dist/services/mention-handler.d.ts.map +1 -0
- package/dist/services/mention-handler.js +405 -0
- package/dist/services/mention-handler.js.map +1 -0
- package/dist/services/nango.d.ts +201 -0
- package/dist/services/nango.d.ts.map +1 -0
- package/dist/services/nango.js +392 -0
- package/dist/services/nango.js.map +1 -0
- package/dist/services/persistence.d.ts +131 -0
- package/dist/services/persistence.d.ts.map +1 -0
- package/dist/services/persistence.js +200 -0
- package/dist/services/persistence.js.map +1 -0
- package/dist/services/planLimits.d.ts +147 -0
- package/dist/services/planLimits.d.ts.map +1 -0
- package/dist/services/planLimits.js +335 -0
- package/dist/services/planLimits.js.map +1 -0
- package/dist/services/presence-registry.d.ts +56 -0
- package/dist/services/presence-registry.d.ts.map +1 -0
- package/dist/services/presence-registry.js +91 -0
- package/dist/services/presence-registry.js.map +1 -0
- package/dist/services/scaling-orchestrator.d.ts +159 -0
- package/dist/services/scaling-orchestrator.d.ts.map +1 -0
- package/dist/services/scaling-orchestrator.js +502 -0
- package/dist/services/scaling-orchestrator.js.map +1 -0
- package/dist/services/scaling-policy.d.ts +121 -0
- package/dist/services/scaling-policy.d.ts.map +1 -0
- package/dist/services/scaling-policy.js +415 -0
- package/dist/services/scaling-policy.js.map +1 -0
- package/dist/services/ssh-security.d.ts +31 -0
- package/dist/services/ssh-security.d.ts.map +1 -0
- package/dist/services/ssh-security.js +63 -0
- package/dist/services/ssh-security.js.map +1 -0
- package/dist/services/workspace-keepalive.d.ts +76 -0
- package/dist/services/workspace-keepalive.d.ts.map +1 -0
- package/dist/services/workspace-keepalive.js +234 -0
- package/dist/services/workspace-keepalive.js.map +1 -0
- package/dist/shims/consensus.d.ts +23 -0
- package/dist/shims/consensus.d.ts.map +1 -0
- package/dist/shims/consensus.js +5 -0
- package/dist/shims/consensus.js.map +1 -0
- package/dist/webhooks/index.d.ts +24 -0
- package/dist/webhooks/index.d.ts.map +1 -0
- package/dist/webhooks/index.js +29 -0
- package/dist/webhooks/index.js.map +1 -0
- package/dist/webhooks/parsers/github.d.ts +8 -0
- package/dist/webhooks/parsers/github.d.ts.map +1 -0
- package/dist/webhooks/parsers/github.js +234 -0
- package/dist/webhooks/parsers/github.js.map +1 -0
- package/dist/webhooks/parsers/index.d.ts +23 -0
- package/dist/webhooks/parsers/index.d.ts.map +1 -0
- package/dist/webhooks/parsers/index.js +30 -0
- package/dist/webhooks/parsers/index.js.map +1 -0
- package/dist/webhooks/parsers/linear.d.ts +9 -0
- package/dist/webhooks/parsers/linear.d.ts.map +1 -0
- package/dist/webhooks/parsers/linear.js +258 -0
- package/dist/webhooks/parsers/linear.js.map +1 -0
- package/dist/webhooks/parsers/slack.d.ts +9 -0
- package/dist/webhooks/parsers/slack.d.ts.map +1 -0
- package/dist/webhooks/parsers/slack.js +214 -0
- package/dist/webhooks/parsers/slack.js.map +1 -0
- package/dist/webhooks/responders/github.d.ts +8 -0
- package/dist/webhooks/responders/github.d.ts.map +1 -0
- package/dist/webhooks/responders/github.js +73 -0
- package/dist/webhooks/responders/github.js.map +1 -0
- package/dist/webhooks/responders/index.d.ts +23 -0
- package/dist/webhooks/responders/index.d.ts.map +1 -0
- package/dist/webhooks/responders/index.js +30 -0
- package/dist/webhooks/responders/index.js.map +1 -0
- package/dist/webhooks/responders/linear.d.ts +9 -0
- package/dist/webhooks/responders/linear.d.ts.map +1 -0
- package/dist/webhooks/responders/linear.js +149 -0
- package/dist/webhooks/responders/linear.js.map +1 -0
- package/dist/webhooks/responders/slack.d.ts +20 -0
- package/dist/webhooks/responders/slack.d.ts.map +1 -0
- package/dist/webhooks/responders/slack.js +178 -0
- package/dist/webhooks/responders/slack.js.map +1 -0
- package/dist/webhooks/router.d.ts +25 -0
- package/dist/webhooks/router.d.ts.map +1 -0
- package/dist/webhooks/router.js +504 -0
- package/dist/webhooks/router.js.map +1 -0
- package/dist/webhooks/rules-engine.d.ts +24 -0
- package/dist/webhooks/rules-engine.d.ts.map +1 -0
- package/dist/webhooks/rules-engine.js +287 -0
- package/dist/webhooks/rules-engine.js.map +1 -0
- package/dist/webhooks/types.d.ts +186 -0
- package/dist/webhooks/types.d.ts.map +1 -0
- package/dist/webhooks/types.js +8 -0
- package/dist/webhooks/types.js.map +1 -0
- package/package.json +55 -0
|
@@ -0,0 +1,2114 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Agent Relay Cloud - Workspace Provisioner
|
|
3
|
+
*
|
|
4
|
+
* One-click provisioning for compute resources (Fly.io, Railway, Docker).
|
|
5
|
+
*/
|
|
6
|
+
import * as crypto from 'crypto';
|
|
7
|
+
import { createHash } from 'node:crypto';
|
|
8
|
+
import { getConfig } from '../config.js';
|
|
9
|
+
import { db } from '../db/index.js';
|
|
10
|
+
import { nangoService } from '../services/nango.js';
|
|
11
|
+
import { canAutoScale, canScaleToTier, getResourceTierForPlan, getPlanLimits, } from '../services/planLimits.js';
|
|
12
|
+
import { deriveSshPassword } from '../services/ssh-security.js';
|
|
13
|
+
// ============================================================================
|
|
14
|
+
// Daemon API Key Management
|
|
15
|
+
// ============================================================================
|
|
16
|
+
/**
|
|
17
|
+
* Generate a daemon API key in the format ar_live_<32 hex chars>
|
|
18
|
+
*/
|
|
19
|
+
function generateDaemonApiKey() {
|
|
20
|
+
const random = crypto.randomBytes(32).toString('hex');
|
|
21
|
+
return `ar_live_${random}`;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Hash an API key for secure storage
|
|
25
|
+
*/
|
|
26
|
+
function hashApiKey(apiKey) {
|
|
27
|
+
return createHash('sha256').update(apiKey).digest('hex');
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Create a linked daemon record for a workspace during provisioning
|
|
31
|
+
* @param preGeneratedApiKey - Pre-generated API key (if not provided, one will be generated)
|
|
32
|
+
*/
|
|
33
|
+
async function createLinkedDaemon(userId, workspaceId, machineId, preGeneratedApiKey) {
|
|
34
|
+
const apiKey = preGeneratedApiKey ?? generateDaemonApiKey();
|
|
35
|
+
const apiKeyHash = hashApiKey(apiKey);
|
|
36
|
+
const daemon = await db.linkedDaemons.create({
|
|
37
|
+
userId,
|
|
38
|
+
workspaceId,
|
|
39
|
+
name: `auto-provisioned-${Date.now()}`,
|
|
40
|
+
machineId,
|
|
41
|
+
apiKeyHash,
|
|
42
|
+
status: 'offline',
|
|
43
|
+
});
|
|
44
|
+
return { daemonId: daemon.id, apiKey };
|
|
45
|
+
}
|
|
46
|
+
const WORKSPACE_PORT = 3888;
|
|
47
|
+
const WORKSPACE_HEALTH_PORT = 3889; // Health check on separate thread - always responsive
|
|
48
|
+
const WORKSPACE_SSH_PORT = 3022;
|
|
49
|
+
const CODEX_OAUTH_PORT = 1455; // Codex CLI OAuth callback port - must be mapped for local dev
|
|
50
|
+
const FETCH_TIMEOUT_MS = 10_000;
|
|
51
|
+
const WORKSPACE_IMAGE = process.env.WORKSPACE_IMAGE || 'ghcr.io/agentworkforce/relay-workspace:latest';
|
|
52
|
+
// In-memory tracker for provisioning progress (workspace ID -> progress)
|
|
53
|
+
const provisioningProgress = new Map();
|
|
54
|
+
/**
|
|
55
|
+
* Update the provisioning stage for a workspace
|
|
56
|
+
*/
|
|
57
|
+
function updateProvisioningStage(workspaceId, stage) {
|
|
58
|
+
const existing = provisioningProgress.get(workspaceId);
|
|
59
|
+
provisioningProgress.set(workspaceId, {
|
|
60
|
+
stage,
|
|
61
|
+
startedAt: existing?.startedAt ?? Date.now(),
|
|
62
|
+
updatedAt: Date.now(),
|
|
63
|
+
});
|
|
64
|
+
console.log(`[provisioner] Workspace ${workspaceId.substring(0, 8)} stage: ${stage}`);
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Get the current provisioning stage for a workspace
|
|
68
|
+
*/
|
|
69
|
+
export function getProvisioningStage(workspaceId) {
|
|
70
|
+
return provisioningProgress.get(workspaceId) ?? null;
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* Clear provisioning progress (call when complete or failed)
|
|
74
|
+
*/
|
|
75
|
+
function clearProvisioningProgress(workspaceId) {
|
|
76
|
+
provisioningProgress.delete(workspaceId);
|
|
77
|
+
}
|
|
78
|
+
/**
|
|
79
|
+
* Schedule cleanup of provisioning progress after a delay
|
|
80
|
+
* This gives the frontend time to poll and see the 'complete' stage
|
|
81
|
+
*/
|
|
82
|
+
function scheduleProgressCleanup(workspaceId, delayMs = 30_000) {
|
|
83
|
+
setTimeout(() => {
|
|
84
|
+
clearProvisioningProgress(workspaceId);
|
|
85
|
+
console.log(`[provisioner] Cleaned up provisioning progress for ${workspaceId.substring(0, 8)}`);
|
|
86
|
+
}, delayMs);
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Get a fresh GitHub App installation token from Nango.
|
|
90
|
+
* Looks up the user's connected repositories to find a valid Nango connection.
|
|
91
|
+
*/
|
|
92
|
+
async function getGithubAppTokenForUser(userId) {
|
|
93
|
+
try {
|
|
94
|
+
// Find any repository with a Nango connection for this user
|
|
95
|
+
const repos = await db.repositories.findByUserId(userId);
|
|
96
|
+
const repoWithConnection = repos.find(r => r.nangoConnectionId);
|
|
97
|
+
if (!repoWithConnection?.nangoConnectionId) {
|
|
98
|
+
console.warn(`[provisioner] No Nango GitHub App connection found for user ${userId}`);
|
|
99
|
+
return null;
|
|
100
|
+
}
|
|
101
|
+
// Get fresh installation token from Nango (handles refresh automatically)
|
|
102
|
+
const token = await nangoService.getGithubAppToken(repoWithConnection.nangoConnectionId);
|
|
103
|
+
return token;
|
|
104
|
+
}
|
|
105
|
+
catch (error) {
|
|
106
|
+
console.error(`[provisioner] Failed to get GitHub App token for user ${userId}:`, error);
|
|
107
|
+
return null;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
async function wait(ms) {
|
|
111
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
112
|
+
}
|
|
113
|
+
async function fetchWithRetry(url, options = {}) {
|
|
114
|
+
const retries = options.retries ?? 2;
|
|
115
|
+
let attempt = 0;
|
|
116
|
+
while (attempt <= retries) {
|
|
117
|
+
const controller = new AbortController();
|
|
118
|
+
const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
|
|
119
|
+
try {
|
|
120
|
+
const response = await fetch(url, { ...options, signal: controller.signal });
|
|
121
|
+
clearTimeout(timer);
|
|
122
|
+
if (!response.ok && response.status >= 500 && attempt < retries) {
|
|
123
|
+
attempt += 1;
|
|
124
|
+
await wait(500 * attempt);
|
|
125
|
+
continue;
|
|
126
|
+
}
|
|
127
|
+
return response;
|
|
128
|
+
}
|
|
129
|
+
catch (error) {
|
|
130
|
+
clearTimeout(timer);
|
|
131
|
+
if (attempt >= retries) {
|
|
132
|
+
throw error;
|
|
133
|
+
}
|
|
134
|
+
attempt += 1;
|
|
135
|
+
await wait(500 * attempt);
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
throw new Error('fetchWithRetry exhausted retries');
|
|
139
|
+
}
|
|
140
|
+
async function softHealthCheck(url) {
|
|
141
|
+
try {
|
|
142
|
+
const res = await fetchWithRetry(`${url.replace(/\/$/, '')}/health`, { method: 'GET', retries: 1 });
|
|
143
|
+
if (!res.ok) {
|
|
144
|
+
console.warn(`[health] Non-200 from ${url}/health: ${res.status}`);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
catch (error) {
|
|
148
|
+
console.warn(`[health] Failed to reach ${url}/health`, error);
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
/**
|
|
152
|
+
* Wait for machine to be in "started" state using Fly.io's /wait endpoint
|
|
153
|
+
* This is more efficient than polling - the API blocks until the state is reached
|
|
154
|
+
* @see https://fly.io/docs/machines/api/machines-resource/#wait-for-a-machine-to-reach-a-specific-state
|
|
155
|
+
*/
|
|
156
|
+
async function waitForMachineStarted(apiToken, appName, machineId, timeoutSeconds = 120) {
|
|
157
|
+
console.log(`[provisioner] Waiting for machine ${machineId} to start (timeout: ${timeoutSeconds}s)...`);
|
|
158
|
+
// Fly.io /wait endpoint has max timeout of 60s, so we need to loop for longer waits
|
|
159
|
+
const maxSingleWait = 60;
|
|
160
|
+
const startTime = Date.now();
|
|
161
|
+
const deadline = startTime + timeoutSeconds * 1000;
|
|
162
|
+
while (Date.now() < deadline) {
|
|
163
|
+
const remainingMs = deadline - Date.now();
|
|
164
|
+
const waitSeconds = Math.min(maxSingleWait, Math.ceil(remainingMs / 1000));
|
|
165
|
+
if (waitSeconds <= 0)
|
|
166
|
+
break;
|
|
167
|
+
try {
|
|
168
|
+
// Use Fly.io's /wait endpoint - blocks until machine reaches target state
|
|
169
|
+
// timeout is an integer in seconds (max 60)
|
|
170
|
+
const res = await fetch(`https://api.machines.dev/v1/apps/${appName}/machines/${machineId}/wait?state=started&timeout=${waitSeconds}`, {
|
|
171
|
+
headers: { Authorization: `Bearer ${apiToken}` },
|
|
172
|
+
});
|
|
173
|
+
if (res.ok) {
|
|
174
|
+
console.log(`[provisioner] Machine ${machineId} is now started`);
|
|
175
|
+
return;
|
|
176
|
+
}
|
|
177
|
+
// 408 = timeout, machine didn't reach state in time - try again if we have time
|
|
178
|
+
if (res.status === 408) {
|
|
179
|
+
console.log(`[provisioner] Machine ${machineId} not ready yet, continuing to wait...`);
|
|
180
|
+
continue;
|
|
181
|
+
}
|
|
182
|
+
// Other error
|
|
183
|
+
const errorText = await res.text();
|
|
184
|
+
throw new Error(`Wait for machine failed: ${res.status} ${errorText}`);
|
|
185
|
+
}
|
|
186
|
+
catch (error) {
|
|
187
|
+
if (error instanceof Error && error.message.includes('Wait for machine failed')) {
|
|
188
|
+
throw error;
|
|
189
|
+
}
|
|
190
|
+
console.warn(`[provisioner] Error waiting for machine:`, error);
|
|
191
|
+
throw new Error(`Failed to wait for machine ${machineId}: ${error.message}`);
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
// Timeout reached - get current state for error message
|
|
195
|
+
const stateRes = await fetch(`https://api.machines.dev/v1/apps/${appName}/machines/${machineId}`, { headers: { Authorization: `Bearer ${apiToken}` } });
|
|
196
|
+
const machine = stateRes.ok ? (await stateRes.json()) : { state: 'unknown' };
|
|
197
|
+
throw new Error(`Machine ${machineId} did not start within ${timeoutSeconds}s (last state: ${machine.state})`);
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* Wait for health check to pass (with DNS propagation time)
|
|
201
|
+
* Tries internal Fly network first if available, then falls back to public URL
|
|
202
|
+
*/
|
|
203
|
+
async function waitForHealthy(url, appName, maxWaitMs = 60_000 // Reduced from 90s - health check is best-effort anyway
|
|
204
|
+
) {
|
|
205
|
+
const startTime = Date.now();
|
|
206
|
+
// Build list of URLs to try - internal first (faster, more reliable from inside Fly)
|
|
207
|
+
const urlsToTry = [];
|
|
208
|
+
// If running on Fly and app name provided, try internal network first
|
|
209
|
+
const isOnFly = !!process.env.FLY_APP_NAME;
|
|
210
|
+
if (isOnFly && appName) {
|
|
211
|
+
urlsToTry.push(`http://${appName}.internal:8080/health`);
|
|
212
|
+
}
|
|
213
|
+
// Always add the public URL as fallback
|
|
214
|
+
urlsToTry.push(`${url.replace(/\/$/, '')}/health`);
|
|
215
|
+
console.log(`[provisioner] Waiting for workspace to become healthy (trying: ${urlsToTry.join(', ')})...`);
|
|
216
|
+
// Exponential backoff: start at 1s, max 5s (reduces unnecessary polling)
|
|
217
|
+
let retryDelayMs = 1000;
|
|
218
|
+
const maxRetryDelayMs = 5000;
|
|
219
|
+
while (Date.now() - startTime < maxWaitMs) {
|
|
220
|
+
// Try each URL in order
|
|
221
|
+
for (const healthUrl of urlsToTry) {
|
|
222
|
+
try {
|
|
223
|
+
const controller = new AbortController();
|
|
224
|
+
const timer = setTimeout(() => controller.abort(), 5_000);
|
|
225
|
+
const res = await fetch(healthUrl, {
|
|
226
|
+
method: 'GET',
|
|
227
|
+
signal: controller.signal,
|
|
228
|
+
});
|
|
229
|
+
clearTimeout(timer);
|
|
230
|
+
if (res.ok) {
|
|
231
|
+
console.log(`[provisioner] Health check passed via ${healthUrl}`);
|
|
232
|
+
return;
|
|
233
|
+
}
|
|
234
|
+
console.log(`[provisioner] Health check to ${healthUrl} returned ${res.status}`);
|
|
235
|
+
}
|
|
236
|
+
catch (error) {
|
|
237
|
+
const elapsed = Math.round((Date.now() - startTime) / 1000);
|
|
238
|
+
const errMsg = error.message;
|
|
239
|
+
// Only log detailed error for last URL attempt
|
|
240
|
+
if (healthUrl === urlsToTry[urlsToTry.length - 1]) {
|
|
241
|
+
console.log(`[provisioner] Health check failed (${elapsed}s elapsed): ${errMsg}`);
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
await wait(retryDelayMs);
|
|
246
|
+
// Exponential backoff with cap
|
|
247
|
+
retryDelayMs = Math.min(retryDelayMs * 1.5, maxRetryDelayMs);
|
|
248
|
+
}
|
|
249
|
+
// Don't throw - workspace is provisioned, health check is best-effort
|
|
250
|
+
console.warn(`[provisioner] Health check did not pass within ${maxWaitMs}ms, continuing anyway`);
|
|
251
|
+
}
|
|
252
|
+
// Resource tiers sized for Claude Code agents (~1-2GB RAM per agent)
|
|
253
|
+
// cpuKind: 'shared' = cheaper but can be throttled, 'performance' = dedicated
|
|
254
|
+
// Note: Team tier (large) uses shared CPUs for better margins (~50% vs ~7% with perf)
|
|
255
|
+
export const RESOURCE_TIERS = {
|
|
256
|
+
small: { name: 'small', cpuCores: 2, memoryMb: 2048, maxAgents: 2, cpuKind: 'shared' },
|
|
257
|
+
medium: { name: 'medium', cpuCores: 2, memoryMb: 4096, maxAgents: 5, cpuKind: 'shared' },
|
|
258
|
+
large: { name: 'large', cpuCores: 4, memoryMb: 8192, maxAgents: 10, cpuKind: 'shared' },
|
|
259
|
+
xlarge: { name: 'xlarge', cpuCores: 8, memoryMb: 16384, maxAgents: 20, cpuKind: 'performance' },
|
|
260
|
+
};
|
|
261
|
+
/**
|
|
262
|
+
* Fly.io provisioner
|
|
263
|
+
*/
|
|
264
|
+
class FlyProvisioner {
|
|
265
|
+
apiToken;
|
|
266
|
+
org;
|
|
267
|
+
region;
|
|
268
|
+
workspaceDomain;
|
|
269
|
+
cloudApiUrl;
|
|
270
|
+
sessionSecret;
|
|
271
|
+
registryAuth;
|
|
272
|
+
snapshotRetentionDays;
|
|
273
|
+
volumeSizeGb;
|
|
274
|
+
constructor() {
|
|
275
|
+
const config = getConfig();
|
|
276
|
+
if (!config.compute.fly) {
|
|
277
|
+
throw new Error('Fly.io configuration missing');
|
|
278
|
+
}
|
|
279
|
+
this.apiToken = config.compute.fly.apiToken;
|
|
280
|
+
this.org = config.compute.fly.org;
|
|
281
|
+
this.region = config.compute.fly.region || 'sjc';
|
|
282
|
+
this.workspaceDomain = config.compute.fly.workspaceDomain;
|
|
283
|
+
this.registryAuth = config.compute.fly.registryAuth;
|
|
284
|
+
this.cloudApiUrl = config.publicUrl;
|
|
285
|
+
this.sessionSecret = config.sessionSecret;
|
|
286
|
+
// Snapshot settings: default 14 days retention, 10GB volume
|
|
287
|
+
this.snapshotRetentionDays = Math.min(60, Math.max(1, config.compute.fly.snapshotRetentionDays ?? 14));
|
|
288
|
+
this.volumeSizeGb = config.compute.fly.volumeSizeGb ?? 10;
|
|
289
|
+
}
|
|
290
|
+
/**
|
|
291
|
+
* Generate a workspace token for API authentication
|
|
292
|
+
* This is a simple HMAC - in production, consider using JWTs
|
|
293
|
+
*/
|
|
294
|
+
generateWorkspaceToken(workspaceId) {
|
|
295
|
+
return crypto
|
|
296
|
+
.createHmac('sha256', this.sessionSecret)
|
|
297
|
+
.update(`workspace:${workspaceId}`)
|
|
298
|
+
.digest('hex');
|
|
299
|
+
}
|
|
300
|
+
/**
|
|
301
|
+
* Create a volume with automatic snapshot settings
|
|
302
|
+
* Fly.io takes daily snapshots automatically; we configure retention
|
|
303
|
+
*/
|
|
304
|
+
async createVolume(appName) {
|
|
305
|
+
const volumeName = 'workspace_data';
|
|
306
|
+
console.log(`[fly] Creating volume ${volumeName} with ${this.snapshotRetentionDays}-day snapshot retention...`);
|
|
307
|
+
const response = await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/volumes`, {
|
|
308
|
+
method: 'POST',
|
|
309
|
+
headers: {
|
|
310
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
311
|
+
'Content-Type': 'application/json',
|
|
312
|
+
},
|
|
313
|
+
body: JSON.stringify({
|
|
314
|
+
name: volumeName,
|
|
315
|
+
region: this.region,
|
|
316
|
+
size_gb: this.volumeSizeGb,
|
|
317
|
+
// Enable automatic daily snapshots (default is true, but be explicit)
|
|
318
|
+
auto_backup_enabled: true,
|
|
319
|
+
// Retain snapshots for configured days (default 5, we use 14)
|
|
320
|
+
snapshot_retention: this.snapshotRetentionDays,
|
|
321
|
+
}),
|
|
322
|
+
});
|
|
323
|
+
if (!response.ok) {
|
|
324
|
+
const error = await response.text();
|
|
325
|
+
throw new Error(`Failed to create volume: ${error}`);
|
|
326
|
+
}
|
|
327
|
+
const volume = await response.json();
|
|
328
|
+
console.log(`[fly] Volume ${volume.id} created with auto-snapshots (${this.snapshotRetentionDays} days retention)`);
|
|
329
|
+
return volume;
|
|
330
|
+
}
|
|
331
|
+
/**
|
|
332
|
+
* Create an on-demand snapshot of a workspace volume
|
|
333
|
+
* Use before risky operations or as manual backup
|
|
334
|
+
*/
|
|
335
|
+
async createSnapshot(appName, volumeId) {
|
|
336
|
+
console.log(`[fly] Creating on-demand snapshot for volume ${volumeId}...`);
|
|
337
|
+
const response = await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/volumes/${volumeId}/snapshots`, {
|
|
338
|
+
method: 'POST',
|
|
339
|
+
headers: {
|
|
340
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
341
|
+
'Content-Type': 'application/json',
|
|
342
|
+
},
|
|
343
|
+
});
|
|
344
|
+
if (!response.ok) {
|
|
345
|
+
const error = await response.text();
|
|
346
|
+
throw new Error(`Failed to create snapshot: ${error}`);
|
|
347
|
+
}
|
|
348
|
+
const snapshot = await response.json();
|
|
349
|
+
console.log(`[fly] Snapshot ${snapshot.id} created`);
|
|
350
|
+
return snapshot;
|
|
351
|
+
}
|
|
352
|
+
/**
|
|
353
|
+
* List snapshots for a workspace volume
|
|
354
|
+
*/
|
|
355
|
+
async listSnapshots(appName, volumeId) {
|
|
356
|
+
const response = await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/volumes/${volumeId}/snapshots`, {
|
|
357
|
+
headers: {
|
|
358
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
359
|
+
},
|
|
360
|
+
});
|
|
361
|
+
if (!response.ok) {
|
|
362
|
+
return [];
|
|
363
|
+
}
|
|
364
|
+
return await response.json();
|
|
365
|
+
}
|
|
366
|
+
/**
|
|
367
|
+
* Get volume info for a workspace
|
|
368
|
+
*/
|
|
369
|
+
async getVolume(appName) {
|
|
370
|
+
const response = await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/volumes`, {
|
|
371
|
+
headers: {
|
|
372
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
373
|
+
},
|
|
374
|
+
});
|
|
375
|
+
if (!response.ok) {
|
|
376
|
+
return null;
|
|
377
|
+
}
|
|
378
|
+
const volumes = await response.json();
|
|
379
|
+
return volumes.find(v => v.name === 'workspace_data') || null;
|
|
380
|
+
}
|
|
381
|
+
async provision(workspace, credentials) {
|
|
382
|
+
const appName = `ar-${workspace.id.substring(0, 8)}`;
|
|
383
|
+
// Stage: Creating workspace
|
|
384
|
+
updateProvisioningStage(workspace.id, 'creating');
|
|
385
|
+
// Create Fly app
|
|
386
|
+
await fetchWithRetry('https://api.machines.dev/v1/apps', {
|
|
387
|
+
method: 'POST',
|
|
388
|
+
headers: {
|
|
389
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
390
|
+
'Content-Type': 'application/json',
|
|
391
|
+
},
|
|
392
|
+
body: JSON.stringify({
|
|
393
|
+
app_name: appName,
|
|
394
|
+
org_slug: this.org,
|
|
395
|
+
}),
|
|
396
|
+
});
|
|
397
|
+
// Stage: Networking
|
|
398
|
+
updateProvisioningStage(workspace.id, 'networking');
|
|
399
|
+
// Allocate IPs for the app (required for public DNS)
|
|
400
|
+
// Must use GraphQL API - Machines REST API doesn't support IP allocation
|
|
401
|
+
// IMPORTANT: We use dedicated IPv4 ($2/mo) instead of shared because:
|
|
402
|
+
// - Shared IPv4 doesn't properly handle raw TCP on non-standard ports (like SSH on 3022)
|
|
403
|
+
// - SSH tunnel connections fail with "Connection closed by remote host" on shared IPs
|
|
404
|
+
// - Dedicated IPv4 is required for raw TCP services to work correctly
|
|
405
|
+
console.log(`[fly] Allocating IPs for ${appName}...`);
|
|
406
|
+
const allocateIP = async (type) => {
|
|
407
|
+
try {
|
|
408
|
+
// Map our type to Fly GraphQL enum (v4 = dedicated IPv4)
|
|
409
|
+
const graphqlType = type;
|
|
410
|
+
const res = await fetchWithRetry('https://api.fly.io/graphql', {
|
|
411
|
+
method: 'POST',
|
|
412
|
+
headers: {
|
|
413
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
414
|
+
'Content-Type': 'application/json',
|
|
415
|
+
},
|
|
416
|
+
body: JSON.stringify({
|
|
417
|
+
query: `
|
|
418
|
+
mutation AllocateIPAddress($input: AllocateIPAddressInput!) {
|
|
419
|
+
allocateIpAddress(input: $input) {
|
|
420
|
+
ipAddress {
|
|
421
|
+
id
|
|
422
|
+
address
|
|
423
|
+
type
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
`,
|
|
428
|
+
variables: {
|
|
429
|
+
input: {
|
|
430
|
+
appId: appName,
|
|
431
|
+
type: graphqlType,
|
|
432
|
+
},
|
|
433
|
+
},
|
|
434
|
+
}),
|
|
435
|
+
});
|
|
436
|
+
if (!res.ok) {
|
|
437
|
+
const errorText = await res.text();
|
|
438
|
+
console.warn(`[fly] Failed to allocate ${type}: ${res.status} ${errorText}`);
|
|
439
|
+
return false;
|
|
440
|
+
}
|
|
441
|
+
const data = await res.json();
|
|
442
|
+
if (data.errors?.length) {
|
|
443
|
+
// Ignore "already allocated" errors
|
|
444
|
+
const alreadyAllocated = data.errors.some(e => e.message.includes('already') || e.message.includes('exists'));
|
|
445
|
+
if (!alreadyAllocated) {
|
|
446
|
+
console.warn(`[fly] GraphQL error allocating ${type}: ${data.errors[0].message}`);
|
|
447
|
+
return false;
|
|
448
|
+
}
|
|
449
|
+
console.log(`[fly] IP ${type} already allocated`);
|
|
450
|
+
return true;
|
|
451
|
+
}
|
|
452
|
+
const address = data.data?.allocateIpAddress?.ipAddress?.address;
|
|
453
|
+
console.log(`[fly] Allocated ${type}: ${address}`);
|
|
454
|
+
return true;
|
|
455
|
+
}
|
|
456
|
+
catch (err) {
|
|
457
|
+
console.warn(`[fly] Failed to allocate ${type}: ${err.message}`);
|
|
458
|
+
return false;
|
|
459
|
+
}
|
|
460
|
+
};
|
|
461
|
+
const [v4Result, v6Result] = await Promise.all([
|
|
462
|
+
allocateIP('v4'),
|
|
463
|
+
allocateIP('v6'),
|
|
464
|
+
]);
|
|
465
|
+
console.log(`[fly] IP allocation results: v4=${v4Result}, v6=${v6Result}`);
|
|
466
|
+
// Stage: Secrets
|
|
467
|
+
updateProvisioningStage(workspace.id, 'secrets');
|
|
468
|
+
// Set secrets (provider credentials)
|
|
469
|
+
const secrets = {};
|
|
470
|
+
for (const [provider, token] of credentials) {
|
|
471
|
+
secrets[`${provider.toUpperCase()}_TOKEN`] = token;
|
|
472
|
+
// Also set GH_TOKEN for gh CLI compatibility
|
|
473
|
+
if (provider === 'github') {
|
|
474
|
+
secrets['GH_TOKEN'] = token;
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
if (Object.keys(secrets).length > 0) {
|
|
478
|
+
await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/secrets`, {
|
|
479
|
+
method: 'POST',
|
|
480
|
+
headers: {
|
|
481
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
482
|
+
'Content-Type': 'application/json',
|
|
483
|
+
},
|
|
484
|
+
body: JSON.stringify(secrets),
|
|
485
|
+
});
|
|
486
|
+
}
|
|
487
|
+
// If custom workspace domain is configured, add certificate
|
|
488
|
+
const customHostname = this.workspaceDomain
|
|
489
|
+
? `${appName}.${this.workspaceDomain}`
|
|
490
|
+
: null;
|
|
491
|
+
if (customHostname) {
|
|
492
|
+
await this.allocateCertificate(appName, customHostname);
|
|
493
|
+
}
|
|
494
|
+
// Stage: Machine (includes volume creation)
|
|
495
|
+
updateProvisioningStage(workspace.id, 'machine');
|
|
496
|
+
// Generate API key for cloud message sync BEFORE creating the machine
|
|
497
|
+
// The key is set as an env var on the machine and stored hashed in linkedDaemons
|
|
498
|
+
const machineApiKey = generateDaemonApiKey();
|
|
499
|
+
// Create volume with automatic daily snapshots before machine
|
|
500
|
+
// Fly.io takes daily snapshots automatically; we configure retention
|
|
501
|
+
const volume = await this.createVolume(appName);
|
|
502
|
+
// Determine instance size based on user's plan using RESOURCE_TIERS
|
|
503
|
+
const user = await db.users.findById(workspace.userId);
|
|
504
|
+
const userPlan = user?.plan || 'free';
|
|
505
|
+
const planLimits = getPlanLimits(userPlan);
|
|
506
|
+
const isFreeTier = userPlan === 'free';
|
|
507
|
+
// Check if user is in introductory period (first 14 days)
|
|
508
|
+
// Free users get Pro-level resources during intro period
|
|
509
|
+
const INTRO_PERIOD_DAYS = 14;
|
|
510
|
+
const userCreatedAt = user?.createdAt ? new Date(user.createdAt) : new Date();
|
|
511
|
+
const daysSinceSignup = (Date.now() - userCreatedAt.getTime()) / (1000 * 60 * 60 * 24);
|
|
512
|
+
const isIntroPeriod = isFreeTier && daysSinceSignup < INTRO_PERIOD_DAYS;
|
|
513
|
+
// Get the appropriate resource tier for this plan
|
|
514
|
+
// Intro period free users get 'pro' tier resources
|
|
515
|
+
const effectivePlan = isIntroPeriod ? 'pro' : userPlan;
|
|
516
|
+
const tierName = getResourceTierForPlan(effectivePlan);
|
|
517
|
+
const tier = RESOURCE_TIERS[tierName];
|
|
518
|
+
const guestConfig = {
|
|
519
|
+
cpu_kind: tier.cpuKind,
|
|
520
|
+
cpus: tier.cpuCores,
|
|
521
|
+
memory_mb: tier.memoryMb,
|
|
522
|
+
};
|
|
523
|
+
// Get max agents for the effective plan (intro users get pro limits)
|
|
524
|
+
const effectiveLimits = isIntroPeriod ? getPlanLimits('pro') : planLimits;
|
|
525
|
+
const maxAgents = effectiveLimits.maxConcurrentAgents === Infinity
|
|
526
|
+
? 100 // Cap at 100 for practical purposes
|
|
527
|
+
: effectiveLimits.maxConcurrentAgents;
|
|
528
|
+
if (isIntroPeriod) {
|
|
529
|
+
const daysRemaining = Math.ceil(INTRO_PERIOD_DAYS - daysSinceSignup);
|
|
530
|
+
console.log(`[fly] Introductory bonus active (${daysRemaining} days remaining) - using ${tierName} tier`);
|
|
531
|
+
}
|
|
532
|
+
console.log(`[fly] Using ${tierName} tier: ${guestConfig.cpus} CPU / ${guestConfig.memory_mb}MB / max ${maxAgents} agents for ${userPlan} plan`);
|
|
533
|
+
// Create machine with auto-stop/start for cost optimization
|
|
534
|
+
const machineResponse = await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/machines`, {
|
|
535
|
+
method: 'POST',
|
|
536
|
+
headers: {
|
|
537
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
538
|
+
'Content-Type': 'application/json',
|
|
539
|
+
},
|
|
540
|
+
body: JSON.stringify({
|
|
541
|
+
region: this.region,
|
|
542
|
+
config: {
|
|
543
|
+
image: WORKSPACE_IMAGE,
|
|
544
|
+
// Registry auth for private ghcr.io images
|
|
545
|
+
...(this.registryAuth && {
|
|
546
|
+
image_registry_auth: {
|
|
547
|
+
registry: 'ghcr.io',
|
|
548
|
+
username: this.registryAuth.username,
|
|
549
|
+
password: this.registryAuth.password,
|
|
550
|
+
},
|
|
551
|
+
}),
|
|
552
|
+
env: {
|
|
553
|
+
WORKSPACE_ID: workspace.id,
|
|
554
|
+
WORKSPACE_OWNER_USER_ID: workspace.userId,
|
|
555
|
+
SUPERVISOR_ENABLED: String(workspace.config.supervisorEnabled ?? false),
|
|
556
|
+
MAX_AGENTS: String(maxAgents),
|
|
557
|
+
REPOSITORIES: (workspace.config.repositories ?? []).join(','),
|
|
558
|
+
PROVIDERS: (workspace.config.providers ?? []).join(','),
|
|
559
|
+
PORT: String(WORKSPACE_PORT),
|
|
560
|
+
AGENT_RELAY_DASHBOARD_PORT: String(WORKSPACE_PORT),
|
|
561
|
+
// Store repos on persistent volume (/data) so they survive container restarts
|
|
562
|
+
// Without this, repos are cloned to /workspace (ephemeral) and lost on restart
|
|
563
|
+
WORKSPACE_DIR: '/data/repos',
|
|
564
|
+
// Git gateway configuration
|
|
565
|
+
CLOUD_API_URL: this.cloudApiUrl,
|
|
566
|
+
WORKSPACE_TOKEN: this.generateWorkspaceToken(workspace.id),
|
|
567
|
+
// Daemon API key for cloud message sync
|
|
568
|
+
// Auto-generated during provisioning, stored in linkedDaemons table
|
|
569
|
+
AGENT_RELAY_API_KEY: machineApiKey,
|
|
570
|
+
// SSH for CLI tunneling (Codex OAuth callback forwarding)
|
|
571
|
+
// Each workspace gets a unique password derived from its ID + secret salt
|
|
572
|
+
ENABLE_SSH: 'true',
|
|
573
|
+
SSH_PASSWORD: deriveSshPassword(workspace.id),
|
|
574
|
+
SSH_PORT: String(WORKSPACE_SSH_PORT),
|
|
575
|
+
// Enable cloud persistence for agent sessions/summaries via API
|
|
576
|
+
RELAY_CLOUD_ENABLED: 'true',
|
|
577
|
+
},
|
|
578
|
+
services: [
|
|
579
|
+
{
|
|
580
|
+
ports: [
|
|
581
|
+
{
|
|
582
|
+
port: 443,
|
|
583
|
+
handlers: ['tls', 'http'],
|
|
584
|
+
// Force HTTP/1.1 to backend for WebSocket upgrade compatibility
|
|
585
|
+
// HTTP/2 doesn't support traditional WebSocket upgrade mechanism
|
|
586
|
+
http_options: {
|
|
587
|
+
h2_backend: false,
|
|
588
|
+
},
|
|
589
|
+
},
|
|
590
|
+
{ port: 80, handlers: ['http'] },
|
|
591
|
+
],
|
|
592
|
+
protocol: 'tcp',
|
|
593
|
+
internal_port: WORKSPACE_PORT,
|
|
594
|
+
// Auto-stop after inactivity to reduce costs
|
|
595
|
+
// Fly Proxy automatically wakes machines on incoming requests
|
|
596
|
+
auto_stop_machines: 'stop', // stop (not suspend) for faster wake
|
|
597
|
+
auto_start_machines: true,
|
|
598
|
+
min_machines_running: 0,
|
|
599
|
+
// Idle timeout before auto-stop (in seconds)
|
|
600
|
+
// Longer timeout = better UX, shorter = lower costs
|
|
601
|
+
concurrency: {
|
|
602
|
+
type: 'requests',
|
|
603
|
+
soft_limit: 25,
|
|
604
|
+
hard_limit: 50,
|
|
605
|
+
},
|
|
606
|
+
},
|
|
607
|
+
// SSH service for CLI tunneling (Codex OAuth callback forwarding)
|
|
608
|
+
// Exposes port 3022 publicly for SSH connections from user's machine
|
|
609
|
+
{
|
|
610
|
+
ports: [
|
|
611
|
+
{
|
|
612
|
+
port: WORKSPACE_SSH_PORT,
|
|
613
|
+
handlers: [], // Empty handlers = raw TCP passthrough
|
|
614
|
+
},
|
|
615
|
+
],
|
|
616
|
+
protocol: 'tcp',
|
|
617
|
+
internal_port: WORKSPACE_SSH_PORT,
|
|
618
|
+
// SSH connections should also wake the machine
|
|
619
|
+
auto_stop_machines: 'stop',
|
|
620
|
+
auto_start_machines: true,
|
|
621
|
+
min_machines_running: 0,
|
|
622
|
+
},
|
|
623
|
+
],
|
|
624
|
+
checks: {
|
|
625
|
+
health: {
|
|
626
|
+
type: 'http',
|
|
627
|
+
port: WORKSPACE_HEALTH_PORT, // Health worker thread - responds even when main loop blocked
|
|
628
|
+
path: '/health',
|
|
629
|
+
interval: '30s',
|
|
630
|
+
timeout: '10s', // Increased timeout for safety
|
|
631
|
+
grace_period: '30s', // Longer grace period for startup
|
|
632
|
+
},
|
|
633
|
+
},
|
|
634
|
+
// Instance size based on plan - free tier gets smaller instance
|
|
635
|
+
guest: guestConfig,
|
|
636
|
+
// Mount the volume we created with snapshot settings
|
|
637
|
+
mounts: [
|
|
638
|
+
{
|
|
639
|
+
volume: volume.id,
|
|
640
|
+
path: '/data',
|
|
641
|
+
},
|
|
642
|
+
],
|
|
643
|
+
},
|
|
644
|
+
}),
|
|
645
|
+
});
|
|
646
|
+
if (!machineResponse.ok) {
|
|
647
|
+
const error = await machineResponse.text();
|
|
648
|
+
throw new Error(`Failed to create Fly machine: ${error}`);
|
|
649
|
+
}
|
|
650
|
+
const machine = (await machineResponse.json());
|
|
651
|
+
// Create linked daemon for cloud message sync
|
|
652
|
+
// Pass the pre-generated API key so it matches what was set in the machine env vars
|
|
653
|
+
const { daemonId } = await createLinkedDaemon(workspace.userId, workspace.id, machine.id, // Use Fly machine ID as daemon's machine ID
|
|
654
|
+
machineApiKey);
|
|
655
|
+
console.log(`[fly] Created linked daemon ${daemonId.substring(0, 8)} for workspace ${workspace.id.substring(0, 8)}`);
|
|
656
|
+
// Return custom domain URL if configured, otherwise default fly.dev
|
|
657
|
+
const publicUrl = customHostname
|
|
658
|
+
? `https://${customHostname}`
|
|
659
|
+
: `https://${appName}.fly.dev`;
|
|
660
|
+
// Stage: Booting
|
|
661
|
+
updateProvisioningStage(workspace.id, 'booting');
|
|
662
|
+
// Wait for machine to be in started state
|
|
663
|
+
await waitForMachineStarted(this.apiToken, appName, machine.id);
|
|
664
|
+
// Stage: Health check
|
|
665
|
+
updateProvisioningStage(workspace.id, 'health');
|
|
666
|
+
// Wait for health check to pass (includes DNS propagation time)
|
|
667
|
+
// Pass appName to enable internal Fly network health checks
|
|
668
|
+
await waitForHealthy(publicUrl, appName);
|
|
669
|
+
// Stage: Complete
|
|
670
|
+
updateProvisioningStage(workspace.id, 'complete');
|
|
671
|
+
// Schedule cleanup of provisioning progress after 30s (gives frontend time to see 'complete')
|
|
672
|
+
scheduleProgressCleanup(workspace.id);
|
|
673
|
+
return {
|
|
674
|
+
computeId: machine.id,
|
|
675
|
+
publicUrl,
|
|
676
|
+
};
|
|
677
|
+
}
|
|
678
|
+
/**
|
|
679
|
+
* Allocate SSL certificate for custom domain
|
|
680
|
+
*/
|
|
681
|
+
async allocateCertificate(appName, hostname) {
|
|
682
|
+
const response = await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/certificates`, {
|
|
683
|
+
method: 'POST',
|
|
684
|
+
headers: {
|
|
685
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
686
|
+
'Content-Type': 'application/json',
|
|
687
|
+
},
|
|
688
|
+
body: JSON.stringify({ hostname }),
|
|
689
|
+
});
|
|
690
|
+
if (!response.ok) {
|
|
691
|
+
const error = await response.text();
|
|
692
|
+
// Don't fail if cert already exists
|
|
693
|
+
if (!error.includes('already exists')) {
|
|
694
|
+
throw new Error(`Failed to allocate certificate for ${hostname}: ${error}`);
|
|
695
|
+
}
|
|
696
|
+
}
|
|
697
|
+
}
|
|
698
|
+
async deprovision(workspace) {
|
|
699
|
+
const appName = `ar-${workspace.id.substring(0, 8)}`;
|
|
700
|
+
await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}`, {
|
|
701
|
+
method: 'DELETE',
|
|
702
|
+
headers: {
|
|
703
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
704
|
+
},
|
|
705
|
+
});
|
|
706
|
+
}
|
|
707
|
+
async getStatus(workspace) {
|
|
708
|
+
if (!workspace.computeId)
|
|
709
|
+
return 'error';
|
|
710
|
+
const appName = `ar-${workspace.id.substring(0, 8)}`;
|
|
711
|
+
const response = await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/machines/${workspace.computeId}`, {
|
|
712
|
+
headers: {
|
|
713
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
714
|
+
},
|
|
715
|
+
});
|
|
716
|
+
if (!response.ok)
|
|
717
|
+
return 'error';
|
|
718
|
+
const machine = await response.json();
|
|
719
|
+
switch (machine.state) {
|
|
720
|
+
case 'started':
|
|
721
|
+
return 'running';
|
|
722
|
+
case 'stopped':
|
|
723
|
+
return 'stopped';
|
|
724
|
+
case 'created':
|
|
725
|
+
case 'starting':
|
|
726
|
+
return 'provisioning';
|
|
727
|
+
default:
|
|
728
|
+
return 'error';
|
|
729
|
+
}
|
|
730
|
+
}
|
|
731
|
+
async restart(workspace) {
|
|
732
|
+
if (!workspace.computeId)
|
|
733
|
+
return;
|
|
734
|
+
const appName = `ar-${workspace.id.substring(0, 8)}`;
|
|
735
|
+
await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/machines/${workspace.computeId}/restart`, {
|
|
736
|
+
method: 'POST',
|
|
737
|
+
headers: {
|
|
738
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
739
|
+
},
|
|
740
|
+
});
|
|
741
|
+
}
|
|
742
|
+
/**
|
|
743
|
+
* Resize workspace - vertical scaling via Fly Machines API
|
|
744
|
+
* @param skipRestart - If true, config is saved but machine won't restart (changes apply on next start)
|
|
745
|
+
*/
|
|
746
|
+
async resize(workspaceOrId, tier, skipRestart = false) {
|
|
747
|
+
const workspaceId = typeof workspaceOrId === 'string' ? workspaceOrId : workspaceOrId.id;
|
|
748
|
+
const computeId = typeof workspaceOrId === 'string' ? undefined : workspaceOrId.computeId;
|
|
749
|
+
// If passed just an ID, look up the workspace
|
|
750
|
+
let machineId = computeId;
|
|
751
|
+
if (!machineId) {
|
|
752
|
+
const workspace = await db.workspaces.findById(workspaceId);
|
|
753
|
+
if (!workspace?.computeId)
|
|
754
|
+
return;
|
|
755
|
+
machineId = workspace.computeId;
|
|
756
|
+
}
|
|
757
|
+
const appName = `ar-${workspaceId.substring(0, 8)}`;
|
|
758
|
+
// Get current machine config first to merge with new specs
|
|
759
|
+
// This is critical - Fly.io replaces the entire config, so we must preserve
|
|
760
|
+
// existing settings (image, services, auto_stop, other env vars, etc.)
|
|
761
|
+
const getResponse = await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/machines/${machineId}`, {
|
|
762
|
+
headers: {
|
|
763
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
764
|
+
},
|
|
765
|
+
});
|
|
766
|
+
if (!getResponse.ok) {
|
|
767
|
+
throw new Error(`Failed to get machine config for resize: ${await getResponse.text()}`);
|
|
768
|
+
}
|
|
769
|
+
const machine = await getResponse.json();
|
|
770
|
+
// Merge new specs into existing config, preserving everything else
|
|
771
|
+
const updatedConfig = {
|
|
772
|
+
...machine.config,
|
|
773
|
+
guest: {
|
|
774
|
+
...(machine.config.guest || {}),
|
|
775
|
+
// Use tier-specific CPU type (shared for cost, performance for power)
|
|
776
|
+
cpu_kind: tier.cpuKind,
|
|
777
|
+
cpus: tier.cpuCores,
|
|
778
|
+
memory_mb: tier.memoryMb,
|
|
779
|
+
},
|
|
780
|
+
env: {
|
|
781
|
+
...(machine.config.env || {}),
|
|
782
|
+
MAX_AGENTS: String(tier.maxAgents),
|
|
783
|
+
},
|
|
784
|
+
};
|
|
785
|
+
// Update machine configuration
|
|
786
|
+
// If running: reboots with new specs (unless skip_launch: true)
|
|
787
|
+
// If stopped: config saved, applies on next start
|
|
788
|
+
const updateUrl = skipRestart
|
|
789
|
+
? `https://api.machines.dev/v1/apps/${appName}/machines/${machineId}?skip_launch=true`
|
|
790
|
+
: `https://api.machines.dev/v1/apps/${appName}/machines/${machineId}`;
|
|
791
|
+
const updateResponse = await fetchWithRetry(updateUrl, {
|
|
792
|
+
method: 'POST',
|
|
793
|
+
headers: {
|
|
794
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
795
|
+
'Content-Type': 'application/json',
|
|
796
|
+
},
|
|
797
|
+
body: JSON.stringify({
|
|
798
|
+
config: updatedConfig,
|
|
799
|
+
}),
|
|
800
|
+
});
|
|
801
|
+
if (!updateResponse.ok) {
|
|
802
|
+
throw new Error(`Failed to resize machine: ${await updateResponse.text()}`);
|
|
803
|
+
}
|
|
804
|
+
const restartNote = skipRestart ? ' (will apply on next restart)' : ' (restarting)';
|
|
805
|
+
console.log(`[fly] Resized workspace ${workspaceId.substring(0, 8)} to ${tier.name} (${tier.cpuCores} CPU, ${tier.memoryMb}MB RAM)${restartNote}`);
|
|
806
|
+
}
|
|
807
|
+
/**
|
|
808
|
+
* Update the max agent limit for a workspace
|
|
809
|
+
*/
|
|
810
|
+
async updateAgentLimit(workspace, newLimit) {
|
|
811
|
+
if (!workspace.computeId)
|
|
812
|
+
return;
|
|
813
|
+
const appName = `ar-${workspace.id.substring(0, 8)}`;
|
|
814
|
+
// Update environment variable
|
|
815
|
+
await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/machines/${workspace.computeId}`, {
|
|
816
|
+
method: 'POST',
|
|
817
|
+
headers: {
|
|
818
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
819
|
+
'Content-Type': 'application/json',
|
|
820
|
+
},
|
|
821
|
+
body: JSON.stringify({
|
|
822
|
+
config: {
|
|
823
|
+
env: {
|
|
824
|
+
MAX_AGENTS: String(newLimit),
|
|
825
|
+
},
|
|
826
|
+
},
|
|
827
|
+
}),
|
|
828
|
+
});
|
|
829
|
+
console.log(`[fly] Updated workspace ${workspace.id} agent limit to ${newLimit}`);
|
|
830
|
+
}
|
|
831
|
+
/**
|
|
832
|
+
* Get current resource tier for a workspace
|
|
833
|
+
*/
|
|
834
|
+
async getCurrentTier(workspace) {
|
|
835
|
+
if (!workspace.computeId) {
|
|
836
|
+
return RESOURCE_TIERS.small;
|
|
837
|
+
}
|
|
838
|
+
const appName = `ar-${workspace.id.substring(0, 8)}`;
|
|
839
|
+
const response = await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/machines/${workspace.computeId}`, {
|
|
840
|
+
headers: {
|
|
841
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
842
|
+
},
|
|
843
|
+
});
|
|
844
|
+
if (!response.ok) {
|
|
845
|
+
return RESOURCE_TIERS.small;
|
|
846
|
+
}
|
|
847
|
+
const machine = await response.json();
|
|
848
|
+
const _cpus = machine.config?.guest?.cpus || 1;
|
|
849
|
+
const memoryMb = machine.config?.guest?.memory_mb || 512;
|
|
850
|
+
// Map to nearest tier based on actual tier thresholds:
|
|
851
|
+
// small: 2048MB, medium: 4096MB, large: 8192MB, xlarge: 16384MB
|
|
852
|
+
if (memoryMb >= 16384)
|
|
853
|
+
return RESOURCE_TIERS.xlarge;
|
|
854
|
+
if (memoryMb >= 8192)
|
|
855
|
+
return RESOURCE_TIERS.large;
|
|
856
|
+
if (memoryMb >= 4096)
|
|
857
|
+
return RESOURCE_TIERS.medium;
|
|
858
|
+
return RESOURCE_TIERS.small;
|
|
859
|
+
}
|
|
860
|
+
/**
|
|
861
|
+
* Update machine image without restarting
|
|
862
|
+
* Note: The machine needs to be restarted later to use the new image
|
|
863
|
+
*/
|
|
864
|
+
async updateMachineImage(workspace, newImage) {
|
|
865
|
+
if (!workspace.computeId)
|
|
866
|
+
return;
|
|
867
|
+
const appName = `ar-${workspace.id.substring(0, 8)}`;
|
|
868
|
+
// Get current machine config first
|
|
869
|
+
const getResponse = await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/machines/${workspace.computeId}`, {
|
|
870
|
+
headers: {
|
|
871
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
872
|
+
},
|
|
873
|
+
});
|
|
874
|
+
if (!getResponse.ok) {
|
|
875
|
+
throw new Error(`Failed to get machine config: ${await getResponse.text()}`);
|
|
876
|
+
}
|
|
877
|
+
const machine = await getResponse.json();
|
|
878
|
+
// Update the image in the config
|
|
879
|
+
const updatedConfig = {
|
|
880
|
+
...machine.config,
|
|
881
|
+
image: newImage,
|
|
882
|
+
// Include registry auth if configured
|
|
883
|
+
...(this.registryAuth && {
|
|
884
|
+
image_registry_auth: {
|
|
885
|
+
registry: 'ghcr.io',
|
|
886
|
+
username: this.registryAuth.username,
|
|
887
|
+
password: this.registryAuth.password,
|
|
888
|
+
},
|
|
889
|
+
}),
|
|
890
|
+
};
|
|
891
|
+
// Update machine with new image config (skip_launch keeps it in current state)
|
|
892
|
+
const updateResponse = await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/machines/${workspace.computeId}?skip_launch=true`, {
|
|
893
|
+
method: 'POST',
|
|
894
|
+
headers: {
|
|
895
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
896
|
+
'Content-Type': 'application/json',
|
|
897
|
+
},
|
|
898
|
+
body: JSON.stringify({ config: updatedConfig }),
|
|
899
|
+
});
|
|
900
|
+
if (!updateResponse.ok) {
|
|
901
|
+
throw new Error(`Failed to update machine image: ${await updateResponse.text()}`);
|
|
902
|
+
}
|
|
903
|
+
console.log(`[fly] Updated machine image for workspace ${workspace.id.substring(0, 8)} to ${newImage}`);
|
|
904
|
+
}
|
|
905
|
+
/**
|
|
906
|
+
* Set secrets as environment variables for a workspace.
|
|
907
|
+
*/
|
|
908
|
+
async setSecrets(workspace, secrets) {
|
|
909
|
+
if (!workspace.computeId || Object.keys(secrets).length === 0)
|
|
910
|
+
return;
|
|
911
|
+
const appName = `ar-${workspace.id.substring(0, 8)}`;
|
|
912
|
+
await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/secrets`, {
|
|
913
|
+
method: 'POST',
|
|
914
|
+
headers: {
|
|
915
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
916
|
+
'Content-Type': 'application/json',
|
|
917
|
+
},
|
|
918
|
+
body: JSON.stringify(secrets),
|
|
919
|
+
});
|
|
920
|
+
}
|
|
921
|
+
/**
|
|
922
|
+
* Check if workspace has active agents by querying the daemon
|
|
923
|
+
* Retries up to 3 times with backoff to handle machines that are starting up
|
|
924
|
+
*/
|
|
925
|
+
async checkActiveAgents(workspace) {
|
|
926
|
+
if (!workspace.publicUrl) {
|
|
927
|
+
return { hasActiveAgents: false, agentCount: 0, agents: [], verified: true };
|
|
928
|
+
}
|
|
929
|
+
// Use internal Fly network URL if available (more reliable)
|
|
930
|
+
const appName = `ar-${workspace.id.substring(0, 8)}`;
|
|
931
|
+
const isOnFly = !!process.env.FLY_APP_NAME;
|
|
932
|
+
const baseUrl = isOnFly
|
|
933
|
+
? `http://${appName}.internal:3888`
|
|
934
|
+
: workspace.publicUrl;
|
|
935
|
+
const maxRetries = 3;
|
|
936
|
+
const retryDelays = [2000, 4000, 6000]; // 2s, 4s, 6s backoff
|
|
937
|
+
for (let attempt = 0; attempt < maxRetries; attempt++) {
|
|
938
|
+
try {
|
|
939
|
+
const controller = new AbortController();
|
|
940
|
+
const timer = setTimeout(() => controller.abort(), 10_000);
|
|
941
|
+
// Use /api/data endpoint which returns { agents: [...], ... }
|
|
942
|
+
// Note: /api/agents doesn't exist on the workspace dashboard-server
|
|
943
|
+
const response = await fetch(`${baseUrl}/api/data`, {
|
|
944
|
+
method: 'GET',
|
|
945
|
+
headers: {
|
|
946
|
+
'Accept': 'application/json',
|
|
947
|
+
},
|
|
948
|
+
signal: controller.signal,
|
|
949
|
+
});
|
|
950
|
+
clearTimeout(timer);
|
|
951
|
+
if (!response.ok) {
|
|
952
|
+
console.warn(`[fly] Failed to check agents for ${workspace.id.substring(0, 8)}: HTTP ${response.status} (attempt ${attempt + 1}/${maxRetries})`);
|
|
953
|
+
if (attempt < maxRetries - 1) {
|
|
954
|
+
await new Promise(resolve => setTimeout(resolve, retryDelays[attempt]));
|
|
955
|
+
continue;
|
|
956
|
+
}
|
|
957
|
+
return { hasActiveAgents: false, agentCount: 0, agents: [], verified: false };
|
|
958
|
+
}
|
|
959
|
+
const data = await response.json();
|
|
960
|
+
const agents = data.agents || [];
|
|
961
|
+
// Diagnostic logging: capture raw agent data before filtering
|
|
962
|
+
if (agents.length > 0) {
|
|
963
|
+
console.log(`[fly] Workspace ${workspace.id.substring(0, 8)} raw agent data:`, agents.map(a => ({ name: a.name, status: a.status, activityState: a.activityState })));
|
|
964
|
+
}
|
|
965
|
+
// Treat any online agent as active unless explicitly disconnected/offline.
|
|
966
|
+
const activeAgents = agents.filter(a => {
|
|
967
|
+
const status = (a.status ?? '').toLowerCase();
|
|
968
|
+
const activityState = (a.activityState ?? '').toLowerCase();
|
|
969
|
+
const isProcessing = a.isProcessing === true;
|
|
970
|
+
if (activityState === 'active' || activityState === 'idle')
|
|
971
|
+
return true;
|
|
972
|
+
if (status && status !== 'disconnected' && status !== 'offline')
|
|
973
|
+
return true;
|
|
974
|
+
if (isProcessing)
|
|
975
|
+
return true;
|
|
976
|
+
return false;
|
|
977
|
+
});
|
|
978
|
+
// Log filtering results for diagnostics
|
|
979
|
+
if (agents.length > 0 && activeAgents.length !== agents.length) {
|
|
980
|
+
const filteredOut = agents.filter(a => {
|
|
981
|
+
const status = (a.status ?? '').toLowerCase();
|
|
982
|
+
const activityState = (a.activityState ?? '').toLowerCase();
|
|
983
|
+
const isProcessing = a.isProcessing === true;
|
|
984
|
+
if (activityState === 'active' || activityState === 'idle')
|
|
985
|
+
return false;
|
|
986
|
+
if (status && status !== 'disconnected' && status !== 'offline')
|
|
987
|
+
return false;
|
|
988
|
+
if (isProcessing)
|
|
989
|
+
return false;
|
|
990
|
+
return true;
|
|
991
|
+
});
|
|
992
|
+
console.log(`[fly] Workspace ${workspace.id.substring(0, 8)} filtered out agents:`, filteredOut.map(a => ({ name: a.name, status: a.status, activityState: a.activityState })));
|
|
993
|
+
}
|
|
994
|
+
return {
|
|
995
|
+
hasActiveAgents: activeAgents.length > 0,
|
|
996
|
+
agentCount: activeAgents.length,
|
|
997
|
+
agents: agents.map(a => ({ name: a.name, status: a.status || a.activityState || 'unknown' })),
|
|
998
|
+
verified: true,
|
|
999
|
+
};
|
|
1000
|
+
}
|
|
1001
|
+
catch (error) {
|
|
1002
|
+
// Workspace might be stopped or unreachable - retry with backoff
|
|
1003
|
+
console.warn(`[fly] Could not reach workspace ${workspace.id.substring(0, 8)} (attempt ${attempt + 1}/${maxRetries}):`, error.message);
|
|
1004
|
+
if (attempt < maxRetries - 1) {
|
|
1005
|
+
await new Promise(resolve => setTimeout(resolve, retryDelays[attempt]));
|
|
1006
|
+
continue;
|
|
1007
|
+
}
|
|
1008
|
+
}
|
|
1009
|
+
}
|
|
1010
|
+
// All retries exhausted
|
|
1011
|
+
console.warn(`[fly] Workspace ${workspace.id.substring(0, 8)} unreachable after ${maxRetries} attempts`);
|
|
1012
|
+
return { hasActiveAgents: false, agentCount: 0, agents: [], verified: false };
|
|
1013
|
+
}
|
|
1014
|
+
/**
|
|
1015
|
+
* Get the current machine state
|
|
1016
|
+
*/
|
|
1017
|
+
async getMachineState(workspace) {
|
|
1018
|
+
if (!workspace.computeId)
|
|
1019
|
+
return 'unknown';
|
|
1020
|
+
const appName = `ar-${workspace.id.substring(0, 8)}`;
|
|
1021
|
+
try {
|
|
1022
|
+
const response = await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/machines/${workspace.computeId}`, {
|
|
1023
|
+
headers: {
|
|
1024
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
1025
|
+
},
|
|
1026
|
+
});
|
|
1027
|
+
if (!response.ok)
|
|
1028
|
+
return 'unknown';
|
|
1029
|
+
const machine = await response.json();
|
|
1030
|
+
return machine.state;
|
|
1031
|
+
}
|
|
1032
|
+
catch {
|
|
1033
|
+
return 'unknown';
|
|
1034
|
+
}
|
|
1035
|
+
}
|
|
1036
|
+
}
|
|
1037
|
+
/**
|
|
1038
|
+
* Railway provisioner
|
|
1039
|
+
*/
|
|
1040
|
+
class RailwayProvisioner {
|
|
1041
|
+
apiToken;
|
|
1042
|
+
cloudApiUrl;
|
|
1043
|
+
sessionSecret;
|
|
1044
|
+
constructor() {
|
|
1045
|
+
const config = getConfig();
|
|
1046
|
+
if (!config.compute.railway) {
|
|
1047
|
+
throw new Error('Railway configuration missing');
|
|
1048
|
+
}
|
|
1049
|
+
this.apiToken = config.compute.railway.apiToken;
|
|
1050
|
+
this.cloudApiUrl = config.publicUrl;
|
|
1051
|
+
this.sessionSecret = config.sessionSecret;
|
|
1052
|
+
}
|
|
1053
|
+
generateWorkspaceToken(workspaceId) {
|
|
1054
|
+
return crypto
|
|
1055
|
+
.createHmac('sha256', this.sessionSecret)
|
|
1056
|
+
.update(`workspace:${workspaceId}`)
|
|
1057
|
+
.digest('hex');
|
|
1058
|
+
}
|
|
1059
|
+
async provision(workspace, credentials) {
|
|
1060
|
+
// Create project
|
|
1061
|
+
const projectResponse = await fetchWithRetry('https://backboard.railway.app/graphql/v2', {
|
|
1062
|
+
method: 'POST',
|
|
1063
|
+
headers: {
|
|
1064
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
1065
|
+
'Content-Type': 'application/json',
|
|
1066
|
+
},
|
|
1067
|
+
body: JSON.stringify({
|
|
1068
|
+
query: `
|
|
1069
|
+
mutation CreateProject($input: ProjectCreateInput!) {
|
|
1070
|
+
projectCreate(input: $input) {
|
|
1071
|
+
id
|
|
1072
|
+
name
|
|
1073
|
+
}
|
|
1074
|
+
}
|
|
1075
|
+
`,
|
|
1076
|
+
variables: {
|
|
1077
|
+
input: {
|
|
1078
|
+
name: `agent-relay-${workspace.id.substring(0, 8)}`,
|
|
1079
|
+
},
|
|
1080
|
+
},
|
|
1081
|
+
}),
|
|
1082
|
+
});
|
|
1083
|
+
const projectData = await projectResponse.json();
|
|
1084
|
+
const projectId = projectData.data.projectCreate.id;
|
|
1085
|
+
// Deploy service
|
|
1086
|
+
const serviceResponse = await fetchWithRetry('https://backboard.railway.app/graphql/v2', {
|
|
1087
|
+
method: 'POST',
|
|
1088
|
+
headers: {
|
|
1089
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
1090
|
+
'Content-Type': 'application/json',
|
|
1091
|
+
},
|
|
1092
|
+
body: JSON.stringify({
|
|
1093
|
+
query: `
|
|
1094
|
+
mutation CreateService($input: ServiceCreateInput!) {
|
|
1095
|
+
serviceCreate(input: $input) {
|
|
1096
|
+
id
|
|
1097
|
+
}
|
|
1098
|
+
}
|
|
1099
|
+
`,
|
|
1100
|
+
variables: {
|
|
1101
|
+
input: {
|
|
1102
|
+
projectId,
|
|
1103
|
+
name: 'workspace',
|
|
1104
|
+
source: {
|
|
1105
|
+
image: WORKSPACE_IMAGE,
|
|
1106
|
+
},
|
|
1107
|
+
},
|
|
1108
|
+
},
|
|
1109
|
+
}),
|
|
1110
|
+
});
|
|
1111
|
+
const serviceData = await serviceResponse.json();
|
|
1112
|
+
const serviceId = serviceData.data.serviceCreate.id;
|
|
1113
|
+
// Create linked daemon for cloud message sync
|
|
1114
|
+
// This generates an API key and registers the daemon in the linkedDaemons table
|
|
1115
|
+
const { daemonId, apiKey: railwayApiKey } = await createLinkedDaemon(workspace.userId, workspace.id, serviceId);
|
|
1116
|
+
console.log(`[railway] Created linked daemon ${daemonId.substring(0, 8)} for workspace ${workspace.id.substring(0, 8)}`);
|
|
1117
|
+
// Set environment variables
|
|
1118
|
+
const envVars = {
|
|
1119
|
+
WORKSPACE_ID: workspace.id,
|
|
1120
|
+
WORKSPACE_OWNER_USER_ID: workspace.userId,
|
|
1121
|
+
SUPERVISOR_ENABLED: String(workspace.config.supervisorEnabled ?? false),
|
|
1122
|
+
MAX_AGENTS: String(workspace.config.maxAgents ?? 10),
|
|
1123
|
+
REPOSITORIES: (workspace.config.repositories ?? []).join(','),
|
|
1124
|
+
PROVIDERS: (workspace.config.providers ?? []).join(','),
|
|
1125
|
+
PORT: String(WORKSPACE_PORT),
|
|
1126
|
+
AGENT_RELAY_DASHBOARD_PORT: String(WORKSPACE_PORT),
|
|
1127
|
+
// Store repos on persistent volume so they survive container restarts
|
|
1128
|
+
WORKSPACE_DIR: '/data/repos',
|
|
1129
|
+
CLOUD_API_URL: this.cloudApiUrl,
|
|
1130
|
+
WORKSPACE_TOKEN: this.generateWorkspaceToken(workspace.id),
|
|
1131
|
+
// Daemon API key for cloud message sync
|
|
1132
|
+
// Auto-generated during provisioning, stored in linkedDaemons table
|
|
1133
|
+
AGENT_RELAY_API_KEY: railwayApiKey,
|
|
1134
|
+
// Enable cloud persistence for agent sessions/summaries via API
|
|
1135
|
+
RELAY_CLOUD_ENABLED: 'true',
|
|
1136
|
+
};
|
|
1137
|
+
for (const [provider, token] of credentials) {
|
|
1138
|
+
envVars[`${provider.toUpperCase()}_TOKEN`] = token;
|
|
1139
|
+
// Also set GH_TOKEN for gh CLI compatibility
|
|
1140
|
+
if (provider === 'github') {
|
|
1141
|
+
envVars['GH_TOKEN'] = token;
|
|
1142
|
+
}
|
|
1143
|
+
}
|
|
1144
|
+
await fetchWithRetry('https://backboard.railway.app/graphql/v2', {
|
|
1145
|
+
method: 'POST',
|
|
1146
|
+
headers: {
|
|
1147
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
1148
|
+
'Content-Type': 'application/json',
|
|
1149
|
+
},
|
|
1150
|
+
body: JSON.stringify({
|
|
1151
|
+
query: `
|
|
1152
|
+
mutation SetVariables($input: VariableCollectionUpsertInput!) {
|
|
1153
|
+
variableCollectionUpsert(input: $input)
|
|
1154
|
+
}
|
|
1155
|
+
`,
|
|
1156
|
+
variables: {
|
|
1157
|
+
input: {
|
|
1158
|
+
projectId,
|
|
1159
|
+
serviceId,
|
|
1160
|
+
variables: envVars,
|
|
1161
|
+
},
|
|
1162
|
+
},
|
|
1163
|
+
}),
|
|
1164
|
+
});
|
|
1165
|
+
// Generate domain
|
|
1166
|
+
const domainResponse = await fetchWithRetry('https://backboard.railway.app/graphql/v2', {
|
|
1167
|
+
method: 'POST',
|
|
1168
|
+
headers: {
|
|
1169
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
1170
|
+
'Content-Type': 'application/json',
|
|
1171
|
+
},
|
|
1172
|
+
body: JSON.stringify({
|
|
1173
|
+
query: `
|
|
1174
|
+
mutation CreateDomain($input: ServiceDomainCreateInput!) {
|
|
1175
|
+
serviceDomainCreate(input: $input) {
|
|
1176
|
+
domain
|
|
1177
|
+
}
|
|
1178
|
+
}
|
|
1179
|
+
`,
|
|
1180
|
+
variables: {
|
|
1181
|
+
input: {
|
|
1182
|
+
serviceId,
|
|
1183
|
+
},
|
|
1184
|
+
},
|
|
1185
|
+
}),
|
|
1186
|
+
});
|
|
1187
|
+
const domainData = await domainResponse.json();
|
|
1188
|
+
const domain = domainData.data.serviceDomainCreate.domain;
|
|
1189
|
+
await softHealthCheck(`https://${domain}`);
|
|
1190
|
+
return {
|
|
1191
|
+
computeId: projectId,
|
|
1192
|
+
publicUrl: `https://${domain}`,
|
|
1193
|
+
};
|
|
1194
|
+
}
|
|
1195
|
+
async deprovision(workspace) {
|
|
1196
|
+
if (!workspace.computeId)
|
|
1197
|
+
return;
|
|
1198
|
+
await fetchWithRetry('https://backboard.railway.app/graphql/v2', {
|
|
1199
|
+
method: 'POST',
|
|
1200
|
+
headers: {
|
|
1201
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
1202
|
+
'Content-Type': 'application/json',
|
|
1203
|
+
},
|
|
1204
|
+
body: JSON.stringify({
|
|
1205
|
+
query: `
|
|
1206
|
+
mutation DeleteProject($id: String!) {
|
|
1207
|
+
projectDelete(id: $id)
|
|
1208
|
+
}
|
|
1209
|
+
`,
|
|
1210
|
+
variables: {
|
|
1211
|
+
id: workspace.computeId,
|
|
1212
|
+
},
|
|
1213
|
+
}),
|
|
1214
|
+
});
|
|
1215
|
+
}
|
|
1216
|
+
async getStatus(workspace) {
|
|
1217
|
+
if (!workspace.computeId)
|
|
1218
|
+
return 'error';
|
|
1219
|
+
const response = await fetchWithRetry('https://backboard.railway.app/graphql/v2', {
|
|
1220
|
+
method: 'POST',
|
|
1221
|
+
headers: {
|
|
1222
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
1223
|
+
'Content-Type': 'application/json',
|
|
1224
|
+
},
|
|
1225
|
+
body: JSON.stringify({
|
|
1226
|
+
query: `
|
|
1227
|
+
query GetProject($id: String!) {
|
|
1228
|
+
project(id: $id) {
|
|
1229
|
+
deployments {
|
|
1230
|
+
edges {
|
|
1231
|
+
node {
|
|
1232
|
+
status
|
|
1233
|
+
}
|
|
1234
|
+
}
|
|
1235
|
+
}
|
|
1236
|
+
}
|
|
1237
|
+
}
|
|
1238
|
+
`,
|
|
1239
|
+
variables: {
|
|
1240
|
+
id: workspace.computeId,
|
|
1241
|
+
},
|
|
1242
|
+
}),
|
|
1243
|
+
});
|
|
1244
|
+
const data = await response.json();
|
|
1245
|
+
const deployments = data.data?.project?.deployments?.edges;
|
|
1246
|
+
if (!deployments || deployments.length === 0)
|
|
1247
|
+
return 'provisioning';
|
|
1248
|
+
const latestStatus = deployments[0].node.status;
|
|
1249
|
+
switch (latestStatus) {
|
|
1250
|
+
case 'SUCCESS':
|
|
1251
|
+
return 'running';
|
|
1252
|
+
case 'BUILDING':
|
|
1253
|
+
case 'DEPLOYING':
|
|
1254
|
+
return 'provisioning';
|
|
1255
|
+
case 'CRASHED':
|
|
1256
|
+
case 'FAILED':
|
|
1257
|
+
return 'error';
|
|
1258
|
+
default:
|
|
1259
|
+
return 'stopped';
|
|
1260
|
+
}
|
|
1261
|
+
}
|
|
1262
|
+
async restart(workspace) {
|
|
1263
|
+
// Railway doesn't have a direct restart - redeploy instead
|
|
1264
|
+
if (!workspace.computeId)
|
|
1265
|
+
return;
|
|
1266
|
+
await fetchWithRetry('https://backboard.railway.app/graphql/v2', {
|
|
1267
|
+
method: 'POST',
|
|
1268
|
+
headers: {
|
|
1269
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
1270
|
+
'Content-Type': 'application/json',
|
|
1271
|
+
},
|
|
1272
|
+
body: JSON.stringify({
|
|
1273
|
+
query: `
|
|
1274
|
+
mutation RedeployService($input: DeploymentTriggerInput!) {
|
|
1275
|
+
deploymentTrigger(input: $input)
|
|
1276
|
+
}
|
|
1277
|
+
`,
|
|
1278
|
+
variables: {
|
|
1279
|
+
input: {
|
|
1280
|
+
projectId: workspace.computeId,
|
|
1281
|
+
},
|
|
1282
|
+
},
|
|
1283
|
+
}),
|
|
1284
|
+
});
|
|
1285
|
+
}
|
|
1286
|
+
async setEnvVars(workspace, envVars) {
|
|
1287
|
+
if (!workspace.computeId || Object.keys(envVars).length === 0)
|
|
1288
|
+
return;
|
|
1289
|
+
const linkedDaemons = await db.linkedDaemons.findByWorkspaceId(workspace.id);
|
|
1290
|
+
const serviceId = linkedDaemons[0]?.machineId;
|
|
1291
|
+
if (!serviceId) {
|
|
1292
|
+
console.warn(`[railway] No service ID found for workspace ${workspace.id}`);
|
|
1293
|
+
return;
|
|
1294
|
+
}
|
|
1295
|
+
await fetchWithRetry('https://backboard.railway.app/graphql/v2', {
|
|
1296
|
+
method: 'POST',
|
|
1297
|
+
headers: {
|
|
1298
|
+
Authorization: `Bearer ${this.apiToken}`,
|
|
1299
|
+
'Content-Type': 'application/json',
|
|
1300
|
+
},
|
|
1301
|
+
body: JSON.stringify({
|
|
1302
|
+
query: `
|
|
1303
|
+
mutation SetVariables($input: VariableCollectionUpsertInput!) {
|
|
1304
|
+
variableCollectionUpsert(input: $input)
|
|
1305
|
+
}
|
|
1306
|
+
`,
|
|
1307
|
+
variables: {
|
|
1308
|
+
input: {
|
|
1309
|
+
projectId: workspace.computeId,
|
|
1310
|
+
serviceId,
|
|
1311
|
+
variables: envVars,
|
|
1312
|
+
},
|
|
1313
|
+
},
|
|
1314
|
+
}),
|
|
1315
|
+
});
|
|
1316
|
+
}
|
|
1317
|
+
}
|
|
1318
|
+
/**
|
|
1319
|
+
* Local Docker provisioner (for development/self-hosted)
|
|
1320
|
+
*/
|
|
1321
|
+
class DockerProvisioner {
|
|
1322
|
+
cloudApiUrl;
|
|
1323
|
+
cloudApiUrlForContainer;
|
|
1324
|
+
sessionSecret;
|
|
1325
|
+
constructor() {
|
|
1326
|
+
const config = getConfig();
|
|
1327
|
+
this.cloudApiUrl = config.publicUrl;
|
|
1328
|
+
this.sessionSecret = config.sessionSecret;
|
|
1329
|
+
// For Docker containers, localhost won't work - they need to reach the host
|
|
1330
|
+
// Convert localhost URLs to host.docker.internal for container access
|
|
1331
|
+
if (this.cloudApiUrl.includes('localhost') || this.cloudApiUrl.includes('127.0.0.1')) {
|
|
1332
|
+
this.cloudApiUrlForContainer = this.cloudApiUrl
|
|
1333
|
+
.replace('localhost', 'host.docker.internal')
|
|
1334
|
+
.replace('127.0.0.1', 'host.docker.internal');
|
|
1335
|
+
console.log(`[docker] Container API URL: ${this.cloudApiUrlForContainer} (host: ${this.cloudApiUrl})`);
|
|
1336
|
+
}
|
|
1337
|
+
else {
|
|
1338
|
+
this.cloudApiUrlForContainer = this.cloudApiUrl;
|
|
1339
|
+
}
|
|
1340
|
+
}
|
|
1341
|
+
generateWorkspaceToken(workspaceId) {
|
|
1342
|
+
return crypto
|
|
1343
|
+
.createHmac('sha256', this.sessionSecret)
|
|
1344
|
+
.update(`workspace:${workspaceId}`)
|
|
1345
|
+
.digest('hex');
|
|
1346
|
+
}
|
|
1347
|
+
/**
|
|
1348
|
+
* Wait for container to be healthy by polling the health endpoint
|
|
1349
|
+
*/
|
|
1350
|
+
async waitForHealthy(publicUrl, timeoutMs = 60_000) {
|
|
1351
|
+
const startTime = Date.now();
|
|
1352
|
+
const pollInterval = 2000;
|
|
1353
|
+
console.log(`[docker] Waiting for container to be healthy at ${publicUrl}...`);
|
|
1354
|
+
while (Date.now() - startTime < timeoutMs) {
|
|
1355
|
+
try {
|
|
1356
|
+
const response = await fetch(`${publicUrl}/health`, {
|
|
1357
|
+
method: 'GET',
|
|
1358
|
+
signal: AbortSignal.timeout(5000),
|
|
1359
|
+
});
|
|
1360
|
+
if (response.ok) {
|
|
1361
|
+
console.log(`[docker] Container healthy after ${Date.now() - startTime}ms`);
|
|
1362
|
+
return;
|
|
1363
|
+
}
|
|
1364
|
+
}
|
|
1365
|
+
catch {
|
|
1366
|
+
// Container not ready yet, continue polling
|
|
1367
|
+
}
|
|
1368
|
+
await wait(pollInterval);
|
|
1369
|
+
}
|
|
1370
|
+
throw new Error(`Container did not become healthy within ${timeoutMs}ms`);
|
|
1371
|
+
}
|
|
1372
|
+
async provision(workspace, credentials) {
|
|
1373
|
+
const containerName = `ar-${workspace.id.substring(0, 8)}`;
|
|
1374
|
+
// Create linked daemon for cloud message sync
|
|
1375
|
+
// This generates an API key and registers the daemon in the linkedDaemons table
|
|
1376
|
+
// Use container name as daemon's machine ID (will be updated to actual container ID after creation)
|
|
1377
|
+
const { daemonId, apiKey: dockerApiKey } = await createLinkedDaemon(workspace.userId, workspace.id, containerName);
|
|
1378
|
+
console.log(`[docker] Created linked daemon ${daemonId.substring(0, 8)} for workspace ${workspace.id.substring(0, 8)}`);
|
|
1379
|
+
// Build environment variables
|
|
1380
|
+
const envArgs = [
|
|
1381
|
+
`-e WORKSPACE_ID=${workspace.id}`,
|
|
1382
|
+
`-e WORKSPACE_OWNER_USER_ID=${workspace.userId}`,
|
|
1383
|
+
`-e SUPERVISOR_ENABLED=${workspace.config.supervisorEnabled ?? false}`,
|
|
1384
|
+
`-e MAX_AGENTS=${workspace.config.maxAgents ?? 10}`,
|
|
1385
|
+
`-e REPOSITORIES=${(workspace.config.repositories ?? []).join(',')}`,
|
|
1386
|
+
`-e PROVIDERS=${(workspace.config.providers ?? []).join(',')}`,
|
|
1387
|
+
`-e PORT=${WORKSPACE_PORT}`,
|
|
1388
|
+
`-e AGENT_RELAY_DASHBOARD_PORT=${WORKSPACE_PORT}`,
|
|
1389
|
+
// Store repos on persistent volume so they survive container restarts
|
|
1390
|
+
`-e WORKSPACE_DIR=/data/repos`,
|
|
1391
|
+
`-e CLOUD_API_URL=${this.cloudApiUrlForContainer}`,
|
|
1392
|
+
`-e WORKSPACE_TOKEN=${this.generateWorkspaceToken(workspace.id)}`,
|
|
1393
|
+
// Daemon API key for cloud message sync
|
|
1394
|
+
// Auto-generated during provisioning, stored in linkedDaemons table
|
|
1395
|
+
`-e AGENT_RELAY_API_KEY=${dockerApiKey}`,
|
|
1396
|
+
// Enable cloud persistence for agent sessions/summaries via API
|
|
1397
|
+
`-e RELAY_CLOUD_ENABLED=true`,
|
|
1398
|
+
];
|
|
1399
|
+
for (const [provider, token] of credentials) {
|
|
1400
|
+
envArgs.push(`-e ${provider.toUpperCase()}_TOKEN=${token}`);
|
|
1401
|
+
// Also set GH_TOKEN for gh CLI compatibility
|
|
1402
|
+
if (provider === 'github') {
|
|
1403
|
+
envArgs.push(`-e GH_TOKEN=${token}`);
|
|
1404
|
+
}
|
|
1405
|
+
}
|
|
1406
|
+
// Run container
|
|
1407
|
+
const { execSync } = await import('child_process');
|
|
1408
|
+
const hostPort = 3000 + Math.floor(Math.random() * 1000);
|
|
1409
|
+
// SSH port for tunneling (Codex OAuth callback forwarding)
|
|
1410
|
+
// Derive from hostPort to avoid collisions: API port 3500 -> SSH port 22500
|
|
1411
|
+
const sshHostPort = 22000 + (hostPort - 3000);
|
|
1412
|
+
// When running in Docker, connect to the same network for container-to-container communication
|
|
1413
|
+
const runningInDocker = process.env.RUNNING_IN_DOCKER === 'true';
|
|
1414
|
+
const networkArg = runningInDocker ? '--network agent-relay-dev' : '';
|
|
1415
|
+
// In development, mount local dist and docs folders for faster iteration
|
|
1416
|
+
// Set WORKSPACE_DEV_MOUNT=true to enable
|
|
1417
|
+
const devMount = process.env.WORKSPACE_DEV_MOUNT === 'true';
|
|
1418
|
+
const volumeArgs = devMount
|
|
1419
|
+
? `-v "${process.cwd()}/dist:/app/dist:ro" -v "${process.cwd()}/docs:/app/docs:ro"`
|
|
1420
|
+
: '';
|
|
1421
|
+
if (devMount) {
|
|
1422
|
+
console.log('[provisioner] Dev mode: mounting local dist/ and docs/ folders into workspace container');
|
|
1423
|
+
}
|
|
1424
|
+
try {
|
|
1425
|
+
// Map workspace API port and SSH port (for tunneling)
|
|
1426
|
+
// SSH is used by CLI to forward localhost:1455 to workspace container for Codex OAuth
|
|
1427
|
+
// Set CODEX_DIRECT_PORT=true to also map port 1455 directly (for debugging only)
|
|
1428
|
+
const directCodexPort = process.env.CODEX_DIRECT_PORT === 'true';
|
|
1429
|
+
const portMappings = directCodexPort
|
|
1430
|
+
? `-p ${hostPort}:${WORKSPACE_PORT} -p ${sshHostPort}:${WORKSPACE_SSH_PORT} -p ${CODEX_OAUTH_PORT}:${CODEX_OAUTH_PORT}`
|
|
1431
|
+
: `-p ${hostPort}:${WORKSPACE_PORT} -p ${sshHostPort}:${WORKSPACE_SSH_PORT}`;
|
|
1432
|
+
// Enable SSH in the container for tunneling
|
|
1433
|
+
// Each workspace gets a unique password derived from its ID + secret salt
|
|
1434
|
+
envArgs.push('-e ENABLE_SSH=true');
|
|
1435
|
+
envArgs.push(`-e SSH_PASSWORD=${deriveSshPassword(workspace.id)}`);
|
|
1436
|
+
envArgs.push(`-e SSH_PORT=${WORKSPACE_SSH_PORT}`);
|
|
1437
|
+
execSync(`docker run -d --user root --name ${containerName} ${networkArg} ${volumeArgs} ${portMappings} ${envArgs.join(' ')} ${WORKSPACE_IMAGE}`, { stdio: 'pipe' });
|
|
1438
|
+
const publicUrl = `http://localhost:${hostPort}`;
|
|
1439
|
+
// Wait for container to be healthy before returning
|
|
1440
|
+
// When running in Docker, use the internal container name for health check
|
|
1441
|
+
const healthCheckUrl = runningInDocker
|
|
1442
|
+
? `http://${containerName}:${WORKSPACE_PORT}`
|
|
1443
|
+
: publicUrl;
|
|
1444
|
+
await this.waitForHealthy(healthCheckUrl);
|
|
1445
|
+
return {
|
|
1446
|
+
computeId: containerName,
|
|
1447
|
+
publicUrl,
|
|
1448
|
+
sshPort: sshHostPort,
|
|
1449
|
+
};
|
|
1450
|
+
}
|
|
1451
|
+
catch (error) {
|
|
1452
|
+
// Clean up container if it was created but health check failed
|
|
1453
|
+
try {
|
|
1454
|
+
const { execSync: execSyncCleanup } = await import('child_process');
|
|
1455
|
+
execSyncCleanup(`docker rm -f ${containerName}`, { stdio: 'pipe' });
|
|
1456
|
+
}
|
|
1457
|
+
catch {
|
|
1458
|
+
// Ignore cleanup errors
|
|
1459
|
+
}
|
|
1460
|
+
throw new Error(`Failed to start Docker container: ${error}`);
|
|
1461
|
+
}
|
|
1462
|
+
}
|
|
1463
|
+
async deprovision(workspace) {
|
|
1464
|
+
if (!workspace.computeId)
|
|
1465
|
+
return;
|
|
1466
|
+
const { execSync } = await import('child_process');
|
|
1467
|
+
try {
|
|
1468
|
+
execSync(`docker rm -f ${workspace.computeId}`, { stdio: 'pipe' });
|
|
1469
|
+
}
|
|
1470
|
+
catch {
|
|
1471
|
+
// Container may already be removed
|
|
1472
|
+
}
|
|
1473
|
+
}
|
|
1474
|
+
async getStatus(workspace) {
|
|
1475
|
+
if (!workspace.computeId)
|
|
1476
|
+
return 'error';
|
|
1477
|
+
const { execSync } = await import('child_process');
|
|
1478
|
+
try {
|
|
1479
|
+
const result = execSync(`docker inspect -f '{{.State.Status}}' ${workspace.computeId}`, { stdio: 'pipe' }).toString().trim();
|
|
1480
|
+
switch (result) {
|
|
1481
|
+
case 'running':
|
|
1482
|
+
return 'running';
|
|
1483
|
+
case 'exited':
|
|
1484
|
+
case 'dead':
|
|
1485
|
+
return 'stopped';
|
|
1486
|
+
case 'created':
|
|
1487
|
+
case 'restarting':
|
|
1488
|
+
return 'provisioning';
|
|
1489
|
+
default:
|
|
1490
|
+
return 'error';
|
|
1491
|
+
}
|
|
1492
|
+
}
|
|
1493
|
+
catch {
|
|
1494
|
+
return 'error';
|
|
1495
|
+
}
|
|
1496
|
+
}
|
|
1497
|
+
async restart(workspace) {
|
|
1498
|
+
if (!workspace.computeId)
|
|
1499
|
+
return;
|
|
1500
|
+
const { execSync } = await import('child_process');
|
|
1501
|
+
try {
|
|
1502
|
+
execSync(`docker restart ${workspace.computeId}`, { stdio: 'pipe' });
|
|
1503
|
+
}
|
|
1504
|
+
catch (error) {
|
|
1505
|
+
throw new Error(`Failed to restart container: ${error}`);
|
|
1506
|
+
}
|
|
1507
|
+
}
|
|
1508
|
+
async setEnvVars(_workspace, _envVars) {
|
|
1509
|
+
console.warn('[docker] Updating environment variables for running containers is not supported.');
|
|
1510
|
+
}
|
|
1511
|
+
}
|
|
1512
|
+
/**
|
|
1513
|
+
* Main Workspace Provisioner
|
|
1514
|
+
*/
|
|
1515
|
+
export class WorkspaceProvisioner {
|
|
1516
|
+
provisioner;
|
|
1517
|
+
constructor() {
|
|
1518
|
+
const config = getConfig();
|
|
1519
|
+
switch (config.compute.provider) {
|
|
1520
|
+
case 'fly':
|
|
1521
|
+
this.provisioner = new FlyProvisioner();
|
|
1522
|
+
break;
|
|
1523
|
+
case 'railway':
|
|
1524
|
+
this.provisioner = new RailwayProvisioner();
|
|
1525
|
+
break;
|
|
1526
|
+
case 'docker':
|
|
1527
|
+
default:
|
|
1528
|
+
this.provisioner = new DockerProvisioner();
|
|
1529
|
+
}
|
|
1530
|
+
}
|
|
1531
|
+
/**
|
|
1532
|
+
* Provision a new workspace (one-click)
|
|
1533
|
+
* Returns immediately with 'provisioning' status and runs actual provisioning in background
|
|
1534
|
+
*/
|
|
1535
|
+
async provision(config) {
|
|
1536
|
+
// Create workspace record
|
|
1537
|
+
const workspace = await db.workspaces.create({
|
|
1538
|
+
userId: config.userId,
|
|
1539
|
+
name: config.name,
|
|
1540
|
+
computeProvider: getConfig().compute.provider,
|
|
1541
|
+
config: {
|
|
1542
|
+
providers: config.providers,
|
|
1543
|
+
repositories: config.repositories,
|
|
1544
|
+
supervisorEnabled: config.supervisorEnabled ?? true,
|
|
1545
|
+
maxAgents: config.maxAgents ?? 10,
|
|
1546
|
+
},
|
|
1547
|
+
});
|
|
1548
|
+
// Add creator as owner in workspace_members for team collaboration support
|
|
1549
|
+
await db.workspaceMembers.addMember({
|
|
1550
|
+
workspaceId: workspace.id,
|
|
1551
|
+
userId: config.userId,
|
|
1552
|
+
role: 'owner',
|
|
1553
|
+
invitedBy: config.userId, // Self-invited as creator
|
|
1554
|
+
});
|
|
1555
|
+
// Auto-accept the creator's membership
|
|
1556
|
+
await db.workspaceMembers.acceptInvite(workspace.id, config.userId);
|
|
1557
|
+
// Link repositories to this workspace
|
|
1558
|
+
// This enables auto-access for users with GitHub access to these repos
|
|
1559
|
+
for (const repoFullName of config.repositories) {
|
|
1560
|
+
try {
|
|
1561
|
+
// Find the user's repo record (may not exist if user didn't import it first)
|
|
1562
|
+
const userRepos = await db.repositories.findByUserId(config.userId);
|
|
1563
|
+
const repoRecord = userRepos.find(r => r.githubFullName.toLowerCase() === repoFullName.toLowerCase());
|
|
1564
|
+
if (repoRecord) {
|
|
1565
|
+
await db.repositories.assignToWorkspace(repoRecord.id, workspace.id);
|
|
1566
|
+
console.log(`[provisioner] Linked repo ${repoFullName} to workspace ${workspace.id.substring(0, 8)}`);
|
|
1567
|
+
}
|
|
1568
|
+
else {
|
|
1569
|
+
// Create a placeholder repo record if it doesn't exist
|
|
1570
|
+
// This ensures the repo is tracked for workspace access checks
|
|
1571
|
+
console.log(`[provisioner] Creating repo record for ${repoFullName}`);
|
|
1572
|
+
const newRepo = await db.repositories.upsert({
|
|
1573
|
+
userId: config.userId,
|
|
1574
|
+
githubFullName: repoFullName,
|
|
1575
|
+
githubId: 0, // Will be updated when actually synced
|
|
1576
|
+
defaultBranch: 'main',
|
|
1577
|
+
isPrivate: true, // Assume private, will be updated
|
|
1578
|
+
workspaceId: workspace.id,
|
|
1579
|
+
});
|
|
1580
|
+
console.log(`[provisioner] Created and linked repo ${repoFullName} (id: ${newRepo.id.substring(0, 8)})`);
|
|
1581
|
+
}
|
|
1582
|
+
}
|
|
1583
|
+
catch (err) {
|
|
1584
|
+
console.warn(`[provisioner] Failed to link repo ${repoFullName}:`, err);
|
|
1585
|
+
// Continue with other repos
|
|
1586
|
+
}
|
|
1587
|
+
}
|
|
1588
|
+
// Initialize stage tracking immediately
|
|
1589
|
+
updateProvisioningStage(workspace.id, 'creating');
|
|
1590
|
+
// Run provisioning in the background so frontend can poll for stages
|
|
1591
|
+
this.runProvisioningAsync(workspace, config).catch((error) => {
|
|
1592
|
+
console.error(`[provisioner] Background provisioning failed for ${workspace.id}:`, error);
|
|
1593
|
+
});
|
|
1594
|
+
// Return immediately with 'provisioning' status
|
|
1595
|
+
return {
|
|
1596
|
+
workspaceId: workspace.id,
|
|
1597
|
+
status: 'provisioning',
|
|
1598
|
+
};
|
|
1599
|
+
}
|
|
1600
|
+
/**
|
|
1601
|
+
* Run the actual provisioning work asynchronously
|
|
1602
|
+
*/
|
|
1603
|
+
async runProvisioningAsync(workspace, config) {
|
|
1604
|
+
// Build credentials map for workspace provisioning
|
|
1605
|
+
// Note: Provider tokens (Claude, Codex, etc.) are no longer stored centrally.
|
|
1606
|
+
// CLI tools authenticate directly on workspace instances.
|
|
1607
|
+
// Only GitHub App tokens are obtained from Nango for repository cloning.
|
|
1608
|
+
const credentials = new Map();
|
|
1609
|
+
// GitHub token is required for cloning repositories
|
|
1610
|
+
// Use direct token if provided (for testing), otherwise get from Nango
|
|
1611
|
+
if (config.repositories.length > 0) {
|
|
1612
|
+
if (config.githubToken) {
|
|
1613
|
+
// Direct token provided (for testing)
|
|
1614
|
+
credentials.set('github', config.githubToken);
|
|
1615
|
+
console.log('[provisioner] Using provided GitHub token');
|
|
1616
|
+
}
|
|
1617
|
+
else {
|
|
1618
|
+
// Get fresh installation token from Nango GitHub App
|
|
1619
|
+
const githubToken = await getGithubAppTokenForUser(config.userId);
|
|
1620
|
+
if (githubToken) {
|
|
1621
|
+
credentials.set('github', githubToken);
|
|
1622
|
+
}
|
|
1623
|
+
else {
|
|
1624
|
+
console.warn(`[provisioner] No GitHub App token for user ${config.userId}; repository cloning may fail.`);
|
|
1625
|
+
}
|
|
1626
|
+
}
|
|
1627
|
+
}
|
|
1628
|
+
// Provision compute
|
|
1629
|
+
try {
|
|
1630
|
+
const { computeId, publicUrl } = await this.provisioner.provision(workspace, credentials);
|
|
1631
|
+
await db.workspaces.updateStatus(workspace.id, 'running', {
|
|
1632
|
+
computeId,
|
|
1633
|
+
publicUrl,
|
|
1634
|
+
});
|
|
1635
|
+
// Schedule cleanup of provisioning progress after 30s (gives frontend time to see 'complete')
|
|
1636
|
+
setTimeout(() => {
|
|
1637
|
+
clearProvisioningProgress(workspace.id);
|
|
1638
|
+
console.log(`[provisioner] Cleaned up provisioning progress for ${workspace.id.substring(0, 8)}`);
|
|
1639
|
+
}, 30_000);
|
|
1640
|
+
console.log(`[provisioner] Workspace ${workspace.id} provisioned successfully at ${publicUrl}`);
|
|
1641
|
+
}
|
|
1642
|
+
catch (error) {
|
|
1643
|
+
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
|
1644
|
+
await db.workspaces.updateStatus(workspace.id, 'error', {
|
|
1645
|
+
errorMessage,
|
|
1646
|
+
});
|
|
1647
|
+
// Clear provisioning progress on error
|
|
1648
|
+
clearProvisioningProgress(workspace.id);
|
|
1649
|
+
console.error(`[provisioner] Workspace ${workspace.id} provisioning failed:`, errorMessage);
|
|
1650
|
+
}
|
|
1651
|
+
}
|
|
1652
|
+
/**
|
|
1653
|
+
* Deprovision a workspace
|
|
1654
|
+
*/
|
|
1655
|
+
async deprovision(workspaceId) {
|
|
1656
|
+
const workspace = await db.workspaces.findById(workspaceId);
|
|
1657
|
+
if (!workspace) {
|
|
1658
|
+
throw new Error('Workspace not found');
|
|
1659
|
+
}
|
|
1660
|
+
await this.provisioner.deprovision(workspace);
|
|
1661
|
+
await db.workspaces.delete(workspaceId);
|
|
1662
|
+
}
|
|
1663
|
+
/**
|
|
1664
|
+
* Get workspace status
|
|
1665
|
+
*/
|
|
1666
|
+
async getStatus(workspaceId) {
|
|
1667
|
+
const workspace = await db.workspaces.findById(workspaceId);
|
|
1668
|
+
if (!workspace) {
|
|
1669
|
+
throw new Error('Workspace not found');
|
|
1670
|
+
}
|
|
1671
|
+
// During early provisioning, computeId isn't set yet
|
|
1672
|
+
// Return the database status instead of querying the provider
|
|
1673
|
+
if (!workspace.computeId && workspace.status === 'provisioning') {
|
|
1674
|
+
return 'provisioning';
|
|
1675
|
+
}
|
|
1676
|
+
const status = await this.provisioner.getStatus(workspace);
|
|
1677
|
+
// Update database if status changed
|
|
1678
|
+
if (status !== workspace.status) {
|
|
1679
|
+
await db.workspaces.updateStatus(workspaceId, status);
|
|
1680
|
+
}
|
|
1681
|
+
return status;
|
|
1682
|
+
}
|
|
1683
|
+
/**
|
|
1684
|
+
* Restart a workspace
|
|
1685
|
+
*/
|
|
1686
|
+
async restart(workspaceId) {
|
|
1687
|
+
const workspace = await db.workspaces.findById(workspaceId);
|
|
1688
|
+
if (!workspace) {
|
|
1689
|
+
throw new Error('Workspace not found');
|
|
1690
|
+
}
|
|
1691
|
+
await this.provisioner.restart(workspace);
|
|
1692
|
+
}
|
|
1693
|
+
/**
|
|
1694
|
+
* Update environment variables for a workspace instance.
|
|
1695
|
+
*/
|
|
1696
|
+
async setWorkspaceEnvVars(workspace, envVars) {
|
|
1697
|
+
if (Object.keys(envVars).length === 0)
|
|
1698
|
+
return;
|
|
1699
|
+
if (this.provisioner instanceof FlyProvisioner) {
|
|
1700
|
+
await this.provisioner.setSecrets(workspace, envVars);
|
|
1701
|
+
return;
|
|
1702
|
+
}
|
|
1703
|
+
if (this.provisioner instanceof RailwayProvisioner) {
|
|
1704
|
+
await this.provisioner.setEnvVars(workspace, envVars);
|
|
1705
|
+
return;
|
|
1706
|
+
}
|
|
1707
|
+
if (this.provisioner instanceof DockerProvisioner) {
|
|
1708
|
+
await this.provisioner.setEnvVars(workspace, envVars);
|
|
1709
|
+
return;
|
|
1710
|
+
}
|
|
1711
|
+
}
|
|
1712
|
+
/**
|
|
1713
|
+
* Stop a workspace
|
|
1714
|
+
*/
|
|
1715
|
+
async stop(workspaceId) {
|
|
1716
|
+
const workspace = await db.workspaces.findById(workspaceId);
|
|
1717
|
+
if (!workspace) {
|
|
1718
|
+
throw new Error('Workspace not found');
|
|
1719
|
+
}
|
|
1720
|
+
// For now, just deprovision to stop
|
|
1721
|
+
await this.provisioner.deprovision(workspace);
|
|
1722
|
+
await db.workspaces.updateStatus(workspaceId, 'stopped');
|
|
1723
|
+
}
|
|
1724
|
+
/**
|
|
1725
|
+
* Resize a workspace (vertical scaling)
|
|
1726
|
+
* @param skipRestart - If true, config is saved but machine won't restart (changes apply on next start)
|
|
1727
|
+
*/
|
|
1728
|
+
async resize(workspaceId, tier, skipRestart = false) {
|
|
1729
|
+
const workspace = await db.workspaces.findById(workspaceId);
|
|
1730
|
+
if (!workspace) {
|
|
1731
|
+
throw new Error('Workspace not found');
|
|
1732
|
+
}
|
|
1733
|
+
if (!this.provisioner.resize) {
|
|
1734
|
+
throw new Error('Resize not supported by current compute provider');
|
|
1735
|
+
}
|
|
1736
|
+
await this.provisioner.resize(workspace, tier, skipRestart);
|
|
1737
|
+
// Update workspace config with new limits
|
|
1738
|
+
await db.workspaces.updateConfig(workspaceId, {
|
|
1739
|
+
...workspace.config,
|
|
1740
|
+
maxAgents: tier.maxAgents,
|
|
1741
|
+
resourceTier: tier.name,
|
|
1742
|
+
});
|
|
1743
|
+
}
|
|
1744
|
+
/**
|
|
1745
|
+
* Update the max agent limit for a workspace
|
|
1746
|
+
*/
|
|
1747
|
+
async updateAgentLimit(workspaceId, newLimit) {
|
|
1748
|
+
const workspace = await db.workspaces.findById(workspaceId);
|
|
1749
|
+
if (!workspace) {
|
|
1750
|
+
throw new Error('Workspace not found');
|
|
1751
|
+
}
|
|
1752
|
+
if (this.provisioner.updateAgentLimit) {
|
|
1753
|
+
await this.provisioner.updateAgentLimit(workspace, newLimit);
|
|
1754
|
+
}
|
|
1755
|
+
// Update workspace config
|
|
1756
|
+
await db.workspaces.updateConfig(workspaceId, {
|
|
1757
|
+
...workspace.config,
|
|
1758
|
+
maxAgents: newLimit,
|
|
1759
|
+
});
|
|
1760
|
+
}
|
|
1761
|
+
/**
|
|
1762
|
+
* Get current resource tier for a workspace
|
|
1763
|
+
*/
|
|
1764
|
+
async getCurrentTier(workspaceId) {
|
|
1765
|
+
const workspace = await db.workspaces.findById(workspaceId);
|
|
1766
|
+
if (!workspace) {
|
|
1767
|
+
throw new Error('Workspace not found');
|
|
1768
|
+
}
|
|
1769
|
+
if (this.provisioner.getCurrentTier) {
|
|
1770
|
+
return this.provisioner.getCurrentTier(workspace);
|
|
1771
|
+
}
|
|
1772
|
+
// Fallback: determine from config or default to small
|
|
1773
|
+
const tierName = workspace.config.resourceTier || 'small';
|
|
1774
|
+
return RESOURCE_TIERS[tierName] || RESOURCE_TIERS.small;
|
|
1775
|
+
}
|
|
1776
|
+
/**
|
|
1777
|
+
* Get recommended tier based on agent count
|
|
1778
|
+
* Uses 1.5-2GB per agent as baseline for Claude Code
|
|
1779
|
+
*/
|
|
1780
|
+
getRecommendedTier(agentCount) {
|
|
1781
|
+
// Find the smallest tier that supports this agent count
|
|
1782
|
+
const tiers = Object.values(RESOURCE_TIERS).sort((a, b) => a.maxAgents - b.maxAgents);
|
|
1783
|
+
for (const tier of tiers) {
|
|
1784
|
+
if (tier.maxAgents >= agentCount) {
|
|
1785
|
+
return tier;
|
|
1786
|
+
}
|
|
1787
|
+
}
|
|
1788
|
+
// If agent count exceeds all tiers, return the largest
|
|
1789
|
+
return RESOURCE_TIERS.xlarge;
|
|
1790
|
+
}
|
|
1791
|
+
/**
|
|
1792
|
+
* Auto-scale workspace based on current agent count
|
|
1793
|
+
* Respects plan limits - free tier cannot scale, others have max tier limits
|
|
1794
|
+
* Returns { scaled: boolean, reason?: string }
|
|
1795
|
+
*/
|
|
1796
|
+
async autoScale(workspaceId, currentAgentCount) {
|
|
1797
|
+
const workspace = await db.workspaces.findById(workspaceId);
|
|
1798
|
+
if (!workspace) {
|
|
1799
|
+
throw new Error('Workspace not found');
|
|
1800
|
+
}
|
|
1801
|
+
// Get user's plan
|
|
1802
|
+
const user = await db.users.findById(workspace.userId);
|
|
1803
|
+
const plan = user?.plan || 'free';
|
|
1804
|
+
// Check if plan allows auto-scaling
|
|
1805
|
+
if (!canAutoScale(plan)) {
|
|
1806
|
+
return {
|
|
1807
|
+
scaled: false,
|
|
1808
|
+
reason: 'Auto-scaling requires Pro plan or higher',
|
|
1809
|
+
};
|
|
1810
|
+
}
|
|
1811
|
+
const currentTier = await this.getCurrentTier(workspaceId);
|
|
1812
|
+
const recommendedTier = this.getRecommendedTier(currentAgentCount);
|
|
1813
|
+
// Only scale UP, never down (to avoid disruption)
|
|
1814
|
+
if (recommendedTier.memoryMb <= currentTier.memoryMb) {
|
|
1815
|
+
return {
|
|
1816
|
+
scaled: false,
|
|
1817
|
+
currentTier: currentTier.name,
|
|
1818
|
+
};
|
|
1819
|
+
}
|
|
1820
|
+
// Check if plan allows scaling to the recommended tier
|
|
1821
|
+
if (!canScaleToTier(plan, recommendedTier.name)) {
|
|
1822
|
+
// Find the max tier allowed for this plan
|
|
1823
|
+
const maxTierName = getResourceTierForPlan(plan);
|
|
1824
|
+
const maxTier = RESOURCE_TIERS[maxTierName];
|
|
1825
|
+
if (maxTier.memoryMb <= currentTier.memoryMb) {
|
|
1826
|
+
return {
|
|
1827
|
+
scaled: false,
|
|
1828
|
+
reason: `Already at max tier (${currentTier.name}) for ${plan} plan`,
|
|
1829
|
+
currentTier: currentTier.name,
|
|
1830
|
+
};
|
|
1831
|
+
}
|
|
1832
|
+
// Scale to max allowed tier instead
|
|
1833
|
+
console.log(`[provisioner] Auto-scaling workspace ${workspaceId.substring(0, 8)} from ${currentTier.name} to ${maxTierName} (max for ${plan} plan)`);
|
|
1834
|
+
await this.resize(workspaceId, maxTier);
|
|
1835
|
+
return {
|
|
1836
|
+
scaled: true,
|
|
1837
|
+
currentTier: currentTier.name,
|
|
1838
|
+
targetTier: maxTierName,
|
|
1839
|
+
reason: `Scaled to max tier for ${plan} plan`,
|
|
1840
|
+
};
|
|
1841
|
+
}
|
|
1842
|
+
console.log(`[provisioner] Auto-scaling workspace ${workspaceId.substring(0, 8)} from ${currentTier.name} to ${recommendedTier.name} (${currentAgentCount} agents)`);
|
|
1843
|
+
await this.resize(workspaceId, recommendedTier);
|
|
1844
|
+
return {
|
|
1845
|
+
scaled: true,
|
|
1846
|
+
currentTier: currentTier.name,
|
|
1847
|
+
targetTier: recommendedTier.name,
|
|
1848
|
+
};
|
|
1849
|
+
}
|
|
1850
|
+
// ============================================================================
|
|
1851
|
+
// Snapshot Management
|
|
1852
|
+
// ============================================================================
|
|
1853
|
+
/**
|
|
1854
|
+
* Create an on-demand snapshot of a workspace's volume
|
|
1855
|
+
* Use before risky operations (e.g., major refactors, untrusted code execution)
|
|
1856
|
+
*/
|
|
1857
|
+
async createSnapshot(workspaceId) {
|
|
1858
|
+
const workspace = await db.workspaces.findById(workspaceId);
|
|
1859
|
+
if (!workspace) {
|
|
1860
|
+
throw new Error('Workspace not found');
|
|
1861
|
+
}
|
|
1862
|
+
// Only Fly.io provisioner supports snapshots
|
|
1863
|
+
if (!(this.provisioner instanceof FlyProvisioner)) {
|
|
1864
|
+
console.warn('[provisioner] Snapshots only supported on Fly.io');
|
|
1865
|
+
return null;
|
|
1866
|
+
}
|
|
1867
|
+
const appName = `ar-${workspace.id.substring(0, 8)}`;
|
|
1868
|
+
const flyProvisioner = this.provisioner;
|
|
1869
|
+
// Get the volume
|
|
1870
|
+
const volume = await flyProvisioner.getVolume(appName);
|
|
1871
|
+
if (!volume) {
|
|
1872
|
+
throw new Error('No volume found for workspace');
|
|
1873
|
+
}
|
|
1874
|
+
// Create snapshot
|
|
1875
|
+
const snapshot = await flyProvisioner.createSnapshot(appName, volume.id);
|
|
1876
|
+
return { snapshotId: snapshot.id };
|
|
1877
|
+
}
|
|
1878
|
+
/**
|
|
1879
|
+
* List available snapshots for a workspace
|
|
1880
|
+
* Includes both automatic daily snapshots and on-demand snapshots
|
|
1881
|
+
*/
|
|
1882
|
+
async listSnapshots(workspaceId) {
|
|
1883
|
+
const workspace = await db.workspaces.findById(workspaceId);
|
|
1884
|
+
if (!workspace) {
|
|
1885
|
+
throw new Error('Workspace not found');
|
|
1886
|
+
}
|
|
1887
|
+
// Only Fly.io provisioner supports snapshots
|
|
1888
|
+
if (!(this.provisioner instanceof FlyProvisioner)) {
|
|
1889
|
+
return [];
|
|
1890
|
+
}
|
|
1891
|
+
const appName = `ar-${workspace.id.substring(0, 8)}`;
|
|
1892
|
+
const flyProvisioner = this.provisioner;
|
|
1893
|
+
// Get the volume
|
|
1894
|
+
const volume = await flyProvisioner.getVolume(appName);
|
|
1895
|
+
if (!volume) {
|
|
1896
|
+
return [];
|
|
1897
|
+
}
|
|
1898
|
+
// List snapshots
|
|
1899
|
+
const snapshots = await flyProvisioner.listSnapshots(appName, volume.id);
|
|
1900
|
+
return snapshots.map(s => ({
|
|
1901
|
+
id: s.id,
|
|
1902
|
+
createdAt: s.created_at,
|
|
1903
|
+
sizeBytes: s.size,
|
|
1904
|
+
}));
|
|
1905
|
+
}
|
|
1906
|
+
/**
|
|
1907
|
+
* Get the volume ID for a workspace (needed for restore operations)
|
|
1908
|
+
*/
|
|
1909
|
+
async getVolumeId(workspaceId) {
|
|
1910
|
+
const workspace = await db.workspaces.findById(workspaceId);
|
|
1911
|
+
if (!workspace) {
|
|
1912
|
+
throw new Error('Workspace not found');
|
|
1913
|
+
}
|
|
1914
|
+
if (!(this.provisioner instanceof FlyProvisioner)) {
|
|
1915
|
+
return null;
|
|
1916
|
+
}
|
|
1917
|
+
const appName = `ar-${workspace.id.substring(0, 8)}`;
|
|
1918
|
+
const flyProvisioner = this.provisioner;
|
|
1919
|
+
const volume = await flyProvisioner.getVolume(appName);
|
|
1920
|
+
return volume?.id || null;
|
|
1921
|
+
}
|
|
1922
|
+
// ============================================================================
|
|
1923
|
+
// Graceful Image Update
|
|
1924
|
+
// ============================================================================
|
|
1925
|
+
/**
|
|
1926
|
+
* Result of a graceful update attempt
|
|
1927
|
+
*/
|
|
1928
|
+
static UpdateResult = {
|
|
1929
|
+
UPDATED: 'updated',
|
|
1930
|
+
UPDATED_PENDING_RESTART: 'updated_pending_restart',
|
|
1931
|
+
SKIPPED_ACTIVE_AGENTS: 'skipped_active_agents',
|
|
1932
|
+
SKIPPED_VERIFICATION_FAILED: 'skipped_verification_failed',
|
|
1933
|
+
SKIPPED_NOT_RUNNING: 'skipped_not_running',
|
|
1934
|
+
NOT_SUPPORTED: 'not_supported',
|
|
1935
|
+
ERROR: 'error',
|
|
1936
|
+
};
|
|
1937
|
+
/**
|
|
1938
|
+
* Gracefully update a single workspace's image
|
|
1939
|
+
*
|
|
1940
|
+
* Behavior:
|
|
1941
|
+
* - If workspace is stopped: Update config, will use new image on next wake
|
|
1942
|
+
* - If workspace is running with no agents: Update config and restart
|
|
1943
|
+
* - If workspace is running with active agents: Skip (or force if specified)
|
|
1944
|
+
*
|
|
1945
|
+
* @param workspaceId - Workspace to update
|
|
1946
|
+
* @param newImage - New Docker image to use
|
|
1947
|
+
* @param options - Update options
|
|
1948
|
+
* @returns Update result with details
|
|
1949
|
+
*/
|
|
1950
|
+
async gracefulUpdateImage(workspaceId, newImage, options = {}) {
|
|
1951
|
+
const workspace = await db.workspaces.findById(workspaceId);
|
|
1952
|
+
if (!workspace) {
|
|
1953
|
+
return {
|
|
1954
|
+
result: WorkspaceProvisioner.UpdateResult.ERROR,
|
|
1955
|
+
workspaceId,
|
|
1956
|
+
error: 'Workspace not found',
|
|
1957
|
+
};
|
|
1958
|
+
}
|
|
1959
|
+
// Only Fly.io supports graceful updates
|
|
1960
|
+
if (!(this.provisioner instanceof FlyProvisioner)) {
|
|
1961
|
+
return {
|
|
1962
|
+
result: WorkspaceProvisioner.UpdateResult.NOT_SUPPORTED,
|
|
1963
|
+
workspaceId,
|
|
1964
|
+
error: 'Graceful updates only supported on Fly.io',
|
|
1965
|
+
};
|
|
1966
|
+
}
|
|
1967
|
+
const flyProvisioner = this.provisioner;
|
|
1968
|
+
try {
|
|
1969
|
+
// Check machine state
|
|
1970
|
+
const machineState = await flyProvisioner.getMachineState(workspace);
|
|
1971
|
+
if (machineState === 'stopped' || machineState === 'suspended') {
|
|
1972
|
+
// Machine is not running - safe to update, will apply on next wake
|
|
1973
|
+
await flyProvisioner.updateMachineImage(workspace, newImage);
|
|
1974
|
+
console.log(`[provisioner] Updated stopped workspace ${workspaceId.substring(0, 8)} to ${newImage}`);
|
|
1975
|
+
return {
|
|
1976
|
+
result: WorkspaceProvisioner.UpdateResult.UPDATED_PENDING_RESTART,
|
|
1977
|
+
workspaceId,
|
|
1978
|
+
machineState,
|
|
1979
|
+
};
|
|
1980
|
+
}
|
|
1981
|
+
if (machineState === 'started') {
|
|
1982
|
+
// Machine is running - check for active agents
|
|
1983
|
+
const agentCheck = await flyProvisioner.checkActiveAgents(workspace);
|
|
1984
|
+
// If we couldn't verify agent status and not forcing, skip to be safe
|
|
1985
|
+
// This is expected behavior for workspaces that are waking up from auto-stop
|
|
1986
|
+
// or experiencing temporary network issues - not an error condition
|
|
1987
|
+
if (!agentCheck.verified && !options.force) {
|
|
1988
|
+
console.log(`[provisioner] Skipped workspace ${workspaceId.substring(0, 8)}: workspace unreachable (will update on next restart)`);
|
|
1989
|
+
return {
|
|
1990
|
+
result: WorkspaceProvisioner.UpdateResult.SKIPPED_VERIFICATION_FAILED,
|
|
1991
|
+
workspaceId,
|
|
1992
|
+
machineState,
|
|
1993
|
+
// Use 'reason' instead of 'error' - this is expected behavior, not an error
|
|
1994
|
+
reason: 'Workspace unreachable - will update on next restart or when accessible',
|
|
1995
|
+
};
|
|
1996
|
+
}
|
|
1997
|
+
if (agentCheck.hasActiveAgents && !options.force) {
|
|
1998
|
+
// Has active agents and not forcing - skip
|
|
1999
|
+
console.log(`[provisioner] Skipped workspace ${workspaceId.substring(0, 8)}: ${agentCheck.agentCount} active agents`);
|
|
2000
|
+
return {
|
|
2001
|
+
result: WorkspaceProvisioner.UpdateResult.SKIPPED_ACTIVE_AGENTS,
|
|
2002
|
+
workspaceId,
|
|
2003
|
+
machineState,
|
|
2004
|
+
agentCount: agentCheck.agentCount,
|
|
2005
|
+
agents: agentCheck.agents,
|
|
2006
|
+
};
|
|
2007
|
+
}
|
|
2008
|
+
// Update the image config
|
|
2009
|
+
await flyProvisioner.updateMachineImage(workspace, newImage);
|
|
2010
|
+
if (options.skipRestart) {
|
|
2011
|
+
// Config updated but not restarting - will apply on next restart/auto-stop-wake
|
|
2012
|
+
console.log(`[provisioner] Updated workspace ${workspaceId.substring(0, 8)} config (restart skipped)`);
|
|
2013
|
+
return {
|
|
2014
|
+
result: WorkspaceProvisioner.UpdateResult.UPDATED_PENDING_RESTART,
|
|
2015
|
+
workspaceId,
|
|
2016
|
+
machineState,
|
|
2017
|
+
agentCount: agentCheck.agentCount,
|
|
2018
|
+
agents: agentCheck.agents,
|
|
2019
|
+
};
|
|
2020
|
+
}
|
|
2021
|
+
// Restart to apply new image
|
|
2022
|
+
await flyProvisioner.restart(workspace);
|
|
2023
|
+
console.log(`[provisioner] Updated and restarted workspace ${workspaceId.substring(0, 8)}`);
|
|
2024
|
+
return {
|
|
2025
|
+
result: WorkspaceProvisioner.UpdateResult.UPDATED,
|
|
2026
|
+
workspaceId,
|
|
2027
|
+
machineState,
|
|
2028
|
+
agentCount: agentCheck.agentCount,
|
|
2029
|
+
};
|
|
2030
|
+
}
|
|
2031
|
+
// Unknown state
|
|
2032
|
+
return {
|
|
2033
|
+
result: WorkspaceProvisioner.UpdateResult.SKIPPED_NOT_RUNNING,
|
|
2034
|
+
workspaceId,
|
|
2035
|
+
machineState,
|
|
2036
|
+
};
|
|
2037
|
+
}
|
|
2038
|
+
catch (error) {
|
|
2039
|
+
console.error(`[provisioner] Error updating workspace ${workspaceId.substring(0, 8)}:`, error);
|
|
2040
|
+
return {
|
|
2041
|
+
result: WorkspaceProvisioner.UpdateResult.ERROR,
|
|
2042
|
+
workspaceId,
|
|
2043
|
+
error: error.message,
|
|
2044
|
+
};
|
|
2045
|
+
}
|
|
2046
|
+
}
|
|
2047
|
+
/**
|
|
2048
|
+
* Gracefully update all workspaces to a new image
|
|
2049
|
+
*
|
|
2050
|
+
* Processes workspaces in batches, respecting active agents unless forced.
|
|
2051
|
+
* Returns detailed results for each workspace.
|
|
2052
|
+
*
|
|
2053
|
+
* @param newImage - New Docker image to use
|
|
2054
|
+
* @param options - Update options
|
|
2055
|
+
* @returns Summary and per-workspace results
|
|
2056
|
+
*/
|
|
2057
|
+
async gracefulUpdateAllImages(newImage, options = {}) {
|
|
2058
|
+
// Get all workspaces to update
|
|
2059
|
+
let workspaces;
|
|
2060
|
+
if (options.workspaceIds?.length) {
|
|
2061
|
+
// Specific workspaces
|
|
2062
|
+
workspaces = (await Promise.all(options.workspaceIds.map(id => db.workspaces.findById(id)))).filter((w) => w !== null);
|
|
2063
|
+
}
|
|
2064
|
+
else if (options.userIds?.length) {
|
|
2065
|
+
// Workspaces for specific users
|
|
2066
|
+
const allWorkspaces = await Promise.all(options.userIds.map(userId => db.workspaces.findByUserId(userId)));
|
|
2067
|
+
workspaces = allWorkspaces.flat();
|
|
2068
|
+
}
|
|
2069
|
+
else {
|
|
2070
|
+
// All workspaces - need to query by status to get running ones
|
|
2071
|
+
// For now, we'll get all workspaces from the provisioning provider
|
|
2072
|
+
workspaces = await db.workspaces.findAll();
|
|
2073
|
+
}
|
|
2074
|
+
// Filter to only Fly.io workspaces
|
|
2075
|
+
workspaces = workspaces.filter(w => w.computeProvider === 'fly' && w.computeId);
|
|
2076
|
+
console.log(`[provisioner] Starting graceful update of ${workspaces.length} workspaces to ${newImage}`);
|
|
2077
|
+
const batchSize = options.batchSize ?? 5;
|
|
2078
|
+
const results = [];
|
|
2079
|
+
// Process in batches
|
|
2080
|
+
for (let i = 0; i < workspaces.length; i += batchSize) {
|
|
2081
|
+
const batch = workspaces.slice(i, i + batchSize);
|
|
2082
|
+
const batchResults = await Promise.all(batch.map(workspace => this.gracefulUpdateImage(workspace.id, newImage, {
|
|
2083
|
+
force: options.force,
|
|
2084
|
+
skipRestart: options.skipRestart,
|
|
2085
|
+
})));
|
|
2086
|
+
results.push(...batchResults);
|
|
2087
|
+
// Small delay between batches to avoid overwhelming Fly API
|
|
2088
|
+
if (i + batchSize < workspaces.length) {
|
|
2089
|
+
await wait(1000);
|
|
2090
|
+
}
|
|
2091
|
+
}
|
|
2092
|
+
// Compute summary
|
|
2093
|
+
const summary = {
|
|
2094
|
+
total: results.length,
|
|
2095
|
+
updated: results.filter(r => r.result === WorkspaceProvisioner.UpdateResult.UPDATED).length,
|
|
2096
|
+
pendingRestart: results.filter(r => r.result === WorkspaceProvisioner.UpdateResult.UPDATED_PENDING_RESTART).length,
|
|
2097
|
+
skippedActiveAgents: results.filter(r => r.result === WorkspaceProvisioner.UpdateResult.SKIPPED_ACTIVE_AGENTS).length,
|
|
2098
|
+
skippedVerificationFailed: results.filter(r => r.result === WorkspaceProvisioner.UpdateResult.SKIPPED_VERIFICATION_FAILED).length,
|
|
2099
|
+
skippedNotRunning: results.filter(r => r.result === WorkspaceProvisioner.UpdateResult.SKIPPED_NOT_RUNNING).length,
|
|
2100
|
+
errors: results.filter(r => r.result === WorkspaceProvisioner.UpdateResult.ERROR).length,
|
|
2101
|
+
};
|
|
2102
|
+
console.log(`[provisioner] Graceful update complete:`, summary);
|
|
2103
|
+
return { summary, results };
|
|
2104
|
+
}
|
|
2105
|
+
}
|
|
2106
|
+
// Singleton instance
|
|
2107
|
+
let _provisioner = null;
|
|
2108
|
+
export function getProvisioner() {
|
|
2109
|
+
if (!_provisioner) {
|
|
2110
|
+
_provisioner = new WorkspaceProvisioner();
|
|
2111
|
+
}
|
|
2112
|
+
return _provisioner;
|
|
2113
|
+
}
|
|
2114
|
+
//# sourceMappingURL=index.js.map
|