@agent-relay/cloud 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (269) hide show
  1. package/dist/api/admin.d.ts +8 -0
  2. package/dist/api/admin.d.ts.map +1 -0
  3. package/dist/api/admin.js +225 -0
  4. package/dist/api/admin.js.map +1 -0
  5. package/dist/api/auth.d.ts +20 -0
  6. package/dist/api/auth.d.ts.map +1 -0
  7. package/dist/api/auth.js +136 -0
  8. package/dist/api/auth.js.map +1 -0
  9. package/dist/api/billing.d.ts +7 -0
  10. package/dist/api/billing.d.ts.map +1 -0
  11. package/dist/api/billing.js +564 -0
  12. package/dist/api/billing.js.map +1 -0
  13. package/dist/api/cli-pty-runner.d.ts +53 -0
  14. package/dist/api/cli-pty-runner.d.ts.map +1 -0
  15. package/dist/api/cli-pty-runner.js +193 -0
  16. package/dist/api/cli-pty-runner.js.map +1 -0
  17. package/dist/api/codex-auth-helper.d.ts +21 -0
  18. package/dist/api/codex-auth-helper.d.ts.map +1 -0
  19. package/dist/api/codex-auth-helper.js +327 -0
  20. package/dist/api/codex-auth-helper.js.map +1 -0
  21. package/dist/api/consensus.d.ts +13 -0
  22. package/dist/api/consensus.d.ts.map +1 -0
  23. package/dist/api/consensus.js +261 -0
  24. package/dist/api/consensus.js.map +1 -0
  25. package/dist/api/coordinators.d.ts +8 -0
  26. package/dist/api/coordinators.d.ts.map +1 -0
  27. package/dist/api/coordinators.js +750 -0
  28. package/dist/api/coordinators.js.map +1 -0
  29. package/dist/api/daemons.d.ts +12 -0
  30. package/dist/api/daemons.d.ts.map +1 -0
  31. package/dist/api/daemons.js +535 -0
  32. package/dist/api/daemons.js.map +1 -0
  33. package/dist/api/generic-webhooks.d.ts +8 -0
  34. package/dist/api/generic-webhooks.d.ts.map +1 -0
  35. package/dist/api/generic-webhooks.js +129 -0
  36. package/dist/api/generic-webhooks.js.map +1 -0
  37. package/dist/api/git.d.ts +8 -0
  38. package/dist/api/git.d.ts.map +1 -0
  39. package/dist/api/git.js +269 -0
  40. package/dist/api/git.js.map +1 -0
  41. package/dist/api/github-app.d.ts +11 -0
  42. package/dist/api/github-app.d.ts.map +1 -0
  43. package/dist/api/github-app.js +223 -0
  44. package/dist/api/github-app.js.map +1 -0
  45. package/dist/api/middleware/planLimits.d.ts +43 -0
  46. package/dist/api/middleware/planLimits.d.ts.map +1 -0
  47. package/dist/api/middleware/planLimits.js +202 -0
  48. package/dist/api/middleware/planLimits.js.map +1 -0
  49. package/dist/api/monitoring.d.ts +11 -0
  50. package/dist/api/monitoring.d.ts.map +1 -0
  51. package/dist/api/monitoring.js +578 -0
  52. package/dist/api/monitoring.js.map +1 -0
  53. package/dist/api/nango-auth.d.ts +9 -0
  54. package/dist/api/nango-auth.d.ts.map +1 -0
  55. package/dist/api/nango-auth.js +674 -0
  56. package/dist/api/nango-auth.js.map +1 -0
  57. package/dist/api/onboarding.d.ts +15 -0
  58. package/dist/api/onboarding.d.ts.map +1 -0
  59. package/dist/api/onboarding.js +679 -0
  60. package/dist/api/onboarding.js.map +1 -0
  61. package/dist/api/policy.d.ts +8 -0
  62. package/dist/api/policy.d.ts.map +1 -0
  63. package/dist/api/policy.js +229 -0
  64. package/dist/api/policy.js.map +1 -0
  65. package/dist/api/provider-env.d.ts +14 -0
  66. package/dist/api/provider-env.d.ts.map +1 -0
  67. package/dist/api/provider-env.js +75 -0
  68. package/dist/api/provider-env.js.map +1 -0
  69. package/dist/api/providers.d.ts +7 -0
  70. package/dist/api/providers.d.ts.map +1 -0
  71. package/dist/api/providers.js +564 -0
  72. package/dist/api/providers.js.map +1 -0
  73. package/dist/api/repos.d.ts +8 -0
  74. package/dist/api/repos.d.ts.map +1 -0
  75. package/dist/api/repos.js +577 -0
  76. package/dist/api/repos.js.map +1 -0
  77. package/dist/api/sessions.d.ts +11 -0
  78. package/dist/api/sessions.d.ts.map +1 -0
  79. package/dist/api/sessions.js +302 -0
  80. package/dist/api/sessions.js.map +1 -0
  81. package/dist/api/teams.d.ts +7 -0
  82. package/dist/api/teams.d.ts.map +1 -0
  83. package/dist/api/teams.js +281 -0
  84. package/dist/api/teams.js.map +1 -0
  85. package/dist/api/test-helpers.d.ts +10 -0
  86. package/dist/api/test-helpers.d.ts.map +1 -0
  87. package/dist/api/test-helpers.js +745 -0
  88. package/dist/api/test-helpers.js.map +1 -0
  89. package/dist/api/usage.d.ts +7 -0
  90. package/dist/api/usage.d.ts.map +1 -0
  91. package/dist/api/usage.js +111 -0
  92. package/dist/api/usage.js.map +1 -0
  93. package/dist/api/webhooks.d.ts +8 -0
  94. package/dist/api/webhooks.d.ts.map +1 -0
  95. package/dist/api/webhooks.js +645 -0
  96. package/dist/api/webhooks.js.map +1 -0
  97. package/dist/api/workspaces.d.ts +25 -0
  98. package/dist/api/workspaces.d.ts.map +1 -0
  99. package/dist/api/workspaces.js +1799 -0
  100. package/dist/api/workspaces.js.map +1 -0
  101. package/dist/billing/index.d.ts +9 -0
  102. package/dist/billing/index.d.ts.map +1 -0
  103. package/dist/billing/index.js +9 -0
  104. package/dist/billing/index.js.map +1 -0
  105. package/dist/billing/plans.d.ts +39 -0
  106. package/dist/billing/plans.d.ts.map +1 -0
  107. package/dist/billing/plans.js +245 -0
  108. package/dist/billing/plans.js.map +1 -0
  109. package/dist/billing/service.d.ts +80 -0
  110. package/dist/billing/service.d.ts.map +1 -0
  111. package/dist/billing/service.js +388 -0
  112. package/dist/billing/service.js.map +1 -0
  113. package/dist/billing/types.d.ts +141 -0
  114. package/dist/billing/types.d.ts.map +1 -0
  115. package/dist/billing/types.js +7 -0
  116. package/dist/billing/types.js.map +1 -0
  117. package/dist/config.d.ts +5 -0
  118. package/dist/config.d.ts.map +1 -0
  119. package/dist/config.js +5 -0
  120. package/dist/config.js.map +1 -0
  121. package/dist/db/bulk-ingest.d.ts +89 -0
  122. package/dist/db/bulk-ingest.d.ts.map +1 -0
  123. package/dist/db/bulk-ingest.js +268 -0
  124. package/dist/db/bulk-ingest.js.map +1 -0
  125. package/dist/db/drizzle.d.ts +256 -0
  126. package/dist/db/drizzle.d.ts.map +1 -0
  127. package/dist/db/drizzle.js +1286 -0
  128. package/dist/db/drizzle.js.map +1 -0
  129. package/dist/db/index.d.ts +55 -0
  130. package/dist/db/index.d.ts.map +1 -0
  131. package/dist/db/index.js +68 -0
  132. package/dist/db/index.js.map +1 -0
  133. package/dist/db/schema.d.ts +4873 -0
  134. package/dist/db/schema.d.ts.map +1 -0
  135. package/dist/db/schema.js +620 -0
  136. package/dist/db/schema.js.map +1 -0
  137. package/dist/index.d.ts +11 -0
  138. package/dist/index.d.ts.map +1 -0
  139. package/dist/index.js +38 -0
  140. package/dist/index.js.map +1 -0
  141. package/dist/provisioner/index.d.ts +207 -0
  142. package/dist/provisioner/index.d.ts.map +1 -0
  143. package/dist/provisioner/index.js +2114 -0
  144. package/dist/provisioner/index.js.map +1 -0
  145. package/dist/server.d.ts +17 -0
  146. package/dist/server.d.ts.map +1 -0
  147. package/dist/server.js +1924 -0
  148. package/dist/server.js.map +1 -0
  149. package/dist/services/auto-scaler.d.ts +152 -0
  150. package/dist/services/auto-scaler.d.ts.map +1 -0
  151. package/dist/services/auto-scaler.js +439 -0
  152. package/dist/services/auto-scaler.js.map +1 -0
  153. package/dist/services/capacity-manager.d.ts +148 -0
  154. package/dist/services/capacity-manager.d.ts.map +1 -0
  155. package/dist/services/capacity-manager.js +449 -0
  156. package/dist/services/capacity-manager.js.map +1 -0
  157. package/dist/services/ci-agent-spawner.d.ts +49 -0
  158. package/dist/services/ci-agent-spawner.d.ts.map +1 -0
  159. package/dist/services/ci-agent-spawner.js +373 -0
  160. package/dist/services/ci-agent-spawner.js.map +1 -0
  161. package/dist/services/cloud-message-bus.d.ts +28 -0
  162. package/dist/services/cloud-message-bus.d.ts.map +1 -0
  163. package/dist/services/cloud-message-bus.js +19 -0
  164. package/dist/services/cloud-message-bus.js.map +1 -0
  165. package/dist/services/compute-enforcement.d.ts +57 -0
  166. package/dist/services/compute-enforcement.d.ts.map +1 -0
  167. package/dist/services/compute-enforcement.js +175 -0
  168. package/dist/services/compute-enforcement.js.map +1 -0
  169. package/dist/services/coordinator.d.ts +62 -0
  170. package/dist/services/coordinator.d.ts.map +1 -0
  171. package/dist/services/coordinator.js +389 -0
  172. package/dist/services/coordinator.js.map +1 -0
  173. package/dist/services/index.d.ts +17 -0
  174. package/dist/services/index.d.ts.map +1 -0
  175. package/dist/services/index.js +25 -0
  176. package/dist/services/index.js.map +1 -0
  177. package/dist/services/intro-expiration.d.ts +60 -0
  178. package/dist/services/intro-expiration.d.ts.map +1 -0
  179. package/dist/services/intro-expiration.js +252 -0
  180. package/dist/services/intro-expiration.js.map +1 -0
  181. package/dist/services/mention-handler.d.ts +65 -0
  182. package/dist/services/mention-handler.d.ts.map +1 -0
  183. package/dist/services/mention-handler.js +405 -0
  184. package/dist/services/mention-handler.js.map +1 -0
  185. package/dist/services/nango.d.ts +201 -0
  186. package/dist/services/nango.d.ts.map +1 -0
  187. package/dist/services/nango.js +392 -0
  188. package/dist/services/nango.js.map +1 -0
  189. package/dist/services/persistence.d.ts +131 -0
  190. package/dist/services/persistence.d.ts.map +1 -0
  191. package/dist/services/persistence.js +200 -0
  192. package/dist/services/persistence.js.map +1 -0
  193. package/dist/services/planLimits.d.ts +147 -0
  194. package/dist/services/planLimits.d.ts.map +1 -0
  195. package/dist/services/planLimits.js +335 -0
  196. package/dist/services/planLimits.js.map +1 -0
  197. package/dist/services/presence-registry.d.ts +56 -0
  198. package/dist/services/presence-registry.d.ts.map +1 -0
  199. package/dist/services/presence-registry.js +91 -0
  200. package/dist/services/presence-registry.js.map +1 -0
  201. package/dist/services/scaling-orchestrator.d.ts +159 -0
  202. package/dist/services/scaling-orchestrator.d.ts.map +1 -0
  203. package/dist/services/scaling-orchestrator.js +502 -0
  204. package/dist/services/scaling-orchestrator.js.map +1 -0
  205. package/dist/services/scaling-policy.d.ts +121 -0
  206. package/dist/services/scaling-policy.d.ts.map +1 -0
  207. package/dist/services/scaling-policy.js +415 -0
  208. package/dist/services/scaling-policy.js.map +1 -0
  209. package/dist/services/ssh-security.d.ts +31 -0
  210. package/dist/services/ssh-security.d.ts.map +1 -0
  211. package/dist/services/ssh-security.js +63 -0
  212. package/dist/services/ssh-security.js.map +1 -0
  213. package/dist/services/workspace-keepalive.d.ts +76 -0
  214. package/dist/services/workspace-keepalive.d.ts.map +1 -0
  215. package/dist/services/workspace-keepalive.js +234 -0
  216. package/dist/services/workspace-keepalive.js.map +1 -0
  217. package/dist/shims/consensus.d.ts +23 -0
  218. package/dist/shims/consensus.d.ts.map +1 -0
  219. package/dist/shims/consensus.js +5 -0
  220. package/dist/shims/consensus.js.map +1 -0
  221. package/dist/webhooks/index.d.ts +24 -0
  222. package/dist/webhooks/index.d.ts.map +1 -0
  223. package/dist/webhooks/index.js +29 -0
  224. package/dist/webhooks/index.js.map +1 -0
  225. package/dist/webhooks/parsers/github.d.ts +8 -0
  226. package/dist/webhooks/parsers/github.d.ts.map +1 -0
  227. package/dist/webhooks/parsers/github.js +234 -0
  228. package/dist/webhooks/parsers/github.js.map +1 -0
  229. package/dist/webhooks/parsers/index.d.ts +23 -0
  230. package/dist/webhooks/parsers/index.d.ts.map +1 -0
  231. package/dist/webhooks/parsers/index.js +30 -0
  232. package/dist/webhooks/parsers/index.js.map +1 -0
  233. package/dist/webhooks/parsers/linear.d.ts +9 -0
  234. package/dist/webhooks/parsers/linear.d.ts.map +1 -0
  235. package/dist/webhooks/parsers/linear.js +258 -0
  236. package/dist/webhooks/parsers/linear.js.map +1 -0
  237. package/dist/webhooks/parsers/slack.d.ts +9 -0
  238. package/dist/webhooks/parsers/slack.d.ts.map +1 -0
  239. package/dist/webhooks/parsers/slack.js +214 -0
  240. package/dist/webhooks/parsers/slack.js.map +1 -0
  241. package/dist/webhooks/responders/github.d.ts +8 -0
  242. package/dist/webhooks/responders/github.d.ts.map +1 -0
  243. package/dist/webhooks/responders/github.js +73 -0
  244. package/dist/webhooks/responders/github.js.map +1 -0
  245. package/dist/webhooks/responders/index.d.ts +23 -0
  246. package/dist/webhooks/responders/index.d.ts.map +1 -0
  247. package/dist/webhooks/responders/index.js +30 -0
  248. package/dist/webhooks/responders/index.js.map +1 -0
  249. package/dist/webhooks/responders/linear.d.ts +9 -0
  250. package/dist/webhooks/responders/linear.d.ts.map +1 -0
  251. package/dist/webhooks/responders/linear.js +149 -0
  252. package/dist/webhooks/responders/linear.js.map +1 -0
  253. package/dist/webhooks/responders/slack.d.ts +20 -0
  254. package/dist/webhooks/responders/slack.d.ts.map +1 -0
  255. package/dist/webhooks/responders/slack.js +178 -0
  256. package/dist/webhooks/responders/slack.js.map +1 -0
  257. package/dist/webhooks/router.d.ts +25 -0
  258. package/dist/webhooks/router.d.ts.map +1 -0
  259. package/dist/webhooks/router.js +504 -0
  260. package/dist/webhooks/router.js.map +1 -0
  261. package/dist/webhooks/rules-engine.d.ts +24 -0
  262. package/dist/webhooks/rules-engine.d.ts.map +1 -0
  263. package/dist/webhooks/rules-engine.js +287 -0
  264. package/dist/webhooks/rules-engine.js.map +1 -0
  265. package/dist/webhooks/types.d.ts +186 -0
  266. package/dist/webhooks/types.d.ts.map +1 -0
  267. package/dist/webhooks/types.js +8 -0
  268. package/dist/webhooks/types.js.map +1 -0
  269. package/package.json +55 -0
@@ -0,0 +1,2114 @@
1
+ /**
2
+ * Agent Relay Cloud - Workspace Provisioner
3
+ *
4
+ * One-click provisioning for compute resources (Fly.io, Railway, Docker).
5
+ */
6
+ import * as crypto from 'crypto';
7
+ import { createHash } from 'node:crypto';
8
+ import { getConfig } from '../config.js';
9
+ import { db } from '../db/index.js';
10
+ import { nangoService } from '../services/nango.js';
11
+ import { canAutoScale, canScaleToTier, getResourceTierForPlan, getPlanLimits, } from '../services/planLimits.js';
12
+ import { deriveSshPassword } from '../services/ssh-security.js';
13
+ // ============================================================================
14
+ // Daemon API Key Management
15
+ // ============================================================================
16
+ /**
17
+ * Generate a daemon API key in the format ar_live_<32 hex chars>
18
+ */
19
+ function generateDaemonApiKey() {
20
+ const random = crypto.randomBytes(32).toString('hex');
21
+ return `ar_live_${random}`;
22
+ }
23
+ /**
24
+ * Hash an API key for secure storage
25
+ */
26
+ function hashApiKey(apiKey) {
27
+ return createHash('sha256').update(apiKey).digest('hex');
28
+ }
29
+ /**
30
+ * Create a linked daemon record for a workspace during provisioning
31
+ * @param preGeneratedApiKey - Pre-generated API key (if not provided, one will be generated)
32
+ */
33
+ async function createLinkedDaemon(userId, workspaceId, machineId, preGeneratedApiKey) {
34
+ const apiKey = preGeneratedApiKey ?? generateDaemonApiKey();
35
+ const apiKeyHash = hashApiKey(apiKey);
36
+ const daemon = await db.linkedDaemons.create({
37
+ userId,
38
+ workspaceId,
39
+ name: `auto-provisioned-${Date.now()}`,
40
+ machineId,
41
+ apiKeyHash,
42
+ status: 'offline',
43
+ });
44
+ return { daemonId: daemon.id, apiKey };
45
+ }
46
+ const WORKSPACE_PORT = 3888;
47
+ const WORKSPACE_HEALTH_PORT = 3889; // Health check on separate thread - always responsive
48
+ const WORKSPACE_SSH_PORT = 3022;
49
+ const CODEX_OAUTH_PORT = 1455; // Codex CLI OAuth callback port - must be mapped for local dev
50
+ const FETCH_TIMEOUT_MS = 10_000;
51
+ const WORKSPACE_IMAGE = process.env.WORKSPACE_IMAGE || 'ghcr.io/agentworkforce/relay-workspace:latest';
52
+ // In-memory tracker for provisioning progress (workspace ID -> progress)
53
+ const provisioningProgress = new Map();
54
+ /**
55
+ * Update the provisioning stage for a workspace
56
+ */
57
+ function updateProvisioningStage(workspaceId, stage) {
58
+ const existing = provisioningProgress.get(workspaceId);
59
+ provisioningProgress.set(workspaceId, {
60
+ stage,
61
+ startedAt: existing?.startedAt ?? Date.now(),
62
+ updatedAt: Date.now(),
63
+ });
64
+ console.log(`[provisioner] Workspace ${workspaceId.substring(0, 8)} stage: ${stage}`);
65
+ }
66
+ /**
67
+ * Get the current provisioning stage for a workspace
68
+ */
69
+ export function getProvisioningStage(workspaceId) {
70
+ return provisioningProgress.get(workspaceId) ?? null;
71
+ }
72
+ /**
73
+ * Clear provisioning progress (call when complete or failed)
74
+ */
75
+ function clearProvisioningProgress(workspaceId) {
76
+ provisioningProgress.delete(workspaceId);
77
+ }
78
+ /**
79
+ * Schedule cleanup of provisioning progress after a delay
80
+ * This gives the frontend time to poll and see the 'complete' stage
81
+ */
82
+ function scheduleProgressCleanup(workspaceId, delayMs = 30_000) {
83
+ setTimeout(() => {
84
+ clearProvisioningProgress(workspaceId);
85
+ console.log(`[provisioner] Cleaned up provisioning progress for ${workspaceId.substring(0, 8)}`);
86
+ }, delayMs);
87
+ }
88
+ /**
89
+ * Get a fresh GitHub App installation token from Nango.
90
+ * Looks up the user's connected repositories to find a valid Nango connection.
91
+ */
92
+ async function getGithubAppTokenForUser(userId) {
93
+ try {
94
+ // Find any repository with a Nango connection for this user
95
+ const repos = await db.repositories.findByUserId(userId);
96
+ const repoWithConnection = repos.find(r => r.nangoConnectionId);
97
+ if (!repoWithConnection?.nangoConnectionId) {
98
+ console.warn(`[provisioner] No Nango GitHub App connection found for user ${userId}`);
99
+ return null;
100
+ }
101
+ // Get fresh installation token from Nango (handles refresh automatically)
102
+ const token = await nangoService.getGithubAppToken(repoWithConnection.nangoConnectionId);
103
+ return token;
104
+ }
105
+ catch (error) {
106
+ console.error(`[provisioner] Failed to get GitHub App token for user ${userId}:`, error);
107
+ return null;
108
+ }
109
+ }
110
+ async function wait(ms) {
111
+ return new Promise((resolve) => setTimeout(resolve, ms));
112
+ }
113
+ async function fetchWithRetry(url, options = {}) {
114
+ const retries = options.retries ?? 2;
115
+ let attempt = 0;
116
+ while (attempt <= retries) {
117
+ const controller = new AbortController();
118
+ const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
119
+ try {
120
+ const response = await fetch(url, { ...options, signal: controller.signal });
121
+ clearTimeout(timer);
122
+ if (!response.ok && response.status >= 500 && attempt < retries) {
123
+ attempt += 1;
124
+ await wait(500 * attempt);
125
+ continue;
126
+ }
127
+ return response;
128
+ }
129
+ catch (error) {
130
+ clearTimeout(timer);
131
+ if (attempt >= retries) {
132
+ throw error;
133
+ }
134
+ attempt += 1;
135
+ await wait(500 * attempt);
136
+ }
137
+ }
138
+ throw new Error('fetchWithRetry exhausted retries');
139
+ }
140
+ async function softHealthCheck(url) {
141
+ try {
142
+ const res = await fetchWithRetry(`${url.replace(/\/$/, '')}/health`, { method: 'GET', retries: 1 });
143
+ if (!res.ok) {
144
+ console.warn(`[health] Non-200 from ${url}/health: ${res.status}`);
145
+ }
146
+ }
147
+ catch (error) {
148
+ console.warn(`[health] Failed to reach ${url}/health`, error);
149
+ }
150
+ }
151
+ /**
152
+ * Wait for machine to be in "started" state using Fly.io's /wait endpoint
153
+ * This is more efficient than polling - the API blocks until the state is reached
154
+ * @see https://fly.io/docs/machines/api/machines-resource/#wait-for-a-machine-to-reach-a-specific-state
155
+ */
156
+ async function waitForMachineStarted(apiToken, appName, machineId, timeoutSeconds = 120) {
157
+ console.log(`[provisioner] Waiting for machine ${machineId} to start (timeout: ${timeoutSeconds}s)...`);
158
+ // Fly.io /wait endpoint has max timeout of 60s, so we need to loop for longer waits
159
+ const maxSingleWait = 60;
160
+ const startTime = Date.now();
161
+ const deadline = startTime + timeoutSeconds * 1000;
162
+ while (Date.now() < deadline) {
163
+ const remainingMs = deadline - Date.now();
164
+ const waitSeconds = Math.min(maxSingleWait, Math.ceil(remainingMs / 1000));
165
+ if (waitSeconds <= 0)
166
+ break;
167
+ try {
168
+ // Use Fly.io's /wait endpoint - blocks until machine reaches target state
169
+ // timeout is an integer in seconds (max 60)
170
+ const res = await fetch(`https://api.machines.dev/v1/apps/${appName}/machines/${machineId}/wait?state=started&timeout=${waitSeconds}`, {
171
+ headers: { Authorization: `Bearer ${apiToken}` },
172
+ });
173
+ if (res.ok) {
174
+ console.log(`[provisioner] Machine ${machineId} is now started`);
175
+ return;
176
+ }
177
+ // 408 = timeout, machine didn't reach state in time - try again if we have time
178
+ if (res.status === 408) {
179
+ console.log(`[provisioner] Machine ${machineId} not ready yet, continuing to wait...`);
180
+ continue;
181
+ }
182
+ // Other error
183
+ const errorText = await res.text();
184
+ throw new Error(`Wait for machine failed: ${res.status} ${errorText}`);
185
+ }
186
+ catch (error) {
187
+ if (error instanceof Error && error.message.includes('Wait for machine failed')) {
188
+ throw error;
189
+ }
190
+ console.warn(`[provisioner] Error waiting for machine:`, error);
191
+ throw new Error(`Failed to wait for machine ${machineId}: ${error.message}`);
192
+ }
193
+ }
194
+ // Timeout reached - get current state for error message
195
+ const stateRes = await fetch(`https://api.machines.dev/v1/apps/${appName}/machines/${machineId}`, { headers: { Authorization: `Bearer ${apiToken}` } });
196
+ const machine = stateRes.ok ? (await stateRes.json()) : { state: 'unknown' };
197
+ throw new Error(`Machine ${machineId} did not start within ${timeoutSeconds}s (last state: ${machine.state})`);
198
+ }
199
+ /**
200
+ * Wait for health check to pass (with DNS propagation time)
201
+ * Tries internal Fly network first if available, then falls back to public URL
202
+ */
203
+ async function waitForHealthy(url, appName, maxWaitMs = 60_000 // Reduced from 90s - health check is best-effort anyway
204
+ ) {
205
+ const startTime = Date.now();
206
+ // Build list of URLs to try - internal first (faster, more reliable from inside Fly)
207
+ const urlsToTry = [];
208
+ // If running on Fly and app name provided, try internal network first
209
+ const isOnFly = !!process.env.FLY_APP_NAME;
210
+ if (isOnFly && appName) {
211
+ urlsToTry.push(`http://${appName}.internal:8080/health`);
212
+ }
213
+ // Always add the public URL as fallback
214
+ urlsToTry.push(`${url.replace(/\/$/, '')}/health`);
215
+ console.log(`[provisioner] Waiting for workspace to become healthy (trying: ${urlsToTry.join(', ')})...`);
216
+ // Exponential backoff: start at 1s, max 5s (reduces unnecessary polling)
217
+ let retryDelayMs = 1000;
218
+ const maxRetryDelayMs = 5000;
219
+ while (Date.now() - startTime < maxWaitMs) {
220
+ // Try each URL in order
221
+ for (const healthUrl of urlsToTry) {
222
+ try {
223
+ const controller = new AbortController();
224
+ const timer = setTimeout(() => controller.abort(), 5_000);
225
+ const res = await fetch(healthUrl, {
226
+ method: 'GET',
227
+ signal: controller.signal,
228
+ });
229
+ clearTimeout(timer);
230
+ if (res.ok) {
231
+ console.log(`[provisioner] Health check passed via ${healthUrl}`);
232
+ return;
233
+ }
234
+ console.log(`[provisioner] Health check to ${healthUrl} returned ${res.status}`);
235
+ }
236
+ catch (error) {
237
+ const elapsed = Math.round((Date.now() - startTime) / 1000);
238
+ const errMsg = error.message;
239
+ // Only log detailed error for last URL attempt
240
+ if (healthUrl === urlsToTry[urlsToTry.length - 1]) {
241
+ console.log(`[provisioner] Health check failed (${elapsed}s elapsed): ${errMsg}`);
242
+ }
243
+ }
244
+ }
245
+ await wait(retryDelayMs);
246
+ // Exponential backoff with cap
247
+ retryDelayMs = Math.min(retryDelayMs * 1.5, maxRetryDelayMs);
248
+ }
249
+ // Don't throw - workspace is provisioned, health check is best-effort
250
+ console.warn(`[provisioner] Health check did not pass within ${maxWaitMs}ms, continuing anyway`);
251
+ }
252
+ // Resource tiers sized for Claude Code agents (~1-2GB RAM per agent)
253
+ // cpuKind: 'shared' = cheaper but can be throttled, 'performance' = dedicated
254
+ // Note: Team tier (large) uses shared CPUs for better margins (~50% vs ~7% with perf)
255
+ export const RESOURCE_TIERS = {
256
+ small: { name: 'small', cpuCores: 2, memoryMb: 2048, maxAgents: 2, cpuKind: 'shared' },
257
+ medium: { name: 'medium', cpuCores: 2, memoryMb: 4096, maxAgents: 5, cpuKind: 'shared' },
258
+ large: { name: 'large', cpuCores: 4, memoryMb: 8192, maxAgents: 10, cpuKind: 'shared' },
259
+ xlarge: { name: 'xlarge', cpuCores: 8, memoryMb: 16384, maxAgents: 20, cpuKind: 'performance' },
260
+ };
261
+ /**
262
+ * Fly.io provisioner
263
+ */
264
+ class FlyProvisioner {
265
+ apiToken;
266
+ org;
267
+ region;
268
+ workspaceDomain;
269
+ cloudApiUrl;
270
+ sessionSecret;
271
+ registryAuth;
272
+ snapshotRetentionDays;
273
+ volumeSizeGb;
274
+ constructor() {
275
+ const config = getConfig();
276
+ if (!config.compute.fly) {
277
+ throw new Error('Fly.io configuration missing');
278
+ }
279
+ this.apiToken = config.compute.fly.apiToken;
280
+ this.org = config.compute.fly.org;
281
+ this.region = config.compute.fly.region || 'sjc';
282
+ this.workspaceDomain = config.compute.fly.workspaceDomain;
283
+ this.registryAuth = config.compute.fly.registryAuth;
284
+ this.cloudApiUrl = config.publicUrl;
285
+ this.sessionSecret = config.sessionSecret;
286
+ // Snapshot settings: default 14 days retention, 10GB volume
287
+ this.snapshotRetentionDays = Math.min(60, Math.max(1, config.compute.fly.snapshotRetentionDays ?? 14));
288
+ this.volumeSizeGb = config.compute.fly.volumeSizeGb ?? 10;
289
+ }
290
+ /**
291
+ * Generate a workspace token for API authentication
292
+ * This is a simple HMAC - in production, consider using JWTs
293
+ */
294
+ generateWorkspaceToken(workspaceId) {
295
+ return crypto
296
+ .createHmac('sha256', this.sessionSecret)
297
+ .update(`workspace:${workspaceId}`)
298
+ .digest('hex');
299
+ }
300
+ /**
301
+ * Create a volume with automatic snapshot settings
302
+ * Fly.io takes daily snapshots automatically; we configure retention
303
+ */
304
+ async createVolume(appName) {
305
+ const volumeName = 'workspace_data';
306
+ console.log(`[fly] Creating volume ${volumeName} with ${this.snapshotRetentionDays}-day snapshot retention...`);
307
+ const response = await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/volumes`, {
308
+ method: 'POST',
309
+ headers: {
310
+ Authorization: `Bearer ${this.apiToken}`,
311
+ 'Content-Type': 'application/json',
312
+ },
313
+ body: JSON.stringify({
314
+ name: volumeName,
315
+ region: this.region,
316
+ size_gb: this.volumeSizeGb,
317
+ // Enable automatic daily snapshots (default is true, but be explicit)
318
+ auto_backup_enabled: true,
319
+ // Retain snapshots for configured days (default 5, we use 14)
320
+ snapshot_retention: this.snapshotRetentionDays,
321
+ }),
322
+ });
323
+ if (!response.ok) {
324
+ const error = await response.text();
325
+ throw new Error(`Failed to create volume: ${error}`);
326
+ }
327
+ const volume = await response.json();
328
+ console.log(`[fly] Volume ${volume.id} created with auto-snapshots (${this.snapshotRetentionDays} days retention)`);
329
+ return volume;
330
+ }
331
+ /**
332
+ * Create an on-demand snapshot of a workspace volume
333
+ * Use before risky operations or as manual backup
334
+ */
335
+ async createSnapshot(appName, volumeId) {
336
+ console.log(`[fly] Creating on-demand snapshot for volume ${volumeId}...`);
337
+ const response = await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/volumes/${volumeId}/snapshots`, {
338
+ method: 'POST',
339
+ headers: {
340
+ Authorization: `Bearer ${this.apiToken}`,
341
+ 'Content-Type': 'application/json',
342
+ },
343
+ });
344
+ if (!response.ok) {
345
+ const error = await response.text();
346
+ throw new Error(`Failed to create snapshot: ${error}`);
347
+ }
348
+ const snapshot = await response.json();
349
+ console.log(`[fly] Snapshot ${snapshot.id} created`);
350
+ return snapshot;
351
+ }
352
+ /**
353
+ * List snapshots for a workspace volume
354
+ */
355
+ async listSnapshots(appName, volumeId) {
356
+ const response = await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/volumes/${volumeId}/snapshots`, {
357
+ headers: {
358
+ Authorization: `Bearer ${this.apiToken}`,
359
+ },
360
+ });
361
+ if (!response.ok) {
362
+ return [];
363
+ }
364
+ return await response.json();
365
+ }
366
+ /**
367
+ * Get volume info for a workspace
368
+ */
369
+ async getVolume(appName) {
370
+ const response = await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/volumes`, {
371
+ headers: {
372
+ Authorization: `Bearer ${this.apiToken}`,
373
+ },
374
+ });
375
+ if (!response.ok) {
376
+ return null;
377
+ }
378
+ const volumes = await response.json();
379
+ return volumes.find(v => v.name === 'workspace_data') || null;
380
+ }
381
+ async provision(workspace, credentials) {
382
+ const appName = `ar-${workspace.id.substring(0, 8)}`;
383
+ // Stage: Creating workspace
384
+ updateProvisioningStage(workspace.id, 'creating');
385
+ // Create Fly app
386
+ await fetchWithRetry('https://api.machines.dev/v1/apps', {
387
+ method: 'POST',
388
+ headers: {
389
+ Authorization: `Bearer ${this.apiToken}`,
390
+ 'Content-Type': 'application/json',
391
+ },
392
+ body: JSON.stringify({
393
+ app_name: appName,
394
+ org_slug: this.org,
395
+ }),
396
+ });
397
+ // Stage: Networking
398
+ updateProvisioningStage(workspace.id, 'networking');
399
+ // Allocate IPs for the app (required for public DNS)
400
+ // Must use GraphQL API - Machines REST API doesn't support IP allocation
401
+ // IMPORTANT: We use dedicated IPv4 ($2/mo) instead of shared because:
402
+ // - Shared IPv4 doesn't properly handle raw TCP on non-standard ports (like SSH on 3022)
403
+ // - SSH tunnel connections fail with "Connection closed by remote host" on shared IPs
404
+ // - Dedicated IPv4 is required for raw TCP services to work correctly
405
+ console.log(`[fly] Allocating IPs for ${appName}...`);
406
+ const allocateIP = async (type) => {
407
+ try {
408
+ // Map our type to Fly GraphQL enum (v4 = dedicated IPv4)
409
+ const graphqlType = type;
410
+ const res = await fetchWithRetry('https://api.fly.io/graphql', {
411
+ method: 'POST',
412
+ headers: {
413
+ Authorization: `Bearer ${this.apiToken}`,
414
+ 'Content-Type': 'application/json',
415
+ },
416
+ body: JSON.stringify({
417
+ query: `
418
+ mutation AllocateIPAddress($input: AllocateIPAddressInput!) {
419
+ allocateIpAddress(input: $input) {
420
+ ipAddress {
421
+ id
422
+ address
423
+ type
424
+ }
425
+ }
426
+ }
427
+ `,
428
+ variables: {
429
+ input: {
430
+ appId: appName,
431
+ type: graphqlType,
432
+ },
433
+ },
434
+ }),
435
+ });
436
+ if (!res.ok) {
437
+ const errorText = await res.text();
438
+ console.warn(`[fly] Failed to allocate ${type}: ${res.status} ${errorText}`);
439
+ return false;
440
+ }
441
+ const data = await res.json();
442
+ if (data.errors?.length) {
443
+ // Ignore "already allocated" errors
444
+ const alreadyAllocated = data.errors.some(e => e.message.includes('already') || e.message.includes('exists'));
445
+ if (!alreadyAllocated) {
446
+ console.warn(`[fly] GraphQL error allocating ${type}: ${data.errors[0].message}`);
447
+ return false;
448
+ }
449
+ console.log(`[fly] IP ${type} already allocated`);
450
+ return true;
451
+ }
452
+ const address = data.data?.allocateIpAddress?.ipAddress?.address;
453
+ console.log(`[fly] Allocated ${type}: ${address}`);
454
+ return true;
455
+ }
456
+ catch (err) {
457
+ console.warn(`[fly] Failed to allocate ${type}: ${err.message}`);
458
+ return false;
459
+ }
460
+ };
461
+ const [v4Result, v6Result] = await Promise.all([
462
+ allocateIP('v4'),
463
+ allocateIP('v6'),
464
+ ]);
465
+ console.log(`[fly] IP allocation results: v4=${v4Result}, v6=${v6Result}`);
466
+ // Stage: Secrets
467
+ updateProvisioningStage(workspace.id, 'secrets');
468
+ // Set secrets (provider credentials)
469
+ const secrets = {};
470
+ for (const [provider, token] of credentials) {
471
+ secrets[`${provider.toUpperCase()}_TOKEN`] = token;
472
+ // Also set GH_TOKEN for gh CLI compatibility
473
+ if (provider === 'github') {
474
+ secrets['GH_TOKEN'] = token;
475
+ }
476
+ }
477
+ if (Object.keys(secrets).length > 0) {
478
+ await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/secrets`, {
479
+ method: 'POST',
480
+ headers: {
481
+ Authorization: `Bearer ${this.apiToken}`,
482
+ 'Content-Type': 'application/json',
483
+ },
484
+ body: JSON.stringify(secrets),
485
+ });
486
+ }
487
+ // If custom workspace domain is configured, add certificate
488
+ const customHostname = this.workspaceDomain
489
+ ? `${appName}.${this.workspaceDomain}`
490
+ : null;
491
+ if (customHostname) {
492
+ await this.allocateCertificate(appName, customHostname);
493
+ }
494
+ // Stage: Machine (includes volume creation)
495
+ updateProvisioningStage(workspace.id, 'machine');
496
+ // Generate API key for cloud message sync BEFORE creating the machine
497
+ // The key is set as an env var on the machine and stored hashed in linkedDaemons
498
+ const machineApiKey = generateDaemonApiKey();
499
+ // Create volume with automatic daily snapshots before machine
500
+ // Fly.io takes daily snapshots automatically; we configure retention
501
+ const volume = await this.createVolume(appName);
502
+ // Determine instance size based on user's plan using RESOURCE_TIERS
503
+ const user = await db.users.findById(workspace.userId);
504
+ const userPlan = user?.plan || 'free';
505
+ const planLimits = getPlanLimits(userPlan);
506
+ const isFreeTier = userPlan === 'free';
507
+ // Check if user is in introductory period (first 14 days)
508
+ // Free users get Pro-level resources during intro period
509
+ const INTRO_PERIOD_DAYS = 14;
510
+ const userCreatedAt = user?.createdAt ? new Date(user.createdAt) : new Date();
511
+ const daysSinceSignup = (Date.now() - userCreatedAt.getTime()) / (1000 * 60 * 60 * 24);
512
+ const isIntroPeriod = isFreeTier && daysSinceSignup < INTRO_PERIOD_DAYS;
513
+ // Get the appropriate resource tier for this plan
514
+ // Intro period free users get 'pro' tier resources
515
+ const effectivePlan = isIntroPeriod ? 'pro' : userPlan;
516
+ const tierName = getResourceTierForPlan(effectivePlan);
517
+ const tier = RESOURCE_TIERS[tierName];
518
+ const guestConfig = {
519
+ cpu_kind: tier.cpuKind,
520
+ cpus: tier.cpuCores,
521
+ memory_mb: tier.memoryMb,
522
+ };
523
+ // Get max agents for the effective plan (intro users get pro limits)
524
+ const effectiveLimits = isIntroPeriod ? getPlanLimits('pro') : planLimits;
525
+ const maxAgents = effectiveLimits.maxConcurrentAgents === Infinity
526
+ ? 100 // Cap at 100 for practical purposes
527
+ : effectiveLimits.maxConcurrentAgents;
528
+ if (isIntroPeriod) {
529
+ const daysRemaining = Math.ceil(INTRO_PERIOD_DAYS - daysSinceSignup);
530
+ console.log(`[fly] Introductory bonus active (${daysRemaining} days remaining) - using ${tierName} tier`);
531
+ }
532
+ console.log(`[fly] Using ${tierName} tier: ${guestConfig.cpus} CPU / ${guestConfig.memory_mb}MB / max ${maxAgents} agents for ${userPlan} plan`);
533
+ // Create machine with auto-stop/start for cost optimization
534
+ const machineResponse = await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/machines`, {
535
+ method: 'POST',
536
+ headers: {
537
+ Authorization: `Bearer ${this.apiToken}`,
538
+ 'Content-Type': 'application/json',
539
+ },
540
+ body: JSON.stringify({
541
+ region: this.region,
542
+ config: {
543
+ image: WORKSPACE_IMAGE,
544
+ // Registry auth for private ghcr.io images
545
+ ...(this.registryAuth && {
546
+ image_registry_auth: {
547
+ registry: 'ghcr.io',
548
+ username: this.registryAuth.username,
549
+ password: this.registryAuth.password,
550
+ },
551
+ }),
552
+ env: {
553
+ WORKSPACE_ID: workspace.id,
554
+ WORKSPACE_OWNER_USER_ID: workspace.userId,
555
+ SUPERVISOR_ENABLED: String(workspace.config.supervisorEnabled ?? false),
556
+ MAX_AGENTS: String(maxAgents),
557
+ REPOSITORIES: (workspace.config.repositories ?? []).join(','),
558
+ PROVIDERS: (workspace.config.providers ?? []).join(','),
559
+ PORT: String(WORKSPACE_PORT),
560
+ AGENT_RELAY_DASHBOARD_PORT: String(WORKSPACE_PORT),
561
+ // Store repos on persistent volume (/data) so they survive container restarts
562
+ // Without this, repos are cloned to /workspace (ephemeral) and lost on restart
563
+ WORKSPACE_DIR: '/data/repos',
564
+ // Git gateway configuration
565
+ CLOUD_API_URL: this.cloudApiUrl,
566
+ WORKSPACE_TOKEN: this.generateWorkspaceToken(workspace.id),
567
+ // Daemon API key for cloud message sync
568
+ // Auto-generated during provisioning, stored in linkedDaemons table
569
+ AGENT_RELAY_API_KEY: machineApiKey,
570
+ // SSH for CLI tunneling (Codex OAuth callback forwarding)
571
+ // Each workspace gets a unique password derived from its ID + secret salt
572
+ ENABLE_SSH: 'true',
573
+ SSH_PASSWORD: deriveSshPassword(workspace.id),
574
+ SSH_PORT: String(WORKSPACE_SSH_PORT),
575
+ // Enable cloud persistence for agent sessions/summaries via API
576
+ RELAY_CLOUD_ENABLED: 'true',
577
+ },
578
+ services: [
579
+ {
580
+ ports: [
581
+ {
582
+ port: 443,
583
+ handlers: ['tls', 'http'],
584
+ // Force HTTP/1.1 to backend for WebSocket upgrade compatibility
585
+ // HTTP/2 doesn't support traditional WebSocket upgrade mechanism
586
+ http_options: {
587
+ h2_backend: false,
588
+ },
589
+ },
590
+ { port: 80, handlers: ['http'] },
591
+ ],
592
+ protocol: 'tcp',
593
+ internal_port: WORKSPACE_PORT,
594
+ // Auto-stop after inactivity to reduce costs
595
+ // Fly Proxy automatically wakes machines on incoming requests
596
+ auto_stop_machines: 'stop', // stop (not suspend) for faster wake
597
+ auto_start_machines: true,
598
+ min_machines_running: 0,
599
+ // Idle timeout before auto-stop (in seconds)
600
+ // Longer timeout = better UX, shorter = lower costs
601
+ concurrency: {
602
+ type: 'requests',
603
+ soft_limit: 25,
604
+ hard_limit: 50,
605
+ },
606
+ },
607
+ // SSH service for CLI tunneling (Codex OAuth callback forwarding)
608
+ // Exposes port 3022 publicly for SSH connections from user's machine
609
+ {
610
+ ports: [
611
+ {
612
+ port: WORKSPACE_SSH_PORT,
613
+ handlers: [], // Empty handlers = raw TCP passthrough
614
+ },
615
+ ],
616
+ protocol: 'tcp',
617
+ internal_port: WORKSPACE_SSH_PORT,
618
+ // SSH connections should also wake the machine
619
+ auto_stop_machines: 'stop',
620
+ auto_start_machines: true,
621
+ min_machines_running: 0,
622
+ },
623
+ ],
624
+ checks: {
625
+ health: {
626
+ type: 'http',
627
+ port: WORKSPACE_HEALTH_PORT, // Health worker thread - responds even when main loop blocked
628
+ path: '/health',
629
+ interval: '30s',
630
+ timeout: '10s', // Increased timeout for safety
631
+ grace_period: '30s', // Longer grace period for startup
632
+ },
633
+ },
634
+ // Instance size based on plan - free tier gets smaller instance
635
+ guest: guestConfig,
636
+ // Mount the volume we created with snapshot settings
637
+ mounts: [
638
+ {
639
+ volume: volume.id,
640
+ path: '/data',
641
+ },
642
+ ],
643
+ },
644
+ }),
645
+ });
646
+ if (!machineResponse.ok) {
647
+ const error = await machineResponse.text();
648
+ throw new Error(`Failed to create Fly machine: ${error}`);
649
+ }
650
+ const machine = (await machineResponse.json());
651
+ // Create linked daemon for cloud message sync
652
+ // Pass the pre-generated API key so it matches what was set in the machine env vars
653
+ const { daemonId } = await createLinkedDaemon(workspace.userId, workspace.id, machine.id, // Use Fly machine ID as daemon's machine ID
654
+ machineApiKey);
655
+ console.log(`[fly] Created linked daemon ${daemonId.substring(0, 8)} for workspace ${workspace.id.substring(0, 8)}`);
656
+ // Return custom domain URL if configured, otherwise default fly.dev
657
+ const publicUrl = customHostname
658
+ ? `https://${customHostname}`
659
+ : `https://${appName}.fly.dev`;
660
+ // Stage: Booting
661
+ updateProvisioningStage(workspace.id, 'booting');
662
+ // Wait for machine to be in started state
663
+ await waitForMachineStarted(this.apiToken, appName, machine.id);
664
+ // Stage: Health check
665
+ updateProvisioningStage(workspace.id, 'health');
666
+ // Wait for health check to pass (includes DNS propagation time)
667
+ // Pass appName to enable internal Fly network health checks
668
+ await waitForHealthy(publicUrl, appName);
669
+ // Stage: Complete
670
+ updateProvisioningStage(workspace.id, 'complete');
671
+ // Schedule cleanup of provisioning progress after 30s (gives frontend time to see 'complete')
672
+ scheduleProgressCleanup(workspace.id);
673
+ return {
674
+ computeId: machine.id,
675
+ publicUrl,
676
+ };
677
+ }
678
+ /**
679
+ * Allocate SSL certificate for custom domain
680
+ */
681
+ async allocateCertificate(appName, hostname) {
682
+ const response = await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/certificates`, {
683
+ method: 'POST',
684
+ headers: {
685
+ Authorization: `Bearer ${this.apiToken}`,
686
+ 'Content-Type': 'application/json',
687
+ },
688
+ body: JSON.stringify({ hostname }),
689
+ });
690
+ if (!response.ok) {
691
+ const error = await response.text();
692
+ // Don't fail if cert already exists
693
+ if (!error.includes('already exists')) {
694
+ throw new Error(`Failed to allocate certificate for ${hostname}: ${error}`);
695
+ }
696
+ }
697
+ }
698
+ async deprovision(workspace) {
699
+ const appName = `ar-${workspace.id.substring(0, 8)}`;
700
+ await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}`, {
701
+ method: 'DELETE',
702
+ headers: {
703
+ Authorization: `Bearer ${this.apiToken}`,
704
+ },
705
+ });
706
+ }
707
+ async getStatus(workspace) {
708
+ if (!workspace.computeId)
709
+ return 'error';
710
+ const appName = `ar-${workspace.id.substring(0, 8)}`;
711
+ const response = await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/machines/${workspace.computeId}`, {
712
+ headers: {
713
+ Authorization: `Bearer ${this.apiToken}`,
714
+ },
715
+ });
716
+ if (!response.ok)
717
+ return 'error';
718
+ const machine = await response.json();
719
+ switch (machine.state) {
720
+ case 'started':
721
+ return 'running';
722
+ case 'stopped':
723
+ return 'stopped';
724
+ case 'created':
725
+ case 'starting':
726
+ return 'provisioning';
727
+ default:
728
+ return 'error';
729
+ }
730
+ }
731
+ async restart(workspace) {
732
+ if (!workspace.computeId)
733
+ return;
734
+ const appName = `ar-${workspace.id.substring(0, 8)}`;
735
+ await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/machines/${workspace.computeId}/restart`, {
736
+ method: 'POST',
737
+ headers: {
738
+ Authorization: `Bearer ${this.apiToken}`,
739
+ },
740
+ });
741
+ }
742
+ /**
743
+ * Resize workspace - vertical scaling via Fly Machines API
744
+ * @param skipRestart - If true, config is saved but machine won't restart (changes apply on next start)
745
+ */
746
+ async resize(workspaceOrId, tier, skipRestart = false) {
747
+ const workspaceId = typeof workspaceOrId === 'string' ? workspaceOrId : workspaceOrId.id;
748
+ const computeId = typeof workspaceOrId === 'string' ? undefined : workspaceOrId.computeId;
749
+ // If passed just an ID, look up the workspace
750
+ let machineId = computeId;
751
+ if (!machineId) {
752
+ const workspace = await db.workspaces.findById(workspaceId);
753
+ if (!workspace?.computeId)
754
+ return;
755
+ machineId = workspace.computeId;
756
+ }
757
+ const appName = `ar-${workspaceId.substring(0, 8)}`;
758
+ // Get current machine config first to merge with new specs
759
+ // This is critical - Fly.io replaces the entire config, so we must preserve
760
+ // existing settings (image, services, auto_stop, other env vars, etc.)
761
+ const getResponse = await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/machines/${machineId}`, {
762
+ headers: {
763
+ Authorization: `Bearer ${this.apiToken}`,
764
+ },
765
+ });
766
+ if (!getResponse.ok) {
767
+ throw new Error(`Failed to get machine config for resize: ${await getResponse.text()}`);
768
+ }
769
+ const machine = await getResponse.json();
770
+ // Merge new specs into existing config, preserving everything else
771
+ const updatedConfig = {
772
+ ...machine.config,
773
+ guest: {
774
+ ...(machine.config.guest || {}),
775
+ // Use tier-specific CPU type (shared for cost, performance for power)
776
+ cpu_kind: tier.cpuKind,
777
+ cpus: tier.cpuCores,
778
+ memory_mb: tier.memoryMb,
779
+ },
780
+ env: {
781
+ ...(machine.config.env || {}),
782
+ MAX_AGENTS: String(tier.maxAgents),
783
+ },
784
+ };
785
+ // Update machine configuration
786
+ // If running: reboots with new specs (unless skip_launch: true)
787
+ // If stopped: config saved, applies on next start
788
+ const updateUrl = skipRestart
789
+ ? `https://api.machines.dev/v1/apps/${appName}/machines/${machineId}?skip_launch=true`
790
+ : `https://api.machines.dev/v1/apps/${appName}/machines/${machineId}`;
791
+ const updateResponse = await fetchWithRetry(updateUrl, {
792
+ method: 'POST',
793
+ headers: {
794
+ Authorization: `Bearer ${this.apiToken}`,
795
+ 'Content-Type': 'application/json',
796
+ },
797
+ body: JSON.stringify({
798
+ config: updatedConfig,
799
+ }),
800
+ });
801
+ if (!updateResponse.ok) {
802
+ throw new Error(`Failed to resize machine: ${await updateResponse.text()}`);
803
+ }
804
+ const restartNote = skipRestart ? ' (will apply on next restart)' : ' (restarting)';
805
+ console.log(`[fly] Resized workspace ${workspaceId.substring(0, 8)} to ${tier.name} (${tier.cpuCores} CPU, ${tier.memoryMb}MB RAM)${restartNote}`);
806
+ }
807
+ /**
808
+ * Update the max agent limit for a workspace
809
+ */
810
+ async updateAgentLimit(workspace, newLimit) {
811
+ if (!workspace.computeId)
812
+ return;
813
+ const appName = `ar-${workspace.id.substring(0, 8)}`;
814
+ // Update environment variable
815
+ await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/machines/${workspace.computeId}`, {
816
+ method: 'POST',
817
+ headers: {
818
+ Authorization: `Bearer ${this.apiToken}`,
819
+ 'Content-Type': 'application/json',
820
+ },
821
+ body: JSON.stringify({
822
+ config: {
823
+ env: {
824
+ MAX_AGENTS: String(newLimit),
825
+ },
826
+ },
827
+ }),
828
+ });
829
+ console.log(`[fly] Updated workspace ${workspace.id} agent limit to ${newLimit}`);
830
+ }
831
+ /**
832
+ * Get current resource tier for a workspace
833
+ */
834
+ async getCurrentTier(workspace) {
835
+ if (!workspace.computeId) {
836
+ return RESOURCE_TIERS.small;
837
+ }
838
+ const appName = `ar-${workspace.id.substring(0, 8)}`;
839
+ const response = await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/machines/${workspace.computeId}`, {
840
+ headers: {
841
+ Authorization: `Bearer ${this.apiToken}`,
842
+ },
843
+ });
844
+ if (!response.ok) {
845
+ return RESOURCE_TIERS.small;
846
+ }
847
+ const machine = await response.json();
848
+ const _cpus = machine.config?.guest?.cpus || 1;
849
+ const memoryMb = machine.config?.guest?.memory_mb || 512;
850
+ // Map to nearest tier based on actual tier thresholds:
851
+ // small: 2048MB, medium: 4096MB, large: 8192MB, xlarge: 16384MB
852
+ if (memoryMb >= 16384)
853
+ return RESOURCE_TIERS.xlarge;
854
+ if (memoryMb >= 8192)
855
+ return RESOURCE_TIERS.large;
856
+ if (memoryMb >= 4096)
857
+ return RESOURCE_TIERS.medium;
858
+ return RESOURCE_TIERS.small;
859
+ }
860
+ /**
861
+ * Update machine image without restarting
862
+ * Note: The machine needs to be restarted later to use the new image
863
+ */
864
+ async updateMachineImage(workspace, newImage) {
865
+ if (!workspace.computeId)
866
+ return;
867
+ const appName = `ar-${workspace.id.substring(0, 8)}`;
868
+ // Get current machine config first
869
+ const getResponse = await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/machines/${workspace.computeId}`, {
870
+ headers: {
871
+ Authorization: `Bearer ${this.apiToken}`,
872
+ },
873
+ });
874
+ if (!getResponse.ok) {
875
+ throw new Error(`Failed to get machine config: ${await getResponse.text()}`);
876
+ }
877
+ const machine = await getResponse.json();
878
+ // Update the image in the config
879
+ const updatedConfig = {
880
+ ...machine.config,
881
+ image: newImage,
882
+ // Include registry auth if configured
883
+ ...(this.registryAuth && {
884
+ image_registry_auth: {
885
+ registry: 'ghcr.io',
886
+ username: this.registryAuth.username,
887
+ password: this.registryAuth.password,
888
+ },
889
+ }),
890
+ };
891
+ // Update machine with new image config (skip_launch keeps it in current state)
892
+ const updateResponse = await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/machines/${workspace.computeId}?skip_launch=true`, {
893
+ method: 'POST',
894
+ headers: {
895
+ Authorization: `Bearer ${this.apiToken}`,
896
+ 'Content-Type': 'application/json',
897
+ },
898
+ body: JSON.stringify({ config: updatedConfig }),
899
+ });
900
+ if (!updateResponse.ok) {
901
+ throw new Error(`Failed to update machine image: ${await updateResponse.text()}`);
902
+ }
903
+ console.log(`[fly] Updated machine image for workspace ${workspace.id.substring(0, 8)} to ${newImage}`);
904
+ }
905
+ /**
906
+ * Set secrets as environment variables for a workspace.
907
+ */
908
+ async setSecrets(workspace, secrets) {
909
+ if (!workspace.computeId || Object.keys(secrets).length === 0)
910
+ return;
911
+ const appName = `ar-${workspace.id.substring(0, 8)}`;
912
+ await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/secrets`, {
913
+ method: 'POST',
914
+ headers: {
915
+ Authorization: `Bearer ${this.apiToken}`,
916
+ 'Content-Type': 'application/json',
917
+ },
918
+ body: JSON.stringify(secrets),
919
+ });
920
+ }
921
+ /**
922
+ * Check if workspace has active agents by querying the daemon
923
+ * Retries up to 3 times with backoff to handle machines that are starting up
924
+ */
925
+ async checkActiveAgents(workspace) {
926
+ if (!workspace.publicUrl) {
927
+ return { hasActiveAgents: false, agentCount: 0, agents: [], verified: true };
928
+ }
929
+ // Use internal Fly network URL if available (more reliable)
930
+ const appName = `ar-${workspace.id.substring(0, 8)}`;
931
+ const isOnFly = !!process.env.FLY_APP_NAME;
932
+ const baseUrl = isOnFly
933
+ ? `http://${appName}.internal:3888`
934
+ : workspace.publicUrl;
935
+ const maxRetries = 3;
936
+ const retryDelays = [2000, 4000, 6000]; // 2s, 4s, 6s backoff
937
+ for (let attempt = 0; attempt < maxRetries; attempt++) {
938
+ try {
939
+ const controller = new AbortController();
940
+ const timer = setTimeout(() => controller.abort(), 10_000);
941
+ // Use /api/data endpoint which returns { agents: [...], ... }
942
+ // Note: /api/agents doesn't exist on the workspace dashboard-server
943
+ const response = await fetch(`${baseUrl}/api/data`, {
944
+ method: 'GET',
945
+ headers: {
946
+ 'Accept': 'application/json',
947
+ },
948
+ signal: controller.signal,
949
+ });
950
+ clearTimeout(timer);
951
+ if (!response.ok) {
952
+ console.warn(`[fly] Failed to check agents for ${workspace.id.substring(0, 8)}: HTTP ${response.status} (attempt ${attempt + 1}/${maxRetries})`);
953
+ if (attempt < maxRetries - 1) {
954
+ await new Promise(resolve => setTimeout(resolve, retryDelays[attempt]));
955
+ continue;
956
+ }
957
+ return { hasActiveAgents: false, agentCount: 0, agents: [], verified: false };
958
+ }
959
+ const data = await response.json();
960
+ const agents = data.agents || [];
961
+ // Diagnostic logging: capture raw agent data before filtering
962
+ if (agents.length > 0) {
963
+ console.log(`[fly] Workspace ${workspace.id.substring(0, 8)} raw agent data:`, agents.map(a => ({ name: a.name, status: a.status, activityState: a.activityState })));
964
+ }
965
+ // Treat any online agent as active unless explicitly disconnected/offline.
966
+ const activeAgents = agents.filter(a => {
967
+ const status = (a.status ?? '').toLowerCase();
968
+ const activityState = (a.activityState ?? '').toLowerCase();
969
+ const isProcessing = a.isProcessing === true;
970
+ if (activityState === 'active' || activityState === 'idle')
971
+ return true;
972
+ if (status && status !== 'disconnected' && status !== 'offline')
973
+ return true;
974
+ if (isProcessing)
975
+ return true;
976
+ return false;
977
+ });
978
+ // Log filtering results for diagnostics
979
+ if (agents.length > 0 && activeAgents.length !== agents.length) {
980
+ const filteredOut = agents.filter(a => {
981
+ const status = (a.status ?? '').toLowerCase();
982
+ const activityState = (a.activityState ?? '').toLowerCase();
983
+ const isProcessing = a.isProcessing === true;
984
+ if (activityState === 'active' || activityState === 'idle')
985
+ return false;
986
+ if (status && status !== 'disconnected' && status !== 'offline')
987
+ return false;
988
+ if (isProcessing)
989
+ return false;
990
+ return true;
991
+ });
992
+ console.log(`[fly] Workspace ${workspace.id.substring(0, 8)} filtered out agents:`, filteredOut.map(a => ({ name: a.name, status: a.status, activityState: a.activityState })));
993
+ }
994
+ return {
995
+ hasActiveAgents: activeAgents.length > 0,
996
+ agentCount: activeAgents.length,
997
+ agents: agents.map(a => ({ name: a.name, status: a.status || a.activityState || 'unknown' })),
998
+ verified: true,
999
+ };
1000
+ }
1001
+ catch (error) {
1002
+ // Workspace might be stopped or unreachable - retry with backoff
1003
+ console.warn(`[fly] Could not reach workspace ${workspace.id.substring(0, 8)} (attempt ${attempt + 1}/${maxRetries}):`, error.message);
1004
+ if (attempt < maxRetries - 1) {
1005
+ await new Promise(resolve => setTimeout(resolve, retryDelays[attempt]));
1006
+ continue;
1007
+ }
1008
+ }
1009
+ }
1010
+ // All retries exhausted
1011
+ console.warn(`[fly] Workspace ${workspace.id.substring(0, 8)} unreachable after ${maxRetries} attempts`);
1012
+ return { hasActiveAgents: false, agentCount: 0, agents: [], verified: false };
1013
+ }
1014
+ /**
1015
+ * Get the current machine state
1016
+ */
1017
+ async getMachineState(workspace) {
1018
+ if (!workspace.computeId)
1019
+ return 'unknown';
1020
+ const appName = `ar-${workspace.id.substring(0, 8)}`;
1021
+ try {
1022
+ const response = await fetchWithRetry(`https://api.machines.dev/v1/apps/${appName}/machines/${workspace.computeId}`, {
1023
+ headers: {
1024
+ Authorization: `Bearer ${this.apiToken}`,
1025
+ },
1026
+ });
1027
+ if (!response.ok)
1028
+ return 'unknown';
1029
+ const machine = await response.json();
1030
+ return machine.state;
1031
+ }
1032
+ catch {
1033
+ return 'unknown';
1034
+ }
1035
+ }
1036
+ }
1037
+ /**
1038
+ * Railway provisioner
1039
+ */
1040
+ class RailwayProvisioner {
1041
+ apiToken;
1042
+ cloudApiUrl;
1043
+ sessionSecret;
1044
+ constructor() {
1045
+ const config = getConfig();
1046
+ if (!config.compute.railway) {
1047
+ throw new Error('Railway configuration missing');
1048
+ }
1049
+ this.apiToken = config.compute.railway.apiToken;
1050
+ this.cloudApiUrl = config.publicUrl;
1051
+ this.sessionSecret = config.sessionSecret;
1052
+ }
1053
+ generateWorkspaceToken(workspaceId) {
1054
+ return crypto
1055
+ .createHmac('sha256', this.sessionSecret)
1056
+ .update(`workspace:${workspaceId}`)
1057
+ .digest('hex');
1058
+ }
1059
+ async provision(workspace, credentials) {
1060
+ // Create project
1061
+ const projectResponse = await fetchWithRetry('https://backboard.railway.app/graphql/v2', {
1062
+ method: 'POST',
1063
+ headers: {
1064
+ Authorization: `Bearer ${this.apiToken}`,
1065
+ 'Content-Type': 'application/json',
1066
+ },
1067
+ body: JSON.stringify({
1068
+ query: `
1069
+ mutation CreateProject($input: ProjectCreateInput!) {
1070
+ projectCreate(input: $input) {
1071
+ id
1072
+ name
1073
+ }
1074
+ }
1075
+ `,
1076
+ variables: {
1077
+ input: {
1078
+ name: `agent-relay-${workspace.id.substring(0, 8)}`,
1079
+ },
1080
+ },
1081
+ }),
1082
+ });
1083
+ const projectData = await projectResponse.json();
1084
+ const projectId = projectData.data.projectCreate.id;
1085
+ // Deploy service
1086
+ const serviceResponse = await fetchWithRetry('https://backboard.railway.app/graphql/v2', {
1087
+ method: 'POST',
1088
+ headers: {
1089
+ Authorization: `Bearer ${this.apiToken}`,
1090
+ 'Content-Type': 'application/json',
1091
+ },
1092
+ body: JSON.stringify({
1093
+ query: `
1094
+ mutation CreateService($input: ServiceCreateInput!) {
1095
+ serviceCreate(input: $input) {
1096
+ id
1097
+ }
1098
+ }
1099
+ `,
1100
+ variables: {
1101
+ input: {
1102
+ projectId,
1103
+ name: 'workspace',
1104
+ source: {
1105
+ image: WORKSPACE_IMAGE,
1106
+ },
1107
+ },
1108
+ },
1109
+ }),
1110
+ });
1111
+ const serviceData = await serviceResponse.json();
1112
+ const serviceId = serviceData.data.serviceCreate.id;
1113
+ // Create linked daemon for cloud message sync
1114
+ // This generates an API key and registers the daemon in the linkedDaemons table
1115
+ const { daemonId, apiKey: railwayApiKey } = await createLinkedDaemon(workspace.userId, workspace.id, serviceId);
1116
+ console.log(`[railway] Created linked daemon ${daemonId.substring(0, 8)} for workspace ${workspace.id.substring(0, 8)}`);
1117
+ // Set environment variables
1118
+ const envVars = {
1119
+ WORKSPACE_ID: workspace.id,
1120
+ WORKSPACE_OWNER_USER_ID: workspace.userId,
1121
+ SUPERVISOR_ENABLED: String(workspace.config.supervisorEnabled ?? false),
1122
+ MAX_AGENTS: String(workspace.config.maxAgents ?? 10),
1123
+ REPOSITORIES: (workspace.config.repositories ?? []).join(','),
1124
+ PROVIDERS: (workspace.config.providers ?? []).join(','),
1125
+ PORT: String(WORKSPACE_PORT),
1126
+ AGENT_RELAY_DASHBOARD_PORT: String(WORKSPACE_PORT),
1127
+ // Store repos on persistent volume so they survive container restarts
1128
+ WORKSPACE_DIR: '/data/repos',
1129
+ CLOUD_API_URL: this.cloudApiUrl,
1130
+ WORKSPACE_TOKEN: this.generateWorkspaceToken(workspace.id),
1131
+ // Daemon API key for cloud message sync
1132
+ // Auto-generated during provisioning, stored in linkedDaemons table
1133
+ AGENT_RELAY_API_KEY: railwayApiKey,
1134
+ // Enable cloud persistence for agent sessions/summaries via API
1135
+ RELAY_CLOUD_ENABLED: 'true',
1136
+ };
1137
+ for (const [provider, token] of credentials) {
1138
+ envVars[`${provider.toUpperCase()}_TOKEN`] = token;
1139
+ // Also set GH_TOKEN for gh CLI compatibility
1140
+ if (provider === 'github') {
1141
+ envVars['GH_TOKEN'] = token;
1142
+ }
1143
+ }
1144
+ await fetchWithRetry('https://backboard.railway.app/graphql/v2', {
1145
+ method: 'POST',
1146
+ headers: {
1147
+ Authorization: `Bearer ${this.apiToken}`,
1148
+ 'Content-Type': 'application/json',
1149
+ },
1150
+ body: JSON.stringify({
1151
+ query: `
1152
+ mutation SetVariables($input: VariableCollectionUpsertInput!) {
1153
+ variableCollectionUpsert(input: $input)
1154
+ }
1155
+ `,
1156
+ variables: {
1157
+ input: {
1158
+ projectId,
1159
+ serviceId,
1160
+ variables: envVars,
1161
+ },
1162
+ },
1163
+ }),
1164
+ });
1165
+ // Generate domain
1166
+ const domainResponse = await fetchWithRetry('https://backboard.railway.app/graphql/v2', {
1167
+ method: 'POST',
1168
+ headers: {
1169
+ Authorization: `Bearer ${this.apiToken}`,
1170
+ 'Content-Type': 'application/json',
1171
+ },
1172
+ body: JSON.stringify({
1173
+ query: `
1174
+ mutation CreateDomain($input: ServiceDomainCreateInput!) {
1175
+ serviceDomainCreate(input: $input) {
1176
+ domain
1177
+ }
1178
+ }
1179
+ `,
1180
+ variables: {
1181
+ input: {
1182
+ serviceId,
1183
+ },
1184
+ },
1185
+ }),
1186
+ });
1187
+ const domainData = await domainResponse.json();
1188
+ const domain = domainData.data.serviceDomainCreate.domain;
1189
+ await softHealthCheck(`https://${domain}`);
1190
+ return {
1191
+ computeId: projectId,
1192
+ publicUrl: `https://${domain}`,
1193
+ };
1194
+ }
1195
+ async deprovision(workspace) {
1196
+ if (!workspace.computeId)
1197
+ return;
1198
+ await fetchWithRetry('https://backboard.railway.app/graphql/v2', {
1199
+ method: 'POST',
1200
+ headers: {
1201
+ Authorization: `Bearer ${this.apiToken}`,
1202
+ 'Content-Type': 'application/json',
1203
+ },
1204
+ body: JSON.stringify({
1205
+ query: `
1206
+ mutation DeleteProject($id: String!) {
1207
+ projectDelete(id: $id)
1208
+ }
1209
+ `,
1210
+ variables: {
1211
+ id: workspace.computeId,
1212
+ },
1213
+ }),
1214
+ });
1215
+ }
1216
+ async getStatus(workspace) {
1217
+ if (!workspace.computeId)
1218
+ return 'error';
1219
+ const response = await fetchWithRetry('https://backboard.railway.app/graphql/v2', {
1220
+ method: 'POST',
1221
+ headers: {
1222
+ Authorization: `Bearer ${this.apiToken}`,
1223
+ 'Content-Type': 'application/json',
1224
+ },
1225
+ body: JSON.stringify({
1226
+ query: `
1227
+ query GetProject($id: String!) {
1228
+ project(id: $id) {
1229
+ deployments {
1230
+ edges {
1231
+ node {
1232
+ status
1233
+ }
1234
+ }
1235
+ }
1236
+ }
1237
+ }
1238
+ `,
1239
+ variables: {
1240
+ id: workspace.computeId,
1241
+ },
1242
+ }),
1243
+ });
1244
+ const data = await response.json();
1245
+ const deployments = data.data?.project?.deployments?.edges;
1246
+ if (!deployments || deployments.length === 0)
1247
+ return 'provisioning';
1248
+ const latestStatus = deployments[0].node.status;
1249
+ switch (latestStatus) {
1250
+ case 'SUCCESS':
1251
+ return 'running';
1252
+ case 'BUILDING':
1253
+ case 'DEPLOYING':
1254
+ return 'provisioning';
1255
+ case 'CRASHED':
1256
+ case 'FAILED':
1257
+ return 'error';
1258
+ default:
1259
+ return 'stopped';
1260
+ }
1261
+ }
1262
+ async restart(workspace) {
1263
+ // Railway doesn't have a direct restart - redeploy instead
1264
+ if (!workspace.computeId)
1265
+ return;
1266
+ await fetchWithRetry('https://backboard.railway.app/graphql/v2', {
1267
+ method: 'POST',
1268
+ headers: {
1269
+ Authorization: `Bearer ${this.apiToken}`,
1270
+ 'Content-Type': 'application/json',
1271
+ },
1272
+ body: JSON.stringify({
1273
+ query: `
1274
+ mutation RedeployService($input: DeploymentTriggerInput!) {
1275
+ deploymentTrigger(input: $input)
1276
+ }
1277
+ `,
1278
+ variables: {
1279
+ input: {
1280
+ projectId: workspace.computeId,
1281
+ },
1282
+ },
1283
+ }),
1284
+ });
1285
+ }
1286
+ async setEnvVars(workspace, envVars) {
1287
+ if (!workspace.computeId || Object.keys(envVars).length === 0)
1288
+ return;
1289
+ const linkedDaemons = await db.linkedDaemons.findByWorkspaceId(workspace.id);
1290
+ const serviceId = linkedDaemons[0]?.machineId;
1291
+ if (!serviceId) {
1292
+ console.warn(`[railway] No service ID found for workspace ${workspace.id}`);
1293
+ return;
1294
+ }
1295
+ await fetchWithRetry('https://backboard.railway.app/graphql/v2', {
1296
+ method: 'POST',
1297
+ headers: {
1298
+ Authorization: `Bearer ${this.apiToken}`,
1299
+ 'Content-Type': 'application/json',
1300
+ },
1301
+ body: JSON.stringify({
1302
+ query: `
1303
+ mutation SetVariables($input: VariableCollectionUpsertInput!) {
1304
+ variableCollectionUpsert(input: $input)
1305
+ }
1306
+ `,
1307
+ variables: {
1308
+ input: {
1309
+ projectId: workspace.computeId,
1310
+ serviceId,
1311
+ variables: envVars,
1312
+ },
1313
+ },
1314
+ }),
1315
+ });
1316
+ }
1317
+ }
1318
+ /**
1319
+ * Local Docker provisioner (for development/self-hosted)
1320
+ */
1321
+ class DockerProvisioner {
1322
+ cloudApiUrl;
1323
+ cloudApiUrlForContainer;
1324
+ sessionSecret;
1325
+ constructor() {
1326
+ const config = getConfig();
1327
+ this.cloudApiUrl = config.publicUrl;
1328
+ this.sessionSecret = config.sessionSecret;
1329
+ // For Docker containers, localhost won't work - they need to reach the host
1330
+ // Convert localhost URLs to host.docker.internal for container access
1331
+ if (this.cloudApiUrl.includes('localhost') || this.cloudApiUrl.includes('127.0.0.1')) {
1332
+ this.cloudApiUrlForContainer = this.cloudApiUrl
1333
+ .replace('localhost', 'host.docker.internal')
1334
+ .replace('127.0.0.1', 'host.docker.internal');
1335
+ console.log(`[docker] Container API URL: ${this.cloudApiUrlForContainer} (host: ${this.cloudApiUrl})`);
1336
+ }
1337
+ else {
1338
+ this.cloudApiUrlForContainer = this.cloudApiUrl;
1339
+ }
1340
+ }
1341
+ generateWorkspaceToken(workspaceId) {
1342
+ return crypto
1343
+ .createHmac('sha256', this.sessionSecret)
1344
+ .update(`workspace:${workspaceId}`)
1345
+ .digest('hex');
1346
+ }
1347
+ /**
1348
+ * Wait for container to be healthy by polling the health endpoint
1349
+ */
1350
+ async waitForHealthy(publicUrl, timeoutMs = 60_000) {
1351
+ const startTime = Date.now();
1352
+ const pollInterval = 2000;
1353
+ console.log(`[docker] Waiting for container to be healthy at ${publicUrl}...`);
1354
+ while (Date.now() - startTime < timeoutMs) {
1355
+ try {
1356
+ const response = await fetch(`${publicUrl}/health`, {
1357
+ method: 'GET',
1358
+ signal: AbortSignal.timeout(5000),
1359
+ });
1360
+ if (response.ok) {
1361
+ console.log(`[docker] Container healthy after ${Date.now() - startTime}ms`);
1362
+ return;
1363
+ }
1364
+ }
1365
+ catch {
1366
+ // Container not ready yet, continue polling
1367
+ }
1368
+ await wait(pollInterval);
1369
+ }
1370
+ throw new Error(`Container did not become healthy within ${timeoutMs}ms`);
1371
+ }
1372
+ async provision(workspace, credentials) {
1373
+ const containerName = `ar-${workspace.id.substring(0, 8)}`;
1374
+ // Create linked daemon for cloud message sync
1375
+ // This generates an API key and registers the daemon in the linkedDaemons table
1376
+ // Use container name as daemon's machine ID (will be updated to actual container ID after creation)
1377
+ const { daemonId, apiKey: dockerApiKey } = await createLinkedDaemon(workspace.userId, workspace.id, containerName);
1378
+ console.log(`[docker] Created linked daemon ${daemonId.substring(0, 8)} for workspace ${workspace.id.substring(0, 8)}`);
1379
+ // Build environment variables
1380
+ const envArgs = [
1381
+ `-e WORKSPACE_ID=${workspace.id}`,
1382
+ `-e WORKSPACE_OWNER_USER_ID=${workspace.userId}`,
1383
+ `-e SUPERVISOR_ENABLED=${workspace.config.supervisorEnabled ?? false}`,
1384
+ `-e MAX_AGENTS=${workspace.config.maxAgents ?? 10}`,
1385
+ `-e REPOSITORIES=${(workspace.config.repositories ?? []).join(',')}`,
1386
+ `-e PROVIDERS=${(workspace.config.providers ?? []).join(',')}`,
1387
+ `-e PORT=${WORKSPACE_PORT}`,
1388
+ `-e AGENT_RELAY_DASHBOARD_PORT=${WORKSPACE_PORT}`,
1389
+ // Store repos on persistent volume so they survive container restarts
1390
+ `-e WORKSPACE_DIR=/data/repos`,
1391
+ `-e CLOUD_API_URL=${this.cloudApiUrlForContainer}`,
1392
+ `-e WORKSPACE_TOKEN=${this.generateWorkspaceToken(workspace.id)}`,
1393
+ // Daemon API key for cloud message sync
1394
+ // Auto-generated during provisioning, stored in linkedDaemons table
1395
+ `-e AGENT_RELAY_API_KEY=${dockerApiKey}`,
1396
+ // Enable cloud persistence for agent sessions/summaries via API
1397
+ `-e RELAY_CLOUD_ENABLED=true`,
1398
+ ];
1399
+ for (const [provider, token] of credentials) {
1400
+ envArgs.push(`-e ${provider.toUpperCase()}_TOKEN=${token}`);
1401
+ // Also set GH_TOKEN for gh CLI compatibility
1402
+ if (provider === 'github') {
1403
+ envArgs.push(`-e GH_TOKEN=${token}`);
1404
+ }
1405
+ }
1406
+ // Run container
1407
+ const { execSync } = await import('child_process');
1408
+ const hostPort = 3000 + Math.floor(Math.random() * 1000);
1409
+ // SSH port for tunneling (Codex OAuth callback forwarding)
1410
+ // Derive from hostPort to avoid collisions: API port 3500 -> SSH port 22500
1411
+ const sshHostPort = 22000 + (hostPort - 3000);
1412
+ // When running in Docker, connect to the same network for container-to-container communication
1413
+ const runningInDocker = process.env.RUNNING_IN_DOCKER === 'true';
1414
+ const networkArg = runningInDocker ? '--network agent-relay-dev' : '';
1415
+ // In development, mount local dist and docs folders for faster iteration
1416
+ // Set WORKSPACE_DEV_MOUNT=true to enable
1417
+ const devMount = process.env.WORKSPACE_DEV_MOUNT === 'true';
1418
+ const volumeArgs = devMount
1419
+ ? `-v "${process.cwd()}/dist:/app/dist:ro" -v "${process.cwd()}/docs:/app/docs:ro"`
1420
+ : '';
1421
+ if (devMount) {
1422
+ console.log('[provisioner] Dev mode: mounting local dist/ and docs/ folders into workspace container');
1423
+ }
1424
+ try {
1425
+ // Map workspace API port and SSH port (for tunneling)
1426
+ // SSH is used by CLI to forward localhost:1455 to workspace container for Codex OAuth
1427
+ // Set CODEX_DIRECT_PORT=true to also map port 1455 directly (for debugging only)
1428
+ const directCodexPort = process.env.CODEX_DIRECT_PORT === 'true';
1429
+ const portMappings = directCodexPort
1430
+ ? `-p ${hostPort}:${WORKSPACE_PORT} -p ${sshHostPort}:${WORKSPACE_SSH_PORT} -p ${CODEX_OAUTH_PORT}:${CODEX_OAUTH_PORT}`
1431
+ : `-p ${hostPort}:${WORKSPACE_PORT} -p ${sshHostPort}:${WORKSPACE_SSH_PORT}`;
1432
+ // Enable SSH in the container for tunneling
1433
+ // Each workspace gets a unique password derived from its ID + secret salt
1434
+ envArgs.push('-e ENABLE_SSH=true');
1435
+ envArgs.push(`-e SSH_PASSWORD=${deriveSshPassword(workspace.id)}`);
1436
+ envArgs.push(`-e SSH_PORT=${WORKSPACE_SSH_PORT}`);
1437
+ execSync(`docker run -d --user root --name ${containerName} ${networkArg} ${volumeArgs} ${portMappings} ${envArgs.join(' ')} ${WORKSPACE_IMAGE}`, { stdio: 'pipe' });
1438
+ const publicUrl = `http://localhost:${hostPort}`;
1439
+ // Wait for container to be healthy before returning
1440
+ // When running in Docker, use the internal container name for health check
1441
+ const healthCheckUrl = runningInDocker
1442
+ ? `http://${containerName}:${WORKSPACE_PORT}`
1443
+ : publicUrl;
1444
+ await this.waitForHealthy(healthCheckUrl);
1445
+ return {
1446
+ computeId: containerName,
1447
+ publicUrl,
1448
+ sshPort: sshHostPort,
1449
+ };
1450
+ }
1451
+ catch (error) {
1452
+ // Clean up container if it was created but health check failed
1453
+ try {
1454
+ const { execSync: execSyncCleanup } = await import('child_process');
1455
+ execSyncCleanup(`docker rm -f ${containerName}`, { stdio: 'pipe' });
1456
+ }
1457
+ catch {
1458
+ // Ignore cleanup errors
1459
+ }
1460
+ throw new Error(`Failed to start Docker container: ${error}`);
1461
+ }
1462
+ }
1463
+ async deprovision(workspace) {
1464
+ if (!workspace.computeId)
1465
+ return;
1466
+ const { execSync } = await import('child_process');
1467
+ try {
1468
+ execSync(`docker rm -f ${workspace.computeId}`, { stdio: 'pipe' });
1469
+ }
1470
+ catch {
1471
+ // Container may already be removed
1472
+ }
1473
+ }
1474
+ async getStatus(workspace) {
1475
+ if (!workspace.computeId)
1476
+ return 'error';
1477
+ const { execSync } = await import('child_process');
1478
+ try {
1479
+ const result = execSync(`docker inspect -f '{{.State.Status}}' ${workspace.computeId}`, { stdio: 'pipe' }).toString().trim();
1480
+ switch (result) {
1481
+ case 'running':
1482
+ return 'running';
1483
+ case 'exited':
1484
+ case 'dead':
1485
+ return 'stopped';
1486
+ case 'created':
1487
+ case 'restarting':
1488
+ return 'provisioning';
1489
+ default:
1490
+ return 'error';
1491
+ }
1492
+ }
1493
+ catch {
1494
+ return 'error';
1495
+ }
1496
+ }
1497
+ async restart(workspace) {
1498
+ if (!workspace.computeId)
1499
+ return;
1500
+ const { execSync } = await import('child_process');
1501
+ try {
1502
+ execSync(`docker restart ${workspace.computeId}`, { stdio: 'pipe' });
1503
+ }
1504
+ catch (error) {
1505
+ throw new Error(`Failed to restart container: ${error}`);
1506
+ }
1507
+ }
1508
+ async setEnvVars(_workspace, _envVars) {
1509
+ console.warn('[docker] Updating environment variables for running containers is not supported.');
1510
+ }
1511
+ }
1512
+ /**
1513
+ * Main Workspace Provisioner
1514
+ */
1515
+ export class WorkspaceProvisioner {
1516
+ provisioner;
1517
+ constructor() {
1518
+ const config = getConfig();
1519
+ switch (config.compute.provider) {
1520
+ case 'fly':
1521
+ this.provisioner = new FlyProvisioner();
1522
+ break;
1523
+ case 'railway':
1524
+ this.provisioner = new RailwayProvisioner();
1525
+ break;
1526
+ case 'docker':
1527
+ default:
1528
+ this.provisioner = new DockerProvisioner();
1529
+ }
1530
+ }
1531
+ /**
1532
+ * Provision a new workspace (one-click)
1533
+ * Returns immediately with 'provisioning' status and runs actual provisioning in background
1534
+ */
1535
+ async provision(config) {
1536
+ // Create workspace record
1537
+ const workspace = await db.workspaces.create({
1538
+ userId: config.userId,
1539
+ name: config.name,
1540
+ computeProvider: getConfig().compute.provider,
1541
+ config: {
1542
+ providers: config.providers,
1543
+ repositories: config.repositories,
1544
+ supervisorEnabled: config.supervisorEnabled ?? true,
1545
+ maxAgents: config.maxAgents ?? 10,
1546
+ },
1547
+ });
1548
+ // Add creator as owner in workspace_members for team collaboration support
1549
+ await db.workspaceMembers.addMember({
1550
+ workspaceId: workspace.id,
1551
+ userId: config.userId,
1552
+ role: 'owner',
1553
+ invitedBy: config.userId, // Self-invited as creator
1554
+ });
1555
+ // Auto-accept the creator's membership
1556
+ await db.workspaceMembers.acceptInvite(workspace.id, config.userId);
1557
+ // Link repositories to this workspace
1558
+ // This enables auto-access for users with GitHub access to these repos
1559
+ for (const repoFullName of config.repositories) {
1560
+ try {
1561
+ // Find the user's repo record (may not exist if user didn't import it first)
1562
+ const userRepos = await db.repositories.findByUserId(config.userId);
1563
+ const repoRecord = userRepos.find(r => r.githubFullName.toLowerCase() === repoFullName.toLowerCase());
1564
+ if (repoRecord) {
1565
+ await db.repositories.assignToWorkspace(repoRecord.id, workspace.id);
1566
+ console.log(`[provisioner] Linked repo ${repoFullName} to workspace ${workspace.id.substring(0, 8)}`);
1567
+ }
1568
+ else {
1569
+ // Create a placeholder repo record if it doesn't exist
1570
+ // This ensures the repo is tracked for workspace access checks
1571
+ console.log(`[provisioner] Creating repo record for ${repoFullName}`);
1572
+ const newRepo = await db.repositories.upsert({
1573
+ userId: config.userId,
1574
+ githubFullName: repoFullName,
1575
+ githubId: 0, // Will be updated when actually synced
1576
+ defaultBranch: 'main',
1577
+ isPrivate: true, // Assume private, will be updated
1578
+ workspaceId: workspace.id,
1579
+ });
1580
+ console.log(`[provisioner] Created and linked repo ${repoFullName} (id: ${newRepo.id.substring(0, 8)})`);
1581
+ }
1582
+ }
1583
+ catch (err) {
1584
+ console.warn(`[provisioner] Failed to link repo ${repoFullName}:`, err);
1585
+ // Continue with other repos
1586
+ }
1587
+ }
1588
+ // Initialize stage tracking immediately
1589
+ updateProvisioningStage(workspace.id, 'creating');
1590
+ // Run provisioning in the background so frontend can poll for stages
1591
+ this.runProvisioningAsync(workspace, config).catch((error) => {
1592
+ console.error(`[provisioner] Background provisioning failed for ${workspace.id}:`, error);
1593
+ });
1594
+ // Return immediately with 'provisioning' status
1595
+ return {
1596
+ workspaceId: workspace.id,
1597
+ status: 'provisioning',
1598
+ };
1599
+ }
1600
+ /**
1601
+ * Run the actual provisioning work asynchronously
1602
+ */
1603
+ async runProvisioningAsync(workspace, config) {
1604
+ // Build credentials map for workspace provisioning
1605
+ // Note: Provider tokens (Claude, Codex, etc.) are no longer stored centrally.
1606
+ // CLI tools authenticate directly on workspace instances.
1607
+ // Only GitHub App tokens are obtained from Nango for repository cloning.
1608
+ const credentials = new Map();
1609
+ // GitHub token is required for cloning repositories
1610
+ // Use direct token if provided (for testing), otherwise get from Nango
1611
+ if (config.repositories.length > 0) {
1612
+ if (config.githubToken) {
1613
+ // Direct token provided (for testing)
1614
+ credentials.set('github', config.githubToken);
1615
+ console.log('[provisioner] Using provided GitHub token');
1616
+ }
1617
+ else {
1618
+ // Get fresh installation token from Nango GitHub App
1619
+ const githubToken = await getGithubAppTokenForUser(config.userId);
1620
+ if (githubToken) {
1621
+ credentials.set('github', githubToken);
1622
+ }
1623
+ else {
1624
+ console.warn(`[provisioner] No GitHub App token for user ${config.userId}; repository cloning may fail.`);
1625
+ }
1626
+ }
1627
+ }
1628
+ // Provision compute
1629
+ try {
1630
+ const { computeId, publicUrl } = await this.provisioner.provision(workspace, credentials);
1631
+ await db.workspaces.updateStatus(workspace.id, 'running', {
1632
+ computeId,
1633
+ publicUrl,
1634
+ });
1635
+ // Schedule cleanup of provisioning progress after 30s (gives frontend time to see 'complete')
1636
+ setTimeout(() => {
1637
+ clearProvisioningProgress(workspace.id);
1638
+ console.log(`[provisioner] Cleaned up provisioning progress for ${workspace.id.substring(0, 8)}`);
1639
+ }, 30_000);
1640
+ console.log(`[provisioner] Workspace ${workspace.id} provisioned successfully at ${publicUrl}`);
1641
+ }
1642
+ catch (error) {
1643
+ const errorMessage = error instanceof Error ? error.message : 'Unknown error';
1644
+ await db.workspaces.updateStatus(workspace.id, 'error', {
1645
+ errorMessage,
1646
+ });
1647
+ // Clear provisioning progress on error
1648
+ clearProvisioningProgress(workspace.id);
1649
+ console.error(`[provisioner] Workspace ${workspace.id} provisioning failed:`, errorMessage);
1650
+ }
1651
+ }
1652
+ /**
1653
+ * Deprovision a workspace
1654
+ */
1655
+ async deprovision(workspaceId) {
1656
+ const workspace = await db.workspaces.findById(workspaceId);
1657
+ if (!workspace) {
1658
+ throw new Error('Workspace not found');
1659
+ }
1660
+ await this.provisioner.deprovision(workspace);
1661
+ await db.workspaces.delete(workspaceId);
1662
+ }
1663
+ /**
1664
+ * Get workspace status
1665
+ */
1666
+ async getStatus(workspaceId) {
1667
+ const workspace = await db.workspaces.findById(workspaceId);
1668
+ if (!workspace) {
1669
+ throw new Error('Workspace not found');
1670
+ }
1671
+ // During early provisioning, computeId isn't set yet
1672
+ // Return the database status instead of querying the provider
1673
+ if (!workspace.computeId && workspace.status === 'provisioning') {
1674
+ return 'provisioning';
1675
+ }
1676
+ const status = await this.provisioner.getStatus(workspace);
1677
+ // Update database if status changed
1678
+ if (status !== workspace.status) {
1679
+ await db.workspaces.updateStatus(workspaceId, status);
1680
+ }
1681
+ return status;
1682
+ }
1683
+ /**
1684
+ * Restart a workspace
1685
+ */
1686
+ async restart(workspaceId) {
1687
+ const workspace = await db.workspaces.findById(workspaceId);
1688
+ if (!workspace) {
1689
+ throw new Error('Workspace not found');
1690
+ }
1691
+ await this.provisioner.restart(workspace);
1692
+ }
1693
+ /**
1694
+ * Update environment variables for a workspace instance.
1695
+ */
1696
+ async setWorkspaceEnvVars(workspace, envVars) {
1697
+ if (Object.keys(envVars).length === 0)
1698
+ return;
1699
+ if (this.provisioner instanceof FlyProvisioner) {
1700
+ await this.provisioner.setSecrets(workspace, envVars);
1701
+ return;
1702
+ }
1703
+ if (this.provisioner instanceof RailwayProvisioner) {
1704
+ await this.provisioner.setEnvVars(workspace, envVars);
1705
+ return;
1706
+ }
1707
+ if (this.provisioner instanceof DockerProvisioner) {
1708
+ await this.provisioner.setEnvVars(workspace, envVars);
1709
+ return;
1710
+ }
1711
+ }
1712
+ /**
1713
+ * Stop a workspace
1714
+ */
1715
+ async stop(workspaceId) {
1716
+ const workspace = await db.workspaces.findById(workspaceId);
1717
+ if (!workspace) {
1718
+ throw new Error('Workspace not found');
1719
+ }
1720
+ // For now, just deprovision to stop
1721
+ await this.provisioner.deprovision(workspace);
1722
+ await db.workspaces.updateStatus(workspaceId, 'stopped');
1723
+ }
1724
+ /**
1725
+ * Resize a workspace (vertical scaling)
1726
+ * @param skipRestart - If true, config is saved but machine won't restart (changes apply on next start)
1727
+ */
1728
+ async resize(workspaceId, tier, skipRestart = false) {
1729
+ const workspace = await db.workspaces.findById(workspaceId);
1730
+ if (!workspace) {
1731
+ throw new Error('Workspace not found');
1732
+ }
1733
+ if (!this.provisioner.resize) {
1734
+ throw new Error('Resize not supported by current compute provider');
1735
+ }
1736
+ await this.provisioner.resize(workspace, tier, skipRestart);
1737
+ // Update workspace config with new limits
1738
+ await db.workspaces.updateConfig(workspaceId, {
1739
+ ...workspace.config,
1740
+ maxAgents: tier.maxAgents,
1741
+ resourceTier: tier.name,
1742
+ });
1743
+ }
1744
+ /**
1745
+ * Update the max agent limit for a workspace
1746
+ */
1747
+ async updateAgentLimit(workspaceId, newLimit) {
1748
+ const workspace = await db.workspaces.findById(workspaceId);
1749
+ if (!workspace) {
1750
+ throw new Error('Workspace not found');
1751
+ }
1752
+ if (this.provisioner.updateAgentLimit) {
1753
+ await this.provisioner.updateAgentLimit(workspace, newLimit);
1754
+ }
1755
+ // Update workspace config
1756
+ await db.workspaces.updateConfig(workspaceId, {
1757
+ ...workspace.config,
1758
+ maxAgents: newLimit,
1759
+ });
1760
+ }
1761
+ /**
1762
+ * Get current resource tier for a workspace
1763
+ */
1764
+ async getCurrentTier(workspaceId) {
1765
+ const workspace = await db.workspaces.findById(workspaceId);
1766
+ if (!workspace) {
1767
+ throw new Error('Workspace not found');
1768
+ }
1769
+ if (this.provisioner.getCurrentTier) {
1770
+ return this.provisioner.getCurrentTier(workspace);
1771
+ }
1772
+ // Fallback: determine from config or default to small
1773
+ const tierName = workspace.config.resourceTier || 'small';
1774
+ return RESOURCE_TIERS[tierName] || RESOURCE_TIERS.small;
1775
+ }
1776
+ /**
1777
+ * Get recommended tier based on agent count
1778
+ * Uses 1.5-2GB per agent as baseline for Claude Code
1779
+ */
1780
+ getRecommendedTier(agentCount) {
1781
+ // Find the smallest tier that supports this agent count
1782
+ const tiers = Object.values(RESOURCE_TIERS).sort((a, b) => a.maxAgents - b.maxAgents);
1783
+ for (const tier of tiers) {
1784
+ if (tier.maxAgents >= agentCount) {
1785
+ return tier;
1786
+ }
1787
+ }
1788
+ // If agent count exceeds all tiers, return the largest
1789
+ return RESOURCE_TIERS.xlarge;
1790
+ }
1791
+ /**
1792
+ * Auto-scale workspace based on current agent count
1793
+ * Respects plan limits - free tier cannot scale, others have max tier limits
1794
+ * Returns { scaled: boolean, reason?: string }
1795
+ */
1796
+ async autoScale(workspaceId, currentAgentCount) {
1797
+ const workspace = await db.workspaces.findById(workspaceId);
1798
+ if (!workspace) {
1799
+ throw new Error('Workspace not found');
1800
+ }
1801
+ // Get user's plan
1802
+ const user = await db.users.findById(workspace.userId);
1803
+ const plan = user?.plan || 'free';
1804
+ // Check if plan allows auto-scaling
1805
+ if (!canAutoScale(plan)) {
1806
+ return {
1807
+ scaled: false,
1808
+ reason: 'Auto-scaling requires Pro plan or higher',
1809
+ };
1810
+ }
1811
+ const currentTier = await this.getCurrentTier(workspaceId);
1812
+ const recommendedTier = this.getRecommendedTier(currentAgentCount);
1813
+ // Only scale UP, never down (to avoid disruption)
1814
+ if (recommendedTier.memoryMb <= currentTier.memoryMb) {
1815
+ return {
1816
+ scaled: false,
1817
+ currentTier: currentTier.name,
1818
+ };
1819
+ }
1820
+ // Check if plan allows scaling to the recommended tier
1821
+ if (!canScaleToTier(plan, recommendedTier.name)) {
1822
+ // Find the max tier allowed for this plan
1823
+ const maxTierName = getResourceTierForPlan(plan);
1824
+ const maxTier = RESOURCE_TIERS[maxTierName];
1825
+ if (maxTier.memoryMb <= currentTier.memoryMb) {
1826
+ return {
1827
+ scaled: false,
1828
+ reason: `Already at max tier (${currentTier.name}) for ${plan} plan`,
1829
+ currentTier: currentTier.name,
1830
+ };
1831
+ }
1832
+ // Scale to max allowed tier instead
1833
+ console.log(`[provisioner] Auto-scaling workspace ${workspaceId.substring(0, 8)} from ${currentTier.name} to ${maxTierName} (max for ${plan} plan)`);
1834
+ await this.resize(workspaceId, maxTier);
1835
+ return {
1836
+ scaled: true,
1837
+ currentTier: currentTier.name,
1838
+ targetTier: maxTierName,
1839
+ reason: `Scaled to max tier for ${plan} plan`,
1840
+ };
1841
+ }
1842
+ console.log(`[provisioner] Auto-scaling workspace ${workspaceId.substring(0, 8)} from ${currentTier.name} to ${recommendedTier.name} (${currentAgentCount} agents)`);
1843
+ await this.resize(workspaceId, recommendedTier);
1844
+ return {
1845
+ scaled: true,
1846
+ currentTier: currentTier.name,
1847
+ targetTier: recommendedTier.name,
1848
+ };
1849
+ }
1850
+ // ============================================================================
1851
+ // Snapshot Management
1852
+ // ============================================================================
1853
+ /**
1854
+ * Create an on-demand snapshot of a workspace's volume
1855
+ * Use before risky operations (e.g., major refactors, untrusted code execution)
1856
+ */
1857
+ async createSnapshot(workspaceId) {
1858
+ const workspace = await db.workspaces.findById(workspaceId);
1859
+ if (!workspace) {
1860
+ throw new Error('Workspace not found');
1861
+ }
1862
+ // Only Fly.io provisioner supports snapshots
1863
+ if (!(this.provisioner instanceof FlyProvisioner)) {
1864
+ console.warn('[provisioner] Snapshots only supported on Fly.io');
1865
+ return null;
1866
+ }
1867
+ const appName = `ar-${workspace.id.substring(0, 8)}`;
1868
+ const flyProvisioner = this.provisioner;
1869
+ // Get the volume
1870
+ const volume = await flyProvisioner.getVolume(appName);
1871
+ if (!volume) {
1872
+ throw new Error('No volume found for workspace');
1873
+ }
1874
+ // Create snapshot
1875
+ const snapshot = await flyProvisioner.createSnapshot(appName, volume.id);
1876
+ return { snapshotId: snapshot.id };
1877
+ }
1878
+ /**
1879
+ * List available snapshots for a workspace
1880
+ * Includes both automatic daily snapshots and on-demand snapshots
1881
+ */
1882
+ async listSnapshots(workspaceId) {
1883
+ const workspace = await db.workspaces.findById(workspaceId);
1884
+ if (!workspace) {
1885
+ throw new Error('Workspace not found');
1886
+ }
1887
+ // Only Fly.io provisioner supports snapshots
1888
+ if (!(this.provisioner instanceof FlyProvisioner)) {
1889
+ return [];
1890
+ }
1891
+ const appName = `ar-${workspace.id.substring(0, 8)}`;
1892
+ const flyProvisioner = this.provisioner;
1893
+ // Get the volume
1894
+ const volume = await flyProvisioner.getVolume(appName);
1895
+ if (!volume) {
1896
+ return [];
1897
+ }
1898
+ // List snapshots
1899
+ const snapshots = await flyProvisioner.listSnapshots(appName, volume.id);
1900
+ return snapshots.map(s => ({
1901
+ id: s.id,
1902
+ createdAt: s.created_at,
1903
+ sizeBytes: s.size,
1904
+ }));
1905
+ }
1906
+ /**
1907
+ * Get the volume ID for a workspace (needed for restore operations)
1908
+ */
1909
+ async getVolumeId(workspaceId) {
1910
+ const workspace = await db.workspaces.findById(workspaceId);
1911
+ if (!workspace) {
1912
+ throw new Error('Workspace not found');
1913
+ }
1914
+ if (!(this.provisioner instanceof FlyProvisioner)) {
1915
+ return null;
1916
+ }
1917
+ const appName = `ar-${workspace.id.substring(0, 8)}`;
1918
+ const flyProvisioner = this.provisioner;
1919
+ const volume = await flyProvisioner.getVolume(appName);
1920
+ return volume?.id || null;
1921
+ }
1922
+ // ============================================================================
1923
+ // Graceful Image Update
1924
+ // ============================================================================
1925
+ /**
1926
+ * Result of a graceful update attempt
1927
+ */
1928
+ static UpdateResult = {
1929
+ UPDATED: 'updated',
1930
+ UPDATED_PENDING_RESTART: 'updated_pending_restart',
1931
+ SKIPPED_ACTIVE_AGENTS: 'skipped_active_agents',
1932
+ SKIPPED_VERIFICATION_FAILED: 'skipped_verification_failed',
1933
+ SKIPPED_NOT_RUNNING: 'skipped_not_running',
1934
+ NOT_SUPPORTED: 'not_supported',
1935
+ ERROR: 'error',
1936
+ };
1937
+ /**
1938
+ * Gracefully update a single workspace's image
1939
+ *
1940
+ * Behavior:
1941
+ * - If workspace is stopped: Update config, will use new image on next wake
1942
+ * - If workspace is running with no agents: Update config and restart
1943
+ * - If workspace is running with active agents: Skip (or force if specified)
1944
+ *
1945
+ * @param workspaceId - Workspace to update
1946
+ * @param newImage - New Docker image to use
1947
+ * @param options - Update options
1948
+ * @returns Update result with details
1949
+ */
1950
+ async gracefulUpdateImage(workspaceId, newImage, options = {}) {
1951
+ const workspace = await db.workspaces.findById(workspaceId);
1952
+ if (!workspace) {
1953
+ return {
1954
+ result: WorkspaceProvisioner.UpdateResult.ERROR,
1955
+ workspaceId,
1956
+ error: 'Workspace not found',
1957
+ };
1958
+ }
1959
+ // Only Fly.io supports graceful updates
1960
+ if (!(this.provisioner instanceof FlyProvisioner)) {
1961
+ return {
1962
+ result: WorkspaceProvisioner.UpdateResult.NOT_SUPPORTED,
1963
+ workspaceId,
1964
+ error: 'Graceful updates only supported on Fly.io',
1965
+ };
1966
+ }
1967
+ const flyProvisioner = this.provisioner;
1968
+ try {
1969
+ // Check machine state
1970
+ const machineState = await flyProvisioner.getMachineState(workspace);
1971
+ if (machineState === 'stopped' || machineState === 'suspended') {
1972
+ // Machine is not running - safe to update, will apply on next wake
1973
+ await flyProvisioner.updateMachineImage(workspace, newImage);
1974
+ console.log(`[provisioner] Updated stopped workspace ${workspaceId.substring(0, 8)} to ${newImage}`);
1975
+ return {
1976
+ result: WorkspaceProvisioner.UpdateResult.UPDATED_PENDING_RESTART,
1977
+ workspaceId,
1978
+ machineState,
1979
+ };
1980
+ }
1981
+ if (machineState === 'started') {
1982
+ // Machine is running - check for active agents
1983
+ const agentCheck = await flyProvisioner.checkActiveAgents(workspace);
1984
+ // If we couldn't verify agent status and not forcing, skip to be safe
1985
+ // This is expected behavior for workspaces that are waking up from auto-stop
1986
+ // or experiencing temporary network issues - not an error condition
1987
+ if (!agentCheck.verified && !options.force) {
1988
+ console.log(`[provisioner] Skipped workspace ${workspaceId.substring(0, 8)}: workspace unreachable (will update on next restart)`);
1989
+ return {
1990
+ result: WorkspaceProvisioner.UpdateResult.SKIPPED_VERIFICATION_FAILED,
1991
+ workspaceId,
1992
+ machineState,
1993
+ // Use 'reason' instead of 'error' - this is expected behavior, not an error
1994
+ reason: 'Workspace unreachable - will update on next restart or when accessible',
1995
+ };
1996
+ }
1997
+ if (agentCheck.hasActiveAgents && !options.force) {
1998
+ // Has active agents and not forcing - skip
1999
+ console.log(`[provisioner] Skipped workspace ${workspaceId.substring(0, 8)}: ${agentCheck.agentCount} active agents`);
2000
+ return {
2001
+ result: WorkspaceProvisioner.UpdateResult.SKIPPED_ACTIVE_AGENTS,
2002
+ workspaceId,
2003
+ machineState,
2004
+ agentCount: agentCheck.agentCount,
2005
+ agents: agentCheck.agents,
2006
+ };
2007
+ }
2008
+ // Update the image config
2009
+ await flyProvisioner.updateMachineImage(workspace, newImage);
2010
+ if (options.skipRestart) {
2011
+ // Config updated but not restarting - will apply on next restart/auto-stop-wake
2012
+ console.log(`[provisioner] Updated workspace ${workspaceId.substring(0, 8)} config (restart skipped)`);
2013
+ return {
2014
+ result: WorkspaceProvisioner.UpdateResult.UPDATED_PENDING_RESTART,
2015
+ workspaceId,
2016
+ machineState,
2017
+ agentCount: agentCheck.agentCount,
2018
+ agents: agentCheck.agents,
2019
+ };
2020
+ }
2021
+ // Restart to apply new image
2022
+ await flyProvisioner.restart(workspace);
2023
+ console.log(`[provisioner] Updated and restarted workspace ${workspaceId.substring(0, 8)}`);
2024
+ return {
2025
+ result: WorkspaceProvisioner.UpdateResult.UPDATED,
2026
+ workspaceId,
2027
+ machineState,
2028
+ agentCount: agentCheck.agentCount,
2029
+ };
2030
+ }
2031
+ // Unknown state
2032
+ return {
2033
+ result: WorkspaceProvisioner.UpdateResult.SKIPPED_NOT_RUNNING,
2034
+ workspaceId,
2035
+ machineState,
2036
+ };
2037
+ }
2038
+ catch (error) {
2039
+ console.error(`[provisioner] Error updating workspace ${workspaceId.substring(0, 8)}:`, error);
2040
+ return {
2041
+ result: WorkspaceProvisioner.UpdateResult.ERROR,
2042
+ workspaceId,
2043
+ error: error.message,
2044
+ };
2045
+ }
2046
+ }
2047
+ /**
2048
+ * Gracefully update all workspaces to a new image
2049
+ *
2050
+ * Processes workspaces in batches, respecting active agents unless forced.
2051
+ * Returns detailed results for each workspace.
2052
+ *
2053
+ * @param newImage - New Docker image to use
2054
+ * @param options - Update options
2055
+ * @returns Summary and per-workspace results
2056
+ */
2057
+ async gracefulUpdateAllImages(newImage, options = {}) {
2058
+ // Get all workspaces to update
2059
+ let workspaces;
2060
+ if (options.workspaceIds?.length) {
2061
+ // Specific workspaces
2062
+ workspaces = (await Promise.all(options.workspaceIds.map(id => db.workspaces.findById(id)))).filter((w) => w !== null);
2063
+ }
2064
+ else if (options.userIds?.length) {
2065
+ // Workspaces for specific users
2066
+ const allWorkspaces = await Promise.all(options.userIds.map(userId => db.workspaces.findByUserId(userId)));
2067
+ workspaces = allWorkspaces.flat();
2068
+ }
2069
+ else {
2070
+ // All workspaces - need to query by status to get running ones
2071
+ // For now, we'll get all workspaces from the provisioning provider
2072
+ workspaces = await db.workspaces.findAll();
2073
+ }
2074
+ // Filter to only Fly.io workspaces
2075
+ workspaces = workspaces.filter(w => w.computeProvider === 'fly' && w.computeId);
2076
+ console.log(`[provisioner] Starting graceful update of ${workspaces.length} workspaces to ${newImage}`);
2077
+ const batchSize = options.batchSize ?? 5;
2078
+ const results = [];
2079
+ // Process in batches
2080
+ for (let i = 0; i < workspaces.length; i += batchSize) {
2081
+ const batch = workspaces.slice(i, i + batchSize);
2082
+ const batchResults = await Promise.all(batch.map(workspace => this.gracefulUpdateImage(workspace.id, newImage, {
2083
+ force: options.force,
2084
+ skipRestart: options.skipRestart,
2085
+ })));
2086
+ results.push(...batchResults);
2087
+ // Small delay between batches to avoid overwhelming Fly API
2088
+ if (i + batchSize < workspaces.length) {
2089
+ await wait(1000);
2090
+ }
2091
+ }
2092
+ // Compute summary
2093
+ const summary = {
2094
+ total: results.length,
2095
+ updated: results.filter(r => r.result === WorkspaceProvisioner.UpdateResult.UPDATED).length,
2096
+ pendingRestart: results.filter(r => r.result === WorkspaceProvisioner.UpdateResult.UPDATED_PENDING_RESTART).length,
2097
+ skippedActiveAgents: results.filter(r => r.result === WorkspaceProvisioner.UpdateResult.SKIPPED_ACTIVE_AGENTS).length,
2098
+ skippedVerificationFailed: results.filter(r => r.result === WorkspaceProvisioner.UpdateResult.SKIPPED_VERIFICATION_FAILED).length,
2099
+ skippedNotRunning: results.filter(r => r.result === WorkspaceProvisioner.UpdateResult.SKIPPED_NOT_RUNNING).length,
2100
+ errors: results.filter(r => r.result === WorkspaceProvisioner.UpdateResult.ERROR).length,
2101
+ };
2102
+ console.log(`[provisioner] Graceful update complete:`, summary);
2103
+ return { summary, results };
2104
+ }
2105
+ }
2106
+ // Singleton instance
2107
+ let _provisioner = null;
2108
+ export function getProvisioner() {
2109
+ if (!_provisioner) {
2110
+ _provisioner = new WorkspaceProvisioner();
2111
+ }
2112
+ return _provisioner;
2113
+ }
2114
+ //# sourceMappingURL=index.js.map