@runcore-sh/runcore 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (351) hide show
  1. package/dictionary.json +2 -2
  2. package/dist/activity/log.js +2 -2
  3. package/dist/activity/log.js.map +1 -1
  4. package/dist/agents/governed-spawn.d.ts.map +1 -1
  5. package/dist/cli.js +101 -11
  6. package/dist/cli.js.map +1 -1
  7. package/dist/extensions/cache.d.ts +57 -0
  8. package/dist/extensions/cache.d.ts.map +1 -0
  9. package/dist/extensions/cache.js +173 -0
  10. package/dist/extensions/cache.js.map +1 -0
  11. package/dist/extensions/client.d.ts +55 -0
  12. package/dist/extensions/client.d.ts.map +1 -0
  13. package/dist/extensions/client.js +120 -0
  14. package/dist/extensions/client.js.map +1 -0
  15. package/dist/extensions/index.d.ts +13 -0
  16. package/dist/extensions/index.d.ts.map +1 -0
  17. package/dist/extensions/index.js +12 -0
  18. package/dist/extensions/index.js.map +1 -0
  19. package/dist/extensions/loader.d.ts +50 -0
  20. package/dist/extensions/loader.d.ts.map +1 -0
  21. package/dist/extensions/loader.js +166 -0
  22. package/dist/extensions/loader.js.map +1 -0
  23. package/dist/extensions/manifest.d.ts +38 -0
  24. package/dist/extensions/manifest.d.ts.map +1 -0
  25. package/dist/extensions/manifest.js +17 -0
  26. package/dist/extensions/manifest.js.map +1 -0
  27. package/dist/extensions/stubs.d.ts +27 -0
  28. package/dist/extensions/stubs.d.ts.map +1 -0
  29. package/dist/extensions/stubs.js +45 -0
  30. package/dist/extensions/stubs.js.map +1 -0
  31. package/dist/lib/audit.js +2 -2
  32. package/dist/lib/audit.js.map +1 -1
  33. package/dist/lib/brain-migrate.d.ts +21 -0
  34. package/dist/lib/brain-migrate.d.ts.map +1 -0
  35. package/dist/lib/brain-migrate.js +137 -0
  36. package/dist/lib/brain-migrate.js.map +1 -0
  37. package/dist/lib/paths.d.ts +27 -0
  38. package/dist/lib/paths.d.ts.map +1 -1
  39. package/dist/lib/paths.js +65 -0
  40. package/dist/lib/paths.js.map +1 -1
  41. package/dist/llm/call-log.d.ts +40 -0
  42. package/dist/llm/call-log.d.ts.map +1 -0
  43. package/dist/llm/call-log.js +35 -0
  44. package/dist/llm/call-log.js.map +1 -0
  45. package/dist/llm/complete.d.ts +6 -0
  46. package/dist/llm/complete.d.ts.map +1 -1
  47. package/dist/llm/complete.js +27 -0
  48. package/dist/llm/complete.js.map +1 -1
  49. package/dist/mcp-server.js +118 -2
  50. package/dist/mcp-server.js.map +1 -1
  51. package/dist/memory/file-backed.d.ts +4 -0
  52. package/dist/memory/file-backed.d.ts.map +1 -1
  53. package/dist/memory/file-backed.js +4 -0
  54. package/dist/memory/file-backed.js.map +1 -1
  55. package/dist/memory/vector-index.d.ts +4 -12
  56. package/dist/memory/vector-index.d.ts.map +1 -1
  57. package/dist/memory/vector-index.js +11 -93
  58. package/dist/memory/vector-index.js.map +1 -1
  59. package/dist/search/brain-docs.d.ts +17 -7
  60. package/dist/search/brain-docs.d.ts.map +1 -1
  61. package/dist/search/brain-docs.js +170 -52
  62. package/dist/search/brain-docs.js.map +1 -1
  63. package/dist/search/brain-rag.d.ts +45 -0
  64. package/dist/search/brain-rag.d.ts.map +1 -0
  65. package/dist/search/brain-rag.js +275 -0
  66. package/dist/search/brain-rag.js.map +1 -0
  67. package/dist/search/chunker.d.ts +24 -0
  68. package/dist/search/chunker.d.ts.map +1 -0
  69. package/dist/search/chunker.js +95 -0
  70. package/dist/search/chunker.js.map +1 -0
  71. package/dist/search/embedder.d.ts +16 -0
  72. package/dist/search/embedder.d.ts.map +1 -0
  73. package/dist/search/embedder.js +108 -0
  74. package/dist/search/embedder.js.map +1 -0
  75. package/dist/search/file-watcher.d.ts +11 -0
  76. package/dist/search/file-watcher.d.ts.map +1 -0
  77. package/dist/search/file-watcher.js +86 -0
  78. package/dist/search/file-watcher.js.map +1 -0
  79. package/dist/server.d.ts.map +1 -1
  80. package/dist/server.js +168 -20
  81. package/dist/server.js.map +1 -1
  82. package/dist/sessions/store.d.ts +9 -0
  83. package/dist/sessions/store.d.ts.map +1 -1
  84. package/dist/sessions/store.js.map +1 -1
  85. package/dist/settings.d.ts +26 -0
  86. package/dist/settings.d.ts.map +1 -1
  87. package/dist/settings.js +78 -2
  88. package/dist/settings.js.map +1 -1
  89. package/dist/tracing/init.d.ts +1 -1
  90. package/dist/tracing/init.d.ts.map +1 -1
  91. package/dist/utils/logger.js +2 -2
  92. package/dist/utils/logger.js.map +1 -1
  93. package/module-tiers.json +164 -0
  94. package/package.json +9 -13
  95. package/public/avatar/cache/1184385ec5522b57.mp4 +0 -0
  96. package/public/avatar/cache/1f15f6a1ebd7e439.mp4 +0 -0
  97. package/public/avatar/cache/2c7e47ff0bdeb8d1.mp4 +0 -0
  98. package/public/avatar/cache/5f308566f7abb8f2.mp4 +0 -0
  99. package/public/avatar/cache/62f9cfba848d724e.mp4 +0 -0
  100. package/public/avatar/cache/6d64e657e6bf2aab.mp4 +0 -0
  101. package/public/avatar/cache/763ad0349e0b6f26.mp4 +0 -0
  102. package/public/avatar/cache/81a516cfd461b2b9.mp4 +0 -0
  103. package/public/avatar/cache/9366de15fd6910ca.mp4 +0 -0
  104. package/public/avatar/cache/ade41a846b283895.mp4 +0 -0
  105. package/public/avatar/cache/b6066e5c65383eec.mp4 +0 -0
  106. package/public/avatar/cache/edadb75d37891fc7.mp4 +0 -0
  107. package/public/avatar/cache/f0ae159640621dd9.mp4 +0 -0
  108. package/public/avatar/cache/fc2e5419adf29d96.mp4 +0 -0
  109. package/public/index.html +379 -59
  110. package/dist/agents/autonomous.js +0 -749
  111. package/dist/agents/autonomous.js.map +0 -1
  112. package/dist/agents/commit.js +0 -113
  113. package/dist/agents/commit.js.map +0 -1
  114. package/dist/agents/continue.js +0 -158
  115. package/dist/agents/continue.js.map +0 -1
  116. package/dist/agents/cooldown.js +0 -397
  117. package/dist/agents/cooldown.js.map +0 -1
  118. package/dist/agents/dedup-guard.js +0 -131
  119. package/dist/agents/dedup-guard.js.map +0 -1
  120. package/dist/agents/feed.js +0 -176
  121. package/dist/agents/feed.js.map +0 -1
  122. package/dist/agents/governance.js +0 -292
  123. package/dist/agents/governance.js.map +0 -1
  124. package/dist/agents/governed-spawn.js +0 -192
  125. package/dist/agents/governed-spawn.js.map +0 -1
  126. package/dist/agents/heartbeat.js +0 -324
  127. package/dist/agents/heartbeat.js.map +0 -1
  128. package/dist/agents/instance-manager.js +0 -850
  129. package/dist/agents/instance-manager.js.map +0 -1
  130. package/dist/agents/issue-reporter.js +0 -123
  131. package/dist/agents/issue-reporter.js.map +0 -1
  132. package/dist/agents/issues.js +0 -141
  133. package/dist/agents/issues.js.map +0 -1
  134. package/dist/agents/locks.js +0 -234
  135. package/dist/agents/locks.js.map +0 -1
  136. package/dist/agents/memory.js +0 -93
  137. package/dist/agents/memory.js.map +0 -1
  138. package/dist/agents/monitor.js +0 -235
  139. package/dist/agents/monitor.js.map +0 -1
  140. package/dist/agents/orchestration.js +0 -715
  141. package/dist/agents/orchestration.js.map +0 -1
  142. package/dist/agents/recover.js +0 -166
  143. package/dist/agents/recover.js.map +0 -1
  144. package/dist/agents/reflection.js +0 -199
  145. package/dist/agents/reflection.js.map +0 -1
  146. package/dist/agents/runtime/bus.js +0 -174
  147. package/dist/agents/runtime/bus.js.map +0 -1
  148. package/dist/agents/runtime/config.js +0 -101
  149. package/dist/agents/runtime/config.js.map +0 -1
  150. package/dist/agents/runtime/driver.js +0 -214
  151. package/dist/agents/runtime/driver.js.map +0 -1
  152. package/dist/agents/runtime/errors.js +0 -40
  153. package/dist/agents/runtime/errors.js.map +0 -1
  154. package/dist/agents/runtime/index.js +0 -54
  155. package/dist/agents/runtime/index.js.map +0 -1
  156. package/dist/agents/runtime/lifecycle.js +0 -116
  157. package/dist/agents/runtime/lifecycle.js.map +0 -1
  158. package/dist/agents/runtime/manager.js +0 -948
  159. package/dist/agents/runtime/manager.js.map +0 -1
  160. package/dist/agents/runtime/registry.js +0 -195
  161. package/dist/agents/runtime/registry.js.map +0 -1
  162. package/dist/agents/runtime/resources.js +0 -146
  163. package/dist/agents/runtime/resources.js.map +0 -1
  164. package/dist/agents/runtime/types.js +0 -24
  165. package/dist/agents/runtime/types.js.map +0 -1
  166. package/dist/agents/spawn-policy.js +0 -202
  167. package/dist/agents/spawn-policy.js.map +0 -1
  168. package/dist/agents/spawn.js +0 -970
  169. package/dist/agents/spawn.js.map +0 -1
  170. package/dist/agents/triage.js +0 -81
  171. package/dist/agents/triage.js.map +0 -1
  172. package/dist/agents/workflow.js +0 -543
  173. package/dist/agents/workflow.js.map +0 -1
  174. package/dist/avatar/client.js +0 -172
  175. package/dist/avatar/client.js.map +0 -1
  176. package/dist/avatar/sidecar.js +0 -125
  177. package/dist/avatar/sidecar.js.map +0 -1
  178. package/dist/browser/sessions.js +0 -122
  179. package/dist/browser/sessions.js.map +0 -1
  180. package/dist/capabilities/definitions/browser.js +0 -242
  181. package/dist/capabilities/definitions/browser.js.map +0 -1
  182. package/dist/channels/whatsapp.js +0 -200
  183. package/dist/channels/whatsapp.js.map +0 -1
  184. package/dist/credentials/store.js +0 -189
  185. package/dist/credentials/store.js.map +0 -1
  186. package/dist/files/deep-index.js +0 -337
  187. package/dist/files/deep-index.js.map +0 -1
  188. package/dist/files/extract.js +0 -33
  189. package/dist/files/extract.js.map +0 -1
  190. package/dist/files/gdrive.js +0 -246
  191. package/dist/files/gdrive.js.map +0 -1
  192. package/dist/github/client.js +0 -408
  193. package/dist/github/client.js.map +0 -1
  194. package/dist/github/commit-analysis.js +0 -276
  195. package/dist/github/commit-analysis.js.map +0 -1
  196. package/dist/github/contributor-stats.js +0 -119
  197. package/dist/github/contributor-stats.js.map +0 -1
  198. package/dist/github/issue-sla.js +0 -220
  199. package/dist/github/issue-sla.js.map +0 -1
  200. package/dist/github/issue-triage.js +0 -286
  201. package/dist/github/issue-triage.js.map +0 -1
  202. package/dist/github/pr-readiness.js +0 -197
  203. package/dist/github/pr-readiness.js.map +0 -1
  204. package/dist/github/pr-review.js +0 -410
  205. package/dist/github/pr-review.js.map +0 -1
  206. package/dist/github/release-notes.js +0 -227
  207. package/dist/github/release-notes.js.map +0 -1
  208. package/dist/github/repo-health.js +0 -303
  209. package/dist/github/repo-health.js.map +0 -1
  210. package/dist/github/retry.js +0 -117
  211. package/dist/github/retry.js.map +0 -1
  212. package/dist/github/types.js +0 -8
  213. package/dist/github/types.js.map +0 -1
  214. package/dist/github/webhooks.js +0 -153
  215. package/dist/github/webhooks.js.map +0 -1
  216. package/dist/google/auth.js +0 -325
  217. package/dist/google/auth.js.map +0 -1
  218. package/dist/google/calendar-timer.js +0 -91
  219. package/dist/google/calendar-timer.js.map +0 -1
  220. package/dist/google/calendar.js +0 -270
  221. package/dist/google/calendar.js.map +0 -1
  222. package/dist/google/docs.js +0 -309
  223. package/dist/google/docs.js.map +0 -1
  224. package/dist/google/gmail-send.js +0 -219
  225. package/dist/google/gmail-send.js.map +0 -1
  226. package/dist/google/gmail-timer.js +0 -223
  227. package/dist/google/gmail-timer.js.map +0 -1
  228. package/dist/google/gmail.js +0 -470
  229. package/dist/google/gmail.js.map +0 -1
  230. package/dist/google/plugin.js +0 -169
  231. package/dist/google/plugin.js.map +0 -1
  232. package/dist/google/tasks-timer.js +0 -107
  233. package/dist/google/tasks-timer.js.map +0 -1
  234. package/dist/google/tasks.js +0 -331
  235. package/dist/google/tasks.js.map +0 -1
  236. package/dist/google/temporal.js +0 -176
  237. package/dist/google/temporal.js.map +0 -1
  238. package/dist/integrations/gate.js +0 -100
  239. package/dist/integrations/gate.js.map +0 -1
  240. package/dist/integrations/github.js +0 -331
  241. package/dist/integrations/github.js.map +0 -1
  242. package/dist/integrations/google-tasks.js +0 -432
  243. package/dist/integrations/google-tasks.js.map +0 -1
  244. package/dist/mdns.js +0 -110
  245. package/dist/mdns.js.map +0 -1
  246. package/dist/notifications/channel.js +0 -83
  247. package/dist/notifications/channel.js.map +0 -1
  248. package/dist/notifications/channels/adapter.js +0 -55
  249. package/dist/notifications/channels/adapter.js.map +0 -1
  250. package/dist/notifications/channels/index.js +0 -6
  251. package/dist/notifications/channels/index.js.map +0 -1
  252. package/dist/notifications/channels/log.js +0 -29
  253. package/dist/notifications/channels/log.js.map +0 -1
  254. package/dist/notifications/email.js +0 -72
  255. package/dist/notifications/email.js.map +0 -1
  256. package/dist/notifications/engine.js +0 -198
  257. package/dist/notifications/engine.js.map +0 -1
  258. package/dist/notifications/index.js +0 -24
  259. package/dist/notifications/index.js.map +0 -1
  260. package/dist/notifications/phone.js +0 -48
  261. package/dist/notifications/phone.js.map +0 -1
  262. package/dist/notifications/sms.js +0 -65
  263. package/dist/notifications/sms.js.map +0 -1
  264. package/dist/notifications/types.js +0 -14
  265. package/dist/notifications/types.js.map +0 -1
  266. package/dist/notifications/webhook.js +0 -65
  267. package/dist/notifications/webhook.js.map +0 -1
  268. package/dist/resend/inbox.js +0 -199
  269. package/dist/resend/inbox.js.map +0 -1
  270. package/dist/resend/webhooks.js +0 -244
  271. package/dist/resend/webhooks.js.map +0 -1
  272. package/dist/search/browse.js +0 -225
  273. package/dist/search/browse.js.map +0 -1
  274. package/dist/search/perplexity.js +0 -41
  275. package/dist/search/perplexity.js.map +0 -1
  276. package/dist/slack/channels.js +0 -277
  277. package/dist/slack/channels.js.map +0 -1
  278. package/dist/slack/client.js +0 -468
  279. package/dist/slack/client.js.map +0 -1
  280. package/dist/slack/retry.js +0 -100
  281. package/dist/slack/retry.js.map +0 -1
  282. package/dist/slack/types.js +0 -52
  283. package/dist/slack/types.js.map +0 -1
  284. package/dist/slack/webhooks.js +0 -285
  285. package/dist/slack/webhooks.js.map +0 -1
  286. package/dist/stt/client.js +0 -66
  287. package/dist/stt/client.js.map +0 -1
  288. package/dist/stt/sidecar.js +0 -115
  289. package/dist/stt/sidecar.js.map +0 -1
  290. package/dist/tracing/bridge.js +0 -70
  291. package/dist/tracing/bridge.js.map +0 -1
  292. package/dist/tracing/correlation.js +0 -49
  293. package/dist/tracing/correlation.js.map +0 -1
  294. package/dist/tracing/index.js +0 -18
  295. package/dist/tracing/index.js.map +0 -1
  296. package/dist/tracing/init.js +0 -81
  297. package/dist/tracing/init.js.map +0 -1
  298. package/dist/tracing/instrument.js +0 -145
  299. package/dist/tracing/instrument.js.map +0 -1
  300. package/dist/tracing/middleware.js +0 -69
  301. package/dist/tracing/middleware.js.map +0 -1
  302. package/dist/tracing/tracer.js +0 -327
  303. package/dist/tracing/tracer.js.map +0 -1
  304. package/dist/tts/client.js +0 -48
  305. package/dist/tts/client.js.map +0 -1
  306. package/dist/tts/sidecar.js +0 -148
  307. package/dist/tts/sidecar.js.map +0 -1
  308. package/dist/twilio/call.js +0 -79
  309. package/dist/twilio/call.js.map +0 -1
  310. package/dist/vault/matcher.js +0 -197
  311. package/dist/vault/matcher.js.map +0 -1
  312. package/dist/vault/personal.js +0 -163
  313. package/dist/vault/personal.js.map +0 -1
  314. package/dist/vault/policy.js +0 -159
  315. package/dist/vault/policy.js.map +0 -1
  316. package/dist/vault/store.js +0 -122
  317. package/dist/vault/store.js.map +0 -1
  318. package/dist/vault/transfer.js +0 -188
  319. package/dist/vault/transfer.js.map +0 -1
  320. package/dist/volumes/index.js +0 -2
  321. package/dist/volumes/index.js.map +0 -1
  322. package/dist/volumes/manager.js +0 -462
  323. package/dist/volumes/manager.js.map +0 -1
  324. package/dist/volumes/types.js +0 -8
  325. package/dist/volumes/types.js.map +0 -1
  326. package/dist/webhooks/config.js +0 -214
  327. package/dist/webhooks/config.js.map +0 -1
  328. package/dist/webhooks/event-log.js +0 -132
  329. package/dist/webhooks/event-log.js.map +0 -1
  330. package/dist/webhooks/handler.js +0 -103
  331. package/dist/webhooks/handler.js.map +0 -1
  332. package/dist/webhooks/handlers.js +0 -231
  333. package/dist/webhooks/handlers.js.map +0 -1
  334. package/dist/webhooks/index.js +0 -33
  335. package/dist/webhooks/index.js.map +0 -1
  336. package/dist/webhooks/mount.js +0 -400
  337. package/dist/webhooks/mount.js.map +0 -1
  338. package/dist/webhooks/registry.js +0 -143
  339. package/dist/webhooks/registry.js.map +0 -1
  340. package/dist/webhooks/relay.js +0 -53
  341. package/dist/webhooks/relay.js.map +0 -1
  342. package/dist/webhooks/retry.js +0 -270
  343. package/dist/webhooks/retry.js.map +0 -1
  344. package/dist/webhooks/router.js +0 -290
  345. package/dist/webhooks/router.js.map +0 -1
  346. package/dist/webhooks/twilio.js +0 -129
  347. package/dist/webhooks/twilio.js.map +0 -1
  348. package/dist/webhooks/types.js +0 -8
  349. package/dist/webhooks/types.js.map +0 -1
  350. package/dist/webhooks/verify.js +0 -154
  351. package/dist/webhooks/verify.js.map +0 -1
@@ -1,948 +0,0 @@
1
- /**
2
- * Agent Runtime Environment — RuntimeManager.
3
- *
4
- * The central orchestrator that ties together lifecycle management,
5
- * resource allocation, the event bus, registry persistence, and the
6
- * agent driver. Provides the high-level API for spawning, pausing,
7
- * resuming, and terminating agent instances.
8
- *
9
- * Usage:
10
- * const manager = new RuntimeManager(driver, configOverrides);
11
- * await manager.init();
12
- * const instance = await manager.spawn({ taskId, label, prompt, origin });
13
- * await manager.pause(instance.id);
14
- * await manager.resume(instance.id);
15
- * await manager.terminate(instance.id);
16
- * await manager.shutdown();
17
- */
18
- import { RuntimeError, ErrorCodes } from "./errors.js";
19
- import { loadRuntimeConfig, resolveInstanceConfig, resolveResources } from "./config.js";
20
- import { transition, shouldRetry, prepareRetry, isTerminal } from "./lifecycle.js";
21
- import { ResourcePool } from "./resources.js";
22
- import { RuntimeBus } from "./bus.js";
23
- import { AgentRegistry } from "./registry.js";
24
- import { logActivity } from "../../activity/log.js";
25
- import { createLogger } from "../../utils/logger.js";
26
- import { rememberTaskOutcome } from "../memory.js";
27
- import { readTask, readTaskOutput, updateTask } from "../store.js";
28
- import { updateBoardTaskState } from "../spawn.js";
29
- const log = createLogger("agent-runtime");
30
- // ---------------------------------------------------------------------------
31
- // RuntimeManager
32
- // ---------------------------------------------------------------------------
33
- export class RuntimeManager {
34
- config;
35
- bus;
36
- resources;
37
- registry;
38
- driver;
39
- monitorTimer = null;
40
- shutdownRequested = false;
41
- // ── Retry loop guards ──────────────────────────────────────────────────
42
- /** Prevents concurrent handleExit calls for the same instance. */
43
- handleExitGuard = new Set();
44
- /** Tracks instances with a pending retry timeout (prevents duplicate scheduling). */
45
- retryPending = new Set();
46
- // ── Global retry budget ────────────────────────────────────────────────
47
- /** Rolling window of retry timestamps across all instances. */
48
- globalRetryTimestamps = [];
49
- /**
50
- * Max retries across all instances within the budget window.
51
- * Scaled to 3× maxConcurrentAgents to avoid budget exhaustion when a full
52
- * batch fails (each agent gets maxRetries=2, so 5 agents = 10 retries
53
- * which previously exhausted the entire budget in one batch).
54
- */
55
- globalRetryBudget;
56
- /** Rolling window for the global retry budget (ms). */
57
- static GLOBAL_RETRY_WINDOW_MS = 5 * 60_000; // 5 minutes
58
- constructor(driver, configOverrides) {
59
- this.config = loadRuntimeConfig(configOverrides);
60
- this.driver = driver;
61
- this.bus = new RuntimeBus();
62
- this.resources = new ResourcePool(this.config);
63
- this.registry = new AgentRegistry(this.config.persistDir);
64
- // Scale retry budget: 3× max concurrent agents ensures one bad batch
65
- // doesn't exhaust the entire budget (5 agents × 2 retries = 10, budget = 15)
66
- this.globalRetryBudget = Math.max(10, this.config.maxConcurrentAgents * 3);
67
- }
68
- // -------------------------------------------------------------------------
69
- // Initialization & shutdown
70
- // -------------------------------------------------------------------------
71
- /** Initialize the runtime: load registry, recover agents, start monitor. */
72
- async init() {
73
- const t0 = Date.now();
74
- log.info("Initializing agent runtime", { driver: this.driver.name, maxAgents: this.config.maxConcurrentAgents });
75
- const t1 = Date.now();
76
- await this.registry.init();
77
- const registryMs = Date.now() - t1;
78
- const t2 = Date.now();
79
- await this.recoverAgents();
80
- const recoveryMs = Date.now() - t2;
81
- this.startMonitor();
82
- const totalMs = Date.now() - t0;
83
- logActivity({
84
- source: "agent",
85
- summary: "Agent runtime initialized",
86
- detail: `Driver: ${this.driver.name}, max agents: ${this.config.maxConcurrentAgents}, init ${totalMs}ms [registry:${registryMs}ms recovery:${recoveryMs}ms]`,
87
- });
88
- }
89
- /** Graceful shutdown: terminate active agents, stop monitor, clean up. */
90
- async shutdown(reason = "Runtime shutdown") {
91
- if (this.shutdownRequested)
92
- return;
93
- log.info("Shutting down agent runtime", { reason });
94
- this.shutdownRequested = true;
95
- this.bus.emitShutdown(reason);
96
- this.stopMonitor();
97
- // Terminate all active agents
98
- const active = this.registry.listActive();
99
- const terminations = active.map((inst) => this.terminate(inst.id, reason).catch(() => { }));
100
- await Promise.allSettled(terminations);
101
- // Clean up resources, retry guards, and bus
102
- this.resources.clear();
103
- this.handleExitGuard.clear();
104
- this.retryPending.clear();
105
- this.globalRetryTimestamps.length = 0;
106
- this.bus.destroy();
107
- logActivity({
108
- source: "agent",
109
- summary: "Agent runtime shut down",
110
- detail: `Reason: ${reason}, terminated ${active.length} agents`,
111
- });
112
- }
113
- // -------------------------------------------------------------------------
114
- // Core lifecycle operations
115
- // -------------------------------------------------------------------------
116
- /** Spawn a new agent instance. */
117
- async spawn(request) {
118
- log.info(`Spawning agent instance: ${request.label}`, { taskId: request.taskId, origin: request.origin });
119
- if (this.shutdownRequested) {
120
- throw new RuntimeError(ErrorCodes.SHUTDOWN_IN_PROGRESS, "Runtime is shutting down");
121
- }
122
- const instanceConfig = resolveInstanceConfig(this.config, request.config);
123
- const resourceAlloc = resolveResources(this.config, request.resources);
124
- // Check resource availability — queue if not available
125
- if (!this.resources.canAllocate(resourceAlloc)) {
126
- logActivity({
127
- source: "agent",
128
- summary: `Agent queued (resources exhausted): ${request.label}`,
129
- detail: `Active: ${this.resources.activeCount}/${this.config.maxConcurrentAgents}`,
130
- });
131
- await this.resources.enqueue(request, instanceConfig.priority);
132
- }
133
- // Create the instance
134
- const now = new Date().toISOString();
135
- const instance = {
136
- id: this.registry.generateId(),
137
- taskId: request.taskId,
138
- state: "initializing",
139
- cwd: request.cwd,
140
- config: instanceConfig,
141
- resources: resourceAlloc,
142
- metadata: {
143
- label: request.label,
144
- origin: request.origin,
145
- parentId: request.parentId,
146
- tags: request.tags ?? [],
147
- },
148
- retryCount: 0,
149
- createdAt: now,
150
- updatedAt: now,
151
- };
152
- // Store the prompt on the instance for the driver
153
- instance._prompt = request.prompt;
154
- // Register and allocate resources
155
- await this.registry.register(instance);
156
- this.resources.allocate(instance.id, resourceAlloc);
157
- logActivity({
158
- source: "agent",
159
- summary: `Spawning agent: ${request.label}`,
160
- detail: `Instance ${instance.id}, task ${request.taskId}`,
161
- });
162
- // Attempt to spawn via driver
163
- try {
164
- const pid = await this.driver.spawn(instance);
165
- instance.pid = pid;
166
- const event = transition(instance, "running", "Spawned successfully");
167
- await this.registry.update(instance.id, {
168
- state: instance.state,
169
- pid,
170
- updatedAt: instance.updatedAt,
171
- });
172
- this.bus.emitLifecycle(event);
173
- this.bus.emitSpawned(instance.id, pid);
174
- // Register exit handler with driver
175
- if ("onExit" in this.driver && typeof this.driver.onExit === "function") {
176
- this.driver
177
- .onExit(instance.id, (code) => this.handleExit(instance.id, code));
178
- }
179
- logActivity({
180
- source: "agent",
181
- summary: `Agent running: ${request.label}`,
182
- detail: `PID ${pid}, instance ${instance.id}`,
183
- });
184
- return instance;
185
- }
186
- catch (err) {
187
- const error = this.toAgentError(err, ErrorCodes.SPAWN_FAILED, true);
188
- const event = transition(instance, "failed", "Spawn failed", error);
189
- await this.registry.update(instance.id, {
190
- state: "failed",
191
- error,
192
- updatedAt: instance.updatedAt,
193
- });
194
- this.resources.release(instance.id);
195
- this.bus.emitLifecycle(event);
196
- // Check if retry is possible BEFORE emitting terminal failure.
197
- // Same fix as handleExit: prevent external listeners from firing
198
- // prematurely while retries are still pending.
199
- const retryDelay = shouldRetry(instance);
200
- if (retryDelay !== null) {
201
- logActivity({
202
- source: "agent",
203
- summary: `Agent spawn failed, will retry: ${request.label}`,
204
- detail: `Attempt ${instance.retryCount + 1}/${instance.config.maxRetries}, backoff ${retryDelay}ms`,
205
- });
206
- await this.maybeRetry(instance);
207
- }
208
- else {
209
- // Terminal failure — notify external listeners
210
- this.bus.emitFailed(instance.id, error);
211
- logActivity({
212
- source: "agent",
213
- summary: `Agent spawn failed (terminal): ${request.label}`,
214
- detail: err.message,
215
- });
216
- }
217
- return instance;
218
- }
219
- }
220
- /** Pause a running agent. */
221
- async pause(instanceId, reason) {
222
- const instance = this.requireInstance(instanceId);
223
- this.assertState(instance, "running");
224
- let checkpoint;
225
- try {
226
- checkpoint = await this.driver.pause(instance);
227
- }
228
- catch (err) {
229
- logActivity({
230
- source: "agent",
231
- summary: `Agent pause failed: ${instance.metadata.label}`,
232
- detail: err.message,
233
- });
234
- throw err;
235
- }
236
- if (checkpoint) {
237
- instance.checkpointData = checkpoint;
238
- }
239
- const event = transition(instance, "paused", reason ?? "Paused by request");
240
- await this.registry.update(instanceId, {
241
- state: "paused",
242
- checkpointData: instance.checkpointData,
243
- pausedAt: instance.pausedAt,
244
- updatedAt: instance.updatedAt,
245
- });
246
- this.resources.release(instanceId);
247
- this.bus.emitLifecycle(event);
248
- logActivity({
249
- source: "agent",
250
- summary: `Agent paused: ${instance.metadata.label}`,
251
- detail: checkpoint ? "Checkpoint saved" : "No checkpoint",
252
- });
253
- return instance;
254
- }
255
- /** Resume a paused agent. */
256
- async resume(instanceId) {
257
- const instance = this.requireInstance(instanceId);
258
- this.assertState(instance, "paused");
259
- // Check resource availability
260
- if (!this.resources.canAllocate(instance.resources)) {
261
- throw new RuntimeError(ErrorCodes.RESOURCE_EXHAUSTED, "Cannot resume: resources unavailable", true);
262
- }
263
- const resumeEvent = transition(instance, "resuming", "Resuming from pause");
264
- this.bus.emitLifecycle(resumeEvent);
265
- // Re-allocate resources
266
- this.resources.allocate(instanceId, instance.resources);
267
- try {
268
- const pid = await this.driver.resume(instance, instance.checkpointData);
269
- instance.pid = pid;
270
- instance.pausedAt = undefined;
271
- const runEvent = transition(instance, "running", "Resumed successfully");
272
- await this.registry.update(instanceId, {
273
- state: "running",
274
- pid,
275
- pausedAt: undefined,
276
- updatedAt: instance.updatedAt,
277
- });
278
- this.bus.emitLifecycle(runEvent);
279
- this.bus.emitSpawned(instanceId, pid);
280
- // Re-register exit handler
281
- if ("onExit" in this.driver && typeof this.driver.onExit === "function") {
282
- this.driver
283
- .onExit(instanceId, (code) => this.handleExit(instanceId, code));
284
- }
285
- logActivity({
286
- source: "agent",
287
- summary: `Agent resumed: ${instance.metadata.label}`,
288
- detail: `PID ${pid}, instance ${instanceId}`,
289
- });
290
- return instance;
291
- }
292
- catch (err) {
293
- const error = this.toAgentError(err, ErrorCodes.RESUME_FAILED, true);
294
- const failEvent = transition(instance, "failed", "Resume failed", error);
295
- await this.registry.update(instanceId, {
296
- state: "failed",
297
- error,
298
- updatedAt: instance.updatedAt,
299
- });
300
- this.resources.release(instanceId);
301
- this.bus.emitLifecycle(failEvent);
302
- this.bus.emitFailed(instanceId, error);
303
- logActivity({
304
- source: "agent",
305
- summary: `Agent resume failed: ${instance.metadata.label}`,
306
- detail: err.message,
307
- });
308
- return instance;
309
- }
310
- }
311
- /** Terminate an agent (from any active state). */
312
- async terminate(instanceId, reason) {
313
- const instance = this.requireInstance(instanceId);
314
- if (isTerminal(instance.state)) {
315
- return instance; // Already done
316
- }
317
- // Transition to terminating
318
- try {
319
- const termEvent = transition(instance, "terminating", reason ?? "Terminated by request");
320
- this.bus.emitLifecycle(termEvent);
321
- }
322
- catch {
323
- // If transition isn't valid from current state, force it
324
- instance.state = "terminating";
325
- instance.updatedAt = new Date().toISOString();
326
- }
327
- // Kill the process
328
- try {
329
- await this.driver.terminate(instance);
330
- }
331
- catch {
332
- // Best effort — process may already be dead
333
- }
334
- // Transition to terminated
335
- const finalEvent = transition(instance, "terminated", reason ?? "Terminated");
336
- // Skip disk persist — GC will delete this file within gcTtlMs.
337
- // On restart, driver.isAlive() returns false → recovery handles correctly.
338
- await this.registry.update(instanceId, {
339
- state: "terminated",
340
- terminatedAt: instance.terminatedAt,
341
- updatedAt: instance.updatedAt,
342
- }, true);
343
- this.resources.release(instanceId);
344
- this.bus.unsubscribe(instanceId);
345
- this.bus.emitLifecycle(finalEvent);
346
- // Update linked AgentTask
347
- await updateTask(instance.taskId, {
348
- status: "cancelled",
349
- finishedAt: new Date().toISOString(),
350
- }).catch(() => { });
351
- logActivity({
352
- source: "agent",
353
- summary: `Agent terminated: ${instance.metadata.label}`,
354
- detail: `Reason: ${reason ?? "requested"}, instance ${instanceId}`,
355
- });
356
- return instance;
357
- }
358
- // -------------------------------------------------------------------------
359
- // Queries
360
- // -------------------------------------------------------------------------
361
- /** Get an agent instance by ID. */
362
- getInstance(id) {
363
- return this.registry.get(id);
364
- }
365
- /** Get an instance by its linked task ID. */
366
- getByTaskId(taskId) {
367
- return this.registry.getByTaskId(taskId);
368
- }
369
- /** List all instances, optionally filtered. */
370
- listInstances(filter) {
371
- if (filter?.states) {
372
- return this.registry.list({ states: filter.states });
373
- }
374
- return this.registry.list();
375
- }
376
- /** List all active (non-terminal) instances. */
377
- listActive() {
378
- return this.registry.listActive();
379
- }
380
- /** Get a snapshot of current resource usage. */
381
- getResourceSnapshot() {
382
- return this.resources.snapshot();
383
- }
384
- /** Get instance counts by state. */
385
- getStateCounts() {
386
- return this.registry.countByState();
387
- }
388
- /** Remove a terminated instance from the registry and disk. */
389
- async removeInstance(id) {
390
- return this.registry.remove(id);
391
- }
392
- /** Remove multiple terminated instances in parallel. */
393
- async removeInstances(ids) {
394
- return this.registry.removeMany(ids);
395
- }
396
- /**
397
- * Remove multiple instances from in-memory registry only (no file I/O).
398
- * Returns the count removed and their persist file paths for caller-managed
399
- * batch deletion.
400
- */
401
- removeInstancesInMemory(ids) {
402
- return this.registry.removeManyInMemory(ids);
403
- }
404
- // -------------------------------------------------------------------------
405
- // Inter-agent messaging
406
- // -------------------------------------------------------------------------
407
- /** Send a message between agents. */
408
- sendMessage(from, to, type, payload) {
409
- return this.bus.send({ from, to, type, payload });
410
- }
411
- /** Send a request and await a correlated response. */
412
- async requestResponse(from, to, type, payload, timeoutMs) {
413
- return this.bus.request({ from, to, type, payload }, timeoutMs);
414
- }
415
- /** Subscribe an agent to receive messages. */
416
- subscribeAgent(agentId, handler) {
417
- this.bus.subscribe(agentId, handler);
418
- }
419
- // -------------------------------------------------------------------------
420
- // Event subscriptions (for external consumers)
421
- // -------------------------------------------------------------------------
422
- /** Subscribe to lifecycle events. */
423
- onLifecycle(handler) {
424
- this.bus.on("agent:lifecycle", handler);
425
- }
426
- /** Subscribe to agent error events. */
427
- onError(handler) {
428
- this.bus.on("agent:error", handler);
429
- }
430
- /** Subscribe to resource warning events. */
431
- onResourceWarning(handler) {
432
- this.bus.on("runtime:resource-warning", handler);
433
- }
434
- // -------------------------------------------------------------------------
435
- // Internal: process exit handling
436
- // -------------------------------------------------------------------------
437
- async handleExit(instanceId, code) {
438
- const instance = this.registry.get(instanceId);
439
- if (!instance || isTerminal(instance.state))
440
- return;
441
- // Guard: prevent concurrent handleExit for the same instance.
442
- // Monitor poll + driver exit handler can race, causing duplicate retry scheduling.
443
- if (this.handleExitGuard.has(instanceId)) {
444
- logActivity({
445
- source: "agent",
446
- summary: `handleExit skipped (concurrent call): ${instance.metadata.label}`,
447
- detail: `Instance ${instanceId}, exit code ${code} — another handleExit is already processing`,
448
- });
449
- return;
450
- }
451
- this.handleExitGuard.add(instanceId);
452
- try {
453
- await this.handleExitInner(instanceId, instance, code);
454
- }
455
- finally {
456
- this.handleExitGuard.delete(instanceId);
457
- }
458
- }
459
- async handleExitInner(instanceId, instance, code) {
460
- const output = await readTaskOutput(instance.taskId).catch(() => "");
461
- const resultSummary = output.trim().slice(0, 1000) || undefined;
462
- // Null exit code (signal/restart) with substantial output is treated as success,
463
- // matching the same logic in spawn.ts. Without this, RuntimeManager retries
464
- // agents that spawn.ts already marked as completed.
465
- const hasSubstantialOutput = output.trim().length > 100;
466
- const success = code === 0 || (code == null && hasSubstantialOutput);
467
- if (success) {
468
- const event = transition(instance, "completed", `Exited with code ${code}`);
469
- // Skip disk persist — GC will delete this file within gcTtlMs.
470
- // On restart, driver.isAlive() returns false → recovery handles correctly.
471
- await this.registry.update(instanceId, {
472
- state: "completed",
473
- updatedAt: instance.updatedAt,
474
- terminatedAt: instance.terminatedAt,
475
- }, true);
476
- this.resources.release(instanceId);
477
- this.bus.emitLifecycle(event);
478
- this.bus.emitCompleted(instanceId, code ?? undefined);
479
- }
480
- else {
481
- // Classify recoverability based on exit code and context:
482
- // - null + near timeout: timeout kill — not recoverable (retrying will timeout again)
483
- // - null (signal/OOM/killed): potentially recoverable (worth retrying)
484
- // - non-zero: check output for deterministic failure patterns
485
- const elapsed = Date.now() - new Date(instance.createdAt).getTime();
486
- const isTimeout = code === null && elapsed >= instance.config.timeoutMs * 0.9;
487
- const isDeterministicFailure = isTimeout || this.isDeterministicFailure(output, code);
488
- const recoverable = !isDeterministicFailure;
489
- if (isTimeout) {
490
- logActivity({
491
- source: "agent",
492
- summary: `Agent timed out: ${instance.metadata.label}`,
493
- detail: `Elapsed ${Math.round(elapsed / 1000)}s of ${Math.round(instance.config.timeoutMs / 1000)}s limit — will not retry`,
494
- });
495
- }
496
- const error = this.toAgentError(new Error(`Process exited with code ${code}`), ErrorCodes.DRIVER_ERROR, recoverable);
497
- const event = transition(instance, "failed", `Exited with code ${code}`, error);
498
- await this.registry.update(instanceId, {
499
- state: "failed",
500
- error,
501
- updatedAt: instance.updatedAt,
502
- });
503
- // Release resources before retry check (maybeRetry re-allocates if needed)
504
- this.resources.release(instanceId);
505
- this.bus.emitLifecycle(event);
506
- // Check if retry is possible BEFORE emitting terminal failure.
507
- // This prevents external listeners (spawn.ts batch tracking) from
508
- // prematurely processing the failure while retries are still pending,
509
- // which was causing retry loop fan-out: continuation spawned new agents
510
- // while the RuntimeManager was still retrying the original.
511
- const retryDelay = shouldRetry(instance);
512
- if (retryDelay !== null) {
513
- logActivity({
514
- source: "agent",
515
- summary: `Agent failed, will retry: ${instance.metadata.label}`,
516
- detail: `Attempt ${instance.retryCount + 1}/${instance.config.maxRetries}, backoff ${retryDelay}ms, exit code ${code}, recoverable=${recoverable}, instance ${instanceId}`,
517
- });
518
- await this.maybeRetry(instance);
519
- return; // Don't emit agent:failed or update task store — retry pending
520
- }
521
- // Terminal failure — no more retries, notify external listeners
522
- logActivity({
523
- source: "agent",
524
- summary: `Agent failed (terminal, no retries left): ${instance.metadata.label}`,
525
- detail: `Exit code ${code}, retryCount=${instance.retryCount}/${instance.config.maxRetries}, recoverable=${recoverable}, instance ${instanceId}`,
526
- });
527
- this.bus.emitFailed(instanceId, error);
528
- }
529
- // ── Finalize (reached only for terminal success or terminal failure) ──
530
- this.bus.unsubscribe(instanceId);
531
- const newState = success ? "completed" : "failed";
532
- await updateTask(instance.taskId, {
533
- status: newState,
534
- exitCode: code ?? undefined,
535
- finishedAt: new Date().toISOString(),
536
- resultSummary,
537
- }).catch(() => { });
538
- // Persist to episodic memory
539
- const task = await readTask(instance.taskId).catch(() => null);
540
- if (task) {
541
- rememberTaskOutcome({ ...task, status: newState, exitCode: code ?? undefined }, output).catch(() => { });
542
- }
543
- // Sync board task state (survives server restarts — inline spawn listeners don't)
544
- if (task?.boardTaskId) {
545
- updateBoardTaskState(task.boardTaskId, success
546
- ? { state: "done" }
547
- : { state: "todo", assignee: null }).catch(() => { });
548
- }
549
- logActivity({
550
- source: "agent",
551
- summary: `Agent ${newState}: ${instance.metadata.label}`,
552
- detail: `Exit code ${code}, instance ${instanceId}`,
553
- });
554
- }
555
- // -------------------------------------------------------------------------
556
- // Internal: retry logic
557
- // -------------------------------------------------------------------------
558
- /**
559
- * Emit terminal failure event and finalize task store + memory.
560
- * Called when all retries are exhausted or a non-recoverable error occurs
561
- * within the retry path. This ensures external listeners (spawn.ts batch
562
- * tracking) are notified and the task store reflects the final state.
563
- *
564
- * Without this, tasks that fail during retries would leave the task store
565
- * in a stale "running" state and batch continuation would never trigger.
566
- */
567
- async finalizeTerminalFailure(instance, error) {
568
- const finalError = error ?? instance.error ?? {
569
- code: ErrorCodes.MAX_RETRIES_EXCEEDED,
570
- message: "Agent failed after all retry attempts",
571
- timestamp: new Date().toISOString(),
572
- recoverable: false,
573
- };
574
- this.bus.emitFailed(instance.id, finalError);
575
- this.bus.unsubscribe(instance.id);
576
- // Update linked AgentTask
577
- const output = await readTaskOutput(instance.taskId).catch(() => "");
578
- const resultSummary = output.trim().slice(0, 1000) || undefined;
579
- await updateTask(instance.taskId, {
580
- status: "failed",
581
- finishedAt: new Date().toISOString(),
582
- resultSummary,
583
- }).catch(() => { });
584
- // Persist to episodic memory
585
- const task = await readTask(instance.taskId).catch(() => null);
586
- if (task) {
587
- rememberTaskOutcome({ ...task, status: "failed" }, output).catch(() => { });
588
- }
589
- logActivity({
590
- source: "agent",
591
- summary: `Agent failed (terminal): ${instance.metadata.label}`,
592
- detail: `${instance.retryCount} retries exhausted, instance ${instance.id}`,
593
- });
594
- }
595
- async maybeRetry(instance) {
596
- // Guard: prevent duplicate retry scheduling for the same instance.
597
- // This can happen when monitor poll and driver exit handler both trigger handleExit.
598
- if (this.retryPending.has(instance.id)) {
599
- logActivity({
600
- source: "agent",
601
- summary: `Retry already pending, skipping duplicate: ${instance.metadata.label}`,
602
- detail: `Instance ${instance.id}, retryCount=${instance.retryCount}`,
603
- });
604
- return;
605
- }
606
- const delay = shouldRetry(instance);
607
- if (delay === null) {
608
- // Retries exhausted — emit terminal failure for external listeners.
609
- // handleExit and spawn() skip emitting agent:failed when retry was
610
- // possible, so this is the terminal notification path.
611
- const terminalError = instance.error ?? {
612
- code: ErrorCodes.MAX_RETRIES_EXCEEDED,
613
- message: `Max retries (${instance.config.maxRetries}) exceeded after ${instance.retryCount} attempts`,
614
- timestamp: new Date().toISOString(),
615
- recoverable: false,
616
- };
617
- if (instance.retryCount > 0) {
618
- logActivity({
619
- source: "agent",
620
- summary: `Agent max retries exceeded: ${instance.metadata.label}`,
621
- detail: `${instance.retryCount}/${instance.config.maxRetries} attempts, instance ${instance.id}`,
622
- });
623
- }
624
- await this.finalizeTerminalFailure(instance, terminalError);
625
- return;
626
- }
627
- // Check global retry budget to prevent retry storms when many agents fail at once
628
- const now = Date.now();
629
- this.globalRetryTimestamps = this.globalRetryTimestamps.filter((t) => now - t < RuntimeManager.GLOBAL_RETRY_WINDOW_MS);
630
- if (this.globalRetryTimestamps.length >= this.globalRetryBudget) {
631
- const budgetError = {
632
- code: ErrorCodes.MAX_RETRIES_EXCEEDED,
633
- message: `Global retry budget exhausted: ${this.globalRetryTimestamps.length} retries in ${RuntimeManager.GLOBAL_RETRY_WINDOW_MS / 60_000}min window`,
634
- timestamp: new Date().toISOString(),
635
- recoverable: false,
636
- };
637
- logActivity({
638
- source: "agent",
639
- summary: `Global retry budget exhausted, failing: ${instance.metadata.label}`,
640
- detail: `${this.globalRetryTimestamps.length}/${this.globalRetryBudget} retries in ${RuntimeManager.GLOBAL_RETRY_WINDOW_MS / 60_000}min, instance ${instance.id}`,
641
- });
642
- await this.finalizeTerminalFailure(instance, budgetError);
643
- return;
644
- }
645
- // Mark this instance as having a pending retry
646
- this.retryPending.add(instance.id);
647
- this.globalRetryTimestamps.push(now);
648
- logActivity({
649
- source: "agent",
650
- summary: `Agent retrying in ${delay}ms: ${instance.metadata.label}`,
651
- detail: `Attempt ${instance.retryCount + 1}/${instance.config.maxRetries}, instance ${instance.id}, global retries: ${this.globalRetryTimestamps.length}/${this.globalRetryBudget}`,
652
- });
653
- setTimeout(async () => {
654
- this.retryPending.delete(instance.id);
655
- let resourcesAllocated = false;
656
- try {
657
- const retryEvent = prepareRetry(instance);
658
- this.bus.emitLifecycle(retryEvent);
659
- await this.registry.update(instance.id, {
660
- state: "initializing",
661
- retryCount: instance.retryCount,
662
- error: undefined,
663
- pid: undefined,
664
- terminatedAt: undefined,
665
- updatedAt: instance.updatedAt,
666
- });
667
- // Re-read the original task for the prompt
668
- const task = await readTask(instance.taskId).catch(() => null);
669
- if (task) {
670
- instance._prompt = task.prompt;
671
- }
672
- // Re-allocate and spawn
673
- if (!this.resources.canAllocate(instance.resources)) {
674
- // Resources unavailable — fail the retry instead of leaving a zombie
675
- const error = {
676
- code: ErrorCodes.RESOURCE_EXHAUSTED,
677
- message: `Resources unavailable for retry attempt ${instance.retryCount}`,
678
- timestamp: new Date().toISOString(),
679
- recoverable: false,
680
- };
681
- instance.state = "failed";
682
- instance.error = error;
683
- instance.updatedAt = new Date().toISOString();
684
- await this.registry.update(instance.id, {
685
- state: "failed",
686
- error,
687
- updatedAt: instance.updatedAt,
688
- });
689
- logActivity({
690
- source: "agent",
691
- summary: `Agent retry aborted (resources exhausted): ${instance.metadata.label}`,
692
- detail: `Attempt ${instance.retryCount}/${instance.config.maxRetries}, instance ${instance.id}`,
693
- });
694
- await this.finalizeTerminalFailure(instance, error);
695
- return;
696
- }
697
- this.resources.allocate(instance.id, instance.resources);
698
- resourcesAllocated = true;
699
- const pid = await this.driver.spawn(instance);
700
- instance.pid = pid;
701
- const runEvent = transition(instance, "running", "Retry spawned");
702
- await this.registry.update(instance.id, {
703
- state: "running",
704
- pid,
705
- updatedAt: instance.updatedAt,
706
- });
707
- this.bus.emitLifecycle(runEvent);
708
- this.bus.emitSpawned(instance.id, pid);
709
- logActivity({
710
- source: "agent",
711
- summary: `Agent retry spawned: ${instance.metadata.label}`,
712
- detail: `Attempt ${instance.retryCount}/${instance.config.maxRetries}, PID ${pid}, instance ${instance.id}`,
713
- });
714
- if ("onExit" in this.driver && typeof this.driver.onExit === "function") {
715
- this.driver
716
- .onExit(instance.id, (code) => this.handleExit(instance.id, code));
717
- }
718
- }
719
- catch (err) {
720
- // Clean up on retry failure: release resources and fail the instance
721
- // to prevent zombie agents stuck in "initializing" forever
722
- if (resourcesAllocated) {
723
- this.resources.release(instance.id);
724
- }
725
- const error = {
726
- code: ErrorCodes.SPAWN_FAILED,
727
- message: `Retry spawn failed: ${err.message}`,
728
- timestamp: new Date().toISOString(),
729
- recoverable: false,
730
- };
731
- instance.state = "failed";
732
- instance.error = error;
733
- instance.updatedAt = new Date().toISOString();
734
- await this.registry.update(instance.id, {
735
- state: "failed",
736
- error,
737
- updatedAt: instance.updatedAt,
738
- }).catch(() => { });
739
- logActivity({
740
- source: "agent",
741
- summary: `Agent retry spawn failed: ${instance.metadata.label}`,
742
- detail: `Attempt ${instance.retryCount}/${instance.config.maxRetries}: ${err.message}, instance ${instance.id}`,
743
- });
744
- await this.finalizeTerminalFailure(instance, error);
745
- }
746
- }, delay);
747
- }
748
- // -------------------------------------------------------------------------
749
- // Internal: recovery on startup
750
- // -------------------------------------------------------------------------
751
- async recoverAgents() {
752
- const instances = this.registry.listActive();
753
- // Skip recovery entirely when there's nothing to recover (cold start)
754
- if (instances.length === 0) {
755
- log.info("No active instances to recover");
756
- return;
757
- }
758
- let recovered = 0;
759
- let marked = 0;
760
- // Separate alive, dead, and initializing instances for parallel recovery
761
- const alive = [];
762
- const dead = [];
763
- const initializing = [];
764
- for (const instance of instances) {
765
- if (instance.state === "running" || instance.state === "resuming") {
766
- if (this.driver.isAlive(instance)) {
767
- alive.push(instance);
768
- }
769
- else {
770
- dead.push(instance);
771
- }
772
- }
773
- else if (instance.state === "initializing") {
774
- initializing.push(instance);
775
- }
776
- }
777
- // Count alive agents (no I/O needed)
778
- recovered = alive.length;
779
- for (const instance of alive) {
780
- logActivity({
781
- source: "agent",
782
- summary: `Recovered running agent: ${instance.metadata.label}`,
783
- detail: `PID ${instance.pid} still alive`,
784
- });
785
- }
786
- // Recover dead agents in parallel (each reads task output)
787
- if (dead.length > 0) {
788
- marked += dead.length;
789
- await Promise.all(dead.map(async (instance) => {
790
- const output = await readTaskOutput(instance.taskId).catch(() => "");
791
- const hasOutput = output.trim().length > 0;
792
- const finalState = hasOutput ? "completed" : "failed";
793
- if (hasOutput) {
794
- transition(instance, "completed", "Recovered after restart");
795
- }
796
- else {
797
- const error = {
798
- code: ErrorCodes.DRIVER_ERROR,
799
- message: "Process died while runtime was down",
800
- timestamp: new Date().toISOString(),
801
- recoverable: false,
802
- };
803
- transition(instance, "failed", "Died during downtime", error);
804
- }
805
- await this.registry.update(instance.id, {
806
- state: instance.state,
807
- error: instance.error,
808
- terminatedAt: instance.terminatedAt,
809
- updatedAt: instance.updatedAt,
810
- });
811
- logActivity({
812
- source: "agent",
813
- summary: `Recovered ${finalState} agent: ${instance.metadata.label}`,
814
- detail: `PID ${instance.pid} was dead`,
815
- });
816
- }));
817
- }
818
- // Recover initializing agents in parallel
819
- if (initializing.length > 0) {
820
- marked += initializing.length;
821
- await Promise.all(initializing.map(async (instance) => {
822
- const error = {
823
- code: ErrorCodes.DRIVER_ERROR,
824
- message: "Runtime restarted during initialization",
825
- timestamp: new Date().toISOString(),
826
- recoverable: true,
827
- };
828
- transition(instance, "failed", "Runtime restarted", error);
829
- await this.registry.update(instance.id, {
830
- state: "failed",
831
- error,
832
- updatedAt: instance.updatedAt,
833
- });
834
- await this.maybeRetry(instance);
835
- }));
836
- }
837
- if (recovered > 0 || marked > 0) {
838
- logActivity({
839
- source: "agent",
840
- summary: `Runtime recovery: ${recovered} alive, ${marked} resolved`,
841
- });
842
- }
843
- }
844
- // -------------------------------------------------------------------------
845
- // Internal: monitoring
846
- // -------------------------------------------------------------------------
847
- startMonitor() {
848
- if (this.monitorTimer)
849
- return;
850
- this.monitorTimer = setInterval(() => {
851
- this.monitorCycle().catch((err) => {
852
- logActivity({
853
- source: "agent",
854
- summary: "Runtime monitor error",
855
- detail: err.message,
856
- });
857
- });
858
- }, this.config.monitorIntervalMs);
859
- }
860
- stopMonitor() {
861
- if (this.monitorTimer) {
862
- clearInterval(this.monitorTimer);
863
- this.monitorTimer = null;
864
- }
865
- }
866
- async monitorCycle() {
867
- const running = this.registry.list({ state: "running" });
868
- for (const instance of running) {
869
- // Skip instances with pending retries or active handleExit processing
870
- // to prevent concurrent exit handling that causes duplicate retry scheduling
871
- if (this.retryPending.has(instance.id) || this.handleExitGuard.has(instance.id)) {
872
- continue;
873
- }
874
- if (!this.driver.isAlive(instance)) {
875
- // Process died without triggering exit handler (recovered PID)
876
- await this.handleExit(instance.id, null);
877
- }
878
- }
879
- // Emit resource warning if nearing capacity
880
- const snapshot = this.resources.snapshot();
881
- const agentRatio = snapshot.activeAgents / snapshot.maxAgents;
882
- const memoryRatio = snapshot.totalMemoryMB / snapshot.maxMemoryMB;
883
- if (agentRatio > 0.8 || memoryRatio > 0.8) {
884
- this.bus.emitResourceWarning(snapshot);
885
- }
886
- }
887
- // -------------------------------------------------------------------------
888
- // Helpers
889
- // -------------------------------------------------------------------------
890
- requireInstance(id) {
891
- const instance = this.registry.get(id);
892
- if (!instance) {
893
- throw new RuntimeError(ErrorCodes.AGENT_NOT_FOUND, `Agent instance not found: ${id}`);
894
- }
895
- return instance;
896
- }
897
- assertState(instance, expected) {
898
- if (instance.state !== expected) {
899
- throw new RuntimeError(ErrorCodes.INVALID_TRANSITION, `Expected agent state '${expected}', got '${instance.state}'`, false, { agentId: instance.id, actual: instance.state, expected });
900
- }
901
- }
902
- /**
903
- * Detect deterministic failures that won't succeed on retry.
904
- * Checks output for patterns like TypeScript errors, missing modules, etc.
905
- * A broader pattern set prevents wasting retry budget on non-recoverable errors.
906
- */
907
- isDeterministicFailure(output, code) {
908
- // null exit code = signal/OOM — worth retrying
909
- if (code === null)
910
- return false;
911
- // Check output for patterns that indicate a deterministic failure
912
- const tail = output.slice(-3000);
913
- const deterministicPatterns = [
914
- /error TS\d+:/i, // TypeScript compilation errors
915
- /SyntaxError:/, // JS/TS syntax errors
916
- /Cannot find module/, // Missing module (won't appear on retry)
917
- /Module not found/, // Same
918
- /ENOENT.*no such file/i, // Missing file
919
- /Permission denied/i, // Permission issues (won't self-resolve)
920
- /EACCES/, // Access denied (filesystem)
921
- /ENOSPC/i, // Disk full
922
- /Invalid API key/i, // Auth errors (won't self-resolve)
923
- /authentication failed/i, // Auth errors
924
- /unauthorized/i, // 401 errors
925
- /forbidden/i, // 403 errors
926
- / 402[:\s]/, // Payment required (insufficient credits)
927
- /insufficient.credits/i, // OpenRouter credit exhaustion
928
- /payment.required/i, // Billing errors
929
- /out of.*credits/i, // Credit exhaustion variant
930
- /can'?t afford/i, // OpenRouter "can't afford" message
931
- /ERR_INVALID_ARG_TYPE/, // Programming error
932
- /TypeError:.*is not a function/, // Programming error
933
- /ReferenceError:/, // Undefined variable
934
- /ENOMEM/i, // Out of memory (system-level)
935
- ];
936
- return deterministicPatterns.some((pattern) => pattern.test(tail));
937
- }
938
- toAgentError(err, code, recoverable) {
939
- const message = err instanceof Error ? err.message : String(err);
940
- return {
941
- code,
942
- message,
943
- timestamp: new Date().toISOString(),
944
- recoverable,
945
- };
946
- }
947
- }
948
- //# sourceMappingURL=manager.js.map