threadforge 0.1.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. package/README.md +69 -42
  2. package/bin/forge.js +2 -1058
  3. package/bin/host-commands.d.ts +2 -0
  4. package/bin/host-commands.d.ts.map +1 -0
  5. package/bin/host-commands.js +7 -8
  6. package/bin/platform-commands.d.ts +2 -0
  7. package/bin/platform-commands.d.ts.map +1 -0
  8. package/bin/platform-commands.js +118 -36
  9. package/dist/cli/base-command.d.ts +12 -0
  10. package/dist/cli/base-command.d.ts.map +1 -0
  11. package/dist/cli/base-command.js +25 -0
  12. package/dist/cli/base-command.js.map +1 -0
  13. package/dist/cli/commands/build.d.ts +10 -0
  14. package/dist/cli/commands/build.d.ts.map +1 -0
  15. package/dist/cli/commands/build.js +110 -0
  16. package/dist/cli/commands/build.js.map +1 -0
  17. package/dist/cli/commands/deploy.d.ts +12 -0
  18. package/dist/cli/commands/deploy.d.ts.map +1 -0
  19. package/dist/cli/commands/deploy.js +143 -0
  20. package/dist/cli/commands/deploy.js.map +1 -0
  21. package/dist/cli/commands/dev.d.ts +10 -0
  22. package/dist/cli/commands/dev.d.ts.map +1 -0
  23. package/dist/cli/commands/dev.js +138 -0
  24. package/dist/cli/commands/dev.js.map +1 -0
  25. package/dist/cli/commands/generate.d.ts +10 -0
  26. package/dist/cli/commands/generate.d.ts.map +1 -0
  27. package/dist/cli/commands/generate.js +76 -0
  28. package/dist/cli/commands/generate.js.map +1 -0
  29. package/dist/cli/commands/host.d.ts +8 -0
  30. package/dist/cli/commands/host.d.ts.map +1 -0
  31. package/dist/cli/commands/host.js +20 -0
  32. package/dist/cli/commands/host.js.map +1 -0
  33. package/dist/cli/commands/init.d.ts +16 -0
  34. package/dist/cli/commands/init.d.ts.map +1 -0
  35. package/dist/cli/commands/init.js +246 -0
  36. package/dist/cli/commands/init.js.map +1 -0
  37. package/dist/cli/commands/platform.d.ts +8 -0
  38. package/dist/cli/commands/platform.d.ts.map +1 -0
  39. package/dist/cli/commands/platform.js +20 -0
  40. package/dist/cli/commands/platform.js.map +1 -0
  41. package/dist/cli/commands/restart.d.ts +8 -0
  42. package/dist/cli/commands/restart.d.ts.map +1 -0
  43. package/dist/cli/commands/restart.js +13 -0
  44. package/dist/cli/commands/restart.js.map +1 -0
  45. package/dist/cli/commands/scaffold/frontend.d.ts +10 -0
  46. package/dist/cli/commands/scaffold/frontend.d.ts.map +1 -0
  47. package/dist/cli/commands/scaffold/frontend.js +130 -0
  48. package/dist/cli/commands/scaffold/frontend.js.map +1 -0
  49. package/dist/cli/commands/scaffold/react.d.ts +7 -0
  50. package/dist/cli/commands/scaffold/react.d.ts.map +1 -0
  51. package/dist/cli/commands/scaffold/react.js +12 -0
  52. package/dist/cli/commands/scaffold/react.js.map +1 -0
  53. package/dist/cli/commands/scale.d.ts +8 -0
  54. package/dist/cli/commands/scale.d.ts.map +1 -0
  55. package/dist/cli/commands/scale.js +13 -0
  56. package/dist/cli/commands/scale.js.map +1 -0
  57. package/dist/cli/commands/start.d.ts +10 -0
  58. package/dist/cli/commands/start.d.ts.map +1 -0
  59. package/dist/cli/commands/start.js +71 -0
  60. package/dist/cli/commands/start.js.map +1 -0
  61. package/dist/cli/commands/status.d.ts +11 -0
  62. package/dist/cli/commands/status.d.ts.map +1 -0
  63. package/dist/cli/commands/status.js +60 -0
  64. package/dist/cli/commands/status.js.map +1 -0
  65. package/dist/cli/commands/stop.d.ts +10 -0
  66. package/dist/cli/commands/stop.d.ts.map +1 -0
  67. package/dist/cli/commands/stop.js +89 -0
  68. package/dist/cli/commands/stop.js.map +1 -0
  69. package/dist/cli/util/config-discovery.d.ts +8 -0
  70. package/dist/cli/util/config-discovery.d.ts.map +1 -0
  71. package/dist/cli/util/config-discovery.js +70 -0
  72. package/dist/cli/util/config-discovery.js.map +1 -0
  73. package/dist/cli/util/config-patcher.d.ts +17 -0
  74. package/dist/cli/util/config-patcher.d.ts.map +1 -0
  75. package/dist/cli/util/config-patcher.js +439 -0
  76. package/dist/cli/util/config-patcher.js.map +1 -0
  77. package/dist/cli/util/frontend-dev.d.ts +8 -0
  78. package/dist/cli/util/frontend-dev.d.ts.map +1 -0
  79. package/dist/cli/util/frontend-dev.js +117 -0
  80. package/dist/cli/util/frontend-dev.js.map +1 -0
  81. package/dist/cli/util/process.d.ts +5 -0
  82. package/dist/cli/util/process.d.ts.map +1 -0
  83. package/dist/cli/util/process.js +17 -0
  84. package/dist/cli/util/process.js.map +1 -0
  85. package/dist/cli/util/templates.d.ts +10 -0
  86. package/dist/cli/util/templates.d.ts.map +1 -0
  87. package/dist/cli/util/templates.js +157 -0
  88. package/dist/cli/util/templates.js.map +1 -0
  89. package/dist/core/AlertSink.d.ts +83 -0
  90. package/dist/core/AlertSink.d.ts.map +1 -0
  91. package/dist/core/AlertSink.js +126 -0
  92. package/dist/core/AlertSink.js.map +1 -0
  93. package/dist/core/DirectMessageBus.d.ts +88 -0
  94. package/dist/core/DirectMessageBus.d.ts.map +1 -0
  95. package/dist/core/DirectMessageBus.js +352 -0
  96. package/dist/core/DirectMessageBus.js.map +1 -0
  97. package/dist/core/EndpointResolver.d.ts +111 -0
  98. package/dist/core/EndpointResolver.d.ts.map +1 -0
  99. package/dist/core/EndpointResolver.js +336 -0
  100. package/dist/core/EndpointResolver.js.map +1 -0
  101. package/dist/core/ForgeContext.d.ts +221 -0
  102. package/dist/core/ForgeContext.d.ts.map +1 -0
  103. package/dist/core/ForgeContext.js +1169 -0
  104. package/dist/core/ForgeContext.js.map +1 -0
  105. package/dist/core/ForgeEndpoints.d.ts +71 -0
  106. package/dist/core/ForgeEndpoints.d.ts.map +1 -0
  107. package/dist/core/ForgeEndpoints.js +442 -0
  108. package/dist/core/ForgeEndpoints.js.map +1 -0
  109. package/dist/core/ForgeHost.d.ts +82 -0
  110. package/dist/core/ForgeHost.d.ts.map +1 -0
  111. package/dist/core/ForgeHost.js +107 -0
  112. package/dist/core/ForgeHost.js.map +1 -0
  113. package/dist/core/ForgePlatform.d.ts +96 -0
  114. package/dist/core/ForgePlatform.d.ts.map +1 -0
  115. package/dist/core/ForgePlatform.js +136 -0
  116. package/dist/core/ForgePlatform.js.map +1 -0
  117. package/dist/core/ForgeWebSocket.d.ts +56 -0
  118. package/dist/core/ForgeWebSocket.d.ts.map +1 -0
  119. package/dist/core/ForgeWebSocket.js +415 -0
  120. package/dist/core/ForgeWebSocket.js.map +1 -0
  121. package/dist/core/Ingress.d.ts +329 -0
  122. package/dist/core/Ingress.d.ts.map +1 -0
  123. package/dist/core/Ingress.js +694 -0
  124. package/dist/core/Ingress.js.map +1 -0
  125. package/dist/core/Interceptors.d.ts +134 -0
  126. package/dist/core/Interceptors.d.ts.map +1 -0
  127. package/dist/core/Interceptors.js +416 -0
  128. package/dist/core/Interceptors.js.map +1 -0
  129. package/dist/core/Logger.d.ts +20 -0
  130. package/dist/core/Logger.d.ts.map +1 -0
  131. package/dist/core/Logger.js +77 -0
  132. package/dist/core/Logger.js.map +1 -0
  133. package/dist/core/MessageBus.d.ts +15 -0
  134. package/dist/core/MessageBus.d.ts.map +1 -0
  135. package/dist/core/MessageBus.js +18 -0
  136. package/dist/core/MessageBus.js.map +1 -0
  137. package/dist/core/Prometheus.d.ts +80 -0
  138. package/dist/core/Prometheus.d.ts.map +1 -0
  139. package/dist/core/Prometheus.js +332 -0
  140. package/dist/core/Prometheus.js.map +1 -0
  141. package/dist/core/RequestContext.d.ts +214 -0
  142. package/dist/core/RequestContext.d.ts.map +1 -0
  143. package/dist/core/RequestContext.js +556 -0
  144. package/dist/core/RequestContext.js.map +1 -0
  145. package/dist/core/Router.d.ts +45 -0
  146. package/dist/core/Router.d.ts.map +1 -0
  147. package/dist/core/Router.js +285 -0
  148. package/dist/core/Router.js.map +1 -0
  149. package/dist/core/RoutingStrategy.d.ts +116 -0
  150. package/dist/core/RoutingStrategy.d.ts.map +1 -0
  151. package/dist/core/RoutingStrategy.js +306 -0
  152. package/dist/core/RoutingStrategy.js.map +1 -0
  153. package/dist/core/RpcConfig.d.ts +72 -0
  154. package/dist/core/RpcConfig.d.ts.map +1 -0
  155. package/dist/core/RpcConfig.js +127 -0
  156. package/dist/core/RpcConfig.js.map +1 -0
  157. package/dist/core/SignatureCache.d.ts +81 -0
  158. package/dist/core/SignatureCache.d.ts.map +1 -0
  159. package/dist/core/SignatureCache.js +172 -0
  160. package/dist/core/SignatureCache.js.map +1 -0
  161. package/dist/core/StaticFileServer.d.ts +34 -0
  162. package/dist/core/StaticFileServer.d.ts.map +1 -0
  163. package/dist/core/StaticFileServer.js +497 -0
  164. package/dist/core/StaticFileServer.js.map +1 -0
  165. package/dist/core/Supervisor.d.ts +198 -0
  166. package/dist/core/Supervisor.d.ts.map +1 -0
  167. package/dist/core/Supervisor.js +1418 -0
  168. package/dist/core/Supervisor.js.map +1 -0
  169. package/dist/core/ThreadAllocator.d.ts +52 -0
  170. package/dist/core/ThreadAllocator.d.ts.map +1 -0
  171. package/dist/core/ThreadAllocator.js +174 -0
  172. package/dist/core/ThreadAllocator.js.map +1 -0
  173. package/dist/core/WorkerChannelManager.d.ts +130 -0
  174. package/dist/core/WorkerChannelManager.d.ts.map +1 -0
  175. package/dist/core/WorkerChannelManager.js +956 -0
  176. package/dist/core/WorkerChannelManager.js.map +1 -0
  177. package/dist/core/config-enums.d.ts +41 -0
  178. package/dist/core/config-enums.d.ts.map +1 -0
  179. package/dist/core/config-enums.js +59 -0
  180. package/dist/core/config-enums.js.map +1 -0
  181. package/dist/core/config.d.ts +159 -0
  182. package/dist/core/config.d.ts.map +1 -0
  183. package/dist/core/config.js +694 -0
  184. package/dist/core/config.js.map +1 -0
  185. package/dist/core/host-config.d.ts +146 -0
  186. package/dist/core/host-config.d.ts.map +1 -0
  187. package/dist/core/host-config.js +312 -0
  188. package/dist/core/host-config.js.map +1 -0
  189. package/dist/core/ipc-errors.d.ts +27 -0
  190. package/dist/core/ipc-errors.d.ts.map +1 -0
  191. package/dist/core/ipc-errors.js +36 -0
  192. package/dist/core/ipc-errors.js.map +1 -0
  193. package/dist/core/network-utils.d.ts +35 -0
  194. package/dist/core/network-utils.d.ts.map +1 -0
  195. package/dist/core/network-utils.js +145 -0
  196. package/dist/core/network-utils.js.map +1 -0
  197. package/dist/core/platform-config.d.ts +142 -0
  198. package/dist/core/platform-config.d.ts.map +1 -0
  199. package/dist/core/platform-config.js +299 -0
  200. package/dist/core/platform-config.js.map +1 -0
  201. package/dist/decorators/ServiceProxy.d.ts +175 -0
  202. package/dist/decorators/ServiceProxy.d.ts.map +1 -0
  203. package/dist/decorators/ServiceProxy.js +969 -0
  204. package/dist/decorators/ServiceProxy.js.map +1 -0
  205. package/dist/decorators/index.d.ts +146 -0
  206. package/dist/decorators/index.d.ts.map +1 -0
  207. package/dist/decorators/index.js +545 -0
  208. package/dist/decorators/index.js.map +1 -0
  209. package/dist/deploy/NginxGenerator.d.ts +165 -0
  210. package/dist/deploy/NginxGenerator.d.ts.map +1 -0
  211. package/dist/deploy/NginxGenerator.js +781 -0
  212. package/dist/deploy/NginxGenerator.js.map +1 -0
  213. package/dist/deploy/PlatformManifestGenerator.d.ts +43 -0
  214. package/dist/deploy/PlatformManifestGenerator.d.ts.map +1 -0
  215. package/dist/deploy/PlatformManifestGenerator.js +80 -0
  216. package/dist/deploy/PlatformManifestGenerator.js.map +1 -0
  217. package/dist/deploy/RouteManifestGenerator.d.ts +42 -0
  218. package/dist/deploy/RouteManifestGenerator.d.ts.map +1 -0
  219. package/dist/deploy/RouteManifestGenerator.js +105 -0
  220. package/dist/deploy/RouteManifestGenerator.js.map +1 -0
  221. package/dist/deploy/index.d.ts +210 -0
  222. package/dist/deploy/index.d.ts.map +1 -0
  223. package/dist/deploy/index.js +918 -0
  224. package/dist/deploy/index.js.map +1 -0
  225. package/dist/frontend/FrontendDevLifecycle.d.ts +26 -0
  226. package/dist/frontend/FrontendDevLifecycle.d.ts.map +1 -0
  227. package/dist/frontend/FrontendDevLifecycle.js +60 -0
  228. package/dist/frontend/FrontendDevLifecycle.js.map +1 -0
  229. package/dist/frontend/FrontendPluginOrchestrator.d.ts +64 -0
  230. package/dist/frontend/FrontendPluginOrchestrator.d.ts.map +1 -0
  231. package/dist/frontend/FrontendPluginOrchestrator.js +167 -0
  232. package/dist/frontend/FrontendPluginOrchestrator.js.map +1 -0
  233. package/dist/frontend/SiteResolver.d.ts +33 -0
  234. package/dist/frontend/SiteResolver.d.ts.map +1 -0
  235. package/dist/frontend/SiteResolver.js +53 -0
  236. package/dist/frontend/SiteResolver.js.map +1 -0
  237. package/dist/frontend/StaticMountRegistry.d.ts +36 -0
  238. package/dist/frontend/StaticMountRegistry.d.ts.map +1 -0
  239. package/dist/frontend/StaticMountRegistry.js +94 -0
  240. package/dist/frontend/StaticMountRegistry.js.map +1 -0
  241. package/dist/frontend/index.d.ts +7 -0
  242. package/dist/frontend/index.d.ts.map +1 -0
  243. package/{src → dist}/frontend/index.js +4 -2
  244. package/dist/frontend/index.js.map +1 -0
  245. package/dist/frontend/pathUtils.d.ts +8 -0
  246. package/dist/frontend/pathUtils.d.ts.map +1 -0
  247. package/dist/frontend/pathUtils.js +17 -0
  248. package/dist/frontend/pathUtils.js.map +1 -0
  249. package/dist/frontend/plugins/index.d.ts +2 -0
  250. package/dist/frontend/plugins/index.d.ts.map +1 -0
  251. package/{src → dist}/frontend/plugins/index.js +1 -1
  252. package/dist/frontend/plugins/index.js.map +1 -0
  253. package/dist/frontend/plugins/viteFrontend.d.ts +51 -0
  254. package/dist/frontend/plugins/viteFrontend.d.ts.map +1 -0
  255. package/dist/frontend/plugins/viteFrontend.js +134 -0
  256. package/dist/frontend/plugins/viteFrontend.js.map +1 -0
  257. package/dist/frontend/types.d.ts +25 -0
  258. package/dist/frontend/types.d.ts.map +1 -0
  259. package/dist/frontend/types.js +2 -0
  260. package/dist/frontend/types.js.map +1 -0
  261. package/dist/index.d.ts +17 -0
  262. package/dist/index.d.ts.map +1 -0
  263. package/dist/index.js +32 -0
  264. package/dist/index.js.map +1 -0
  265. package/dist/internals.d.ts +21 -0
  266. package/dist/internals.d.ts.map +1 -0
  267. package/{src → dist}/internals.js +12 -14
  268. package/dist/internals.js.map +1 -0
  269. package/dist/plugins/PluginManager.d.ts +209 -0
  270. package/dist/plugins/PluginManager.d.ts.map +1 -0
  271. package/dist/plugins/PluginManager.js +365 -0
  272. package/dist/plugins/PluginManager.js.map +1 -0
  273. package/dist/plugins/ScopedPostgres.d.ts +78 -0
  274. package/dist/plugins/ScopedPostgres.d.ts.map +1 -0
  275. package/dist/plugins/ScopedPostgres.js +190 -0
  276. package/dist/plugins/ScopedPostgres.js.map +1 -0
  277. package/dist/plugins/ScopedRedis.d.ts +88 -0
  278. package/dist/plugins/ScopedRedis.d.ts.map +1 -0
  279. package/dist/plugins/ScopedRedis.js +169 -0
  280. package/dist/plugins/ScopedRedis.js.map +1 -0
  281. package/dist/plugins/index.d.ts +289 -0
  282. package/dist/plugins/index.d.ts.map +1 -0
  283. package/dist/plugins/index.js +1942 -0
  284. package/dist/plugins/index.js.map +1 -0
  285. package/dist/plugins/types.d.ts +59 -0
  286. package/dist/plugins/types.d.ts.map +1 -0
  287. package/dist/plugins/types.js +2 -0
  288. package/dist/plugins/types.js.map +1 -0
  289. package/dist/registry/ServiceRegistry.d.ts +305 -0
  290. package/dist/registry/ServiceRegistry.d.ts.map +1 -0
  291. package/dist/registry/ServiceRegistry.js +735 -0
  292. package/dist/registry/ServiceRegistry.js.map +1 -0
  293. package/dist/scaling/ScaleAdvisor.d.ts +214 -0
  294. package/dist/scaling/ScaleAdvisor.d.ts.map +1 -0
  295. package/dist/scaling/ScaleAdvisor.js +526 -0
  296. package/dist/scaling/ScaleAdvisor.js.map +1 -0
  297. package/dist/services/Service.d.ts +164 -0
  298. package/dist/services/Service.d.ts.map +1 -0
  299. package/dist/services/Service.js +106 -0
  300. package/dist/services/Service.js.map +1 -0
  301. package/dist/services/worker-bootstrap.d.ts +15 -0
  302. package/dist/services/worker-bootstrap.d.ts.map +1 -0
  303. package/dist/services/worker-bootstrap.js +744 -0
  304. package/dist/services/worker-bootstrap.js.map +1 -0
  305. package/dist/templates/auth-service.d.ts +42 -0
  306. package/dist/templates/auth-service.d.ts.map +1 -0
  307. package/dist/templates/auth-service.js +54 -0
  308. package/dist/templates/auth-service.js.map +1 -0
  309. package/dist/templates/identity-service.d.ts +50 -0
  310. package/dist/templates/identity-service.d.ts.map +1 -0
  311. package/dist/templates/identity-service.js +62 -0
  312. package/dist/templates/identity-service.js.map +1 -0
  313. package/dist/types/contract.d.ts +120 -0
  314. package/dist/types/contract.d.ts.map +1 -0
  315. package/dist/types/contract.js +69 -0
  316. package/dist/types/contract.js.map +1 -0
  317. package/package.json +79 -20
  318. package/src/core/DirectMessageBus.js +0 -364
  319. package/src/core/EndpointResolver.js +0 -259
  320. package/src/core/ForgeContext.js +0 -2236
  321. package/src/core/ForgeHost.js +0 -122
  322. package/src/core/ForgePlatform.js +0 -145
  323. package/src/core/Ingress.js +0 -768
  324. package/src/core/Interceptors.js +0 -420
  325. package/src/core/MessageBus.js +0 -321
  326. package/src/core/Prometheus.js +0 -305
  327. package/src/core/RequestContext.js +0 -413
  328. package/src/core/RoutingStrategy.js +0 -330
  329. package/src/core/Supervisor.js +0 -1349
  330. package/src/core/ThreadAllocator.js +0 -196
  331. package/src/core/WorkerChannelManager.js +0 -879
  332. package/src/core/config.js +0 -637
  333. package/src/core/host-config.js +0 -311
  334. package/src/core/network-utils.js +0 -166
  335. package/src/core/platform-config.js +0 -308
  336. package/src/decorators/ServiceProxy.js +0 -904
  337. package/src/decorators/index.js +0 -571
  338. package/src/deploy/NginxGenerator.js +0 -865
  339. package/src/deploy/PlatformManifestGenerator.js +0 -96
  340. package/src/deploy/RouteManifestGenerator.js +0 -112
  341. package/src/deploy/index.js +0 -984
  342. package/src/frontend/FrontendDevLifecycle.js +0 -65
  343. package/src/frontend/FrontendPluginOrchestrator.js +0 -187
  344. package/src/frontend/SiteResolver.js +0 -63
  345. package/src/frontend/StaticMountRegistry.js +0 -90
  346. package/src/frontend/plugins/viteFrontend.js +0 -79
  347. package/src/frontend/types.js +0 -35
  348. package/src/index.js +0 -58
  349. package/src/plugins/PluginManager.js +0 -537
  350. package/src/plugins/ScopedPostgres.js +0 -192
  351. package/src/plugins/ScopedRedis.js +0 -142
  352. package/src/plugins/index.js +0 -1756
  353. package/src/registry/ServiceRegistry.js +0 -797
  354. package/src/scaling/ScaleAdvisor.js +0 -442
  355. package/src/services/Service.js +0 -195
  356. package/src/services/worker-bootstrap.js +0 -679
  357. package/src/templates/auth-service.js +0 -65
  358. package/src/templates/identity-service.js +0 -75
@@ -0,0 +1,1418 @@
1
+ import cluster from "node:cluster";
2
+ import { timingSafeEqual } from "node:crypto";
3
+ import { EventEmitter } from "node:events";
4
+ import fs from "node:fs";
5
+ import { createServer } from "node:http";
6
+ import { createServer as createNetServer } from "node:net";
7
+ import { tmpdir } from "node:os";
8
+ import path from "node:path";
9
+ import { fileURLToPath } from "node:url";
10
+ import { ServiceRegistry } from "../registry/ServiceRegistry.js";
11
+ import { ScaleAdvisor } from "../scaling/ScaleAdvisor.js";
12
+ import { AlertSink } from "./AlertSink.js";
13
+ import { RegistryMode, ServiceMode, ServiceType, } from "./config-enums.js";
14
+ import { DirectMessageBus } from "./DirectMessageBus.js";
15
+ import { IPC_PROTOCOL_VERSION, isExpectedIpcError } from "./ipc-errors.js";
16
+ import { ThreadAllocator } from "./ThreadAllocator.js";
17
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
18
+ const WORKER_BOOTSTRAP = path.join(__dirname, "..", "services", "worker-bootstrap.js");
19
+ // L1: Restart policy constants
20
+ const RESTART_BASE_BACKOFF_MS = 2000;
21
+ const RESTART_MAX_BACKOFF_MS = 60000;
22
+ const MAX_RESTARTS_PER_WINDOW = 5;
23
+ const RESTART_WINDOW_MS = 300000;
24
+ // L5: Rate-limit restart warnings — one per 5s per group
25
+ const RESTART_WARNING_INTERVAL_MS = 5000;
26
+ // C5: Overall shutdown deadline
27
+ const SHUTDOWN_DEADLINE_MS = 25000;
28
+ // C2: Forbidden env keys
29
+ const FORBIDDEN_ENV_KEYS = new Set(["PATH", "LD_PRELOAD", "LD_LIBRARY_PATH", "NODE_OPTIONS", "NODE_EXTRA_CA_CERTS"]);
30
+ const VALID_ENV_KEY = /^[A-Z_][A-Z0-9_]*$/i;
31
+ // C3: Max env var size before file fallback
32
+ const MAX_ENDPOINT_ENV_SIZE = 65536;
33
+ /**
34
+ * Supervisor v2
35
+ *
36
+ * Key differences from v1:
37
+ *
38
+ * - Understands service types (edge/internal/background)
39
+ * - Only edge services get HTTP servers
40
+ * - Colocated services share a process (same event loop)
41
+ * - Channels are dependency-based, not full mesh
42
+ * - Thread allocation is per process group, not per service
43
+ */
44
+ export class Supervisor extends EventEmitter {
45
+ config;
46
+ services;
47
+ groups;
48
+ channels;
49
+ options;
50
+ allocator;
51
+ messageBus;
52
+ registry;
53
+ scaleAdvisor;
54
+ alertSink;
55
+ plugins;
56
+ _pluginEnv;
57
+ _failedPlugins;
58
+ workerMap;
59
+ groupWorkers;
60
+ allocation;
61
+ _metricsServer;
62
+ _shuttingDown;
63
+ _restartHistory;
64
+ _scalingDown;
65
+ _pendingRestarts;
66
+ _killTimers;
67
+ _restartWarningTimes;
68
+ _endpointTempFile;
69
+ _sitesTempFile;
70
+ _metricsRequestSeq;
71
+ _pendingMetricsSnapshots;
72
+ _groupReadyWorkers;
73
+ _groupReadyLogged;
74
+ _nextWorkerIndex;
75
+ _heartbeatInterval;
76
+ _lastHeartbeat;
77
+ _workersReady;
78
+ _workerRestartCount;
79
+ _workerErrorGuards;
80
+ _cachedEndpointJson;
81
+ _pidFilePath;
82
+ constructor(config, options = {}) {
83
+ super();
84
+ this.config = config;
85
+ this.services = config.services;
86
+ this.groups = config.groups;
87
+ this.channels = config.channels; // declared dependency channels
88
+ this.options = options;
89
+ this.allocator = new ThreadAllocator({
90
+ cpus: options.cpus,
91
+ reserved: options.reserved,
92
+ });
93
+ this.messageBus = new DirectMessageBus();
94
+ // Service registry — starts embedded, upgrades to multicast/external
95
+ this.registry = new ServiceRegistry({
96
+ mode: options.registryMode ?? RegistryMode.EMBEDDED,
97
+ host: options.host,
98
+ httpBasePort: options.httpBasePort ?? 4000,
99
+ });
100
+ // Scale advisor — monitors health and recommends actions
101
+ this.scaleAdvisor = new ScaleAdvisor(this.registry, {
102
+ evaluationIntervalMs: options.evaluationIntervalMs ?? 30000,
103
+ });
104
+ // Alert sink — delivers critical event alerts via webhook or custom sink
105
+ const alertsConfig = config.alerts;
106
+ this.alertSink = new AlertSink(alertsConfig ?? {});
107
+ // Log scaling recommendations
108
+ this.scaleAdvisor.on("recommendation", (rec) => {
109
+ const icon = {
110
+ scale_up: "\u2191",
111
+ migrate: "\u2192",
112
+ split_out: "\u229E",
113
+ scale_down: "\u2193",
114
+ };
115
+ console.log(`\n ${icon[rec.action] ?? "\u2022"} SCALE: ${rec.service} \u2014 ${rec.action}`);
116
+ console.log(` ${rec.reason}`);
117
+ if (rec.details.command)
118
+ console.log(` Run: ${rec.details.command}`);
119
+ });
120
+ // Plugins
121
+ this.plugins = (config.plugins ?? []);
122
+ this._pluginEnv = {};
123
+ this._failedPlugins = new Set();
124
+ this.workerMap = new Map();
125
+ this.groupWorkers = new Map();
126
+ this.allocation = new Map();
127
+ this._metricsServer = null;
128
+ this._shuttingDown = false;
129
+ this._restartHistory = new Map();
130
+ this._scalingDown = new Set();
131
+ this._pendingRestarts = new Map();
132
+ this._killTimers = new Map();
133
+ this._restartWarningTimes = new Map();
134
+ this._endpointTempFile = null;
135
+ this._sitesTempFile = null;
136
+ this._metricsRequestSeq = 0;
137
+ this._pendingMetricsSnapshots = new Map();
138
+ this._groupReadyWorkers = new Map();
139
+ this._groupReadyLogged = new Set();
140
+ this._nextWorkerIndex = {};
141
+ this._heartbeatInterval = null;
142
+ this._lastHeartbeat = new Map();
143
+ this._workersReady = new Map();
144
+ this._workerRestartCount = 0;
145
+ this._workerErrorGuards = new WeakSet();
146
+ this._cachedEndpointJson = undefined;
147
+ this._pidFilePath = null;
148
+ }
149
+ async start() {
150
+ if (!cluster.isPrimary) {
151
+ throw new Error("Supervisor.start() must be called from the primary process");
152
+ }
153
+ // S6: Reject placeholder JWT_SECRET in production
154
+ if (process.env.NODE_ENV === "production" && process.env.JWT_SECRET === "CHANGE_ME_BEFORE_DEPLOY") {
155
+ console.error('FATAL: JWT_SECRET is set to the placeholder value "CHANGE_ME_BEFORE_DEPLOY". Set a real secret before deploying.');
156
+ process.exit(1);
157
+ }
158
+ // M-1 Security: Warn if FORGE_INTERNAL_SECRET is not set in production
159
+ if (process.env.NODE_ENV === "production" && !process.env.FORGE_INTERNAL_SECRET) {
160
+ console.error("\u26a0 WARNING: FORGE_INTERNAL_SECRET is not set. Internal endpoints (/__forge/*) will reject requests in production without a valid HMAC signature.");
161
+ }
162
+ // Register signal handlers early so signals during startup are caught
163
+ process.once("SIGTERM", () => this.shutdown());
164
+ process.once("SIGINT", () => this.shutdown());
165
+ // SUP-H1: Handle SIGQUIT for graceful shutdown instead of default core dump
166
+ process.once("SIGQUIT", () => this.shutdown());
167
+ // Preflight check: fail fast with a single clear error before forking workers.
168
+ await this._assertStartupPortsAvailable();
169
+ // Validate and collect plugin env vars before forking workers
170
+ this._failedPlugins = new Set();
171
+ if (this.plugins.length > 0) {
172
+ for (const plugin of this.plugins) {
173
+ const pName = plugin.name ?? "unknown";
174
+ try {
175
+ if (plugin.validate) {
176
+ await plugin.validate();
177
+ }
178
+ if (plugin.env) {
179
+ Object.assign(this._pluginEnv, plugin.env());
180
+ }
181
+ }
182
+ catch (err) {
183
+ console.warn(` \u26a0 Plugin "${pName}" unavailable: ${err.message}`);
184
+ this._failedPlugins.add(pName);
185
+ }
186
+ }
187
+ const available = this.plugins.filter((p) => !this._failedPlugins.has(p.name ?? "unknown"));
188
+ if (available.length > 0) {
189
+ console.log(` Plugins: ${available.map((p) => p.name).join(", ")}`);
190
+ }
191
+ if (this._failedPlugins.size > 0) {
192
+ console.warn(` Failed plugins: ${[...this._failedPlugins].join(", ")}`);
193
+ }
194
+ }
195
+ console.log(this._banner());
196
+ // Allocate threads per process group (not per service)
197
+ this._allocateGroups();
198
+ // Display allocation
199
+ this._printAllocation();
200
+ cluster.setupPrimary({
201
+ exec: WORKER_BOOTSTRAP,
202
+ silent: false,
203
+ });
204
+ // Register exit handler BEFORE forking so early crashes are caught
205
+ cluster.on("exit", (worker, code, signal) => {
206
+ this._handleWorkerExit(worker, code, signal);
207
+ });
208
+ // REG-H3: Start registry BEFORE building endpoint map so multicast discovery
209
+ // can contribute to the initial snapshot workers receive at fork time.
210
+ await this.registry.start();
211
+ this.scaleAdvisor.start();
212
+ // Register all local services in the registry before building endpoint map
213
+ for (const [name, svc] of Object.entries(this.services)) {
214
+ if (svc.type === ServiceType.REMOTE)
215
+ continue;
216
+ const groupName = svc.group ?? `_isolated:${name}`;
217
+ this.registry.register({
218
+ name,
219
+ ports: { http: svc.port },
220
+ udsPath: null,
221
+ workers: this.allocation.get(groupName) ?? 1,
222
+ contract: {
223
+ methods: [], // populated by worker after loading class
224
+ events: [],
225
+ },
226
+ metadata: {
227
+ group: groupName,
228
+ },
229
+ });
230
+ }
231
+ // P21: Pre-serialize endpoint map once for all worker forks
232
+ // REG-H3: Now built AFTER registry start + local registration so the map includes discovered services
233
+ this._cachedEndpointJson = JSON.stringify(this._buildEndpointMap());
234
+ // Fork workers for each process group
235
+ for (const [groupName, group] of Object.entries(this.groups)) {
236
+ const threadCount = this.allocation.get(groupName) ?? 1;
237
+ this.groupWorkers.set(groupName, []);
238
+ this._groupReadyWorkers.set(groupName, new Set());
239
+ for (let i = 0; i < threadCount; i++) {
240
+ this._forkGroupWorker(groupName, group, i);
241
+ }
242
+ }
243
+ // SUP-H3: Start heartbeat monitor on a safety timer in case some groups never become ready.
244
+ // This prevents a stuck group from blocking health monitoring for all other groups.
245
+ const heartbeatSafetyTimer = setTimeout(() => {
246
+ if (!this._heartbeatInterval && !this._shuttingDown) {
247
+ console.warn(" ⚠ Starting heartbeat monitor via safety timer (not all groups ready after 60s)");
248
+ this._startHeartbeatMonitor();
249
+ }
250
+ }, 60_000);
251
+ if (typeof heartbeatSafetyTimer.unref === "function")
252
+ heartbeatSafetyTimer.unref();
253
+ await this._startMetricsServer();
254
+ // REG-H1: Push topology updates to workers when registry discovers new/removed services
255
+ const pushEndpointUpdate = () => {
256
+ const endpointMap = this._buildEndpointMap();
257
+ const json = JSON.stringify(endpointMap);
258
+ // Skip if nothing changed
259
+ if (json === this._cachedEndpointJson)
260
+ return;
261
+ this._cachedEndpointJson = json;
262
+ const message = { type: "forge:endpoint-update", endpoints: endpointMap };
263
+ const clusterWorkers = cluster.workers ?? {};
264
+ for (const id of Object.keys(clusterWorkers)) {
265
+ const worker = clusterWorkers[Number(id)];
266
+ if (worker) {
267
+ this._sendWorkerMessage(worker, message, "endpoint update");
268
+ }
269
+ }
270
+ };
271
+ this.registry.on("discovered", pushEndpointUpdate);
272
+ this.registry.on("removed", pushEndpointUpdate);
273
+ this.registry.on("deregistered", pushEndpointUpdate);
274
+ // REG-H2: Push health status changes to workers so EndpointResolver can filter
275
+ const pushHealthUpdate = (reg) => {
276
+ if (!reg.ports?.http)
277
+ return;
278
+ const message = {
279
+ type: "forge:health-update",
280
+ host: reg.host,
281
+ port: reg.ports.http,
282
+ status: reg.health.status,
283
+ };
284
+ const clusterWorkers = cluster.workers ?? {};
285
+ for (const id of Object.keys(clusterWorkers)) {
286
+ const worker = clusterWorkers[Number(id)];
287
+ if (worker) {
288
+ this._sendWorkerMessage(worker, message, "health update");
289
+ }
290
+ }
291
+ };
292
+ this.registry.on("unhealthy", pushHealthUpdate);
293
+ // Print channel topology
294
+ this._printTopology();
295
+ // H4: Write PID file so `forge stop` can find us without the metrics endpoint
296
+ this._pidFilePath = path.join(process.cwd(), ".forge.pid");
297
+ try {
298
+ fs.writeFileSync(this._pidFilePath, String(process.pid));
299
+ }
300
+ catch {
301
+ /* ignore: PID file write failure is non-fatal */
302
+ }
303
+ console.log(`\n \u26a1 ThreadForge runtime started\n`);
304
+ }
305
+ /**
306
+ * Allocate threads per process group.
307
+ *
308
+ * Each group gets threads based on the highest weight of its member
309
+ * services. Colocated services share their group's allocation.
310
+ */
311
+ _allocateGroups() {
312
+ // Build a services-like map for the allocator, keyed by group name
313
+ const groupConfigs = {};
314
+ for (const [groupName, group] of Object.entries(this.groups)) {
315
+ groupConfigs[groupName] = {
316
+ name: groupName,
317
+ port: group.port ?? 0,
318
+ threads: group.threads === 0 ? "auto" : group.threads,
319
+ weight: group.weight || 1,
320
+ mode: ServiceMode.CLUSTER,
321
+ };
322
+ }
323
+ this.allocation = this.allocator.allocate(groupConfigs);
324
+ }
325
+ /**
326
+ * Fork a worker for a process group.
327
+ *
328
+ * The worker will load ALL services in the group within a single
329
+ * process. Colocated services communicate via direct function calls.
330
+ */
331
+ _forkGroupWorker(groupName, group, workerIndex) {
332
+ const serviceNames = group.services.map((s) => s.name);
333
+ const edgeService = group.services.find((s) => s.type === ServiceType.EDGE);
334
+ // Build comma-separated entry points for all services in the group
335
+ const entries = group.services.map((s) => `${s.name}=${s.entry}`).join(",");
336
+ const configuredHost = (() => {
337
+ if (typeof this.options.host === "string" && this.options.host.trim()) {
338
+ return this.options.host.trim();
339
+ }
340
+ if (typeof this.config.host === "string" && this.config.host.trim()) {
341
+ return this.config.host.trim();
342
+ }
343
+ return null;
344
+ })();
345
+ const env = {
346
+ ...process.env,
347
+ ...this._pluginEnv,
348
+ FORGE_GROUP_NAME: groupName,
349
+ FORGE_SERVICE_ENTRIES: entries,
350
+ FORGE_SERVICE_NAMES: serviceNames.join(","),
351
+ FORGE_PORT: edgeService ? String(edgeService.port) : "0", // 0 = no HTTP
352
+ FORGE_WORKER_ID: String(workerIndex),
353
+ FORGE_THREAD_COUNT: String(this.allocation.get(groupName) ?? 1),
354
+ FORGE_MODE: ServiceMode.CLUSTER,
355
+ FORGE_SERVICE_TYPES: group.services.map((s) => `${s.name}=${s.type}`).join(","),
356
+ // Port map for HTTP-based service-to-service calls (backward compat)
357
+ FORGE_SERVICE_PORTS: JSON.stringify(Object.fromEntries(Object.entries(this.services)
358
+ .filter(([, s]) => s.port)
359
+ .map(([name, s]) => [name, s.port]))),
360
+ // Full endpoint topology — includes remote hosts for multi-machine
361
+ // S10: Endpoint map may contain internal IPs — treat as trusted internal config, not user input.
362
+ // P21: Use cached JSON serialization; C3: File fallback when JSON exceeds 64KB
363
+ ...(() => {
364
+ const json = this._cachedEndpointJson ?? JSON.stringify(this._buildEndpointMap());
365
+ if (json.length > MAX_ENDPOINT_ENV_SIZE) {
366
+ if (!this._endpointTempFile) {
367
+ const tempFile = path.join(tmpdir(), `forge-endpoints-${process.pid}.json`);
368
+ // M-SEC-5: Restrict temp file permissions — contains internal topology
369
+ fs.writeFileSync(tempFile, json, { encoding: "utf8", mode: 0o600 });
370
+ this._endpointTempFile = tempFile;
371
+ }
372
+ return { FORGE_SERVICE_ENDPOINTS_FILE: this._endpointTempFile };
373
+ }
374
+ return { FORGE_SERVICE_ENDPOINTS: json };
375
+ })(),
376
+ ...(() => {
377
+ const configWithSites = this.config;
378
+ const sites = configWithSites._sites ?? this.config.sites;
379
+ const json = sites && Object.keys(sites).length > 0 ? JSON.stringify(sites) : "";
380
+ if (!json)
381
+ return { FORGE_SITES: "" };
382
+ if (json.length > MAX_ENDPOINT_ENV_SIZE) {
383
+ if (!this._sitesTempFile) {
384
+ const tempFile = path.join(tmpdir(), `forge-sites-${process.pid}.json`);
385
+ fs.writeFileSync(tempFile, json, { encoding: "utf8", mode: 0o600 });
386
+ this._sitesTempFile = tempFile;
387
+ }
388
+ return { FORGE_SITES_FILE: this._sitesTempFile };
389
+ }
390
+ return { FORGE_SITES: json };
391
+ })(),
392
+ // Registry mode and host for dynamic discovery
393
+ FORGE_REGISTRY_MODE: this.options.registryMode ?? RegistryMode.EMBEDDED,
394
+ // Plugin config — which plugins each service uses
395
+ FORGE_PLUGINS: JSON.stringify(this.plugins.map((p) => p.name)),
396
+ FORGE_CONFIG_PATH: this.config._configUrl ?? "",
397
+ FORGE_SERVICE_PLUGINS: JSON.stringify(Object.fromEntries(group.services.map((s) => [s.name, s.plugins ?? null]))),
398
+ FORGE_CHANNELS: JSON.stringify(this.channels.filter((ch) => serviceNames.includes(ch.from) || serviceNames.includes(ch.to))),
399
+ FORGE_INGRESS: this.config.ingress ? JSON.stringify(this.config.ingress) : "",
400
+ FORGE_SERVICE_RPC: JSON.stringify(Object.fromEntries(group.services
401
+ .filter((s) => s.rpc || s.rpcTargets)
402
+ .map((s) => [s.name, { rpc: s.rpc, rpcTargets: s.rpcTargets }]))),
403
+ };
404
+ if (configuredHost) {
405
+ env.FORGE_HOST = configuredHost;
406
+ }
407
+ if (this.config._isHostMode) {
408
+ const configAny = this.config;
409
+ env.FORGE_HOST_META = configAny._hostMetaJSON ?? JSON.stringify(configAny._hostMeta);
410
+ }
411
+ if (this.config._isPlatformMode) {
412
+ env.FORGE_PLATFORM_MODE = "1";
413
+ }
414
+ // C2: Set per-service env overrides with validation
415
+ for (const svc of group.services) {
416
+ for (const [key, value] of Object.entries(svc.env)) {
417
+ if (!VALID_ENV_KEY.test(key)) {
418
+ throw new Error(`Service "${svc.name}": invalid env key "${key}" \u2014 must match /^[A-Z_][A-Z0-9_]*$/i`);
419
+ }
420
+ if (FORBIDDEN_ENV_KEYS.has(key.toUpperCase())) {
421
+ throw new Error(`Service "${svc.name}": env key "${key}" is forbidden (security risk)`);
422
+ }
423
+ env[`FORGE_ENV_${svc.name.toUpperCase()}_${key}`] = value;
424
+ }
425
+ }
426
+ const worker = cluster.fork(env);
427
+ this._attachWorkerErrorGuard(worker, groupName, serviceNames, workerIndex);
428
+ this.workerMap.set(worker.id, {
429
+ groupName,
430
+ services: serviceNames,
431
+ workerId: workerIndex,
432
+ });
433
+ // H-5: Initialize heartbeat timestamp so the monitor doesn't kill
434
+ // workers forked after startup (restarts, scale-up)
435
+ this._lastHeartbeat.set(worker.id, Date.now());
436
+ const workers = this.groupWorkers.get(groupName) ?? [];
437
+ workers.push(worker.id);
438
+ this.groupWorkers.set(groupName, workers);
439
+ // Register with message bus — using service names, not group name
440
+ // so IPC addressing is still by service name
441
+ for (const svcName of serviceNames) {
442
+ this.messageBus.registerWorker(svcName, worker, "cluster");
443
+ }
444
+ worker.on("message", (msg) => this._handleWorkerMessage(worker, msg));
445
+ return worker;
446
+ }
447
+ _attachWorkerErrorGuard(worker, groupName, serviceNames, workerIndex) {
448
+ if (!worker || this._workerErrorGuards.has(worker))
449
+ return;
450
+ this._workerErrorGuards.add(worker);
451
+ worker.on("error", (err) => {
452
+ if (isExpectedIpcError(err))
453
+ return;
454
+ if (this._shuttingDown)
455
+ return;
456
+ console.error(` \u26a0 Worker ${groupName}[${workerIndex}] (${serviceNames.join("+")}) IPC error: ${err?.message ?? err}`);
457
+ });
458
+ }
459
+ _sendWorkerMessage(worker, message, label = "worker message") {
460
+ if (!worker)
461
+ return false;
462
+ if (typeof worker.isDead === "function" && worker.isDead())
463
+ return false;
464
+ if (typeof worker.isConnected === "function" && !worker.isConnected())
465
+ return false;
466
+ if (worker.process?.connected === false)
467
+ return false;
468
+ try {
469
+ worker.send(message);
470
+ return true;
471
+ }
472
+ catch (err) {
473
+ if (!isExpectedIpcError(err)) {
474
+ console.error(` \u26a0 Failed to send ${label}: ${err?.message ?? err}`);
475
+ }
476
+ return false;
477
+ }
478
+ }
479
+ _handleWorkerMessage(worker, msg) {
480
+ if (msg?.type === "forge:group-ready") {
481
+ // IPC version handshake: fail fast on protocol mismatch
482
+ const workerVersion = msg.ipcVersion;
483
+ if (workerVersion !== undefined && workerVersion !== IPC_PROTOCOL_VERSION) {
484
+ const info = this.workerMap.get(worker.id);
485
+ const groupName = info?.groupName ?? "unknown";
486
+ console.error(` FATAL: Worker ${groupName}[${info?.workerId ?? "?"}] IPC protocol version mismatch: ` +
487
+ `worker=${workerVersion}, supervisor=${IPC_PROTOCOL_VERSION}. ` +
488
+ `This usually means mismatched ThreadForge versions. ` +
489
+ `Restart all processes with the same version.`);
490
+ try {
491
+ worker.process.kill("SIGKILL");
492
+ }
493
+ catch {
494
+ /* ignore: worker may already be dead */
495
+ }
496
+ return;
497
+ }
498
+ // O1: Mark worker as ready for the /health/ready readiness probe
499
+ this._workersReady.set(worker.id, true);
500
+ const info = this.workerMap.get(worker.id);
501
+ if (!info)
502
+ return;
503
+ const groupName = info.groupName;
504
+ const readySet = this._groupReadyWorkers.get(groupName) ?? new Set();
505
+ readySet.add(worker.id);
506
+ this._groupReadyWorkers.set(groupName, readySet);
507
+ const expected = this.allocation.get(groupName) ?? 1;
508
+ if (readySet.size >= expected && !this._groupReadyLogged.has(groupName)) {
509
+ this._groupReadyLogged.add(groupName);
510
+ const group = this.groups[groupName];
511
+ const edgeService = group?.services?.find((s) => s.type === ServiceType.EDGE);
512
+ const portLabel = edgeService?.port ? ` on port ${edgeService.port}` : "";
513
+ const svcLabel = group?.services?.map((s) => s.name).join(", ") ?? groupName;
514
+ console.log(` \u2713 ${svcLabel}: ${expected} workers ready${portLabel}`);
515
+ // H-5: Start heartbeat monitor once all groups are ready
516
+ if (!this._heartbeatInterval) {
517
+ const allReady = [...this._groupReadyWorkers.entries()].every(([gn, set]) => set.size >= (this.allocation.get(gn) ?? 1));
518
+ if (allReady) {
519
+ this._startHeartbeatMonitor();
520
+ }
521
+ }
522
+ }
523
+ return;
524
+ }
525
+ if (msg?.type === "forge:fatal-error") {
526
+ const info = this.workerMap.get(worker.id);
527
+ const groupName = info?.groupName ?? "unknown";
528
+ const workerId = info?.workerId ?? "?";
529
+ console.error(` \u2716 Worker ${groupName}[${workerId}] fatal error: ${msg.error} - ${msg.message}`);
530
+ if (msg.port) {
531
+ console.error(` \u2716 Failed to bind to port ${msg.port}. Check permissions or port availability.`);
532
+ }
533
+ return;
534
+ }
535
+ // H-5: Track heartbeat responses from workers
536
+ // Workers respond to forge:health-check with forge:health-response,
537
+ // so we accept both message types for heartbeat tracking.
538
+ if (msg?.type === "forge:heartbeat-response" || msg?.type === "forge:health-response") {
539
+ this._lastHeartbeat.set(worker.id, Date.now());
540
+ return;
541
+ }
542
+ if (msg?.type === "forge:metrics-snapshot-response" && msg.requestId) {
543
+ const pending = this._pendingMetricsSnapshots.get(msg.requestId);
544
+ if (!pending)
545
+ return;
546
+ if (typeof msg.metrics === "string" && msg.metrics.trim().length > 0) {
547
+ pending.chunks.push(msg.metrics);
548
+ }
549
+ if (msg.error) {
550
+ pending.chunks.push(`# Worker ${worker.id} metrics error: ${msg.error}`);
551
+ }
552
+ pending.expected.delete(worker.id);
553
+ if (pending.expected.size === 0) {
554
+ pending.finish();
555
+ }
556
+ }
557
+ }
558
+ _mergePrometheusExpositions(expositions) {
559
+ const lines = [];
560
+ const seenMeta = new Set();
561
+ for (const chunk of expositions) {
562
+ if (typeof chunk !== "string")
563
+ continue;
564
+ for (const rawLine of chunk.split(/\r?\n/)) {
565
+ const line = rawLine.trimEnd();
566
+ if (!line)
567
+ continue;
568
+ if (line.startsWith("# HELP ") || line.startsWith("# TYPE ")) {
569
+ if (seenMeta.has(line))
570
+ continue;
571
+ seenMeta.add(line);
572
+ }
573
+ lines.push(line);
574
+ }
575
+ }
576
+ if (lines.length === 0) {
577
+ return "# No worker metrics available\n";
578
+ }
579
+ return `${lines.join("\n")}\n`;
580
+ }
581
+ _collectMetricsSnapshot(timeoutMs = 1000) {
582
+ const clusterWorkers = cluster.workers ?? {};
583
+ const activeWorkers = Object.values(clusterWorkers).filter((worker) => worker != null && !worker.isDead());
584
+ if (activeWorkers.length === 0) {
585
+ return Promise.resolve("# No worker metrics available\n");
586
+ }
587
+ const requestId = `metrics-${process.pid}-${Date.now()}-${++this._metricsRequestSeq}`;
588
+ return new Promise((resolve) => {
589
+ const expected = new Set(activeWorkers.map((worker) => worker.id));
590
+ const chunks = [];
591
+ let finished = false;
592
+ const finish = () => {
593
+ if (finished)
594
+ return;
595
+ finished = true;
596
+ const pending = this._pendingMetricsSnapshots.get(requestId);
597
+ if (pending?.timer)
598
+ clearTimeout(pending.timer);
599
+ this._pendingMetricsSnapshots.delete(requestId);
600
+ resolve(this._mergePrometheusExpositions(chunks));
601
+ };
602
+ const timer = setTimeout(finish, timeoutMs);
603
+ if (typeof timer.unref === "function")
604
+ timer.unref();
605
+ this._pendingMetricsSnapshots.set(requestId, { expected, chunks, timer, finish });
606
+ for (const worker of activeWorkers) {
607
+ const sent = this._sendWorkerMessage(worker, { type: "forge:metrics-snapshot", requestId }, "metrics snapshot request");
608
+ if (!sent)
609
+ expected.delete(worker.id);
610
+ }
611
+ if (expected.size === 0) {
612
+ finish();
613
+ }
614
+ });
615
+ }
616
+ _handleWorkerExit(worker, code, signal) {
617
+ const info = this.workerMap.get(worker.id);
618
+ if (!info)
619
+ return;
620
+ const { groupName, services, workerId } = info;
621
+ // CR-1: Find the worker's slot index in the group before removing it
622
+ const workers = this.groupWorkers.get(groupName) ?? [];
623
+ const workerSlotIndex = workers.indexOf(worker.id);
624
+ // SUP-H2: Guard against -1 index — use worker.id as unique key fallback
625
+ const slotKey = workerSlotIndex !== -1 ? `slot${workerSlotIndex}` : `wid${worker.id}`;
626
+ // CR-2: Always perform cleanup even during shutdown — only skip restart/fork logic
627
+ this.workerMap.delete(worker.id);
628
+ this._groupReadyWorkers.get(groupName)?.delete(worker.id);
629
+ this._lastHeartbeat.delete(worker.id);
630
+ this._workersReady.delete(worker.id);
631
+ // RT-H3: Clear any pending SIGKILL timer for this worker
632
+ const killTimer = this._killTimers.get(worker.id);
633
+ if (killTimer) {
634
+ clearTimeout(killTimer);
635
+ this._killTimers.delete(worker.id);
636
+ }
637
+ if (workerSlotIndex !== -1)
638
+ workers.splice(workerSlotIndex, 1);
639
+ // Unregister from message bus
640
+ for (let i = 0; i < services.length; i++) {
641
+ const svcName = services[i];
642
+ this.messageBus.unregisterWorker(svcName, worker.id, {
643
+ suppressBroadcast: i < services.length - 1,
644
+ });
645
+ }
646
+ // SUP-M1: Invalidate cached endpoint map so restarted workers get fresh topology
647
+ this._cachedEndpointJson = undefined;
648
+ if (this._endpointTempFile) {
649
+ try {
650
+ fs.unlinkSync(this._endpointTempFile);
651
+ }
652
+ catch { /* ignore */ }
653
+ this._endpointTempFile = null;
654
+ }
655
+ // CR-2: During shutdown, only do cleanup (above) — skip restart/fork logic
656
+ if (this._shuttingDown)
657
+ return;
658
+ // If this worker was intentionally removed during scale-down, don't restart
659
+ if (this._scalingDown.has(worker.id)) {
660
+ this._scalingDown.delete(worker.id);
661
+ // MED-4: Clean up restart history for removed worker
662
+ const cooldownKey = `${groupName}:${slotKey}`;
663
+ this._restartHistory.delete(cooldownKey);
664
+ console.log(` \u2193 Worker ${groupName}[${workerId}] (${services.join("+")}) removed (scale-down)`);
665
+ return;
666
+ }
667
+ // Exit code 100 indicates fatal configuration error (e.g., EPERM on port bind)
668
+ // Don't restart — log clear message and stop
669
+ if (code === 100) {
670
+ console.error(` \u2716 Worker ${groupName}[${workerId}] (${services.join("+")}) failed with fatal error \u2014 not restarting`);
671
+ console.error(` \u2716 Check worker logs above for details (likely port permission issue)`);
672
+ // Clean up restart history to prevent future attempts
673
+ const cooldownKey = `${groupName}:${slotKey}`;
674
+ this._restartHistory.delete(cooldownKey);
675
+ this._pendingRestarts.delete(cooldownKey);
676
+ return;
677
+ }
678
+ const reason = signal ? `signal ${signal}` : `code ${code}`;
679
+ // Alert: worker crash
680
+ this.alertSink.emit("worker.crash", `Worker ${groupName}[${workerId}] (${services.join("+")}) exited: ${reason}`, "warning", { groupName, workerId, services, reason, code, signal });
681
+ // L5: Rate-limit restart warnings per group
682
+ const now = Date.now();
683
+ const lastWarning = this._restartWarningTimes.get(groupName) ?? 0;
684
+ if (now - lastWarning >= RESTART_WARNING_INTERVAL_MS) {
685
+ console.error(` \u26a0 Worker ${groupName}[${workerId}] (${services.join("+")}) exited: ${reason}`);
686
+ this._restartWarningTimes.set(groupName, now);
687
+ }
688
+ // CR-1: Key by worker slot index (not cluster worker.id) so restart history persists across restarts
689
+ const cooldownKey = `${groupName}:${slotKey}`;
690
+ const history = this._restartHistory.get(cooldownKey) ?? { count: 0, firstRestart: now, lastRestart: 0 };
691
+ // Reset counter if outside the restart window
692
+ if (now - history.firstRestart > RESTART_WINDOW_MS) {
693
+ history.count = 0;
694
+ history.firstRestart = now;
695
+ }
696
+ if (history.count >= MAX_RESTARTS_PER_WINDOW) {
697
+ console.error(` \u26a0 ${groupName}[${workerId}] exceeded max restarts (${MAX_RESTARTS_PER_WINDOW} in ${RESTART_WINDOW_MS / 60000}min), not restarting`);
698
+ // Alert: restart limit exhausted — critical because the worker will not be restarted
699
+ this.alertSink.emit("worker.restart_limit", `Worker ${groupName}[${workerId}] (${services.join("+")}) exceeded max restarts (${MAX_RESTARTS_PER_WINDOW}/${RESTART_WINDOW_MS / 60000}min) — not restarting`, "critical", { groupName, workerId, services, restartCount: history.count, windowMs: RESTART_WINDOW_MS });
700
+ this._restartHistory.delete(cooldownKey);
701
+ return;
702
+ }
703
+ // Exponential backoff with constants
704
+ const backoffMs = Math.min(RESTART_BASE_BACKOFF_MS * 2 ** history.count, RESTART_MAX_BACKOFF_MS);
705
+ const timeSinceLast = now - history.lastRestart;
706
+ if (timeSinceLast < backoffMs) {
707
+ const remaining = backoffMs - timeSinceLast;
708
+ console.log(` \u21bb Delaying restart for ${groupName}[${workerId}] (${remaining}ms remaining in backoff)`);
709
+ // Cancel any existing pending restart for this slot
710
+ const existingTimer = this._pendingRestarts.get(cooldownKey);
711
+ if (existingTimer)
712
+ clearTimeout(existingTimer);
713
+ const timer = setTimeout(() => {
714
+ this._pendingRestarts.delete(cooldownKey);
715
+ if (this._shuttingDown)
716
+ return;
717
+ if (!this.groups[groupName])
718
+ return;
719
+ history.count++;
720
+ history.lastRestart = Date.now();
721
+ this._restartHistory.set(cooldownKey, history);
722
+ console.log(` \u21bb Restarting ${groupName}[${workerId}] (attempt ${history.count}/${MAX_RESTARTS_PER_WINDOW}, backoff ${backoffMs}ms)...`);
723
+ // O15: Track worker restart metric
724
+ this._workerRestartCount++;
725
+ this._forkGroupWorker(groupName, this.groups[groupName], workerId);
726
+ }, remaining);
727
+ timer.unref();
728
+ this._pendingRestarts.set(cooldownKey, timer);
729
+ return;
730
+ }
731
+ if (!this.groups[groupName])
732
+ return;
733
+ history.count++;
734
+ history.lastRestart = now;
735
+ this._restartHistory.set(cooldownKey, history);
736
+ console.log(` \u21bb Restarting ${groupName}[${workerId}] (attempt ${history.count}/${MAX_RESTARTS_PER_WINDOW}, backoff ${backoffMs}ms)...`);
737
+ // O15: Track worker restart metric
738
+ this._workerRestartCount++;
739
+ if (this._shuttingDown)
740
+ return;
741
+ this._forkGroupWorker(groupName, this.groups[groupName], workerId);
742
+ }
743
+ _startupPortsToCheck() {
744
+ const targets = [];
745
+ const seen = new Set();
746
+ for (const [name, svc] of Object.entries(this.services)) {
747
+ if (svc?.type !== ServiceType.EDGE)
748
+ continue;
749
+ if (!Number.isInteger(svc.port) || !svc.port || svc.port <= 0)
750
+ continue;
751
+ if (seen.has(svc.port))
752
+ continue;
753
+ seen.add(svc.port);
754
+ targets.push({ port: svc.port, purpose: `service "${name}"` });
755
+ }
756
+ return targets;
757
+ }
758
+ _isPortAvailable(port, host = "127.0.0.1") {
759
+ if (!Number.isInteger(port) || port <= 0)
760
+ return Promise.resolve(true);
761
+ return new Promise((resolve) => {
762
+ const probe = createNetServer();
763
+ let settled = false;
764
+ const finish = (available) => {
765
+ if (settled)
766
+ return;
767
+ settled = true;
768
+ if (available) {
769
+ probe.close(() => resolve(true));
770
+ }
771
+ else {
772
+ resolve(false);
773
+ }
774
+ };
775
+ probe.once("error", (err) => {
776
+ if (err.code === "EADDRINUSE" || err.code === "EACCES" || err.code === "EPERM") {
777
+ finish(false);
778
+ return;
779
+ }
780
+ finish(false);
781
+ });
782
+ probe.once("listening", () => finish(true));
783
+ probe.listen(port, host);
784
+ });
785
+ }
786
+ async _assertStartupPortsAvailable() {
787
+ const targets = this._startupPortsToCheck();
788
+ for (const { port, purpose } of targets) {
789
+ const available = await this._isPortAvailable(port);
790
+ if (!available) {
791
+ throw new Error(`Startup preflight failed: port ${port} (${purpose}) is unavailable ` +
792
+ `(already in use or permission denied).`);
793
+ }
794
+ }
795
+ }
796
+ async scale(groupName, newCount) {
797
+ const group = this.groups[groupName];
798
+ if (!group)
799
+ throw new Error(`Unknown group: ${groupName}`);
800
+ // M-2: Bounds checking for newCount
801
+ if (newCount < 1 || newCount > 64) {
802
+ throw new Error(`Invalid worker count ${newCount} for group "${groupName}": must be between 1 and 64`);
803
+ }
804
+ const currentIds = this.groupWorkers.get(groupName) ?? [];
805
+ const currentCount = currentIds.length;
806
+ if (newCount === currentCount)
807
+ return;
808
+ if (newCount > currentCount) {
809
+ const toAdd = newCount - currentCount;
810
+ console.log(` \u2191 Scaling ${groupName} ${currentCount} \u2192 ${newCount} (+${toAdd})`);
811
+ // H-3: Reset groupReadyLogged so ready message is logged again for new workers
812
+ this._groupReadyLogged.delete(groupName);
813
+ // Use a monotonic counter to avoid index collisions after scale-down + scale-up
814
+ if (this._nextWorkerIndex[groupName] === undefined) {
815
+ this._nextWorkerIndex[groupName] = currentCount;
816
+ }
817
+ for (let i = 0; i < toAdd; i++) {
818
+ const workerIndex = this._nextWorkerIndex[groupName]++;
819
+ // Clear any stale restart history so new workers don't inherit crash counts
820
+ this._restartHistory.delete(`${groupName}:${workerIndex}`);
821
+ this._forkGroupWorker(groupName, group, workerIndex);
822
+ }
823
+ }
824
+ else {
825
+ const toRemove = currentCount - newCount;
826
+ console.log(` \u2193 Scaling ${groupName} ${currentCount} \u2192 ${newCount} (-${toRemove})`);
827
+ for (let i = 0; i < toRemove; i++) {
828
+ const wid = currentIds[currentIds.length - 1 - i];
829
+ this._scalingDown.add(wid);
830
+ const clusterWorkers = cluster.workers ?? {};
831
+ const worker = clusterWorkers[wid];
832
+ if (worker) {
833
+ // SUP-H5: Use worker.disconnect() for proper IPC channel teardown
834
+ // instead of direct SIGTERM which bypasses cluster lifecycle
835
+ try {
836
+ worker.disconnect();
837
+ }
838
+ catch {
839
+ /* ignore: worker may already be disconnected */
840
+ }
841
+ // RT-H3: Force SIGKILL after 10s if worker hasn't exited
842
+ const killTimer = setTimeout(() => {
843
+ this._killTimers.delete(wid);
844
+ try {
845
+ if (!worker.isDead()) {
846
+ console.error(` \u26a0 Worker ${wid} did not exit after SIGTERM, sending SIGKILL`);
847
+ worker.process.kill("SIGKILL");
848
+ }
849
+ }
850
+ catch {
851
+ /* ignore: worker may have already exited between isDead() check and kill */
852
+ }
853
+ }, 10_000);
854
+ killTimer.unref();
855
+ this._killTimers.set(wid, killTimer);
856
+ // H3: Clean up kill timer if worker exits before SIGKILL fires
857
+ worker.once("exit", () => {
858
+ const t = this._killTimers.get(wid);
859
+ if (t) {
860
+ clearTimeout(t);
861
+ this._killTimers.delete(wid);
862
+ }
863
+ });
864
+ }
865
+ }
866
+ }
867
+ this.allocation.set(groupName, newCount);
868
+ }
869
+ /**
870
+ * H-5: Start heartbeat monitor — checks worker health every 30s.
871
+ * Warns after 60s of silence, kills after 90s.
872
+ */
873
+ _startHeartbeatMonitor() {
874
+ // Initialize heartbeat timestamps for all current workers
875
+ const now = Date.now();
876
+ for (const wid of this.workerMap.keys()) {
877
+ this._lastHeartbeat.set(wid, now);
878
+ }
879
+ this._heartbeatInterval = setInterval(() => {
880
+ if (this._shuttingDown)
881
+ return;
882
+ // Request health checks from message bus if available
883
+ if (typeof this.messageBus.requestHealthChecks === "function") {
884
+ this.messageBus.requestHealthChecks();
885
+ }
886
+ const checkTime = Date.now();
887
+ for (const [wid, info] of this.workerMap) {
888
+ const lastSeen = this._lastHeartbeat.get(wid) ?? 0;
889
+ const elapsed = checkTime - lastSeen;
890
+ if (elapsed > 90_000) {
891
+ // 90s without response — kill the worker
892
+ console.error(` \u2716 Worker ${info.groupName}[${info.workerId}] unresponsive for ${Math.round(elapsed / 1000)}s \u2014 sending SIGKILL`);
893
+ // Alert: heartbeat timeout — critical because the worker is being force-killed
894
+ this.alertSink.emit("worker.heartbeat_timeout", `Worker ${info.groupName}[${info.workerId}] (${info.services.join("+")}) unresponsive for ${Math.round(elapsed / 1000)}s — sending SIGKILL`, "critical", { groupName: info.groupName, workerId: info.workerId, services: info.services, elapsedMs: elapsed });
895
+ try {
896
+ const clusterWorkers = cluster.workers ?? {};
897
+ const w = clusterWorkers[wid];
898
+ if (w && !w.isDead()) {
899
+ w.process.kill("SIGKILL");
900
+ }
901
+ }
902
+ catch {
903
+ /* ignore: worker may have already exited between isDead() check and kill */
904
+ }
905
+ }
906
+ else if (elapsed > 60_000) {
907
+ // 60s without response — log a warning
908
+ console.warn(` \u26a0 Worker ${info.groupName}[${info.workerId}] no heartbeat for ${Math.round(elapsed / 1000)}s`);
909
+ }
910
+ }
911
+ }, 30_000);
912
+ this._heartbeatInterval.unref();
913
+ }
914
+ _stopHeartbeatMonitor() {
915
+ if (this._heartbeatInterval) {
916
+ clearInterval(this._heartbeatInterval);
917
+ this._heartbeatInterval = null;
918
+ }
919
+ }
920
+ async shutdown() {
921
+ if (this._shuttingDown)
922
+ return;
923
+ this._shuttingDown = true;
924
+ // H-5: Stop heartbeat monitor
925
+ this._stopHeartbeatMonitor();
926
+ // Resolve in-flight /metrics scrapes so callers don't hang during shutdown
927
+ for (const pending of this._pendingMetricsSnapshots.values()) {
928
+ clearTimeout(pending.timer);
929
+ pending.finish();
930
+ }
931
+ this._pendingMetricsSnapshots.clear();
932
+ // Cancel any pending delayed restarts
933
+ for (const timer of this._pendingRestarts.values()) {
934
+ clearTimeout(timer);
935
+ }
936
+ this._pendingRestarts.clear();
937
+ // Cancel any pending SIGKILL timers from scale-down
938
+ for (const [, timer] of this._killTimers) {
939
+ clearTimeout(timer);
940
+ }
941
+ this._killTimers.clear();
942
+ console.log("\n Shutting down ThreadForge...\n");
943
+ // C5: Overall shutdown deadline — each phase races against remaining time
944
+ const deadlineStart = Date.now();
945
+ const withDeadline = (promise, label) => {
946
+ const remaining = SHUTDOWN_DEADLINE_MS - (Date.now() - deadlineStart);
947
+ if (remaining <= 0) {
948
+ console.warn(` \u26a0 Shutdown deadline exceeded during: ${label} \u2014 skipping`);
949
+ return Promise.resolve();
950
+ }
951
+ return Promise.race([
952
+ promise,
953
+ new Promise((resolve) => {
954
+ const t = setTimeout(() => {
955
+ console.warn(` \u26a0 Shutdown phase "${label}" exceeded deadline \u2014 skipping`);
956
+ resolve();
957
+ }, remaining);
958
+ t.unref();
959
+ }),
960
+ ]);
961
+ };
962
+ // Close metrics server first so health checks fail during shutdown
963
+ if (this._metricsServer) {
964
+ await withDeadline(new Promise((resolve) => this._metricsServer.close(() => resolve())), "metrics server close");
965
+ this._metricsServer = null;
966
+ }
967
+ // Step 1: Send graceful shutdown message to each worker.
968
+ //
969
+ // H5: Intentionally NO SIGTERM here. Graceful shutdown uses only IPC
970
+ // (forge:shutdown) + worker.disconnect(). SIGTERM would race with the IPC
971
+ // handler — the worker might receive SIGTERM before it finishes draining
972
+ // HTTP connections triggered by the forge:shutdown message. SIGTERM is
973
+ // reserved for scale-down operations where immediate process termination
974
+ // is acceptable. SIGKILL is used only as a last resort in Step 5 if
975
+ // workers refuse to exit after disconnect.
976
+ const clusterWorkers = cluster.workers ?? {};
977
+ for (const id of Object.keys(clusterWorkers)) {
978
+ const worker = clusterWorkers[Number(id)];
979
+ if (worker) {
980
+ this._sendWorkerMessage(worker, { type: "forge:shutdown" }, "shutdown signal");
981
+ }
982
+ }
983
+ // Step 2: Wait for workers to drain HTTP connections and exit
984
+ // H-CORE-3: Unref timers so they don't keep the process alive after workers exit
985
+ await withDeadline(new Promise((resolve) => {
986
+ const check = setInterval(() => {
987
+ const cw = cluster.workers ?? {};
988
+ const alive = Object.keys(cw).filter((id) => {
989
+ const w = cw[Number(id)];
990
+ return w && !w.isDead();
991
+ });
992
+ if (alive.length === 0) {
993
+ clearInterval(check);
994
+ resolve();
995
+ }
996
+ }, 200);
997
+ check.unref();
998
+ const fallback = setTimeout(() => {
999
+ clearInterval(check);
1000
+ resolve();
1001
+ }, 10000);
1002
+ fallback.unref();
1003
+ }), "graceful drain");
1004
+ // Collect all worker PIDs before disconnect (workers may leave cluster.workers after disconnect)
1005
+ // SUP-M2: Listen for exits to remove PIDs, preventing SIGKILL of recycled PIDs
1006
+ const workerPids = new Set();
1007
+ const cw2 = cluster.workers ?? {};
1008
+ for (const id of Object.keys(cw2)) {
1009
+ const w = cw2[Number(id)];
1010
+ if (w?.process?.pid) {
1011
+ const pid = w.process.pid;
1012
+ workerPids.add(pid);
1013
+ w.once("exit", () => { workerPids.delete(pid); });
1014
+ }
1015
+ }
1016
+ // Step 3: Disconnect remaining workers
1017
+ const cw3 = cluster.workers ?? {};
1018
+ for (const id of Object.keys(cw3)) {
1019
+ const worker = cw3[Number(id)];
1020
+ if (worker && !worker.isDead()) {
1021
+ try {
1022
+ worker.disconnect();
1023
+ }
1024
+ catch {
1025
+ /* ignore: cleanup — worker may already be dead or disconnected */
1026
+ }
1027
+ }
1028
+ }
1029
+ // Step 4: Wait for disconnect to complete
1030
+ await withDeadline(new Promise((resolve) => {
1031
+ const check = setInterval(() => {
1032
+ const cw4 = cluster.workers ?? {};
1033
+ const alive = Object.keys(cw4).filter((id) => {
1034
+ const w = cw4[Number(id)];
1035
+ return w && !w.isDead();
1036
+ });
1037
+ if (alive.length === 0) {
1038
+ clearInterval(check);
1039
+ resolve();
1040
+ }
1041
+ }, 200);
1042
+ check.unref();
1043
+ const fallback = setTimeout(() => {
1044
+ clearInterval(check);
1045
+ resolve();
1046
+ }, 5000);
1047
+ fallback.unref();
1048
+ }), "disconnect");
1049
+ // Step 5: Force kill any remaining workers
1050
+ const cw5 = cluster.workers ?? {};
1051
+ for (const id of Object.keys(cw5)) {
1052
+ const worker = cw5[Number(id)];
1053
+ if (worker && !worker.isDead()) {
1054
+ console.error(` \u26a0 Forcefully killing worker ${id}...`);
1055
+ worker.process.kill("SIGKILL");
1056
+ }
1057
+ }
1058
+ // SUP-M2: Kill workers that disconnected from cluster but may still be alive.
1059
+ // PIDs are removed from workerPids via exit listeners, so recycled PIDs are safe.
1060
+ for (const pid of workerPids) {
1061
+ try {
1062
+ process.kill(pid, 0);
1063
+ process.kill(pid, "SIGKILL");
1064
+ }
1065
+ catch {
1066
+ /* ignore: cleanup — process already exited */
1067
+ }
1068
+ }
1069
+ // C3: Clean up temp endpoint file if created
1070
+ if (this._endpointTempFile) {
1071
+ try {
1072
+ fs.unlinkSync(this._endpointTempFile);
1073
+ }
1074
+ catch {
1075
+ /* ignore: cleanup — temp file may already be removed */
1076
+ }
1077
+ this._endpointTempFile = null;
1078
+ }
1079
+ if (this._sitesTempFile) {
1080
+ try {
1081
+ fs.unlinkSync(this._sitesTempFile);
1082
+ }
1083
+ catch {
1084
+ /* ignore: cleanup — temp file may already be removed */
1085
+ }
1086
+ this._sitesTempFile = null;
1087
+ }
1088
+ // H4: Clean up PID file
1089
+ if (this._pidFilePath) {
1090
+ try {
1091
+ fs.unlinkSync(this._pidFilePath);
1092
+ }
1093
+ catch {
1094
+ /* ignore: cleanup — PID file may already be removed */
1095
+ }
1096
+ this._pidFilePath = null;
1097
+ }
1098
+ console.log(" All workers stopped. Goodbye.\n");
1099
+ this.messageBus.cleanup();
1100
+ this.scaleAdvisor.stop();
1101
+ // Flush pending alerts before stopping
1102
+ await this.alertSink.stop();
1103
+ // O14: Add deadline to registry.stop() to prevent hanging
1104
+ try {
1105
+ await Promise.race([this.registry.stop(), new Promise((resolve) => setTimeout(resolve, 5000))]);
1106
+ }
1107
+ catch (err) {
1108
+ console.error(` \u26a0 Registry stop failed: ${err.message}`);
1109
+ }
1110
+ // Let the caller decide whether to exit — don't force process.exit here
1111
+ // so tests and CLI wrappers can run post-shutdown cleanup
1112
+ }
1113
+ async _startMetricsServer() {
1114
+ // Allow metricsPort: null or false to disable metrics entirely
1115
+ if (this.config.metricsPort === null || this.config.metricsPort === false) {
1116
+ console.log(` \ud83d\udcca Metrics: disabled`);
1117
+ return;
1118
+ }
1119
+ // Safety fallback to 9090 (config layer should already provide this default)
1120
+ const port = this.config.metricsPort ?? 9090;
1121
+ return new Promise((resolve) => {
1122
+ this._metricsServer = createServer((req, res) => {
1123
+ const reqPath = new URL(req.url ?? "/", "http://localhost").pathname;
1124
+ // Let registry handle its endpoints first
1125
+ if (this.registry.httpHandler(req, res))
1126
+ return;
1127
+ // S7: Auth gate for sensitive supervisor endpoints (matches worker-level FORGE_METRICS_TOKEN)
1128
+ // SEC-C2: Include registry endpoints — they expose full service topology
1129
+ const sensitiveEndpoints = ["/status", "/metrics", "/scaling", "/_forge/topology", "/_forge/resolve"];
1130
+ if (sensitiveEndpoints.includes(reqPath)) {
1131
+ const metricsToken = process.env.FORGE_METRICS_TOKEN;
1132
+ if (metricsToken) {
1133
+ const auth = req.headers.authorization ?? "";
1134
+ const expected = `Bearer ${metricsToken}`;
1135
+ if (auth.length !== expected.length || !timingSafeEqual(Buffer.from(auth), Buffer.from(expected))) {
1136
+ res.writeHead(401, { "Content-Type": "application/json" });
1137
+ res.end(JSON.stringify({ error: "Unauthorized" }));
1138
+ return;
1139
+ }
1140
+ }
1141
+ }
1142
+ if (reqPath === "/status") {
1143
+ res.writeHead(200, { "Content-Type": "application/json" });
1144
+ res.end(JSON.stringify(this._status(), null, 2));
1145
+ }
1146
+ else if (reqPath === "/metrics") {
1147
+ this._collectMetricsSnapshot()
1148
+ .then((payload) => {
1149
+ // O15: Prepend supervisor-level restart counter
1150
+ const supervisorMetrics = `# HELP forge_worker_restarts_total Total number of worker restarts\n` +
1151
+ `# TYPE forge_worker_restarts_total counter\n` +
1152
+ `forge_worker_restarts_total ${this._workerRestartCount}\n`;
1153
+ res.writeHead(200, { "Content-Type": "text/plain; version=0.0.4; charset=utf-8" });
1154
+ res.end(supervisorMetrics + payload);
1155
+ })
1156
+ .catch((err) => {
1157
+ res.writeHead(500, { "Content-Type": "text/plain; charset=utf-8" });
1158
+ res.end(`# metrics collection failed: ${err.message}\n`);
1159
+ });
1160
+ return;
1161
+ }
1162
+ else if (reqPath === "/health" || reqPath === "/health/ready") {
1163
+ // O1: Readiness probe — 200 only when ALL workers have reported ready
1164
+ const totalWorkers = this.workerMap.size;
1165
+ const readyWorkers = [...this._workersReady.values()].filter(Boolean).length;
1166
+ if (totalWorkers > 0 && readyWorkers >= totalWorkers) {
1167
+ res.writeHead(200, { "Content-Type": "application/json" });
1168
+ res.end(JSON.stringify({ status: "ready", ready: readyWorkers, total: totalWorkers }));
1169
+ }
1170
+ else {
1171
+ res.writeHead(503, { "Content-Type": "application/json" });
1172
+ res.end(JSON.stringify({ status: "starting", ready: readyWorkers, total: totalWorkers }));
1173
+ }
1174
+ }
1175
+ else if (reqPath === "/health/live") {
1176
+ // O1: Liveness probe — always 200 if process is running
1177
+ res.writeHead(200, { "Content-Type": "text/plain" });
1178
+ res.end("ok");
1179
+ }
1180
+ else if (reqPath === "/scaling") {
1181
+ res.writeHead(200, { "Content-Type": "text/plain" });
1182
+ res.end(this.scaleAdvisor.report());
1183
+ }
1184
+ else {
1185
+ res.writeHead(404);
1186
+ res.end("Not found");
1187
+ }
1188
+ });
1189
+ this._metricsServer.on("error", (err) => {
1190
+ // Enhanced error message with actionable guidance
1191
+ console.warn(` \u26a0 Metrics server failed to bind port ${port}: ${err.message}`);
1192
+ console.warn(` To fix: Set metricsPort to a different port in your config, or set metricsPort: null to disable metrics.`);
1193
+ console.warn(` Example: defineServices(services, { metricsPort: 9091 }) or { metricsPort: null }`);
1194
+ this._metricsServer = null;
1195
+ resolve(); // non-fatal — supervisor continues without metrics
1196
+ });
1197
+ // Set timeouts to prevent slowloris attacks
1198
+ this._metricsServer.timeout = 5000;
1199
+ this._metricsServer.requestTimeout = 5000;
1200
+ this._metricsServer.headersTimeout = 3000;
1201
+ // RT-H4: Bind to localhost only — metrics endpoint has no auth
1202
+ // C2: Allow override via FORGE_METRICS_BIND for containers (e.g. 0.0.0.0)
1203
+ const bindAddr = process.env.FORGE_METRICS_BIND || "127.0.0.1";
1204
+ // SEC-C2: Warn when metrics are exposed without auth
1205
+ if (bindAddr !== "127.0.0.1" && bindAddr !== "::1" && !process.env.FORGE_METRICS_TOKEN) {
1206
+ console.warn(` \u26a0 Metrics server binding to ${bindAddr} without FORGE_METRICS_TOKEN \u2014 topology and metrics are publicly accessible`);
1207
+ console.warn(` Set FORGE_METRICS_TOKEN=<secret> to require Bearer auth on sensitive endpoints`);
1208
+ }
1209
+ this._metricsServer.listen(port, bindAddr, () => {
1210
+ console.log(` \ud83d\udcca Metrics: http://${bindAddr}:${port}/status (Prometheus: /metrics)`);
1211
+ resolve();
1212
+ });
1213
+ });
1214
+ }
1215
+ _status() {
1216
+ const groups = [];
1217
+ const liveWorkersByService = {};
1218
+ for (const [groupName, workerIds] of this.groupWorkers) {
1219
+ const group = this.groups[groupName];
1220
+ const clusterWorkers = cluster.workers ?? {};
1221
+ const pids = workerIds
1222
+ .map((wid) => clusterWorkers[wid]?.process?.pid)
1223
+ .filter((pid) => pid != null);
1224
+ const liveWorkers = workerIds.length;
1225
+ for (const svc of group.services) {
1226
+ liveWorkersByService[svc.name] = liveWorkers;
1227
+ }
1228
+ groups.push({
1229
+ group: groupName,
1230
+ services: group.services.map((s) => ({
1231
+ name: s.name,
1232
+ type: s.type,
1233
+ port: s.port,
1234
+ })),
1235
+ workers: liveWorkers,
1236
+ pids,
1237
+ });
1238
+ }
1239
+ const topology = this.registry.topology();
1240
+ for (const [serviceName, liveWorkers] of Object.entries(liveWorkersByService)) {
1241
+ const existing = Array.isArray(topology[serviceName]) ? topology[serviceName] : [];
1242
+ let hasLocalEntry = false;
1243
+ const updated = existing.map((entry) => {
1244
+ const isLocalEntry = entry?.nodeId === this.registry.nodeId || entry?.transport === "local" || entry?.transport === "colocated";
1245
+ if (!isLocalEntry)
1246
+ return entry;
1247
+ hasLocalEntry = true;
1248
+ return {
1249
+ ...entry,
1250
+ workers: liveWorkers,
1251
+ status: liveWorkers > 0 ? (entry.status ?? "healthy") : "unhealthy",
1252
+ };
1253
+ });
1254
+ if (!hasLocalEntry) {
1255
+ updated.unshift({
1256
+ nodeId: this.registry.nodeId,
1257
+ host: this.registry.host,
1258
+ transport: "local",
1259
+ status: liveWorkers > 0 ? "healthy" : "unhealthy",
1260
+ cpu: 0,
1261
+ workers: liveWorkers,
1262
+ });
1263
+ }
1264
+ topology[serviceName] = updated;
1265
+ }
1266
+ const clusterWorkers = cluster.workers ?? {};
1267
+ const status = {
1268
+ supervisorPid: process.pid,
1269
+ uptime: process.uptime(),
1270
+ totalCpus: this.allocator.totalCpus,
1271
+ nodeId: this.registry.nodeId,
1272
+ host: this.registry.host,
1273
+ registryMode: this.registry.mode,
1274
+ processGroups: groups,
1275
+ channels: this.channels,
1276
+ totalProcesses: Object.keys(clusterWorkers).length,
1277
+ totalServices: Object.keys(this.services).length,
1278
+ remoteServices: Object.values(this.services).filter((s) => s.type === ServiceType.REMOTE).length,
1279
+ portsUsed: Object.values(this.services)
1280
+ .filter((s) => s.port)
1281
+ .map((s) => s.port),
1282
+ messageBus: this.messageBus.stats(),
1283
+ topology,
1284
+ scalingRecommendations: this.scaleAdvisor.recommendations,
1285
+ };
1286
+ const configAny = this.config;
1287
+ if (configAny?._hostMeta && typeof configAny._hostMeta === "object") {
1288
+ const projects = {};
1289
+ for (const [projectId, meta] of Object.entries(configAny._hostMeta)) {
1290
+ const projectGroups = groups.filter((pg) => pg.services.some((svc) => Array.isArray(meta.services) && meta.services.includes(svc.name)));
1291
+ projects[projectId] = {
1292
+ domain: meta.domain ?? null,
1293
+ services: Array.isArray(meta.services) ? meta.services.length : 0,
1294
+ workers: projectGroups.reduce((sum, pg) => sum + pg.workers, 0),
1295
+ schema: meta.schema ?? null,
1296
+ keyPrefix: meta.keyPrefix ?? null,
1297
+ };
1298
+ }
1299
+ status.hostMode = true;
1300
+ status.domain = configAny._hostDomain ?? null;
1301
+ status.projects = projects;
1302
+ }
1303
+ if (configAny?._isPlatformMode) {
1304
+ status.platformMode = true;
1305
+ }
1306
+ return status;
1307
+ }
1308
+ /**
1309
+ * Build endpoint map for workers.
1310
+ *
1311
+ * Maps each service to { host, port, remote } or an array of endpoints
1312
+ * for multi-instance services. Remote services get their address parsed
1313
+ * into host/port. Local services get host: '127.0.0.1'.
1314
+ */
1315
+ _buildEndpointMap() {
1316
+ const endpoints = {};
1317
+ for (const [name, svc] of Object.entries(this.services)) {
1318
+ if (svc.type === ServiceType.REMOTE) {
1319
+ // Parse address: "http://host:port" or "host:port"
1320
+ const parsed = this._parseAddress(svc.address, name);
1321
+ if (parsed) {
1322
+ endpoints[name] = { host: parsed.host, port: parsed.port, remote: true };
1323
+ }
1324
+ }
1325
+ else if (svc.port) {
1326
+ endpoints[name] = { host: "127.0.0.1", port: svc.port, remote: false };
1327
+ }
1328
+ else if (svc.type === ServiceType.INTERNAL || svc.type === ServiceType.BACKGROUND) {
1329
+ // Include internal/background services so workers can use DirectMessageBus
1330
+ // for cross-group calls instead of falling back to supervisor IPC
1331
+ const groupName = svc.group ?? `_isolated:${name}`;
1332
+ const messageBusAny = this.messageBus;
1333
+ const socketPath = messageBusAny.getSocketPath?.(name);
1334
+ endpoints[name] = { host: "127.0.0.1", remote: false, uds: socketPath ?? null, group: groupName };
1335
+ }
1336
+ }
1337
+ return endpoints;
1338
+ }
1339
+ /**
1340
+ * Parse a service address string into host/port.
1341
+ * Supports: "http://host:port", "host:port", and other URL schemes
1342
+ */
1343
+ _parseAddress(address, serviceName) {
1344
+ if (!address)
1345
+ return null;
1346
+ try {
1347
+ // Try as URL first (handles http://, https://, etc.)
1348
+ if (address.includes("://")) {
1349
+ const url = new URL(address);
1350
+ return {
1351
+ host: url.hostname,
1352
+ port: parseInt(url.port, 10) || (url.protocol === "https:" ? 443 : 80),
1353
+ };
1354
+ }
1355
+ // Plain host:port
1356
+ const [host, portStr] = address.split(":");
1357
+ const port = parseInt(portStr, 10);
1358
+ if (host && port)
1359
+ return { host, port };
1360
+ }
1361
+ catch (err) {
1362
+ console.warn(` \u26a0 Failed to parse address "${address}" for service "${serviceName ?? "unknown"}": ${err.message}`);
1363
+ }
1364
+ console.warn(` \u26a0 Invalid address "${address}" for service "${serviceName ?? "unknown"}" \u2014 skipping`);
1365
+ return null;
1366
+ }
1367
+ _printAllocation() {
1368
+ console.log("");
1369
+ console.log(" \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510");
1370
+ console.log(" \u2502 Process Group \u2502 Services \u2502 Workers \u2502 Port \u2502");
1371
+ console.log(" \u251c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524");
1372
+ for (const [groupName, group] of Object.entries(this.groups)) {
1373
+ const name = groupName.replace("_isolated:", "").padEnd(16);
1374
+ const svcList = group.services
1375
+ .map((s) => {
1376
+ const badge = s.type === ServiceType.EDGE ? "\u26a1" : s.type === ServiceType.BACKGROUND ? "\u23f0" : "\u25cb";
1377
+ return `${badge} ${s.name}`;
1378
+ })
1379
+ .join(", ");
1380
+ const svcs = svcList.substring(0, 25).padEnd(25);
1381
+ const threads = String(this.allocation.get(groupName) ?? 1).padEnd(7);
1382
+ const port = group.port ? String(group.port).padEnd(6) : " \u2014 ";
1383
+ console.log(` \u2502 ${name} \u2502 ${svcs} \u2502 ${threads} \u2502 ${port} \u2502`);
1384
+ }
1385
+ console.log(" \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518");
1386
+ let totalProcesses = 0;
1387
+ for (const [, count] of this.allocation)
1388
+ totalProcesses += count;
1389
+ const edgePorts = Object.values(this.services).filter((s) => s.port).length;
1390
+ console.log(` Processes: ${totalProcesses} | Services: ${Object.keys(this.services).length} | Ports: ${edgePorts} | CPUs: ${this.allocator.totalCpus}`);
1391
+ }
1392
+ _printTopology() {
1393
+ if (this.channels.length === 0)
1394
+ return;
1395
+ console.log("");
1396
+ console.log(" Channels:");
1397
+ for (const ch of this.channels) {
1398
+ console.log(` ${ch.from} \u2194 ${ch.to}`);
1399
+ }
1400
+ console.log(` Total: ${this.channels.length} channels (dependency-based)`);
1401
+ }
1402
+ _banner() {
1403
+ let version = "0.0.0";
1404
+ try {
1405
+ const pkg = JSON.parse(fs.readFileSync(new URL("../../package.json", import.meta.url), "utf8"));
1406
+ version = pkg.version;
1407
+ }
1408
+ catch {
1409
+ /* ignore: fallback to default version if package.json is unreadable */
1410
+ }
1411
+ return `
1412
+ \u2554\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2557
1413
+ \u2551 \u26a1 ThreadForge v${version.padEnd(12)}\u2551
1414
+ \u2551 Multi-threaded Service Runtime \u2551
1415
+ \u255a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u255d`;
1416
+ }
1417
+ }
1418
+ //# sourceMappingURL=Supervisor.js.map