crewly 1.8.9 → 1.8.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (241) hide show
  1. package/config/constants.d.ts.map +1 -0
  2. package/config/index.d.ts.map +1 -0
  3. package/config/roles/_common/memory-instructions.md +6 -5
  4. package/config/roles/_common/wiki-instructions.md +49 -0
  5. package/config/roles/architect/prompt.md +2 -2
  6. package/config/roles/backend-developer/prompt.md +2 -2
  7. package/config/roles/designer/prompt.md +2 -2
  8. package/config/roles/developer/prompt.md +2 -2
  9. package/config/roles/frontend-developer/prompt.md +2 -2
  10. package/config/roles/fullstack-dev/prompt.md +2 -2
  11. package/config/roles/generalist/prompt.md +2 -2
  12. package/config/roles/ops/prompt.md +2 -2
  13. package/config/roles/orchestrator/prompt.md +135 -11
  14. package/config/roles/product-manager/prompt.md +2 -2
  15. package/config/roles/qa/prompt.md +2 -2
  16. package/config/roles/qa-engineer/prompt.md +2 -2
  17. package/config/roles/researcher/prompt.md +15 -6
  18. package/config/roles/sales/prompt.md +2 -2
  19. package/config/roles/support/prompt.md +2 -2
  20. package/config/roles/team-leader/prompt.md +17 -2
  21. package/config/roles/tpm/prompt.md +2 -2
  22. package/config/roles/ux-designer/prompt.md +2 -2
  23. package/config/skills/orchestrator/wiki-cleanup/SKILL.md +89 -0
  24. package/config/skills/orchestrator/wiki-cleanup/execute.sh +139 -0
  25. package/config/skills/orchestrator/wiki-lint/SKILL.md +75 -0
  26. package/config/skills/orchestrator/wiki-lint/execute.sh +66 -0
  27. package/config/skills/orchestrator/wiki-migrate/SKILL.md +103 -0
  28. package/config/skills/orchestrator/wiki-migrate/execute.sh +82 -0
  29. package/config/skills/orchestrator/wiki-process-queue/SKILL.md +9 -1
  30. package/dist/backend/backend/src/controllers/task-management/task-management.controller.d.ts +169 -0
  31. package/dist/backend/backend/src/controllers/task-management/task-management.controller.d.ts.map +1 -0
  32. package/dist/backend/backend/src/controllers/task-management/task-management.controller.js +1779 -0
  33. package/dist/backend/backend/src/controllers/task-management/task-management.controller.js.map +1 -0
  34. package/dist/backend/backend/src/controllers/task-pool/task-pool.controller.d.ts +18 -0
  35. package/dist/backend/backend/src/controllers/task-pool/task-pool.controller.d.ts.map +1 -1
  36. package/dist/backend/backend/src/controllers/task-pool/task-pool.controller.js +63 -0
  37. package/dist/backend/backend/src/controllers/task-pool/task-pool.controller.js.map +1 -1
  38. package/dist/backend/backend/src/controllers/task-pool/task-pool.routes.d.ts.map +1 -1
  39. package/dist/backend/backend/src/controllers/task-pool/task-pool.routes.js +5 -1
  40. package/dist/backend/backend/src/controllers/task-pool/task-pool.routes.js.map +1 -1
  41. package/dist/backend/backend/src/controllers/wiki/wiki.controller.d.ts +109 -0
  42. package/dist/backend/backend/src/controllers/wiki/wiki.controller.d.ts.map +1 -1
  43. package/dist/backend/backend/src/controllers/wiki/wiki.controller.js +418 -4
  44. package/dist/backend/backend/src/controllers/wiki/wiki.controller.js.map +1 -1
  45. package/dist/backend/backend/src/controllers/wiki/wiki.routes.d.ts.map +1 -1
  46. package/dist/backend/backend/src/controllers/wiki/wiki.routes.js +11 -1
  47. package/dist/backend/backend/src/controllers/wiki/wiki.routes.js.map +1 -1
  48. package/dist/backend/backend/src/index.d.ts.map +1 -1
  49. package/dist/backend/backend/src/index.js +64 -0
  50. package/dist/backend/backend/src/index.js.map +1 -1
  51. package/dist/backend/backend/src/index.js.orc-bak-20260529 +3130 -0
  52. package/dist/backend/backend/src/services/agent/crewly-agent/agent-runner.service.d.ts +513 -0
  53. package/dist/backend/backend/src/services/agent/crewly-agent/agent-runner.service.d.ts.map +1 -0
  54. package/dist/backend/backend/src/services/agent/crewly-agent/agent-runner.service.js +1568 -0
  55. package/dist/backend/backend/src/services/agent/crewly-agent/agent-runner.service.js.map +1 -0
  56. package/dist/backend/backend/src/services/agent/crewly-agent/agent-worker.d.ts +86 -0
  57. package/dist/backend/backend/src/services/agent/crewly-agent/agent-worker.d.ts.map +1 -0
  58. package/dist/backend/backend/src/services/agent/crewly-agent/agent-worker.js +147 -0
  59. package/dist/backend/backend/src/services/agent/crewly-agent/agent-worker.js.map +1 -0
  60. package/dist/backend/backend/src/services/agent/crewly-agent/api-client.d.ts +68 -0
  61. package/dist/backend/backend/src/services/agent/crewly-agent/api-client.d.ts.map +1 -0
  62. package/dist/backend/backend/src/services/agent/crewly-agent/api-client.js +131 -0
  63. package/dist/backend/backend/src/services/agent/crewly-agent/api-client.js.map +1 -0
  64. package/dist/backend/backend/src/services/agent/crewly-agent/audit-log.service.d.ts +130 -0
  65. package/dist/backend/backend/src/services/agent/crewly-agent/audit-log.service.d.ts.map +1 -0
  66. package/dist/backend/backend/src/services/agent/crewly-agent/audit-log.service.js +263 -0
  67. package/dist/backend/backend/src/services/agent/crewly-agent/audit-log.service.js.map +1 -0
  68. package/dist/backend/backend/src/services/agent/crewly-agent/audit-trail.service.d.ts +74 -0
  69. package/dist/backend/backend/src/services/agent/crewly-agent/audit-trail.service.d.ts.map +1 -0
  70. package/dist/backend/backend/src/services/agent/crewly-agent/audit-trail.service.js +140 -0
  71. package/dist/backend/backend/src/services/agent/crewly-agent/audit-trail.service.js.map +1 -0
  72. package/dist/backend/backend/src/services/agent/crewly-agent/auditor-tools.d.ts +29 -0
  73. package/dist/backend/backend/src/services/agent/crewly-agent/auditor-tools.d.ts.map +1 -0
  74. package/dist/backend/backend/src/services/agent/crewly-agent/auditor-tools.js +279 -0
  75. package/dist/backend/backend/src/services/agent/crewly-agent/auditor-tools.js.map +1 -0
  76. package/dist/backend/backend/src/services/agent/crewly-agent/crewly-agent-runtime.service.d.ts +340 -0
  77. package/dist/backend/backend/src/services/agent/crewly-agent/crewly-agent-runtime.service.d.ts.map +1 -0
  78. package/dist/backend/backend/src/services/agent/crewly-agent/crewly-agent-runtime.service.js +1176 -0
  79. package/dist/backend/backend/src/services/agent/crewly-agent/crewly-agent-runtime.service.js.map +1 -0
  80. package/dist/backend/backend/src/services/agent/crewly-agent/deepseek-sse-transform.d.ts +79 -0
  81. package/dist/backend/backend/src/services/agent/crewly-agent/deepseek-sse-transform.d.ts.map +1 -0
  82. package/dist/backend/backend/src/services/agent/crewly-agent/deepseek-sse-transform.js +145 -0
  83. package/dist/backend/backend/src/services/agent/crewly-agent/deepseek-sse-transform.js.map +1 -0
  84. package/dist/backend/backend/src/services/agent/crewly-agent/env-isolation.service.d.ts +79 -0
  85. package/dist/backend/backend/src/services/agent/crewly-agent/env-isolation.service.d.ts.map +1 -0
  86. package/dist/backend/backend/src/services/agent/crewly-agent/env-isolation.service.js +218 -0
  87. package/dist/backend/backend/src/services/agent/crewly-agent/env-isolation.service.js.map +1 -0
  88. package/dist/backend/backend/src/services/agent/crewly-agent/index.d.ts +16 -0
  89. package/dist/backend/backend/src/services/agent/crewly-agent/index.d.ts.map +1 -0
  90. package/dist/backend/backend/src/services/agent/crewly-agent/index.js +16 -0
  91. package/dist/backend/backend/src/services/agent/crewly-agent/index.js.map +1 -0
  92. package/dist/backend/backend/src/services/agent/crewly-agent/mcp-tool-bridge.d.ts +135 -0
  93. package/dist/backend/backend/src/services/agent/crewly-agent/mcp-tool-bridge.d.ts.map +1 -0
  94. package/dist/backend/backend/src/services/agent/crewly-agent/mcp-tool-bridge.js +185 -0
  95. package/dist/backend/backend/src/services/agent/crewly-agent/mcp-tool-bridge.js.map +1 -0
  96. package/dist/backend/backend/src/services/agent/crewly-agent/model-manager.d.ts +141 -0
  97. package/dist/backend/backend/src/services/agent/crewly-agent/model-manager.d.ts.map +1 -0
  98. package/dist/backend/backend/src/services/agent/crewly-agent/model-manager.js +310 -0
  99. package/dist/backend/backend/src/services/agent/crewly-agent/model-manager.js.map +1 -0
  100. package/dist/backend/backend/src/services/agent/crewly-agent/output-filter.service.d.ts +91 -0
  101. package/dist/backend/backend/src/services/agent/crewly-agent/output-filter.service.d.ts.map +1 -0
  102. package/dist/backend/backend/src/services/agent/crewly-agent/output-filter.service.js +143 -0
  103. package/dist/backend/backend/src/services/agent/crewly-agent/output-filter.service.js.map +1 -0
  104. package/dist/backend/backend/src/services/agent/crewly-agent/prompt-guard.service.d.ts +103 -0
  105. package/dist/backend/backend/src/services/agent/crewly-agent/prompt-guard.service.d.ts.map +1 -0
  106. package/dist/backend/backend/src/services/agent/crewly-agent/prompt-guard.service.js +256 -0
  107. package/dist/backend/backend/src/services/agent/crewly-agent/prompt-guard.service.js.map +1 -0
  108. package/dist/backend/backend/src/services/agent/crewly-agent/rate-limiter.d.ts +143 -0
  109. package/dist/backend/backend/src/services/agent/crewly-agent/rate-limiter.d.ts.map +1 -0
  110. package/dist/backend/backend/src/services/agent/crewly-agent/rate-limiter.js +264 -0
  111. package/dist/backend/backend/src/services/agent/crewly-agent/rate-limiter.js.map +1 -0
  112. package/dist/backend/backend/src/services/agent/crewly-agent/smoke-test.d.ts +13 -0
  113. package/dist/backend/backend/src/services/agent/crewly-agent/smoke-test.d.ts.map +1 -0
  114. package/dist/backend/backend/src/services/agent/crewly-agent/smoke-test.js +91 -0
  115. package/dist/backend/backend/src/services/agent/crewly-agent/smoke-test.js.map +1 -0
  116. package/dist/backend/backend/src/services/agent/crewly-agent/tool-registry.d.ts +135 -0
  117. package/dist/backend/backend/src/services/agent/crewly-agent/tool-registry.d.ts.map +1 -0
  118. package/dist/backend/backend/src/services/agent/crewly-agent/tool-registry.js +1937 -0
  119. package/dist/backend/backend/src/services/agent/crewly-agent/tool-registry.js.map +1 -0
  120. package/dist/backend/backend/src/services/ai/prompt-builder.service.js +1 -1
  121. package/dist/backend/backend/src/services/autonomous/auto-assign.service.d.ts +429 -0
  122. package/dist/backend/backend/src/services/autonomous/auto-assign.service.d.ts.map +1 -0
  123. package/dist/backend/backend/src/services/autonomous/auto-assign.service.js +852 -0
  124. package/dist/backend/backend/src/services/autonomous/auto-assign.service.js.map +1 -0
  125. package/dist/backend/backend/src/services/project/task-tracking.service.d.ts +171 -0
  126. package/dist/backend/backend/src/services/project/task-tracking.service.d.ts.map +1 -0
  127. package/dist/backend/backend/src/services/project/task-tracking.service.js +725 -0
  128. package/dist/backend/backend/src/services/project/task-tracking.service.js.map +1 -0
  129. package/dist/backend/backend/src/services/reconciler/reconciler-data-provider.d.ts.map +1 -1
  130. package/dist/backend/backend/src/services/reconciler/reconciler-data-provider.js +50 -0
  131. package/dist/backend/backend/src/services/reconciler/reconciler-data-provider.js.map +1 -1
  132. package/dist/backend/backend/src/services/task-pool/task-pool.service.d.ts +19 -0
  133. package/dist/backend/backend/src/services/task-pool/task-pool.service.d.ts.map +1 -1
  134. package/dist/backend/backend/src/services/task-pool/task-pool.service.js +45 -0
  135. package/dist/backend/backend/src/services/task-pool/task-pool.service.js.map +1 -1
  136. package/dist/backend/backend/src/services/v3/agent-auto-claim.service.d.ts.map +1 -1
  137. package/dist/backend/backend/src/services/v3/agent-auto-claim.service.js +34 -1
  138. package/dist/backend/backend/src/services/v3/agent-auto-claim.service.js.map +1 -1
  139. package/dist/backend/backend/src/services/v3/project-task-watcher.service.d.ts +118 -0
  140. package/dist/backend/backend/src/services/v3/project-task-watcher.service.d.ts.map +1 -0
  141. package/dist/backend/backend/src/services/v3/project-task-watcher.service.js +326 -0
  142. package/dist/backend/backend/src/services/v3/project-task-watcher.service.js.map +1 -0
  143. package/dist/backend/backend/src/services/wiki/wiki-backlinks.service.d.ts +72 -0
  144. package/dist/backend/backend/src/services/wiki/wiki-backlinks.service.d.ts.map +1 -0
  145. package/dist/backend/backend/src/services/wiki/wiki-backlinks.service.js +186 -0
  146. package/dist/backend/backend/src/services/wiki/wiki-backlinks.service.js.map +1 -0
  147. package/dist/backend/backend/src/services/wiki/wiki-bookkeep-trigger.service.d.ts +4 -1
  148. package/dist/backend/backend/src/services/wiki/wiki-bookkeep-trigger.service.d.ts.map +1 -1
  149. package/dist/backend/backend/src/services/wiki/wiki-bookkeep-trigger.service.js +24 -1
  150. package/dist/backend/backend/src/services/wiki/wiki-bookkeep-trigger.service.js.map +1 -1
  151. package/dist/backend/backend/src/services/wiki/wiki-chat-subscriber.service.d.ts +74 -0
  152. package/dist/backend/backend/src/services/wiki/wiki-chat-subscriber.service.d.ts.map +1 -0
  153. package/dist/backend/backend/src/services/wiki/wiki-chat-subscriber.service.js +154 -0
  154. package/dist/backend/backend/src/services/wiki/wiki-chat-subscriber.service.js.map +1 -0
  155. package/dist/backend/backend/src/services/wiki/wiki-cleanup.service.d.ts +160 -0
  156. package/dist/backend/backend/src/services/wiki/wiki-cleanup.service.d.ts.map +1 -0
  157. package/dist/backend/backend/src/services/wiki/wiki-cleanup.service.js +399 -0
  158. package/dist/backend/backend/src/services/wiki/wiki-cleanup.service.js.map +1 -0
  159. package/dist/backend/backend/src/services/wiki/wiki-lint.service.d.ts +182 -0
  160. package/dist/backend/backend/src/services/wiki/wiki-lint.service.d.ts.map +1 -0
  161. package/dist/backend/backend/src/services/wiki/wiki-lint.service.js +505 -0
  162. package/dist/backend/backend/src/services/wiki/wiki-lint.service.js.map +1 -0
  163. package/dist/backend/backend/src/services/wiki/wiki-migrate.service.d.ts +232 -0
  164. package/dist/backend/backend/src/services/wiki/wiki-migrate.service.d.ts.map +1 -0
  165. package/dist/backend/backend/src/services/wiki/wiki-migrate.service.js +1416 -0
  166. package/dist/backend/backend/src/services/wiki/wiki-migrate.service.js.map +1 -0
  167. package/dist/backend/backend/src/services/wiki/wiki-recent.service.d.ts +51 -0
  168. package/dist/backend/backend/src/services/wiki/wiki-recent.service.d.ts.map +1 -0
  169. package/dist/backend/backend/src/services/wiki/wiki-recent.service.js +102 -0
  170. package/dist/backend/backend/src/services/wiki/wiki-recent.service.js.map +1 -0
  171. package/dist/backend/backend/src/services/wiki/wiki-reflect-trigger.service.d.ts +84 -0
  172. package/dist/backend/backend/src/services/wiki/wiki-reflect-trigger.service.d.ts.map +1 -0
  173. package/dist/backend/backend/src/services/wiki/wiki-reflect-trigger.service.js +156 -0
  174. package/dist/backend/backend/src/services/wiki/wiki-reflect-trigger.service.js.map +1 -0
  175. package/dist/backend/backend/src/services/wiki/wiki-search.service.d.ts +90 -0
  176. package/dist/backend/backend/src/services/wiki/wiki-search.service.d.ts.map +1 -0
  177. package/dist/backend/backend/src/services/wiki/wiki-search.service.js +190 -0
  178. package/dist/backend/backend/src/services/wiki/wiki-search.service.js.map +1 -0
  179. package/dist/backend/backend/src/services/wiki/wiki-workitem-bridge.service.d.ts +164 -0
  180. package/dist/backend/backend/src/services/wiki/wiki-workitem-bridge.service.d.ts.map +1 -0
  181. package/dist/backend/backend/src/services/wiki/wiki-workitem-bridge.service.js +675 -0
  182. package/dist/backend/backend/src/services/wiki/wiki-workitem-bridge.service.js.map +1 -0
  183. package/dist/backend/backend/src/services/workflow/cron-task.service.d.ts.map +1 -1
  184. package/dist/backend/backend/src/services/workflow/cron-task.service.js +65 -0
  185. package/dist/backend/backend/src/services/workflow/cron-task.service.js.map +1 -1
  186. package/dist/backend/backend/src/types/auto-assign.types.d.ts +271 -0
  187. package/dist/backend/backend/src/types/auto-assign.types.d.ts.map +1 -0
  188. package/dist/backend/backend/src/types/auto-assign.types.js +136 -0
  189. package/dist/backend/backend/src/types/auto-assign.types.js.map +1 -0
  190. package/dist/backend/backend/src/types/cron-task.types.d.ts +16 -1
  191. package/dist/backend/backend/src/types/cron-task.types.d.ts.map +1 -1
  192. package/dist/backend/backend/src/utils/esm-require.utils.d.ts +111 -0
  193. package/dist/backend/backend/src/utils/esm-require.utils.d.ts.map +1 -0
  194. package/dist/backend/backend/src/utils/esm-require.utils.js +124 -0
  195. package/dist/backend/backend/src/utils/esm-require.utils.js.map +1 -0
  196. package/dist/cli/backend/src/services/ai/prompt-modules/prompt-module.interface.d.ts +220 -0
  197. package/dist/cli/backend/src/services/ai/prompt-modules/prompt-module.interface.d.ts.map +1 -0
  198. package/dist/cli/backend/src/services/ai/prompt-modules/prompt-module.interface.js +37 -0
  199. package/dist/cli/backend/src/services/ai/prompt-modules/prompt-module.interface.js.map +1 -0
  200. package/dist/cli/backend/src/services/knowledge/fts5-search-strategy.d.ts +56 -0
  201. package/dist/cli/backend/src/services/knowledge/fts5-search-strategy.d.ts.map +1 -0
  202. package/dist/cli/backend/src/services/knowledge/fts5-search-strategy.js +91 -0
  203. package/dist/cli/backend/src/services/knowledge/fts5-search-strategy.js.map +1 -0
  204. package/dist/cli/backend/src/services/knowledge/learnings-index.service.d.ts +159 -0
  205. package/dist/cli/backend/src/services/knowledge/learnings-index.service.d.ts.map +1 -0
  206. package/dist/cli/backend/src/services/knowledge/learnings-index.service.js +304 -0
  207. package/dist/cli/backend/src/services/knowledge/learnings-index.service.js.map +1 -0
  208. package/dist/cli/backend/src/services/knowledge/wiki-compiler.service.d.ts +115 -0
  209. package/dist/cli/backend/src/services/knowledge/wiki-compiler.service.d.ts.map +1 -0
  210. package/dist/cli/backend/src/services/knowledge/wiki-compiler.service.js +215 -0
  211. package/dist/cli/backend/src/services/knowledge/wiki-compiler.service.js.map +1 -0
  212. package/dist/cli/backend/src/services/memory/embedding-provider.d.ts +78 -0
  213. package/dist/cli/backend/src/services/memory/embedding-provider.d.ts.map +1 -0
  214. package/dist/cli/backend/src/services/memory/embedding-provider.js +179 -0
  215. package/dist/cli/backend/src/services/memory/embedding-provider.js.map +1 -0
  216. package/dist/cli/backend/src/services/memory/vector-store.service.d.ts +331 -0
  217. package/dist/cli/backend/src/services/memory/vector-store.service.d.ts.map +1 -0
  218. package/dist/cli/backend/src/services/memory/vector-store.service.js +814 -0
  219. package/dist/cli/backend/src/services/memory/vector-store.service.js.map +1 -0
  220. package/dist/cli/backend/src/services/project/task-tracking.service.d.ts +171 -0
  221. package/dist/cli/backend/src/services/project/task-tracking.service.d.ts.map +1 -0
  222. package/dist/cli/backend/src/services/project/task-tracking.service.js +725 -0
  223. package/dist/cli/backend/src/services/project/task-tracking.service.js.map +1 -0
  224. package/dist/cli/backend/src/services/task-pool/task-pool.service.d.ts +19 -0
  225. package/dist/cli/backend/src/services/task-pool/task-pool.service.d.ts.map +1 -1
  226. package/dist/cli/backend/src/services/task-pool/task-pool.service.js +45 -0
  227. package/dist/cli/backend/src/services/task-pool/task-pool.service.js.map +1 -1
  228. package/dist/cli/backend/src/types/auto-assign.types.d.ts +271 -0
  229. package/dist/cli/backend/src/types/auto-assign.types.d.ts.map +1 -0
  230. package/dist/cli/backend/src/types/auto-assign.types.js +136 -0
  231. package/dist/cli/backend/src/types/auto-assign.types.js.map +1 -0
  232. package/dist/cli/cli/src/index.js +0 -0
  233. package/frontend/dist/assets/{index-db3f5041.css → index-068bb4f6.css} +10 -1
  234. package/frontend/dist/assets/index-c24ceb15.js +4960 -0
  235. package/frontend/dist/index.html +2 -2
  236. package/package.json +1 -1
  237. package/config/skills/agent/core/query-knowledge/SKILL.md +0 -87
  238. package/config/skills/agent/core/query-knowledge/execute.sh +0 -30
  239. package/config/skills/orchestrator/query-knowledge/SKILL.md +0 -75
  240. package/config/skills/orchestrator/query-knowledge/execute.sh +0 -30
  241. package/frontend/dist/assets/index-cc115bb4.js +0 -4926
@@ -0,0 +1,3130 @@
1
+ #!/usr/bin/env node
2
+ // Load environment variables from .env file BEFORE any other imports
3
+ // This ensures env vars are available when services initialize
4
+ import dotenv from 'dotenv';
5
+ import path from 'path';
6
+ // Load .env from project root
7
+ dotenv.config({ path: path.resolve(process.cwd(), '.env') });
8
+ import express from 'express';
9
+ import { createServer } from 'http';
10
+ import { Server as SocketIOServer } from 'socket.io';
11
+ import cors from 'cors';
12
+ import helmet from 'helmet';
13
+ import morgan from 'morgan';
14
+ import os from 'os';
15
+ import { fileURLToPath } from 'url';
16
+ import { StorageService, TmuxService, SchedulerService, MessageSchedulerService, ActivityMonitorService, TeamActivityWebSocketService, TeamsJsonWatcherService, } from './services/index.js';
17
+ import { getSessionBackend, getSessionBackendSync, getSessionStatePersistence, destroySessionBackend, PtySessionBackend, } from './services/session/index.js';
18
+ import { ApiController } from './controllers/api.controller.js';
19
+ import { createApiRoutes } from './routes/api.routes.js';
20
+ import { TerminalGateway, setTerminalGateway } from './websocket/terminal.gateway.js';
21
+ import { initializeChatGateway } from './websocket/chat.gateway.js';
22
+ import { LoggerService } from './services/core/logger.service.js';
23
+ import { CREWLY_CONSTANTS, ORCHESTRATOR_SESSION_NAME, ORCHESTRATOR_ROLE, ORCHESTRATOR_WINDOW_NAME, MESSAGE_QUEUE_CONSTANTS, RUNTIME_TYPES, AUDITOR_CONSTANTS, AUDITOR_SCHEDULER_CONSTANTS, } from './constants.js';
24
+ import { getSettingsService } from './services/settings/index.js';
25
+ import { MemoryService } from './services/memory/memory.service.js';
26
+ import { getImprovementStartupService } from './services/orchestrator/improvement-startup.service.js';
27
+ import { initializeSlackIfConfigured, shutdownSlack } from './services/slack/index.js';
28
+ import { resolveTeamByIdOrSlug, slugifyTeamName } from './services/workflow/team-identifier-resolver.js';
29
+ import { initializeWhatsAppIfConfigured, shutdownWhatsApp } from './services/whatsapp/index.js';
30
+ import { initializeGoogleChatIfConfigured } from './services/messaging/google-chat-initializer.js';
31
+ import { initializeTelegramIfConfigured, shutdownTelegram } from './services/telegram/index.js';
32
+ import { initializeCloudIfConfigured } from './services/cloud/cloud-initializer.js';
33
+ import { MessageQueueService, QueueProcessorService, ResponseRouterService } from './services/messaging/index.js';
34
+ import { ThreadStatusQueueService } from './services/messaging/thread-status-queue.service.js';
35
+ import { EventBusService } from './services/event-bus/index.js';
36
+ import { EventToWorkItemBridge } from './services/event-bus/event-to-workitem-bridge.service.js';
37
+ import { AutoLearningSubscriber } from './services/memory/auto-learning.subscriber.js';
38
+ import { MilestoneNotificationSubscriber } from './services/notification/milestone-notification.subscriber.js';
39
+ import { RequestSlaSubscriber, setRequestSlaSubscriber, } from './services/v3/request-sla.subscriber.js';
40
+ import { RequestDecomposeSubscriber, setRequestDecomposeSubscriber, } from './services/v3/request-decompose.subscriber.js';
41
+ import { RequestStatusUpdateSubscriber } from './services/v3/request-status-update.subscriber.js';
42
+ import { RequestCascadeSubscriber } from './services/v3/request-cascade.subscriber.js';
43
+ import { setRequestServiceEventBus, RequestService } from './services/v3/request.service.js';
44
+ import { getSlackService } from './services/slack/slack.service.js';
45
+ import { SlackThreadStoreService, setSlackThreadStore, getSlackThreadStore } from './services/slack/slack-thread-store.service.js';
46
+ import { GoogleChatThreadStoreService, setGchatThreadStore } from './services/messaging/gchat-thread-store.service.js';
47
+ import { SlackImageService, setSlackImageService } from './services/slack/slack-image.service.js';
48
+ import { NotifyReconciliationService } from './services/slack/notify-reconciliation.service.js';
49
+ import { setEventBusService as setEventBusControllerService } from './controllers/event-bus/event-bus.controller.js';
50
+ import { setTeamControllerEventBusService } from './controllers/team/team.controller.js';
51
+ import { SkillCatalogService } from './services/skill/skill-catalog.service.js';
52
+ import { setMessageQueueService as setChatMessageQueueService, setThreadStatusQueueService as setChatThreadStatusQueueService } from './controllers/chat/chat.controller.js';
53
+ import { setMessageQueueService as setMessagingControllerQueueService } from './controllers/messaging/messaging.controller.js';
54
+ import { SystemResourceAlertService } from './services/monitoring/system-resource-alert.service.js';
55
+ import { TokenUsageService } from './services/monitoring/token-usage.service.js';
56
+ import { agentHeartbeatMiddleware } from './middleware/agent-heartbeat.middleware.js';
57
+ import { RedisCacheService } from './services/cache/redis-cache.service.js';
58
+ import { OrchestratorRestartService } from './services/orchestrator/orchestrator-restart.service.js';
59
+ import { setOrchestratorSetupDependencies } from './services/orchestrator/orchestrator-setup.service.js';
60
+ import { IdleDetectionService } from './services/agent/idle-detection.service.js';
61
+ import { AgentSuspendService } from './services/agent/agent-suspend.service.js';
62
+ import { AgentHeartbeatMonitorService } from './services/agent/agent-heartbeat-monitor.service.js';
63
+ import { OrchestratorHeartbeatMonitorService } from './services/orchestrator/orchestrator-heartbeat-monitor.service.js';
64
+ import { RuntimeExitMonitorService } from './services/agent/runtime-exit-monitor.service.js';
65
+ import { ContextWindowMonitorService } from './services/agent/context-window-monitor.service.js';
66
+ import { OAuthReloginMonitorService } from './services/agent/oauth-relogin-monitor.service.js';
67
+ import { findPackageRoot } from './utils/package-root.js';
68
+ import { isNativeBindingFatalError } from './utils/native-binding.utils.js';
69
+ import { VersionCheckService } from './services/system/version-check.service.js';
70
+ import { LogRotationService } from './services/session/log-rotation.service.js';
71
+ import { AuditorSchedulerService } from './services/agent/auditor-scheduler.service.js';
72
+ import { setAuditorSchedulerService } from './controllers/auditor/auditor.controller.js';
73
+ import { AddonLoaderService } from './services/addon/addon-loader.service.js';
74
+ import { CronTaskService } from './services/workflow/cron-task.service.js';
75
+ import { ReconcilerService } from './services/reconciler/reconciler.service.js';
76
+ import { LiveReconcilerDataProvider } from './services/reconciler/reconciler-data-provider.js';
77
+ import { setReconcilerService } from './controllers/reconciler/reconciler.controller.js';
78
+ import { FissionGuardService } from './services/fission/fission-guard.service.js';
79
+ import { setFissionGuardService } from './controllers/fission/fission.controller.js';
80
+ import { TaskPoolService } from './services/task-pool/task-pool.service.js';
81
+ import { ProjectMemoryService } from './services/memory/project-memory.service.js';
82
+ import { TaskHistorySubscriber } from './services/memory/task-history.subscriber.js';
83
+ import { TeamHealthWatchdogService, LiveTeamHealthDataProvider, loadTeamHealthConfig, setTeamHealthWatchdogSingleton, getTeamHealthWatchdogSingleton, } from './services/team-health/index.js';
84
+ // ESM __dirname equivalent using import.meta.url
85
+ const __filename = fileURLToPath(import.meta.url);
86
+ const __dirname = path.dirname(__filename);
87
+ /**
88
+ * Safely parses an integer from a string with validation and fallback.
89
+ *
90
+ * @param value - The string value to parse, or undefined
91
+ * @param defaultValue - The default value to return if parsing fails or value is invalid
92
+ * @param envVarName - Optional name of the environment variable for logging purposes
93
+ * @returns The parsed integer or the default value if parsing fails
94
+ */
95
+ function parseIntWithFallback(value, defaultValue, envVarName) {
96
+ if (value === undefined || value === '') {
97
+ return defaultValue;
98
+ }
99
+ const parsed = parseInt(value, 10);
100
+ // Check if parsing resulted in NaN or if the value contains non-numeric characters
101
+ // that would be silently ignored by parseInt (e.g., "3000abc" -> 3000)
102
+ if (Number.isNaN(parsed) || !Number.isFinite(parsed)) {
103
+ const logger = LoggerService.getInstance().createComponentLogger('ConfigParser');
104
+ logger.warn('Invalid numeric environment variable value, using default', {
105
+ envVar: envVarName,
106
+ value,
107
+ defaultValue,
108
+ });
109
+ return defaultValue;
110
+ }
111
+ // Validate that the entire string was a valid number (no trailing non-numeric chars)
112
+ if (String(parsed) !== value.trim()) {
113
+ const logger = LoggerService.getInstance().createComponentLogger('ConfigParser');
114
+ logger.warn('Environment variable contains non-numeric characters, using parsed value', {
115
+ envVar: envVarName,
116
+ originalValue: value,
117
+ parsedValue: parsed,
118
+ });
119
+ }
120
+ return parsed;
121
+ }
122
+ export class CrewlyServer {
123
+ app;
124
+ httpServer;
125
+ io;
126
+ config;
127
+ logger = LoggerService.getInstance().createComponentLogger('CrewlyServer');
128
+ storageService;
129
+ tmuxService;
130
+ schedulerService;
131
+ messageSchedulerService;
132
+ activityMonitorService;
133
+ teamActivityWebSocketService;
134
+ teamsJsonWatcherService;
135
+ apiController;
136
+ terminalGateway;
137
+ messageQueueService;
138
+ queueProcessorService;
139
+ threadStatusQueueService;
140
+ eventBusService;
141
+ /** BRIDGE-1: subscribes to autonomy events and creates WorkItems. */
142
+ eventToWorkItemBridge = null;
143
+ /** LEARN-1: subscribes to terminal task / mission:replanned events and auto-records learnings. */
144
+ autoLearningSubscriber = null;
145
+ // DF-1 #438 — symmetric to AutoLearningSubscriber; surfaces milestones
146
+ // to orc's chat queue.
147
+ milestoneNotificationSubscriber = null;
148
+ /** INBOUND-1: subscribes to request:created and tracks 5/10 min SLA on respond_to_user WIs. */
149
+ requestSlaSubscriber = null;
150
+ /** Pipeline-#4 follow-up: subscribes to request:created and auto-decomposes actionable L2 Requests via plan() → addToPool. */
151
+ requestDecomposeSubscriber = null;
152
+ requestStatusUpdateSubscriber = null;
153
+ requestCascadeSubscriber = null;
154
+ notifyReconciliationService;
155
+ systemResourceAlertService;
156
+ reconcilerService = null;
157
+ teamHealthWatchdog = null;
158
+ // Chat MVP Phase 1 — initialized lazily in `start()` after the HTTP
159
+ // server is created. Kept as fields so the shutdown path can close
160
+ // them cleanly and tests can reach in with a reference.
161
+ chatV2Gateway = null;
162
+ chatV2Dispatcher = null;
163
+ // Shutdown state
164
+ isShuttingDown = false;
165
+ healthMonitoringInterval = null;
166
+ constructor(config) {
167
+ // Resolve ~ to actual home directory
168
+ const resolveHomePath = (inputPath) => {
169
+ if (inputPath.startsWith('~/')) {
170
+ return path.join(os.homedir(), inputPath.slice(2));
171
+ }
172
+ if (inputPath === '~') {
173
+ return os.homedir();
174
+ }
175
+ return inputPath;
176
+ };
177
+ const defaultAgentmuxHome = config?.crewlyHome || process.env.CREWLY_HOME || '~/.crewly';
178
+ this.config = {
179
+ webPort: config?.webPort || parseIntWithFallback(process.env.WEB_PORT, 8787, 'WEB_PORT'),
180
+ crewlyHome: resolveHomePath(defaultAgentmuxHome),
181
+ defaultCheckInterval: config?.defaultCheckInterval ||
182
+ parseIntWithFallback(process.env.DEFAULT_CHECK_INTERVAL, 30, 'DEFAULT_CHECK_INTERVAL'),
183
+ autoCommitInterval: config?.autoCommitInterval || parseIntWithFallback(process.env.AUTO_COMMIT_INTERVAL, 30, 'AUTO_COMMIT_INTERVAL'),
184
+ headless: config?.headless ?? process.env.CREWLY_HEADLESS === 'true',
185
+ };
186
+ this.app = express();
187
+ this.httpServer = createServer(this.app);
188
+ this.io = new SocketIOServer(this.httpServer, {
189
+ cors: {
190
+ origin: process.env.NODE_ENV === 'production'
191
+ ? ['https://crewlyai.com', 'https://www.crewlyai.com']
192
+ : '*',
193
+ methods: ['GET', 'POST'],
194
+ },
195
+ // Configure ping/pong to keep connections alive
196
+ pingInterval: 10000, // Send ping every 10 seconds
197
+ pingTimeout: 5000, // Wait 5 seconds for pong response
198
+ // Prefer WebSocket transport for lower latency
199
+ transports: ['websocket', 'polling'],
200
+ // Allow transport upgrade from polling to websocket
201
+ allowUpgrades: true,
202
+ // Increase buffer size for large terminal output
203
+ maxHttpBufferSize: 5 * 1024 * 1024, // 5MB
204
+ perMessageDeflate: false,
205
+ // CRITICAL: Prevent Engine.IO from destroying non-matching upgrade requests.
206
+ // Crewly in Chrome (BrowserBridgeService) shares this httpServer and handles /ws/browser upgrades.
207
+ // Without this, Engine.IO sets a 1-second timer to socket.end() any upgrade
208
+ // that doesn't match /socket.io/ — killing Crewly in Chrome connections before
209
+ // any data is exchanged (manifests as "Invalid frame header" errors).
210
+ destroyUpgrade: false,
211
+ });
212
+ this.initializeServices();
213
+ this.configureMiddleware();
214
+ this.configureRoutes();
215
+ this.configureWebSocket();
216
+ }
217
+ initializeServices() {
218
+ this.storageService = StorageService.getInstance(this.config.crewlyHome);
219
+ this.tmuxService = new TmuxService();
220
+ this.schedulerService = new SchedulerService(this.storageService);
221
+ this.messageSchedulerService = new MessageSchedulerService(this.tmuxService, this.storageService);
222
+ this.activityMonitorService = ActivityMonitorService.getInstance();
223
+ // V3-only as of spec 2026-05-06-task-management-v1-deprecation.md.
224
+ // TaskTrackingService is deleted; in-progress task data and lifecycle
225
+ // events come from TaskPoolService + EventBusService respectively.
226
+ this.teamActivityWebSocketService = new TeamActivityWebSocketService(this.storageService, this.tmuxService);
227
+ this.teamsJsonWatcherService = new TeamsJsonWatcherService();
228
+ this.apiController = new ApiController(this.storageService, this.tmuxService, this.schedulerService, this.messageSchedulerService);
229
+ // Wire up reliable delivery: both schedulers use AgentRegistrationService
230
+ // for retry + progressive verification + background stuck-detection
231
+ this.messageSchedulerService.setAgentRegistrationService(this.apiController.agentRegistrationService);
232
+ this.schedulerService.setAgentRegistrationService(this.apiController.agentRegistrationService);
233
+ // Initialize message queue services (with disk persistence)
234
+ // NOTE: Must be created before services that depend on them (scheduler, thread status queue)
235
+ this.messageQueueService = new MessageQueueService(this.config.crewlyHome);
236
+ const responseRouter = new ResponseRouterService();
237
+ this.queueProcessorService = new QueueProcessorService(this.messageQueueService, responseRouter, this.apiController.agentRegistrationService);
238
+ // Initialize event bus service for agent lifecycle pub/sub
239
+ // NOTE: Must be created before services that depend on it (agent registration, thread status queue)
240
+ this.eventBusService = new EventBusService();
241
+ this.eventBusService.setMessageQueueService(this.messageQueueService);
242
+ // Now wire services that depend on messageQueueService and eventBusService
243
+ this.schedulerService.setMessageQueueService(this.messageQueueService);
244
+ this.schedulerService.setActivityMonitor(this.activityMonitorService);
245
+ // #167: Wire scheduler into agent registration for DLQ drain on activation
246
+ this.apiController.agentRegistrationService.setSchedulerService(this.schedulerService);
247
+ // Architecture Upgrade Phase 6: Wire EventBusService for standing task subscriptions
248
+ this.apiController.agentRegistrationService.setEventBusService(this.eventBusService);
249
+ this.terminalGateway = new TerminalGateway(this.io);
250
+ // Set terminal gateway singleton for chat integration
251
+ setTerminalGateway(this.terminalGateway);
252
+ // Initialize ChatGateway for chat message forwarding
253
+ // This sets up the event listeners that forward chat messages to WebSocket clients
254
+ initializeChatGateway(this.io).catch((error) => {
255
+ this.logger.error('Failed to initialize ChatGateway', {
256
+ error: error instanceof Error ? error.message : String(error),
257
+ });
258
+ });
259
+ // Connect WebSocket service to terminal gateway for broadcasting
260
+ this.teamActivityWebSocketService.setTerminalGateway(this.terminalGateway);
261
+ // Connect teams.json watcher to team activity service for real-time updates
262
+ this.teamsJsonWatcherService.setTeamActivityService(this.teamActivityWebSocketService);
263
+ // Initialize thread status queue for tracking inbound message lifecycle
264
+ this.threadStatusQueueService = new ThreadStatusQueueService(this.config.crewlyHome);
265
+ responseRouter.setThreadStatusQueue(this.threadStatusQueueService);
266
+ this.queueProcessorService.setThreadStatusQueue(this.threadStatusQueueService);
267
+ // INBOUND-1.f1: Wire EventBus into the TaskPool singleton so addToPool
268
+ // can publish `workitem:queued` events. Must run before any code path
269
+ // triggers addToPool — the slack listener / TaskPool router below both
270
+ // depend on this for the auto-close path b chain. Idempotent.
271
+ TaskPoolService.getInstance().setEventBusService(this.eventBusService);
272
+ // Memory: TaskHistorySubscriber listens on the bus for
273
+ // task:done_by_worker / task:rejected / task:cancelled and writes
274
+ // the resulting TaskHistoryEntry into ProjectMemory. This is the
275
+ // load-bearing write path behind "who in my team has done X?" —
276
+ // the orchestrator queries via recall(capability:...). Must run
277
+ // AFTER TaskPoolService is wired to the bus (above) so the events
278
+ // it publishes have a subscriber to consume them.
279
+ const taskHistorySubscriber = new TaskHistorySubscriber({
280
+ eventBus: this.eventBusService,
281
+ projectMemoryService: ProjectMemoryService.getInstance(),
282
+ taskPoolService: TaskPoolService.getInstance(),
283
+ });
284
+ taskHistorySubscriber.start();
285
+ // P1 Bug B (Pool umbrella WI 72ca743a): Wire RequestService into the
286
+ // TaskPool singleton so addToPool intrinsically links new WIs into
287
+ // their parent Request.workItemIds[] — independent of the
288
+ // subscriber-driven path (request-sla.subscriber, V3DataService).
289
+ // Pre-fix, manual / programmatic / cron callers that bypassed the
290
+ // event chain left Requests with empty workItemIds[]. The linker is
291
+ // idempotent (request.service.ts:328 short-circuits on duplicate id)
292
+ // so subscriber-driven linking stays as belt-and-suspenders.
293
+ TaskPoolService.getInstance().setRequestService(RequestService.getInstance());
294
+ // P1 Bug C (Pool umbrella WI 72ca743a, sub-WI Bug C): Wire the inverse
295
+ // dependency — RequestService → TaskPool — so RequestService.update
296
+ // can refuse `Request → done` when any child WorkItem is still in a
297
+ // non-terminal state. Bug B (above) makes Request.workItemIds[]
298
+ // authoritative on every addToPool; Bug C makes the closure honor
299
+ // that data. The setter is duck-typed on IWorkItemQueryable so
300
+ // neither side needs a static import of the other.
301
+ RequestService.getInstance().setTaskPoolService(TaskPoolService.getInstance());
302
+ // Atlas 2026-05-23 fix: wire the agent-liveness gate so claimFromPool /
303
+ // claimSpecificItem refuse to put a WI into `running` when the requesting
304
+ // agent's session is dead. delegate-task's "self-heal fix #1" used to
305
+ // pre-claim WIs for inactive targets, which short-circuited the
306
+ // reconciler's wake-rule and left WIs blocked indefinitely. With this
307
+ // probe wired, rejected pre-claims leave the WI in `queued` so the
308
+ // reconciler can fire detectUnclaimedTasks → start-agent → the agent
309
+ // auto-claims when it boots. The probe is the same lightweight check
310
+ // (PTY session exists + child process alive) used by chat-v2 and slack.
311
+ // Wrapped in async-IIFE because initializeServices() is sync.
312
+ void (async () => {
313
+ const { isAgentActive } = await import('./services/orchestrator/orchestrator-status.service.js');
314
+ TaskPoolService.getInstance().setIsAgentActive(isAgentActive);
315
+ })();
316
+ // Wire Task Pool router so [TASK]-prefixed messages route through the pool
317
+ this.queueProcessorService.setTaskPoolRouter(async (messageContent, targetSession) => {
318
+ const { createWorkItem } = await import('./types/v2/work-item.types.js');
319
+ const taskPool = TaskPoolService.getInstance();
320
+ const workItem = createWorkItem({
321
+ type: 'delegate',
322
+ owner: 'orchestrator',
323
+ target: targetSession,
324
+ title: messageContent.slice(0, 100),
325
+ description: messageContent,
326
+ });
327
+ await taskPool.addToPool(workItem);
328
+ const claimed = await taskPool.claimFromPool(targetSession);
329
+ return claimed !== null;
330
+ });
331
+ // Wire thread status queue with scheduler and event bus for follow-up tracking
332
+ this.threadStatusQueueService.setSchedulerService(this.schedulerService);
333
+ this.threadStatusQueueService.setEventBusService(this.eventBusService);
334
+ // Wire queue service into controllers
335
+ setChatMessageQueueService(this.messageQueueService);
336
+ setChatThreadStatusQueueService(this.threadStatusQueueService);
337
+ setMessagingControllerQueueService(this.messageQueueService);
338
+ // LLM-wiki bookkeep trigger (Steve 2026-05-22 design point #5):
339
+ // every 30 minutes (configurable via CREWLY_WIKI_BOOKKEEP_INTERVAL_MS),
340
+ // scan every known vault. When recentMd ≥ threshold OR there are
341
+ // duplicate clusters, enqueue a [BOOKKEEP] message to ORC so it can
342
+ // run wiki-bookkeep + decide which pages to consolidate.
343
+ void (async () => {
344
+ try {
345
+ const { WikiBookkeepTriggerService } = await import('./services/wiki/wiki-bookkeep-trigger.service.js');
346
+ const intervalMs = Number(process.env['CREWLY_WIKI_BOOKKEEP_INTERVAL_MS'] ?? 30 * 60 * 1000);
347
+ const debounceMs = Number(process.env['CREWLY_WIKI_BOOKKEEP_DEBOUNCE_MS'] ?? 6 * 3600 * 1000);
348
+ const trigger = new WikiBookkeepTriggerService({
349
+ intervalMs,
350
+ debounceMs,
351
+ fireFn: async (vaultPath, report) => {
352
+ if (!this.messageQueueService)
353
+ return;
354
+ const summary = `[BOOKKEEP] vault=${vaultPath} | ${report.recentMdCount} new md(s) in last ${report.windowDays}d (threshold ${report.threshold}) | duplicates=${report.duplicateCandidates.length} | pending-queue=${report.queue.pending}. Run wiki-bookkeep to drain.`;
355
+ this.messageQueueService.enqueue({
356
+ content: summary,
357
+ conversationId: 'system:wiki-bookkeep',
358
+ source: 'system_event',
359
+ });
360
+ },
361
+ });
362
+ WikiBookkeepTriggerService.setInstance(trigger);
363
+ trigger.start();
364
+ }
365
+ catch (bookkeepErr) {
366
+ this.logger.warn('Wiki bookkeep trigger failed to start (non-fatal)', {
367
+ error: bookkeepErr.message,
368
+ });
369
+ }
370
+ // LLM-wiki reflect trigger (2026-05-24): if a vault has had zero
371
+ // wiki-queue-add fires in the last `quietWindowMs`, ping ORC with
372
+ // a `[REFLECT-WIKI]` message so it sweeps recent conversation
373
+ // for worth-saving content. Solves the "the queue is never used"
374
+ // problem found during the 2026-05-24 audit.
375
+ try {
376
+ const { WikiReflectTriggerService } = await import('./services/wiki/wiki-reflect-trigger.service.js');
377
+ const reflectInterval = Number(process.env['CREWLY_WIKI_REFLECT_INTERVAL_MS'] ?? 60 * 60 * 1000);
378
+ const reflectQuiet = Number(process.env['CREWLY_WIKI_REFLECT_QUIET_WINDOW_MS'] ?? 4 * 60 * 60 * 1000);
379
+ const reflectDebounce = Number(process.env['CREWLY_WIKI_REFLECT_DEBOUNCE_MS'] ?? 4 * 60 * 60 * 1000);
380
+ const reflectTrigger = new WikiReflectTriggerService({
381
+ intervalMs: reflectInterval,
382
+ quietWindowMs: reflectQuiet,
383
+ debounceMs: reflectDebounce,
384
+ fireFn: async (meta) => {
385
+ if (!this.messageQueueService)
386
+ return;
387
+ const lastAddText = meta.msSinceLastQueueAdd === Number.POSITIVE_INFINITY
388
+ ? 'never'
389
+ : `${Math.floor(meta.msSinceLastQueueAdd / (60 * 60 * 1000))}h ago`;
390
+ const summary = `[REFLECT-WIKI] vault=${meta.vaultPath} | last wiki-queue-add: ${lastAddText} | total queue items: ${meta.totalQueueItems}. Sweep the recent conversation for worth-saving content (decisions, customer facts, learnings) and call wiki-queue-add for each, OR reply "nothing this period" if there genuinely is nothing.`;
391
+ this.messageQueueService.enqueue({
392
+ content: summary,
393
+ conversationId: 'system:wiki-reflect',
394
+ source: 'system_event',
395
+ });
396
+ },
397
+ });
398
+ WikiReflectTriggerService.setInstance(reflectTrigger);
399
+ reflectTrigger.start();
400
+ }
401
+ catch (reflectErr) {
402
+ this.logger.warn('Wiki reflect trigger failed to start (non-fatal)', {
403
+ error: reflectErr.message,
404
+ });
405
+ }
406
+ // LLM-wiki → WorkItem bridge (2026-05-27): pending wiki queue
407
+ // items + legacy migration candidates become claimable
408
+ // WorkItems in the V3 pool. Replaces the bookkeep/reflect
409
+ // "shouldFire ≥ threshold" model — even 1 pending item now
410
+ // surfaces in /work-items and idle agents drain it through the
411
+ // standard claim loop. PR-1: target=crewly-orc for both kinds
412
+ // because the wiki-process-queue / wiki-migrate skills live
413
+ // under `config/skills/orchestrator/` today.
414
+ try {
415
+ const { WikiWorkItemBridgeService } = await import('./services/wiki/wiki-workitem-bridge.service.js');
416
+ const intervalMs = Number(process.env['CREWLY_WIKI_BRIDGE_INTERVAL_MS'] ?? 10 * 60 * 1000);
417
+ const targetAgent = process.env['CREWLY_WIKI_BRIDGE_TARGET'] ?? 'crewly-orc';
418
+ const maxCreatesPerTick = Number(process.env['CREWLY_WIKI_BRIDGE_MAX_PER_TICK'] ?? 2);
419
+ const cooldownMs = Number(process.env['CREWLY_WIKI_BRIDGE_COOLDOWN_MS'] ?? 30 * 60 * 1000);
420
+ const bridge = new WikiWorkItemBridgeService({
421
+ intervalMs,
422
+ targetAgent,
423
+ maxCreatesPerTick,
424
+ cooldownMs,
425
+ });
426
+ WikiWorkItemBridgeService.setInstance(bridge);
427
+ bridge.start();
428
+ }
429
+ catch (bridgeErr) {
430
+ this.logger.warn('Wiki work-item bridge failed to start (non-fatal)', {
431
+ error: bridgeErr.message,
432
+ });
433
+ }
434
+ // ORC delivery enforcer (2026-05-23 incident fix): watches for
435
+ // agent `[DONE]` posts to slack threads that ORC hasn't yet
436
+ // forwarded via reply-slack. Fires `[DELIVER_REQUIRED]` nudges
437
+ // at 3 / 10 / 30 min after the agent finished, until ORC
438
+ // actually delivers OR the budget is exhausted.
439
+ try {
440
+ const { OrcDeliveryEnforcerService } = await import('./services/orc/orc-delivery-enforcer.service.js');
441
+ const enforcer = new OrcDeliveryEnforcerService({
442
+ reminderSink: ({ conversationId, text }) => {
443
+ if (!this.messageQueueService)
444
+ return;
445
+ this.messageQueueService.enqueue({
446
+ content: text,
447
+ conversationId,
448
+ source: 'system_event',
449
+ });
450
+ },
451
+ });
452
+ OrcDeliveryEnforcerService.setInstance(enforcer);
453
+ enforcer.start();
454
+ }
455
+ catch (enforcerErr) {
456
+ this.logger.warn('OrcDeliveryEnforcer failed to start (non-fatal)', {
457
+ error: enforcerErr.message,
458
+ });
459
+ }
460
+ })();
461
+ // Initialize system resource alert service for proactive monitoring
462
+ this.systemResourceAlertService = new SystemResourceAlertService();
463
+ this.teamsJsonWatcherService.setEventBusService(this.eventBusService);
464
+ this.activityMonitorService.setEventBusService(this.eventBusService);
465
+ setEventBusControllerService(this.eventBusService);
466
+ setTeamControllerEventBusService(this.eventBusService);
467
+ // Wire team-activity-websocket to EventBus so it reacts to V3
468
+ // WorkItem lifecycle events (replaces the legacy
469
+ // TaskTrackingService.on('task_workflow_event') bridge that was
470
+ // deleted with the v1 task-management subsystem).
471
+ this.teamActivityWebSocketService.setEventBus(this.eventBusService);
472
+ // V3-only autonomy: AgentAutoClaimService (started later in boot)
473
+ // is the single autonomy loop. The legacy AutoAssignService has
474
+ // been retired — see spec 2026-05-06-task-management-v1-deprecation.md.
475
+ // BRIDGE-1: subscribe to autonomy events (task:done_by_worker,
476
+ // task:rejected, task:blocked, team:all_tasks_done, mission:*) and
477
+ // create the appropriate WorkItem(s) — verification WI for TL on
478
+ // done_by_worker, retry WI / escalation WI on rejected, review WI on
479
+ // blocked / mission events. See `event-to-workitem-bridge.service.ts`
480
+ // for idempotency contract + retry cap + cron-recursion guard.
481
+ this.eventToWorkItemBridge = EventToWorkItemBridge.boot(this.eventBusService);
482
+ this.eventToWorkItemBridge.start();
483
+ // LEARN-1: subscribe to terminal task / mission:replanned events and
484
+ // auto-record a learning entry via MemoryService.recordLearning. Closes
485
+ // the prompt-driven "agents-forget-to-record" gap. See
486
+ // `auto-learning.subscriber.ts` for category mapping + idempotency
487
+ // contract (V1) and the V7/V9 self-checks in the co-located test.
488
+ this.autoLearningSubscriber = AutoLearningSubscriber.boot(this.eventBusService);
489
+ this.autoLearningSubscriber.start();
490
+ // DF-1 #438: symmetric notification subscriber. Same architectural
491
+ // pattern as AutoLearningSubscriber — listens to terminal lifecycle
492
+ // events (`task:verified`, `mission:replanned`) and enqueues a
493
+ // `[MILESTONE]` envelope into orc's chat queue. The QW-3 row in
494
+ // `config/roles/orchestrator/prompt.md` (#436) handles the
495
+ // always-forward-to-owner rule on the orc side; this subscriber
496
+ // closes the gap where an agent ships work but forgets to call
497
+ // `report-status --status milestone` (the agent-side QW-1 path).
498
+ this.milestoneNotificationSubscriber = new MilestoneNotificationSubscriber({
499
+ eventBus: this.eventBusService,
500
+ messageQueueService: this.messageQueueService,
501
+ });
502
+ this.milestoneNotificationSubscriber.start();
503
+ // INBOUND-1 + Pipeline-#4 follow-up: wire RequestService → bus, then
504
+ // boot both v3 subscribers (SLA tracker + auto-decompose). Order
505
+ // matters within the block: setRequestServiceEventBus must run
506
+ // BEFORE any code path can call RequestService.create() — the slack
507
+ // listener at line ~370 is the first hot caller, but the slack
508
+ // service hasn't been initialised yet at this point in boot, so
509
+ // we're safe.
510
+ //
511
+ // Failure-isolated (issue #465): the entire v3 subscriber boot is
512
+ // wrapped in try/catch so a wiring failure logs + continues rather
513
+ // than crashing the whole backend. Neither subscriber is essential
514
+ // to API liveness — degrading them is preferable to losing the
515
+ // process. A single catch block treats both as a unit because the
516
+ // failure mode is "wiring is broken, fix the deploy" not
517
+ // "intermittently flaky"; partial recovery would be unnecessary
518
+ // complexity for v1. B0 broadcast (line ~2336) and TriggerEngine
519
+ // boot (line ~1464) already have equivalent isolation; this brings
520
+ // the v3 subscriber block in line with that pattern.
521
+ try {
522
+ setRequestServiceEventBus(this.eventBusService);
523
+ this.requestSlaSubscriber = RequestSlaSubscriber.boot(this.eventBusService, RequestService.getInstance(), TaskPoolService.getInstance(), async ({ channelId, threadTs, messageText }) => {
524
+ // Production wiring of the 10-min escalation hook: nudge the user
525
+ // in the same Slack thread so they're never blind to the miss.
526
+ const slack = getSlackService();
527
+ await slack.sendMessage({
528
+ channelId,
529
+ threadTs,
530
+ text: messageText,
531
+ });
532
+ });
533
+ this.requestSlaSubscriber.start();
534
+ setRequestSlaSubscriber(this.requestSlaSubscriber);
535
+ // Pipeline-#4 follow-up: auto-decompose actionable L2 Requests on
536
+ // request:created. Sequenced AFTER the SLA subscriber so the
537
+ // respond_to_user WI seeding still runs first when both fire on
538
+ // the same event (deterministic listener-attach order; both run
539
+ // via the same in-process bus). Side note: order is semantically
540
+ // irrelevant — the linkWorkItem path keys on workitem:queued, not
541
+ // on relative listener position — but predictable startup ordering
542
+ // helps debug.
543
+ this.requestDecomposeSubscriber = RequestDecomposeSubscriber.boot(this.eventBusService, RequestService.getInstance(), TaskPoolService.getInstance());
544
+ this.requestDecomposeSubscriber.start();
545
+ setRequestDecomposeSubscriber(this.requestDecomposeSubscriber);
546
+ // Status-update subscriber: posts progress on Slack-originated
547
+ // Requests as their child WIs reach milestones, plus a heartbeat
548
+ // while work is still in flight. Closes the "long silence after
549
+ // orc's first ack" UX gap. Idempotent — duplicate boots are no-ops.
550
+ this.requestStatusUpdateSubscriber = new RequestStatusUpdateSubscriber({
551
+ eventBus: this.eventBusService,
552
+ requestService: RequestService.getInstance(),
553
+ taskPool: TaskPoolService.getInstance(),
554
+ slackPoster: async ({ channelId, text, threadTs }) => {
555
+ // Post via the in-process SlackService to avoid a self-HTTP
556
+ // hop. The /api/slack/send route's other side-effects (chat
557
+ // persistence, thread-status replied marker) don't apply
558
+ // to mid-thread heartbeat updates — those are only for
559
+ // the user's direct reply, not for orc's progress pings.
560
+ const slack = getSlackService();
561
+ if (!slack.isConnected())
562
+ return;
563
+ await slack.sendMessage({ channelId, text, threadTs });
564
+ },
565
+ heartbeatMinutes: 30,
566
+ });
567
+ this.requestStatusUpdateSubscriber.start();
568
+ // Cascade subscriber: keeps Request.status in sync with the
569
+ // aggregate state of its child WIs by reacting to live task
570
+ // lifecycle events. Closes the gap left by V3DataService's
571
+ // retired `v3:task_*` subscriptions (see 2026-05-09 dogfood
572
+ // note in request-cascade.subscriber.ts).
573
+ this.requestCascadeSubscriber = new RequestCascadeSubscriber({
574
+ eventBus: this.eventBusService,
575
+ requestService: RequestService.getInstance(),
576
+ taskPool: TaskPoolService.getInstance(),
577
+ notifier: this.eventBusService,
578
+ });
579
+ this.requestCascadeSubscriber.start();
580
+ }
581
+ catch (subscriberBootErr) {
582
+ // Degraded mode: SLA tracking + auto-decompose are off, but the
583
+ // API surface and rest of the backend continue to serve. Ops can
584
+ // grep for `v3 subscriber boot failed` in logs to triage.
585
+ this.logger.error('v3 subscriber boot failed — degrading SLA + auto-decompose paths, continuing backend startup', {
586
+ error: subscriberBootErr instanceof Error
587
+ ? subscriberBootErr.message
588
+ : String(subscriberBootErr),
589
+ });
590
+ // Best-effort cleanup of any partial wiring so a later restart
591
+ // doesn't see stale singletons. The setters are idempotent.
592
+ setRequestSlaSubscriber(null);
593
+ setRequestDecomposeSubscriber(null);
594
+ setRequestServiceEventBus(null);
595
+ this.requestSlaSubscriber = null;
596
+ this.requestDecomposeSubscriber = null;
597
+ if (this.requestStatusUpdateSubscriber) {
598
+ try {
599
+ this.requestStatusUpdateSubscriber.stop();
600
+ }
601
+ catch { /* best-effort */ }
602
+ this.requestStatusUpdateSubscriber = null;
603
+ }
604
+ if (this.requestCascadeSubscriber) {
605
+ try {
606
+ this.requestCascadeSubscriber.stop();
607
+ }
608
+ catch { /* best-effort */ }
609
+ this.requestCascadeSubscriber = null;
610
+ }
611
+ }
612
+ // Initialize Slack thread store for persistent thread conversations
613
+ const slackThreadStore = new SlackThreadStoreService(this.config.crewlyHome);
614
+ setSlackThreadStore(slackThreadStore);
615
+ this.eventBusService.setSlackThreadStore(slackThreadStore);
616
+ // Initialize Google Chat thread store for persistent thread conversations
617
+ const gchatThreadStore = new GoogleChatThreadStoreService(this.config.crewlyHome);
618
+ setGchatThreadStore(gchatThreadStore);
619
+ // Initialize Slack image service for downloading images from Slack messages
620
+ const slackImageService = new SlackImageService(this.config.crewlyHome);
621
+ setSlackImageService(slackImageService);
622
+ // Wire agent:idle events to thread status queue for delegation completion
623
+ this.eventBusService.on('eventPublished', (event) => {
624
+ if (event.type === 'agent:idle' && event.sessionName) {
625
+ try {
626
+ const waitingThreads = this.threadStatusQueueService.getByStatus('replied_waiting_actions');
627
+ for (const entry of waitingThreads) {
628
+ if (entry.delegatedAgents?.includes(event.sessionName)) {
629
+ this.threadStatusQueueService.markDelegationsComplete(entry.threadKey);
630
+ }
631
+ }
632
+ }
633
+ catch (err) {
634
+ this.logger.warn('Failed to check thread delegation completion on agent:idle', {
635
+ sessionName: event.sessionName,
636
+ error: err instanceof Error ? err.message : String(err),
637
+ });
638
+ }
639
+ // V3: Auto-close open Requests when the orchestrator goes idle
640
+ // Handles direct responses (no WorkItem delegation)
641
+ if (event.sessionName === ORCHESTRATOR_SESSION_NAME) {
642
+ setImmediate(() => this.autoCloseOpenRequests());
643
+ }
644
+ }
645
+ });
646
+ // Shared LiveReconcilerDataProvider instance used by both the
647
+ // Reconciler service and the TeamHealthWatchdog data provider.
648
+ // Sharing is required so the memory-pressure broadcast state
649
+ // (`consecutivePressureSkips` / `lastPressureNotifiedAt`) is
650
+ // counted ONCE per sustained pressure episode. Two separate
651
+ // instances would each cross the 5-skip threshold around the same
652
+ // time and publish two `system:memory_pressure` events with
653
+ // distinct `event.id` values (no debounce match), so orc would
654
+ // receive duplicates. See follow-up #5 from PR #543 review.
655
+ const liveDataProvider = new LiveReconcilerDataProvider();
656
+ liveDataProvider.setEventBus(this.eventBusService);
657
+ // Wire AgentRegistrationService so the memory-pressure eviction
658
+ // path can terminate idle agents to free wake slots (issue surfaced
659
+ // 2026-05-16: queued WIs for inactive Atlas could not get woken
660
+ // because the floor was held by idle product/marketing agents).
661
+ liveDataProvider.setAgentRegistrationService(this.apiController.agentRegistrationService);
662
+ // Initialize Reconciler Service (V2 — system truth recomputation)
663
+ {
664
+ const reconcilerLogger = LoggerService.getInstance().createComponentLogger('ReconcilerInit');
665
+ // Live data provider — connects Reconciler to Task Pool, Claim Service,
666
+ // Storage Service, and Agent Suspend for real reconciliation including
667
+ // Hybrid Wake (auto-rehydrating suspended agents when tasks go unclaimed).
668
+ this.reconcilerService = new ReconcilerService(liveDataProvider);
669
+ setReconcilerService(this.reconcilerService);
670
+ // Subscribe EventBus events for targeted reconciliation
671
+ if (this.reconcilerService) {
672
+ const reconciler = this.reconcilerService;
673
+ // 2026-05-15 Steve dogfood: the prior `subscribe({ subscriberSession:
674
+ // '__reconciler__' })` loop here was redundant AND wrong. The
675
+ // subscribe path routes critical events through
676
+ // `MessageQueueService.enqueue` keyed by `targetSession`, which
677
+ // then fails noisily because `__reconciler__` is not a PTY
678
+ // session ("Session '__reconciler__' does not exist", every
679
+ // reconciler tick). The in-process `event_published` listener
680
+ // below already drives the reconciler — no second wiring needed.
681
+ // Removed the subscribe-block; if a future change needs persistent
682
+ // metadata for the reconciler subscription, attach it as a real
683
+ // in-process subscriber via `onInProcess` rather than the
684
+ // session-targeted `subscribe` API.
685
+ // Listen for all published events and trigger targeted reconciliation
686
+ this.eventBusService.on('event_published', (payload) => {
687
+ const targetedEventTypes = ['task:completed', 'task:failed', 'agent:idle', 'agent:inactive'];
688
+ if (targetedEventTypes.includes(payload.eventType)) {
689
+ reconciler.runFast().catch((err) => {
690
+ reconcilerLogger.warn('Event-triggered fast reconcile failed', {
691
+ eventType: payload.eventType,
692
+ error: err instanceof Error ? err.message : String(err),
693
+ });
694
+ });
695
+ }
696
+ });
697
+ }
698
+ reconcilerLogger.info('ReconcilerService initialized and wired to EventBus');
699
+ }
700
+ // Initialize Team-Health-Watchdog (THW) — Layer 4 liveness aggregator
701
+ // Lazy singleton wiring per Sam's etiquette nudge: no module-load
702
+ // side effects; controller and /api/health resolve via accessor.
703
+ {
704
+ const thwLogger = LoggerService.getInstance().createComponentLogger('TeamHealthInit');
705
+ try {
706
+ const config = loadTeamHealthConfig({
707
+ warn: (msg, meta) => thwLogger.warn(msg, meta ?? {}),
708
+ info: (msg, meta) => thwLogger.info(msg, meta ?? {}),
709
+ });
710
+ if (!config.enabled) {
711
+ thwLogger.info('TeamHealthWatchdog disabled by config; skipping init.');
712
+ }
713
+ else if (!this.reconcilerService) {
714
+ thwLogger.warn('Reconciler not available; skipping TeamHealthWatchdog init.');
715
+ }
716
+ else {
717
+ // Reuse the shared LiveReconcilerDataProvider declared
718
+ // above (follow-up #5 from PR #543 review) — instantiating
719
+ // a second copy would double-broadcast memory-pressure.
720
+ const dataProvider = new LiveTeamHealthDataProvider({
721
+ reconcilerProvider: liveDataProvider,
722
+ getTeams: async () => StorageService.getInstance().getTeams(),
723
+ bootedAt: new Date(),
724
+ });
725
+ // Phase 0 alert sink: log-only. Slack delivery wires up
726
+ // in Phase 1 (per §G phasing). Shadow-mode is the
727
+ // default config.json setting, so this sink is mostly
728
+ // invoked for the recovery announcement path.
729
+ const alertSink = {
730
+ deliver: async (decision) => {
731
+ thwLogger.info('THW alert (Phase 0 log-only sink)', {
732
+ teamId: decision.detection.teamId,
733
+ verdict: decision.effectiveVerdict,
734
+ channel: decision.channel,
735
+ message: decision.message,
736
+ });
737
+ },
738
+ };
739
+ this.teamHealthWatchdog = new TeamHealthWatchdogService({
740
+ config,
741
+ dataProvider,
742
+ alertSink,
743
+ bootedAt: new Date(),
744
+ logger: {
745
+ info: (msg, meta) => thwLogger.info(msg, meta ?? {}),
746
+ warn: (msg, meta) => thwLogger.warn(msg, meta ?? {}),
747
+ error: (msg, meta) => thwLogger.error(msg, meta ?? {}),
748
+ },
749
+ });
750
+ setTeamHealthWatchdogSingleton(this.teamHealthWatchdog);
751
+ this.teamHealthWatchdog.start();
752
+ thwLogger.info('TeamHealthWatchdog initialized', {
753
+ shadowMode: config.shadowMode,
754
+ sweepIntervalMs: config.sweepIntervalMs,
755
+ });
756
+ }
757
+ }
758
+ catch (err) {
759
+ thwLogger.error('Failed to initialize TeamHealthWatchdog (continuing without it)', {
760
+ error: err instanceof Error ? err.message : String(err),
761
+ });
762
+ }
763
+ }
764
+ // Initialize Fission Guard Service
765
+ {
766
+ const fissionLogger = LoggerService.getInstance().createComponentLogger('FissionInit');
767
+ try {
768
+ const taskPool = TaskPoolService.getInstance();
769
+ // FissionDataProvider backed by TaskPoolService storage
770
+ const fissionDataProvider = {
771
+ async getWorkItemById(id) {
772
+ const items = await taskPool.getAllItems();
773
+ return items.find((i) => i.id === id) ?? null;
774
+ },
775
+ async countMissionWorkItems(missionId) {
776
+ const items = await taskPool.getAllItems();
777
+ return items.filter((i) => i.missionId === missionId).length;
778
+ },
779
+ async countChildWorkItems(parentWorkItemId) {
780
+ const items = await taskPool.getAllItems();
781
+ return items.filter((i) => i.parentWorkItemId === parentWorkItemId).length;
782
+ },
783
+ };
784
+ const fissionService = FissionGuardService.init(fissionDataProvider);
785
+ setFissionGuardService(fissionService);
786
+ fissionLogger.info('FissionGuardService initialized');
787
+ }
788
+ catch (fissionErr) {
789
+ fissionLogger.warn('FissionGuardService initialization failed (non-fatal)', {
790
+ error: fissionErr instanceof Error ? fissionErr.message : String(fissionErr),
791
+ });
792
+ }
793
+ }
794
+ // Broadcast queue events via Socket.IO
795
+ this.messageQueueService.on('enqueued', (msg) => {
796
+ this.io.emit(MESSAGE_QUEUE_CONSTANTS.SOCKET_EVENTS.MESSAGE_ENQUEUED, msg);
797
+ });
798
+ this.messageQueueService.on('processing', (msg) => {
799
+ this.io.emit(MESSAGE_QUEUE_CONSTANTS.SOCKET_EVENTS.MESSAGE_PROCESSING, msg);
800
+ });
801
+ this.messageQueueService.on('completed', (msg) => {
802
+ this.io.emit(MESSAGE_QUEUE_CONSTANTS.SOCKET_EVENTS.MESSAGE_COMPLETED, msg);
803
+ });
804
+ this.messageQueueService.on('failed', (msg) => {
805
+ this.io.emit(MESSAGE_QUEUE_CONSTANTS.SOCKET_EVENTS.MESSAGE_FAILED, msg);
806
+ });
807
+ this.messageQueueService.on('statusUpdate', (status) => {
808
+ this.io.emit(MESSAGE_QUEUE_CONSTANTS.SOCKET_EVENTS.STATUS_UPDATE, status);
809
+ });
810
+ }
811
+ configureMiddleware() {
812
+ // Security middleware
813
+ this.app.use(helmet({
814
+ contentSecurityPolicy: {
815
+ directives: {
816
+ defaultSrc: ["'self'"],
817
+ styleSrc: ["'self'", "'unsafe-inline'"],
818
+ scriptSrc: ["'self'", "'unsafe-eval'"],
819
+ imgSrc: ["'self'", 'data:', 'https:', 'blob:'],
820
+ connectSrc: ["'self'", 'ws:', 'wss:', 'blob:'],
821
+ // Disable upgrade-insecure-requests for HTTP-only deployments (ESTestNode)
822
+ // Without this, browsers on HTTP upgrade all asset requests to HTTPS → ERR_SSL_PROTOCOL_ERROR
823
+ upgradeInsecureRequests: null,
824
+ },
825
+ },
826
+ }));
827
+ // CORS — allow Cloud Console frontend and localhost OSS instances
828
+ const CORS_ALLOWED_ORIGINS = process.env['CORS_ALLOWED_ORIGINS']
829
+ ? process.env['CORS_ALLOWED_ORIGINS'].split(',')
830
+ : ['https://crewlyai.com', 'https://www.crewlyai.com', 'http://localhost:8787', 'http://localhost:3000'];
831
+ this.app.use(cors({
832
+ origin: process.env.NODE_ENV === 'production'
833
+ ? CORS_ALLOWED_ORIGINS
834
+ : '*',
835
+ credentials: true,
836
+ }));
837
+ // Logging
838
+ this.app.use(morgan(process.env.NODE_ENV === 'production' ? 'combined' : 'dev'));
839
+ // Body parsing — `verify` captures the raw bytes so the error handler
840
+ // below can log the exact payload when JSON parsing fails. Without this
841
+ // we only see the position-of-failure, not the bytes.
842
+ this.app.use(express.json({
843
+ limit: '10mb',
844
+ verify: (req, _res, buf) => {
845
+ req.rawBody = buf.toString('utf8');
846
+ },
847
+ }));
848
+ this.app.use(express.urlencoded({ extended: true, limit: '10mb' }));
849
+ // Note: Static files are configured in configureRoutes() after API routes
850
+ }
851
+ configureRoutes() {
852
+ // Agent heartbeat middleware - any API call with X-Agent-Session header updates heartbeat
853
+ this.app.use('/api', agentHeartbeatMiddleware);
854
+ // API routes
855
+ this.app.use('/api', createApiRoutes(this.apiController));
856
+ // Health check (enhanced with mode and agent info)
857
+ this.app.get('/health', (req, res) => {
858
+ const versionService = VersionCheckService.getInstance();
859
+ const cachedCheck = versionService.getCachedCheckResult();
860
+ // Count active agents from the session backend.
861
+ // listSessions() returns names of all active sessions, so
862
+ // total and active counts are equal (only live sessions are listed).
863
+ let agentCount = 0;
864
+ try {
865
+ const sessionBackend = getSessionBackendSync();
866
+ if (sessionBackend) {
867
+ agentCount = sessionBackend.listSessions().length;
868
+ }
869
+ }
870
+ catch {
871
+ // Session backend may not be initialized yet
872
+ }
873
+ // #199: Safely resolve version — findPackageRoot may fail from global install paths
874
+ let version = cachedCheck?.currentVersion ?? null;
875
+ if (!version) {
876
+ try {
877
+ version = versionService.getLocalVersion();
878
+ }
879
+ catch {
880
+ version = process.env.npm_package_version || 'unknown';
881
+ }
882
+ }
883
+ // THW self-instrumentation (§F.3): surface last-sweep age + degraded
884
+ // flag so the watchdog-watchdog (§E.8) bubbles up here. Fail-soft
885
+ // per Sam's etiquette nudge — when the singleton isn't ready, return
886
+ // status:"warming" rather than 5xx.
887
+ const watchdog = getTeamHealthWatchdogSingleton();
888
+ const teamHealthBlock = watchdog
889
+ ? {
890
+ status: watchdog.isDegraded() ? 'degraded' : (watchdog.isActive() ? 'ok' : 'inactive'),
891
+ last_sweep_age_ms: watchdog.getLastSweepAgeMs(),
892
+ shadowMode: watchdog.getLastSweep()?.shadowMode ?? null,
893
+ }
894
+ : { status: 'warming', last_sweep_age_ms: -1, shadowMode: null };
895
+ res.json({
896
+ status: 'healthy',
897
+ timestamp: new Date().toISOString(),
898
+ uptime: process.uptime(),
899
+ version,
900
+ latestVersion: cachedCheck?.latestVersion ?? null,
901
+ updateAvailable: cachedCheck?.updateAvailable ?? false,
902
+ mode: this.config.headless ? 'headless' : 'standard',
903
+ agents: {
904
+ active: agentCount,
905
+ total: agentCount,
906
+ },
907
+ team_health: teamHealthBlock,
908
+ });
909
+ });
910
+ // H5 quick entry static page (served regardless of headless mode)
911
+ {
912
+ const projectRoot = findPackageRoot(__dirname);
913
+ const h5StaticPath = path.join(projectRoot, 'backend/src/static/h5');
914
+ this.app.use('/h5', express.static(h5StaticPath));
915
+ }
916
+ // Static files for frontend (skip in headless mode)
917
+ if (!this.config.headless) {
918
+ // Use findPackageRoot() so this works both in dev mode (backend/src/)
919
+ // and in compiled/npm-installed mode (dist/backend/backend/src/)
920
+ const projectRoot = findPackageRoot(__dirname);
921
+ const frontendPath = path.join(projectRoot, 'frontend/dist');
922
+ this.app.use(express.static(frontendPath));
923
+ // Serve frontend for all other routes (SPA)
924
+ // Skip /api/ and /health paths so addon-registered API routes are reachable
925
+ this.app.get('*', (req, res, next) => {
926
+ if (req.path.startsWith('/api/') || req.path === '/health') {
927
+ return next();
928
+ }
929
+ const frontendIndexPath = path.join(projectRoot, 'frontend/dist/index.html');
930
+ res.sendFile(frontendIndexPath);
931
+ });
932
+ }
933
+ else {
934
+ this.logger.info('Headless mode: frontend serving disabled (API-only)');
935
+ }
936
+ // Error handling middleware
937
+ this.app.use((err, req, res, next) => {
938
+ const rawBody = req.rawBody;
939
+ this.logger.error('Request error', {
940
+ error: err.message,
941
+ stack: err.stack,
942
+ url: `${req.method} ${req.originalUrl}`,
943
+ contentType: req.headers['content-type'],
944
+ contentLength: req.headers['content-length'],
945
+ rawBodyLength: rawBody?.length,
946
+ rawBody: rawBody ? JSON.stringify(rawBody) : undefined,
947
+ });
948
+ const status = err.statusCode
949
+ ?? err.status
950
+ ?? 500;
951
+ res.status(status).json({
952
+ success: false,
953
+ error: process.env.NODE_ENV === 'production'
954
+ ? 'Internal server error'
955
+ : err.message,
956
+ });
957
+ });
958
+ }
959
+ configureWebSocket() {
960
+ this.io.on('connection', (socket) => {
961
+ this.logger.info('Client connected', { socketId: socket.id });
962
+ socket.on('disconnect', () => {
963
+ this.logger.info('Client disconnected', { socketId: socket.id });
964
+ });
965
+ });
966
+ // Connect terminal output to WebSocket
967
+ this.tmuxService.on('output', (output) => {
968
+ this.io.emit('terminal_output', output);
969
+ });
970
+ // Forward scheduler events
971
+ this.schedulerService.on('check_executed', (data) => {
972
+ this.io.emit('check_executed', data);
973
+ });
974
+ this.schedulerService.on('check_scheduled', (data) => {
975
+ this.io.emit('check_scheduled', data);
976
+ });
977
+ }
978
+ async start() {
979
+ try {
980
+ // Validate environment configuration (fail fast with clear errors)
981
+ const { validateEnvConfig, logEnvValidation } = await import('./services/core/env.config.js');
982
+ const envValidation = validateEnvConfig();
983
+ logEnvValidation(envValidation);
984
+ if (!envValidation.valid) {
985
+ throw new Error('Environment configuration validation failed — see errors above');
986
+ }
987
+ // Initialize OpenTelemetry tracing (early, before other services)
988
+ const { TracingService } = await import('./services/core/tracing.service.js');
989
+ TracingService.getInstance().initialize();
990
+ // Expose queue instance for cross-machine message routing (used by MessageRouterService)
991
+ const { setMessageQueueInstance } = await import('./services/messaging/index.js');
992
+ setMessageQueueInstance(this.messageQueueService);
993
+ this.logger.info('Starting Crewly server...');
994
+ this.logger.info('Server startup info', {
995
+ pid: process.pid,
996
+ memoryUsageMB: Math.round(process.memoryUsage().heapUsed / 1024 / 1024),
997
+ targetPort: this.config.webPort,
998
+ headless: this.config.headless,
999
+ });
1000
+ // Truncate service.log on startup — it's a raw stdout pipe duplicate of the
1001
+ // daily crewly-YYYY-MM-DD.log files and grows unbounded otherwise.
1002
+ try {
1003
+ const serviceLogPath = path.join(this.config.crewlyHome, 'logs', 'service.log');
1004
+ const { stat, truncate } = await import('fs/promises');
1005
+ const logStat = await stat(serviceLogPath).catch(() => null);
1006
+ if (logStat && logStat.size > 10 * 1024 * 1024) { // truncate if > 10MB
1007
+ await truncate(serviceLogPath, 0);
1008
+ this.logger.info('Truncated service.log on startup', {
1009
+ previousSizeMB: Math.round(logStat.size / 1024 / 1024),
1010
+ });
1011
+ }
1012
+ }
1013
+ catch {
1014
+ // Non-critical
1015
+ }
1016
+ if (this.config.headless) {
1017
+ this.logger.info('Headless mode active: API-only, no frontend serving');
1018
+ }
1019
+ // Check for pending self-improvement (hot-reload recovery)
1020
+ await this.checkPendingSelfImprovement();
1021
+ // Check if port is already in use
1022
+ await this.checkPortAvailability();
1023
+ // Skip tmux initialization since we're using PTY session backend
1024
+ // Note: TmuxService is kept for backward compatibility but PTY is the active backend
1025
+ try {
1026
+ await this.tmuxService.initialize();
1027
+ }
1028
+ catch (error) {
1029
+ // Ignore tmux initialization errors - PTY backend is primary
1030
+ }
1031
+ // Reset orchestrator status to inactive on startup.
1032
+ // The persisted status file may still say "active" from the previous session,
1033
+ // but a fresh app start has no running agent. Without this reset, the UI
1034
+ // would show "Active" for a bare shell that has no Claude running inside it.
1035
+ try {
1036
+ await this.storageService.updateOrchestratorStatus(CREWLY_CONSTANTS.AGENT_STATUSES.INACTIVE);
1037
+ this.logger.info('Reset orchestrator status to inactive on startup');
1038
+ }
1039
+ catch (resetErr) {
1040
+ this.logger.warn('Failed to reset orchestrator status on startup', {
1041
+ error: resetErr instanceof Error ? resetErr.message : String(resetErr),
1042
+ });
1043
+ }
1044
+ // Initialize PTY session backend.
1045
+ // We load persisted session metadata (including Claude session IDs) so that
1046
+ // when agents are re-started, they can resume their previous conversations
1047
+ // using --resume. The actual PTY sessions are NOT restored here — they are
1048
+ // recreated when the user starts teams again.
1049
+ this.logger.info('Initializing PTY session backend...');
1050
+ await getSessionBackend();
1051
+ // Load persisted session metadata for resume-on-restart support
1052
+ try {
1053
+ const persistence = getSessionStatePersistence();
1054
+ const savedState = await persistence.loadState();
1055
+ if (savedState && savedState.sessions.length > 0) {
1056
+ for (const sessionInfo of savedState.sessions) {
1057
+ persistence.registerSession(sessionInfo.name, {
1058
+ cwd: sessionInfo.cwd,
1059
+ command: sessionInfo.command,
1060
+ args: sessionInfo.args,
1061
+ env: sessionInfo.env,
1062
+ }, sessionInfo.runtimeType, sessionInfo.role, sessionInfo.teamId);
1063
+ if (sessionInfo.claudeSessionId) {
1064
+ persistence.updateSessionId(sessionInfo.name, sessionInfo.claudeSessionId);
1065
+ }
1066
+ }
1067
+ this.logger.info('Loaded persisted session metadata for resume support', {
1068
+ count: savedState.sessions.length,
1069
+ sessionsWithResumeId: savedState.sessions.filter(s => s.claudeSessionId).length,
1070
+ });
1071
+ }
1072
+ }
1073
+ catch (loadError) {
1074
+ this.logger.debug('No persisted session state to load (first run or cleared)', {
1075
+ error: loadError instanceof Error ? loadError.message : String(loadError),
1076
+ });
1077
+ }
1078
+ // Initialize Redis cache (non-blocking — falls back to memory if Redis is unavailable)
1079
+ try {
1080
+ const redisConnected = await RedisCacheService.getInstance().connect();
1081
+ this.logger.info('Redis cache initialized', { connected: redisConnected, backend: redisConnected ? 'redis' : 'memory' });
1082
+ }
1083
+ catch (cacheErr) {
1084
+ this.logger.info('Redis cache not available, using in-memory fallback', {
1085
+ error: cacheErr instanceof Error ? cacheErr.message : String(cacheErr),
1086
+ });
1087
+ }
1088
+ // Start message scheduler
1089
+ this.logger.info('Starting message scheduler...');
1090
+ await this.messageSchedulerService.start();
1091
+ // Restore persisted scheduled checks (non-critical — don't block startup)
1092
+ try {
1093
+ this.logger.info('Restoring persisted scheduled checks...');
1094
+ const [recurringRestored, oneTimeRestored] = await Promise.all([
1095
+ this.schedulerService.restoreRecurringChecks(),
1096
+ this.schedulerService.restoreOneTimeChecks(),
1097
+ ]);
1098
+ if (recurringRestored > 0 || oneTimeRestored > 0) {
1099
+ this.logger.info('Restored scheduled checks', { recurringRestored, oneTimeRestored });
1100
+ }
1101
+ }
1102
+ catch (restoreError) {
1103
+ this.logger.warn('Failed to restore scheduled checks (non-critical)', {
1104
+ error: restoreError instanceof Error ? restoreError.message : String(restoreError),
1105
+ });
1106
+ }
1107
+ // Start activity monitoring
1108
+ this.logger.info('Starting activity monitoring...');
1109
+ this.activityMonitorService.startPolling();
1110
+ // Start idle detection for agent suspension
1111
+ this.logger.info('Starting idle detection service...');
1112
+ const idleDetection = IdleDetectionService.getInstance();
1113
+ idleDetection.setAgentRegistrationService(this.apiController.agentRegistrationService);
1114
+ idleDetection.start();
1115
+ // Wire OrchestratorRestartService with dependencies for auto-restart
1116
+ try {
1117
+ const sessionBackend = getSessionBackendSync();
1118
+ if (sessionBackend) {
1119
+ const restartService = OrchestratorRestartService.getInstance();
1120
+ restartService.setDependencies(this.apiController.agentRegistrationService, sessionBackend, this.io);
1121
+ this.logger.info('OrchestratorRestartService wired with dependencies');
1122
+ }
1123
+ }
1124
+ catch (error) {
1125
+ this.logger.warn('Failed to wire OrchestratorRestartService (non-critical)', {
1126
+ error: error instanceof Error ? error.message : String(error),
1127
+ });
1128
+ }
1129
+ // Wire orchestrator-setup service for SlackBridge auto-recovery (B0).
1130
+ // Without this, the bridge's auto-recovery path returns "deps not
1131
+ // initialized" and falls through to the offline branch.
1132
+ try {
1133
+ setOrchestratorSetupDependencies({
1134
+ agentRegistrationService: this.apiController.agentRegistrationService,
1135
+ storageService: this.storageService,
1136
+ });
1137
+ this.logger.info('OrchestratorSetupService wired with dependencies');
1138
+ }
1139
+ catch (error) {
1140
+ this.logger.warn('Failed to wire OrchestratorSetupService (non-critical)', {
1141
+ error: error instanceof Error ? error.message : String(error),
1142
+ });
1143
+ }
1144
+ // Wire and start OrchestratorHeartbeatMonitorService for auto-restart
1145
+ try {
1146
+ const orchHbSessionBackend = getSessionBackendSync();
1147
+ if (orchHbSessionBackend) {
1148
+ const orchHeartbeatMonitor = OrchestratorHeartbeatMonitorService.getInstance();
1149
+ orchHeartbeatMonitor.setDependencies(orchHbSessionBackend, () => this.messageQueueService.hasPending() || this.queueProcessorService.isProcessingMessage());
1150
+ orchHeartbeatMonitor.start();
1151
+ this.logger.info('OrchestratorHeartbeatMonitorService started');
1152
+ }
1153
+ }
1154
+ catch (error) {
1155
+ this.logger.warn('Failed to start OrchestratorHeartbeatMonitorService (non-critical)', {
1156
+ error: error instanceof Error ? error.message : String(error),
1157
+ });
1158
+ }
1159
+ // Wire AgentSuspendService with registration service for rehydration
1160
+ try {
1161
+ AgentSuspendService.getInstance().setDependencies(this.apiController.agentRegistrationService);
1162
+ this.logger.info('AgentSuspendService wired with dependencies');
1163
+ }
1164
+ catch (error) {
1165
+ this.logger.warn('Failed to wire AgentSuspendService (non-critical)', {
1166
+ error: error instanceof Error ? error.message : String(error),
1167
+ });
1168
+ }
1169
+ // Wire and start AgentHeartbeatMonitorService
1170
+ try {
1171
+ const agentHbSessionBackend = getSessionBackendSync();
1172
+ if (agentHbSessionBackend) {
1173
+ const agentHeartbeatMonitor = AgentHeartbeatMonitorService.getInstance();
1174
+ agentHeartbeatMonitor.setDependencies(agentHbSessionBackend, this.apiController.agentRegistrationService, this.storageService);
1175
+ agentHeartbeatMonitor.start();
1176
+ this.logger.info('AgentHeartbeatMonitorService started');
1177
+ }
1178
+ }
1179
+ catch (error) {
1180
+ this.logger.warn('Failed to start AgentHeartbeatMonitorService (non-critical)', {
1181
+ error: error instanceof Error ? error.message : String(error),
1182
+ });
1183
+ }
1184
+ // Wire and start ContextWindowMonitorService
1185
+ try {
1186
+ const ctxSessionBackend = getSessionBackendSync();
1187
+ if (ctxSessionBackend) {
1188
+ const contextWindowMonitor = ContextWindowMonitorService.getInstance();
1189
+ contextWindowMonitor.setDependencies(ctxSessionBackend, this.apiController.agentRegistrationService, this.storageService, this.eventBusService);
1190
+ contextWindowMonitor.start();
1191
+ this.logger.info('ContextWindowMonitorService started');
1192
+ }
1193
+ }
1194
+ catch (error) {
1195
+ this.logger.warn('Failed to start ContextWindowMonitorService (non-critical)', {
1196
+ error: error instanceof Error ? error.message : String(error),
1197
+ });
1198
+ }
1199
+ // Wire OAuthReloginMonitorService EventBus dependency
1200
+ try {
1201
+ OAuthReloginMonitorService.getInstance().setEventBusService(this.eventBusService);
1202
+ }
1203
+ catch (error) {
1204
+ this.logger.warn('Failed to wire OAuthReloginMonitorService EventBus (non-critical)', {
1205
+ error: error instanceof Error ? error.message : String(error),
1206
+ });
1207
+ }
1208
+ // Wire RuntimeExitMonitorService dependencies for task-aware restart
1209
+ try {
1210
+ const runtimeExitMonitor = RuntimeExitMonitorService.getInstance();
1211
+ runtimeExitMonitor.setAgentRegistrationService(this.apiController.agentRegistrationService);
1212
+ runtimeExitMonitor.setEventBusService(this.eventBusService);
1213
+ }
1214
+ catch (error) {
1215
+ this.logger.warn('Failed to wire RuntimeExitMonitorService dependencies (non-critical)', {
1216
+ error: error instanceof Error ? error.message : String(error),
1217
+ });
1218
+ }
1219
+ // Start Crewly in Chrome WebSocket bridge
1220
+ try {
1221
+ const { BrowserBridgeService } = await import('./services/browser/browser-bridge.service.js');
1222
+ const browserBridge = BrowserBridgeService.getInstance();
1223
+ browserBridge.attach(this.httpServer);
1224
+ this.logger.info('Crewly in Chrome WebSocket bridge started');
1225
+ }
1226
+ catch (error) {
1227
+ this.logger.warn('Failed to start Crewly in Chrome bridge (non-critical)', {
1228
+ error: error instanceof Error ? error.message : String(error),
1229
+ });
1230
+ }
1231
+ // Start chat-v2 WebSocket gateway + dispatcher (Phase 1 Chat MVP).
1232
+ // The gateway fans `message`/`presence` frames to subscribers of
1233
+ // `/ws/chat?channelId=...`. The dispatcher pushes user-origin
1234
+ // messages into the bound agent session so it can reply via the
1235
+ // `reply-channel` skill. See chat-v2.gateway.ts for the contract.
1236
+ try {
1237
+ const [{ ChatV2Gateway, devAnonymousTokenVerifier }, { ChatV2DispatcherService }, { ChatV2MentionResolver }, { getChatV2Service }, { setChatV2RealtimeDeps }, { verifyHs256Token },] = await Promise.all([
1238
+ import('./websocket/chat-v2.gateway.js'),
1239
+ import('./services/chat-v2/chat-v2.dispatcher.service.js'),
1240
+ import('./services/chat-v2/chat-v2.mention-resolver.js'),
1241
+ import('./services/chat-v2/chat-v2.singleton.js'),
1242
+ import('./services/chat-v2/chat-v2.realtime-holder.js'),
1243
+ import('./middleware/require-auth.middleware.js'),
1244
+ ]);
1245
+ const chatService = getChatV2Service();
1246
+ const jwtSecret = process.env['CREWLY_JWT_SECRET'];
1247
+ const verifyToken = jwtSecret
1248
+ ? async (token) => {
1249
+ if (!token)
1250
+ return null;
1251
+ const payload = verifyHs256Token(token, jwtSecret);
1252
+ if (!payload?.sub)
1253
+ return null;
1254
+ return { userId: payload.sub };
1255
+ }
1256
+ : devAnonymousTokenVerifier;
1257
+ const chatGateway = new ChatV2Gateway({ service: chatService, verifyToken });
1258
+ chatGateway.attach(this.httpServer);
1259
+ // Phase C BE.3 — inject the mention resolver so type='channel'
1260
+ // messages fan out to @-mentioned recipients instead of
1261
+ // short-circuiting with strategy='skip' at the dispatcher.
1262
+ // Pattern matches LiveTeamHealthDataProvider wiring (~line 487):
1263
+ // `getTeams: async () => StorageService.getInstance().getTeams()`.
1264
+ const chatMentionResolver = new ChatV2MentionResolver({
1265
+ loadTeams: async () => StorageService.getInstance().getTeams(),
1266
+ });
1267
+ const chatDispatcher = new ChatV2DispatcherService({
1268
+ agentSink: this.apiController.agentRegistrationService,
1269
+ mentionResolver: chatMentionResolver,
1270
+ // Phase B-2 — huddle roster lookup. ChatV2Service owns
1271
+ // the chat_channel_members table; the dispatcher just
1272
+ // needs the list of session names for a given channel
1273
+ // to fan-out a user message to every huddle member.
1274
+ huddleMembersFor: (channelId) => chatService.queryHuddleMembersForDispatch(channelId),
1275
+ });
1276
+ this.chatV2Gateway = chatGateway;
1277
+ this.chatV2Dispatcher = chatDispatcher;
1278
+ // The chat-v2 router mounted earlier reads realtime deps from
1279
+ // this holder at request time, so it picks up broadcast +
1280
+ // dispatch without a re-mount.
1281
+ setChatV2RealtimeDeps({ gateway: chatGateway, dispatcher: chatDispatcher });
1282
+ this.logger.info('chat-v2 WebSocket gateway + dispatcher started', {
1283
+ path: '/ws/chat',
1284
+ authMode: jwtSecret ? 'jwt' : 'dev-anonymous',
1285
+ });
1286
+ // LLM-wiki Phase 1 (redesign 2026-05-22): the prior auto-write
1287
+ // subscriber was REMOVED. Steve's direction: agents decide what
1288
+ // is wiki-worthy from inside the conversation and call the
1289
+ // `wiki-queue-add` skill explicitly. No keyword routing, no
1290
+ // blanket "every chat → log.md." See WikiQueueService for the
1291
+ // queue + the orchestrator system prompt for the agent rule.
1292
+ // Cloud Portal relay bridge — gives the Crewly Portal at
1293
+ // crewlyai.com the same /agents experience by tunnelling chat-v2
1294
+ // RPC calls through the Cloud relay queue + forwarding gateway
1295
+ // broadcasts as `chat_event` messages. Only wired when Cloud Sync
1296
+ // is running (BrowserRelayAdapter pattern).
1297
+ try {
1298
+ const { ChatV2RelayAdapter } = await import('./services/chat-v2/chat-v2.relay-adapter.service.js');
1299
+ const { CloudSyncService } = await import('./services/cloud/cloud-sync.service.js');
1300
+ const { createOssAgentDirectoryProvider, createOssAgentPresenceProvider, } = await import('./services/chat-v2/chat-v2.providers.js');
1301
+ const sync = CloudSyncService.getInstance();
1302
+ if (sync) {
1303
+ const chatRelayAdapter = new ChatV2RelayAdapter({
1304
+ service: chatService,
1305
+ gateway: chatGateway,
1306
+ cloudSync: sync,
1307
+ // Wire the dispatcher so Portal-sent user messages also fire the
1308
+ // agent-side prompt (parity with the HTTP controller path).
1309
+ // Without this, Portal user-messages persist but the bound agent
1310
+ // never receives the `[CHAT:<id>]` prompt — orc/etc. stay silent.
1311
+ dispatcher: chatDispatcher,
1312
+ directory: createOssAgentDirectoryProvider(this.storageService),
1313
+ presence: createOssAgentPresenceProvider(this.storageService),
1314
+ });
1315
+ chatRelayAdapter.start();
1316
+ this.logger.info('ChatV2RelayAdapter started — Cloud Portal can now drive chat-v2 via relay');
1317
+ }
1318
+ }
1319
+ catch (err) {
1320
+ // Adapter wiring failure is non-fatal — local OSS UI still works.
1321
+ this.logger.warn('ChatV2RelayAdapter wiring skipped', {
1322
+ error: err instanceof Error ? err.message : String(err),
1323
+ });
1324
+ }
1325
+ // Onboarding v3 (B1) — wire the cold-start detector with the
1326
+ // chat-v2 service we just stood up. The orc bootstrap path
1327
+ // (CrewlyAgentExternalRuntimeService.detectOnboardingMode) probes this
1328
+ // singleton; null means "skip the cold-start probe", so this
1329
+ // wiring is what flips onboarding mode on for the demo path.
1330
+ try {
1331
+ const { OnboardingBootstrapService, setOnboardingBootstrapService } = await import('./services/orchestrator/onboarding-bootstrap.service.js');
1332
+ setOnboardingBootstrapService(new OnboardingBootstrapService({
1333
+ storage: this.storageService,
1334
+ chat: { countAllMessages: () => chatService.countAllMessages() },
1335
+ }));
1336
+ this.logger.info('OnboardingBootstrapService wired with storage + chat probes');
1337
+ }
1338
+ catch (wireErr) {
1339
+ this.logger.warn('Failed to wire OnboardingBootstrapService (non-critical)', {
1340
+ error: wireErr instanceof Error ? wireErr.message : String(wireErr),
1341
+ });
1342
+ }
1343
+ }
1344
+ catch (error) {
1345
+ // F-CYCLE7-1: a native-binding failure (e.g. better-sqlite3 built
1346
+ // for the wrong arch) MUST crash the boot rather than be downgraded
1347
+ // to a JSON-file fallback. The audit on 2026-05-07 caught this
1348
+ // exact path: chat.db went stale at 11:17Z because dlopen errors
1349
+ // were swallowed here as "non-critical", so operators had no signal
1350
+ // to run `npm rebuild better-sqlite3 --build-from-source`.
1351
+ //
1352
+ // `isNativeBindingFatalError` matches structurally (not just via
1353
+ // instanceof) so realm-boundary cases — same module loaded via
1354
+ // two require paths — still trip the rethrow.
1355
+ if (isNativeBindingFatalError(error)) {
1356
+ this.logger.error('FATAL native binding failed at chat-v2 boot — refusing to downgrade to JSON fallback. Run the printed remediation and restart.', { error: error.message });
1357
+ throw error;
1358
+ }
1359
+ this.logger.warn('Failed to start chat-v2 WS gateway (non-critical)', {
1360
+ error: error instanceof Error ? error.message : String(error),
1361
+ });
1362
+ }
1363
+ // Connect BrowserProxyService to Cloud Relay (lazy — does not block startup)
1364
+ try {
1365
+ const { BrowserProxyService } = await import('./services/browser/browser-proxy.service.js');
1366
+ const { CloudClientService } = await import('./services/cloud/cloud-client.service.js');
1367
+ const cloudClient = CloudClientService.getInstance();
1368
+ const browserProxy = BrowserProxyService.getInstance();
1369
+ // Wire up token resolver so reconnects always use the freshest JWT
1370
+ // Reconnects must use the freshest RELAY token (NOT the access token,
1371
+ // per the RELAY-TOKEN-TYPE invariant). The relay only accepts a relay-
1372
+ // signed access JWT; the access token churns the socket.
1373
+ browserProxy.setTokenResolver(() => cloudClient.getRelayToken());
1374
+ // Subscribe to RELAY-token refresh events (distinct channel from the
1375
+ // access-token refresh) so the proxy re-registers in place with the
1376
+ // fresh relay token before its exp.
1377
+ cloudClient.onRelayTokenRefresh((newRelayToken) => {
1378
+ browserProxy.updateToken(newRelayToken);
1379
+ });
1380
+ const relayToken = cloudClient.getRelayToken();
1381
+ if (relayToken) {
1382
+ browserProxy.connect(relayToken);
1383
+ this.logger.info('BrowserProxyService connecting to Cloud Relay');
1384
+ }
1385
+ else {
1386
+ // No relay token yet (connectLocal mints it asynchronously). Defer
1387
+ // connect to the onRelayTokenRefresh callback above rather than
1388
+ // connecting with the wrong (access) token.
1389
+ this.logger.debug('BrowserProxyService deferred — no relay token yet, will connect on relay-token refresh');
1390
+ }
1391
+ }
1392
+ catch (error) {
1393
+ this.logger.warn('Failed to initialize BrowserProxyService (non-critical)', {
1394
+ error: error instanceof Error ? error.message : String(error),
1395
+ });
1396
+ }
1397
+ // Start team activity WebSocket service
1398
+ this.logger.info('Starting team activity WebSocket service...');
1399
+ this.teamActivityWebSocketService.start();
1400
+ // Start teams.json file watcher for real-time updates
1401
+ this.logger.info('Starting teams.json file watcher...');
1402
+ this.teamsJsonWatcherService.start();
1403
+ this.logger.info('Teams.json file watcher started for real-time updates');
1404
+ // Generate orchestrator skills catalog
1405
+ try {
1406
+ const skillCatalogProjectRoot = findPackageRoot(__dirname);
1407
+ const catalogService = SkillCatalogService.getInstance(skillCatalogProjectRoot);
1408
+ const catalogResult = await catalogService.generateCatalog();
1409
+ this.logger.info('Orchestrator skills catalog generated', {
1410
+ catalogPath: catalogResult.catalogPath,
1411
+ skillCount: catalogResult.skillCount,
1412
+ });
1413
+ const agentCatalogResult = await catalogService.generateAgentCatalog();
1414
+ this.logger.info('Agent skills catalog generated', {
1415
+ catalogPath: agentCatalogResult.catalogPath,
1416
+ skillCount: agentCatalogResult.skillCount,
1417
+ });
1418
+ }
1419
+ catch (error) {
1420
+ this.logger.warn('Failed to generate skills catalog (non-critical)', {
1421
+ error: error instanceof Error ? error.message : String(error),
1422
+ });
1423
+ }
1424
+ // Restore persisted message queue state (pending messages survive restarts)
1425
+ this.logger.info('Loading persisted message queue state...');
1426
+ try {
1427
+ await this.messageQueueService.loadPersistedState();
1428
+ const queueStatus = this.messageQueueService.getStatus();
1429
+ if (queueStatus.pendingCount > 0) {
1430
+ this.logger.info('Restored pending messages from previous session', {
1431
+ pendingCount: queueStatus.pendingCount,
1432
+ });
1433
+ }
1434
+ }
1435
+ catch (error) {
1436
+ this.logger.warn('Failed to load persisted queue state', {
1437
+ error: error instanceof Error ? error.message : String(error),
1438
+ });
1439
+ }
1440
+ // Load thread status queue from disk so replay can check terminal statuses
1441
+ try {
1442
+ await this.threadStatusQueueService.loadPersistedState();
1443
+ }
1444
+ catch (err) {
1445
+ this.logger.warn('Failed to load thread status queue state (non-critical)', {
1446
+ error: err instanceof Error ? err.message : String(err),
1447
+ });
1448
+ }
1449
+ // Backfill: mark Slack threads for done Requests as terminal so the
1450
+ // resume notification won't re-send already-answered conversations.
1451
+ try {
1452
+ const { RequestService } = await import('./services/v3/request.service.js');
1453
+ const { extractSlackChannelId, extractSlackThreadTs } = await import('./services/v3/request-sla.subscriber.js');
1454
+ const reqSvc = RequestService.getInstance();
1455
+ const allReqs = await reqSvc.listAll();
1456
+ let backfilled = 0;
1457
+ for (const req of allReqs) {
1458
+ if (req.status !== 'done')
1459
+ continue;
1460
+ const scid = req.sourceConversationItemId || '';
1461
+ // `extractSlack*` strips the optional `-msg-{ts}` thread-reply
1462
+ // suffix before parsing, so both top-level and in-thread
1463
+ // Requests resolve to the canonical `{channelId}:{threadRoot}`.
1464
+ // Previously a local regex was used here and its greedy `.+`
1465
+ // swallowed the suffix, producing a malformed threadKey that
1466
+ // missed the dedup check and bloated the persistence file.
1467
+ const channelId = extractSlackChannelId(scid);
1468
+ const threadTs = extractSlackThreadTs(scid);
1469
+ if (!channelId || !threadTs)
1470
+ continue;
1471
+ const threadKey = `${channelId}:${threadTs}`;
1472
+ if (this.threadStatusQueueService.get(threadKey))
1473
+ continue;
1474
+ this.threadStatusQueueService.trackInbound({
1475
+ threadKey,
1476
+ conversationId: scid,
1477
+ source: 'slack',
1478
+ messagePreview: req.title.slice(0, 200),
1479
+ });
1480
+ this.threadStatusQueueService.markReplied(threadKey, 'replied_completed');
1481
+ backfilled++;
1482
+ }
1483
+ if (backfilled > 0) {
1484
+ this.logger.info('Backfilled thread status for done Requests', { count: backfilled });
1485
+ }
1486
+ }
1487
+ catch (err) {
1488
+ this.logger.warn('Thread status backfill failed (non-critical)', {
1489
+ error: err instanceof Error ? err.message : String(err),
1490
+ });
1491
+ }
1492
+ // #247: Replay pending messages that arrived while the orchestrator was offline.
1493
+ // This must happen after loadPersistedState() (so we know what's already queued)
1494
+ // but before the queue processor starts (so replayed messages are ready for delivery).
1495
+ try {
1496
+ const { MessageReplayService } = await import('./services/messaging/message-replay.service.js');
1497
+ const replayService = new MessageReplayService(this.messageQueueService, this.config.crewlyHome);
1498
+ const replayResult = await replayService.replayPendingMessages();
1499
+ if (replayResult.replayedCount > 0) {
1500
+ this.logger.info('Replayed pending messages from offline period (#247)', {
1501
+ replayed: replayResult.replayedCount,
1502
+ found: replayResult.foundCount,
1503
+ skipped: replayResult.skippedDuplicate,
1504
+ offlineSince: replayResult.offlineSince,
1505
+ offlineDurationMs: replayResult.offlineDurationMs,
1506
+ });
1507
+ }
1508
+ }
1509
+ catch (replayErr) {
1510
+ this.logger.warn('Failed to replay pending messages (non-critical)', {
1511
+ error: replayErr instanceof Error ? replayErr.message : String(replayErr),
1512
+ });
1513
+ }
1514
+ // Start message queue processor
1515
+ this.logger.info('Starting message queue processor...');
1516
+ this.queueProcessorService.start();
1517
+ // Thread Status Queue: load persisted state and recover pending threads
1518
+ try {
1519
+ const recoveryResult = await this.threadStatusQueueService.recoverPendingThreads(this.messageQueueService, {
1520
+ agentStatusChecker: {
1521
+ getAgentWorkingStatus: async (sessionName) => {
1522
+ const status = await this.activityMonitorService.getWorkingStatusForSession(sessionName);
1523
+ if (status === null)
1524
+ return 'unknown';
1525
+ return status;
1526
+ },
1527
+ },
1528
+ });
1529
+ if (recoveryResult.reEnqueued > 0 || recoveryResult.followUpRestored > 0 || recoveryResult.delegationsCompleted > 0) {
1530
+ this.logger.info('Thread status queue recovery complete', recoveryResult);
1531
+ }
1532
+ if (recoveryResult.expired > 0 || recoveryResult.cleaned > 0) {
1533
+ this.logger.info('Thread status queue maintenance', {
1534
+ expired: recoveryResult.expired,
1535
+ cleaned: recoveryResult.cleaned,
1536
+ });
1537
+ }
1538
+ }
1539
+ catch (err) {
1540
+ this.logger.warn('Thread status queue recovery failed (non-critical)', {
1541
+ error: err instanceof Error ? err.message : String(err),
1542
+ });
1543
+ }
1544
+ // #286: Start cron task service with agent status/start callbacks
1545
+ try {
1546
+ const cronTaskService = CronTaskService.getInstance();
1547
+ const storageRef = this.storageService;
1548
+ const registrationRef = this.apiController.agentRegistrationService;
1549
+ cronTaskService.setExecutionCallback(async (task) => {
1550
+ this.logger.info('Executing cron task', { id: task.id, target: task.targetAgent });
1551
+ await registrationRef.sendMessageToAgent(task.targetAgent, `[CRON_TASK:${task.id}] ${task.taskDescription}`);
1552
+ });
1553
+ // Issue #307: cron tasks created with `targetTeamId` set to a
1554
+ // name slug (e.g. "stock-ops-team") instead of the UUID would
1555
+ // silently 404 on every fire — `teams.find(t => t.id === teamId)`
1556
+ // returned undefined and both callbacks returned `false` with
1557
+ // no log surface. `resolveTeamByIdOrSlug` (imported statically
1558
+ // at the top of the file) tries UUID first, then falls back
1559
+ // to a slug match against `name`. Misses now surface a distinct
1560
+ // warn-log with the available slugs so the cause is visible
1561
+ // instead of hiding behind the generic "agent offline" warn
1562
+ // from cron-task.service.
1563
+ cronTaskService.setAgentStatusCallback(async (sessionName, teamId) => {
1564
+ // Handle orchestrator separately — it's not in regular teams
1565
+ if (sessionName === CREWLY_CONSTANTS.SESSIONS.ORCHESTRATOR_NAME || teamId === 'orchestrator') {
1566
+ const orchStatus = await storageRef.getOrchestratorStatus();
1567
+ return orchStatus?.agentStatus === 'active' || orchStatus?.agentStatus === 'started';
1568
+ }
1569
+ const teams = await storageRef.getTeams();
1570
+ const team = resolveTeamByIdOrSlug(teams, teamId);
1571
+ if (!team) {
1572
+ this.logger.warn('CronTask: targetTeamId resolves to no team', {
1573
+ sessionName,
1574
+ targetTeamId: teamId,
1575
+ availableSlugs: teams.slice(0, 10).map((t) => slugifyTeamName(t.name)),
1576
+ hint: 'Set targetTeamId to either the team UUID or one of availableSlugs (lowercase, spaces→-)',
1577
+ });
1578
+ return false;
1579
+ }
1580
+ const member = team.members.find((m) => m.sessionName === sessionName);
1581
+ if (!member)
1582
+ return false;
1583
+ // #286 Root Cause C: treat both 'active' and 'started' as online
1584
+ return member.agentStatus === 'active' || member.agentStatus === 'started';
1585
+ });
1586
+ cronTaskService.setAgentStartCallback(async (sessionName, teamId) => {
1587
+ try {
1588
+ const teams = await storageRef.getTeams();
1589
+ const team = resolveTeamByIdOrSlug(teams, teamId);
1590
+ if (!team) {
1591
+ this.logger.warn('CronTask auto-start: targetTeamId resolves to no team', {
1592
+ sessionName,
1593
+ targetTeamId: teamId,
1594
+ availableSlugs: teams.slice(0, 10).map((t) => slugifyTeamName(t.name)),
1595
+ hint: 'Set targetTeamId to either the team UUID or one of availableSlugs (lowercase, spaces→-)',
1596
+ });
1597
+ return false;
1598
+ }
1599
+ const member = team.members.find((m) => m.sessionName === sessionName);
1600
+ if (!member)
1601
+ return false;
1602
+ await registrationRef.createAgentSession({
1603
+ sessionName: member.sessionName,
1604
+ role: member.role,
1605
+ // Use the resolved team's UUID — not the user-supplied identifier
1606
+ // — so downstream agent-registration always sees the canonical id.
1607
+ teamId: team.id,
1608
+ memberId: member.id,
1609
+ });
1610
+ return true;
1611
+ }
1612
+ catch {
1613
+ return false;
1614
+ }
1615
+ });
1616
+ // Self-heal stale nextRunAt values from pre-timezone-fix versions
1617
+ await cronTaskService.recalculateAllNextRunTimes();
1618
+ cronTaskService.start();
1619
+ this.logger.info('CronTaskService started');
1620
+ }
1621
+ catch (cronErr) {
1622
+ this.logger.warn('CronTaskService initialization failed (non-critical)', {
1623
+ error: cronErr instanceof Error ? cronErr.message : String(cronErr),
1624
+ });
1625
+ }
1626
+ // Start TriggerEngine (V3 unified trigger system) and wire action handler
1627
+ try {
1628
+ const { TriggerEngine } = await import('./services/v3/trigger-engine.service.js');
1629
+ const { TaskProjectionService } = await import('./services/v3/task-projection.service.js');
1630
+ const triggerEngine = TriggerEngine.getInstance();
1631
+ const taskProjection = TaskProjectionService.getInstance();
1632
+ // Load TaskProjection state from disk
1633
+ await taskProjection.load();
1634
+ // Wire EventBus so signal triggers can subscribe to events
1635
+ triggerEngine.setEventBus(this.eventBusService);
1636
+ // Wire action handler — executes the effect when a trigger fires
1637
+ triggerEngine.setActionHandler(async (trigger, action) => {
1638
+ const triggerId = trigger.id;
1639
+ const logger = this.logger;
1640
+ // 1. sendMessage — enqueue a message to the orchestrator session
1641
+ if (action.sendMessage) {
1642
+ const { target, message } = action.sendMessage;
1643
+ try {
1644
+ // Format with trigger context so Agent knows why it was woken
1645
+ const formattedContent = `[SYSTEM ALERT] Trigger '${triggerId}' says: ${message}`;
1646
+ this.messageQueueService.enqueue({
1647
+ content: formattedContent,
1648
+ conversationId: target || 'system',
1649
+ source: 'system_event',
1650
+ });
1651
+ logger.info('TriggerEngine: sendMessage enqueued', { triggerId, target });
1652
+ }
1653
+ catch (err) {
1654
+ logger.warn('TriggerEngine: sendMessage failed', {
1655
+ triggerId,
1656
+ error: err instanceof Error ? err.message : String(err),
1657
+ });
1658
+ }
1659
+ }
1660
+ // 2. createWorkItem — push a new WorkItem into the task pool
1661
+ if (action.createWorkItem) {
1662
+ try {
1663
+ const { TaskPoolService } = await import('./services/task-pool/task-pool.service.js');
1664
+ const { createWorkItem } = await import('./types/v2/work-item.types.js');
1665
+ const template = action.createWorkItem;
1666
+ const workItem = createWorkItem({
1667
+ title: template.title || `Triggered task (${trigger.id})`,
1668
+ description: template.description || `Auto-created by trigger ${trigger.id}`,
1669
+ type: template.type ?? 'delegate',
1670
+ owner: template.owner ?? 'orchestrator',
1671
+ target: template.target,
1672
+ triggerId,
1673
+ requestId: template.requestId,
1674
+ });
1675
+ await TaskPoolService.getInstance().addToPool(workItem);
1676
+ // Project as a trigger_action TaskRecord
1677
+ await taskProjection.createRecord({
1678
+ title: workItem.title,
1679
+ type: 'trigger_action',
1680
+ ownerAgent: 'system',
1681
+ triggerId,
1682
+ workItemId: workItem.id,
1683
+ });
1684
+ logger.info('TriggerEngine: WorkItem enqueued', { triggerId, workItemId: workItem.id });
1685
+ }
1686
+ catch (err) {
1687
+ logger.warn('TriggerEngine: createWorkItem failed', {
1688
+ triggerId,
1689
+ error: err instanceof Error ? err.message : String(err),
1690
+ });
1691
+ }
1692
+ }
1693
+ // 3. wakeWorkItemId — re-queue a suspended/blocked WorkItem with context note
1694
+ if (action.wakeWorkItemId) {
1695
+ try {
1696
+ const { TaskPoolService } = await import('./services/task-pool/task-pool.service.js');
1697
+ const taskPool = TaskPoolService.getInstance();
1698
+ // Append system note to description so Agent knows why it was woken
1699
+ const wakeNote = `\n\n[SYSTEM NOTE] Woken automatically by Trigger '${triggerId}' at ${new Date().toISOString()}.`;
1700
+ await taskPool.updateItemStatus(action.wakeWorkItemId, 'queued');
1701
+ // Append wake reason to WorkItem description via storage
1702
+ try {
1703
+ const item = (await taskPool.getAllItems()).find(wi => wi.id === action.wakeWorkItemId);
1704
+ if (item) {
1705
+ await taskPool.updateTokenUsage(action.wakeWorkItemId, item.inputTokens, item.outputTokens, item.cost);
1706
+ // We use the storage directly through the service — patch description via a minimal re-add isn't feasible,
1707
+ // so we write the note to the task projection instead:
1708
+ await taskProjection.createRecord({
1709
+ title: `[TRIGGER WAKE] ${item.title}`,
1710
+ type: 'trigger_action',
1711
+ ownerAgent: 'system',
1712
+ triggerId,
1713
+ workItemId: item.id,
1714
+ requestId: item.requestId,
1715
+ });
1716
+ }
1717
+ }
1718
+ catch { /* non-critical */ }
1719
+ logger.info('TriggerEngine: WorkItem woken', { triggerId, workItemId: action.wakeWorkItemId, note: wakeNote });
1720
+ }
1721
+ catch (err) {
1722
+ logger.warn('TriggerEngine: wakeWorkItemId failed', {
1723
+ triggerId,
1724
+ workItemId: action.wakeWorkItemId,
1725
+ error: err instanceof Error ? err.message : String(err),
1726
+ });
1727
+ }
1728
+ }
1729
+ // 4. runReconciler — trigger a targeted reconciliation cycle
1730
+ if (action.runReconciler && this.reconcilerService) {
1731
+ try {
1732
+ await this.reconcilerService.runFull();
1733
+ logger.info('TriggerEngine: Reconciler run triggered', { triggerId });
1734
+ }
1735
+ catch (err) {
1736
+ logger.warn('TriggerEngine: runReconciler failed', {
1737
+ triggerId,
1738
+ error: err instanceof Error ? err.message : String(err),
1739
+ });
1740
+ }
1741
+ }
1742
+ });
1743
+ await triggerEngine.start();
1744
+ this.logger.info('TriggerEngine started with action handler wired');
1745
+ // Wire team-scoped triggers: reconcile declarative Team.triggers spec
1746
+ // against the running engine on every team save, and cancel all of a
1747
+ // team's triggers when it's deleted. Listener is fire-and-forget.
1748
+ try {
1749
+ const { TeamTriggerReconciler } = await import('./services/v3/team-trigger-reconciler.service.js');
1750
+ const reconciler = new TeamTriggerReconciler(triggerEngine);
1751
+ // Initial converge: reconcile every team that already exists on disk.
1752
+ const existingTeams = await this.storageService.getTeams();
1753
+ for (const team of existingTeams) {
1754
+ try {
1755
+ await reconciler.reconcile(team);
1756
+ }
1757
+ catch (recErr) {
1758
+ this.logger.warn('Initial team-trigger reconcile failed', {
1759
+ teamId: team.id,
1760
+ error: recErr instanceof Error ? recErr.message : String(recErr),
1761
+ });
1762
+ }
1763
+ }
1764
+ // Ongoing: subscribe to storage events.
1765
+ this.storageService.onStorageEvent(async (event) => {
1766
+ if (event.kind === 'team-saved') {
1767
+ await reconciler.reconcile(event.team);
1768
+ }
1769
+ else if (event.kind === 'team-deleted') {
1770
+ await reconciler.unregisterAll(event.teamId);
1771
+ }
1772
+ });
1773
+ this.logger.info('TeamTriggerReconciler subscribed to storage events', {
1774
+ initialTeams: existingTeams.length,
1775
+ });
1776
+ }
1777
+ catch (recErr) {
1778
+ this.logger.warn('TeamTriggerReconciler wiring failed (non-critical)', {
1779
+ error: recErr instanceof Error ? recErr.message : String(recErr),
1780
+ });
1781
+ }
1782
+ }
1783
+ catch (triggerErr) {
1784
+ this.logger.warn('TriggerEngine initialization failed (non-critical)', {
1785
+ error: triggerErr instanceof Error ? triggerErr.message : String(triggerErr),
1786
+ });
1787
+ }
1788
+ // Start V3DataService — listens for v3:task_delegated / v3:task_completed events
1789
+ // and links WorkItems to their parent Requests via requestService.linkWorkItem().
1790
+ // Must be initialized after EventBusService is ready.
1791
+ try {
1792
+ const { V3DataService } = await import('./services/v3/v3-data.service.js');
1793
+ new V3DataService(this.eventBusService, process.cwd());
1794
+ this.logger.info('V3DataService started — WorkItem↔Request linking active');
1795
+ }
1796
+ catch (v3Err) {
1797
+ this.logger.warn('V3DataService initialization failed (non-critical)', {
1798
+ error: v3Err instanceof Error ? v3Err.message : String(v3Err),
1799
+ });
1800
+ }
1801
+ // Start WorkItemDispatchSubscriber FIRST — AgentAutoClaim's recovery
1802
+ // path delegates to its dispatchTo() for the "active target session"
1803
+ // branch, so the singleton must be reachable when recovery fires.
1804
+ try {
1805
+ const { WorkItemDispatchSubscriber } = await import('./services/v3/workitem-dispatch.subscriber.js');
1806
+ const dispatchSubscriber = WorkItemDispatchSubscriber.getInstance();
1807
+ dispatchSubscriber.initialize(this.eventBusService);
1808
+ dispatchSubscriber.start();
1809
+ this.logger.info('WorkItemDispatchSubscriber started — workitem:queued events push to target sessions');
1810
+ }
1811
+ catch (dispatchErr) {
1812
+ this.logger.warn('WorkItemDispatchSubscriber initialization failed (non-critical)', {
1813
+ error: dispatchErr instanceof Error ? dispatchErr.message : String(dispatchErr),
1814
+ });
1815
+ }
1816
+ // Start AgentAutoClaimService — auto-assign work to idle agents
1817
+ try {
1818
+ const { AgentAutoClaimService } = await import('./services/v3/agent-auto-claim.service.js');
1819
+ const autoClaimService = AgentAutoClaimService.getInstance();
1820
+ autoClaimService.initialize(this.eventBusService);
1821
+ await autoClaimService.start();
1822
+ this.logger.info('AgentAutoClaimService started — idle agents will auto-claim work');
1823
+ }
1824
+ catch (autoClaimErr) {
1825
+ this.logger.warn('AgentAutoClaimService initialization failed (non-critical)', {
1826
+ error: autoClaimErr instanceof Error ? autoClaimErr.message : String(autoClaimErr),
1827
+ });
1828
+ }
1829
+ // Start TLAutoVerifyService — auto-trigger TL verification on worker task completion
1830
+ try {
1831
+ const { TLAutoVerifyService } = await import('./services/v3/tl-auto-verify.service.js');
1832
+ const tlVerifyService = TLAutoVerifyService.getInstance();
1833
+ tlVerifyService.initialize(this.eventBusService);
1834
+ tlVerifyService.start();
1835
+ this.logger.info('TLAutoVerifyService started — worker completions trigger TL verification');
1836
+ }
1837
+ catch (tlVerifyErr) {
1838
+ this.logger.warn('TLAutoVerifyService initialization failed (non-critical)', {
1839
+ error: tlVerifyErr instanceof Error ? tlVerifyErr.message : String(tlVerifyErr),
1840
+ });
1841
+ }
1842
+ // Initialize MissionExecutorService — Mission lifecycle + decomposition processing
1843
+ try {
1844
+ const { MissionExecutorService } = await import('./services/v3/mission-executor.service.js');
1845
+ MissionExecutorService.getInstance();
1846
+ this.logger.info('MissionExecutorService initialized — Mission decomposition + progress tracking ready');
1847
+ }
1848
+ catch (missionErr) {
1849
+ this.logger.warn('MissionExecutorService initialization failed (non-critical)', {
1850
+ error: missionErr instanceof Error ? missionErr.message : String(missionErr),
1851
+ });
1852
+ }
1853
+ // Start marketplace auto-update (check registry every 6 hours)
1854
+ try {
1855
+ const { startAutoUpdate } = await import('./services/marketplace/marketplace-auto-update.service.js');
1856
+ startAutoUpdate();
1857
+ }
1858
+ catch (autoUpdateErr) {
1859
+ this.logger.warn('Marketplace auto-update startup failed (non-fatal)', {
1860
+ error: autoUpdateErr instanceof Error ? autoUpdateErr.message : String(autoUpdateErr),
1861
+ });
1862
+ }
1863
+ // Start Slack image cleanup (download temp files)
1864
+ try {
1865
+ const { getSlackImageService: getImgService } = await import('./services/slack/slack-image.service.js');
1866
+ const imgService = getImgService();
1867
+ await imgService.cleanupOnStartup();
1868
+ imgService.startCleanup();
1869
+ }
1870
+ catch (err) {
1871
+ this.logger.warn('Failed to initialize Slack image service', {
1872
+ error: err instanceof Error ? err.message : String(err),
1873
+ });
1874
+ }
1875
+ // Initialize Slack if configured
1876
+ await this.initializeSlackIfConfigured();
1877
+ // Initialize WhatsApp if configured
1878
+ await this.initializeWhatsAppIfConfigured();
1879
+ // Initialize Google Chat if saved credentials exist
1880
+ await this.initializeGoogleChatIfConfigured();
1881
+ // Initialize Telegram if configured
1882
+ await this.initializeTelegramIfConfigured();
1883
+ // Restore Cloud connection from persisted config (non-blocking)
1884
+ initializeCloudIfConfigured().catch((err) => {
1885
+ this.logger.warn('Cloud initialization failed (non-fatal)', {
1886
+ error: err instanceof Error ? err.message : String(err),
1887
+ });
1888
+ });
1889
+ // Start NOTIFY reconciliation service (retries failed Slack deliveries)
1890
+ this.notifyReconciliationService = new NotifyReconciliationService();
1891
+ this.notifyReconciliationService.start();
1892
+ // Start system resource alert monitoring (proactive disk/memory/CPU alerts)
1893
+ this.systemResourceAlertService.startMonitoring();
1894
+ // Fire-and-forget background version check (populates cache for /health)
1895
+ VersionCheckService.getInstance().checkForUpdate().catch(() => {
1896
+ // Silently ignore — version check is non-critical
1897
+ });
1898
+ // V3-only as of spec 2026-05-06-task-management-v1-deprecation.md.
1899
+ // The legacy `TaskTrackingService.startAutoSync()` is gone — V3
1900
+ // task-pool reconciler owns lifecycle cleanup now.
1901
+ // Initialize token usage tracking: load persisted data and start periodic flush
1902
+ try {
1903
+ const tokenUsageService = TokenUsageService.getInstance();
1904
+ await tokenUsageService.loadFromDisk();
1905
+ tokenUsageService.startPeriodicFlush();
1906
+ // Sync Claude Code session JSONL files → TokenUsageService
1907
+ // so the Usage dashboard has data from claude-code runtime agents
1908
+ const { syncSessionsToTokenUsageService } = await import('./services/monitoring/claude-session-tokens.service.js');
1909
+ const synced = await syncSessionsToTokenUsageService(this.config.crewlyHome, 7);
1910
+ this.logger.info('Token usage tracking initialized', { syncedClaudeSessions: synced });
1911
+ }
1912
+ catch (tokenErr) {
1913
+ this.logger.warn('Token usage initialization failed (non-fatal)', {
1914
+ error: tokenErr instanceof Error ? tokenErr.message : String(tokenErr),
1915
+ });
1916
+ }
1917
+ // Start Reconciler: run initial full reconcile and start loops
1918
+ if (this.reconcilerService) {
1919
+ try {
1920
+ this.logger.info('Running initial full reconciliation...');
1921
+ const initialResult = await this.reconcilerService.runFull();
1922
+ this.logger.info('Initial reconciliation complete', {
1923
+ durationMs: initialResult.durationMs,
1924
+ corrections: initialResult.corrections.length,
1925
+ errors: initialResult.errors.length,
1926
+ });
1927
+ this.reconcilerService.start();
1928
+ this.logger.info('Reconciler loops started (fast: 10s, full: 60s)');
1929
+ }
1930
+ catch (reconcilerErr) {
1931
+ this.logger.warn('Reconciler startup failed (non-fatal)', {
1932
+ error: reconcilerErr instanceof Error ? reconcilerErr.message : String(reconcilerErr),
1933
+ });
1934
+ }
1935
+ }
1936
+ // C1 — boot-time state invariant check (Persistence P0 spec).
1937
+ // Refuses to start serving traffic if the live teams directory
1938
+ // is empty but a healthy backup snapshot exists. Override via
1939
+ // CREWLY_FORCE_EMPTY_BOOT=1 for legitimate fresh-install / reset.
1940
+ try {
1941
+ await this.storageService.verifyStateInvariantOnBoot();
1942
+ }
1943
+ catch (invariantErr) {
1944
+ const { StateInvariantViolation } = await import('./services/core/state-invariant.types.js');
1945
+ if (invariantErr instanceof StateInvariantViolation) {
1946
+ this.logger.error('Boot aborted by state invariant check — refusing to serve traffic with wiped state', {
1947
+ currentTeamCount: invariantErr.currentTeamCount,
1948
+ backupTeamCount: invariantErr.backupTeamCount,
1949
+ backupTimestamp: invariantErr.backupTimestamp,
1950
+ message: invariantErr.message,
1951
+ });
1952
+ }
1953
+ throw invariantErr;
1954
+ }
1955
+ // Start HTTP server with enhanced error handling
1956
+ await this.startHttpServer();
1957
+ // Load addons from ~/.crewly/addons/ (Pro features, extensions, etc.)
1958
+ try {
1959
+ const addonLoader = AddonLoaderService.getInstance();
1960
+ const loadedAddons = await addonLoader.loadAddons(this.app, this.httpServer);
1961
+ if (loadedAddons.length > 0) {
1962
+ this.logger.info('Addons loaded successfully', { addons: loadedAddons });
1963
+ }
1964
+ }
1965
+ catch (addonErr) {
1966
+ this.logger.warn('Addon loading encountered an error (non-fatal)', {
1967
+ error: addonErr instanceof Error ? addonErr.message : String(addonErr),
1968
+ });
1969
+ }
1970
+ // Register cleanup handlers
1971
+ this.registerSignalHandlers();
1972
+ // Start health monitoring
1973
+ this.startHealthMonitoring();
1974
+ // Auto-start orchestrator if enabled in settings
1975
+ await this.autoStartOrchestratorIfEnabled();
1976
+ // Auto-restore agent sessions that were running before the last shutdown
1977
+ await this.autoRestoreAgentSessionsIfEnabled();
1978
+ // #166: Auto-recover in-progress tasks after restart.
1979
+ // #196: Skip tasks older than 1 hour to avoid re-sending stale work.
1980
+ // V3-only as of spec 2026-05-06-task-management-v1-deprecation.md —
1981
+ // reads WorkItems from TaskPoolService (replaces the prior
1982
+ // `TaskTrackingService.getAllInProgressTasks()` call).
1983
+ try {
1984
+ const TASK_RECOVERY_MAX_AGE_MS = 60 * 60 * 1000; // 1 hour
1985
+ const { TaskPoolService } = await import('./services/task-pool/task-pool.service.js');
1986
+ const allItems = await TaskPoolService.getInstance().getAllItems();
1987
+ const now = Date.now();
1988
+ const activeTasks = allItems.filter(wi => {
1989
+ if (wi.status !== 'queued' && wi.status !== 'accepted' && wi.status !== 'running')
1990
+ return false;
1991
+ if (!wi.target)
1992
+ return false;
1993
+ // Skip stale tasks — startedAt/createdAt older than threshold
1994
+ const taskTime = new Date(wi.startedAt || wi.createdAt || 0).getTime();
1995
+ if (now - taskTime > TASK_RECOVERY_MAX_AGE_MS) {
1996
+ this.logger.info('Skipping stale task recovery (older than 1 hour)', {
1997
+ workItemId: wi.id,
1998
+ taskName: wi.title,
1999
+ age: `${Math.round((now - taskTime) / 60000)} minutes`,
2000
+ });
2001
+ return false;
2002
+ }
2003
+ return true;
2004
+ });
2005
+ if (activeTasks.length > 0) {
2006
+ this.logger.info('Found in-progress WorkItems to recover after restart', {
2007
+ count: activeTasks.length,
2008
+ });
2009
+ for (const wi of activeTasks) {
2010
+ try {
2011
+ const recoveryMessage = `[SYSTEM — TASK RECOVERY] You were working on this task before the server restarted. Please continue:\n\nTask: ${wi.title}\nWorkItem: ${wi.id}\n\nFetch full brief: bash config/skills/agent/core/read-task/execute.sh '{"workItemId":"${wi.id}"}'\n\nPlease check the current state and continue working.`;
2012
+ await this.apiController.agentRegistrationService.sendMessageToAgent(wi.target, recoveryMessage, undefined);
2013
+ this.logger.info('Task recovery message sent', {
2014
+ workItemId: wi.id,
2015
+ sessionName: wi.target,
2016
+ taskName: wi.title,
2017
+ });
2018
+ }
2019
+ catch (err) {
2020
+ // Agent might not be online yet — DLQ in scheduler will handle it
2021
+ this.logger.warn('Task recovery delivery deferred (agent may not be online yet)', {
2022
+ workItemId: wi.id,
2023
+ sessionName: wi.target,
2024
+ error: err instanceof Error ? err.message : String(err),
2025
+ });
2026
+ }
2027
+ }
2028
+ }
2029
+ }
2030
+ catch (err) {
2031
+ this.logger.warn('Task auto-recovery failed (non-critical)', {
2032
+ error: err instanceof Error ? err.message : String(err),
2033
+ });
2034
+ }
2035
+ // Start log rotation service (non-critical — logs cleanup)
2036
+ try {
2037
+ const logRotation = LogRotationService.getInstance();
2038
+ const backend = getSessionBackendSync();
2039
+ const activeNames = backend ? backend.listSessions() : [];
2040
+ await logRotation.start(activeNames);
2041
+ this.logger.info('LogRotationService started');
2042
+ }
2043
+ catch (error) {
2044
+ this.logger.warn('Failed to start LogRotationService (non-critical)', {
2045
+ error: error instanceof Error ? error.message : String(error),
2046
+ });
2047
+ }
2048
+ // Start AuditorSchedulerService (non-critical — audit scheduling)
2049
+ // Priority: env var > settings.json > ENABLED_BY_DEFAULT constant
2050
+ const envValue = process.env[AUDITOR_CONSTANTS.ENV_VAR]?.toLowerCase();
2051
+ let auditorEnabled;
2052
+ if (envValue !== undefined) {
2053
+ // Env var explicitly set — use it
2054
+ auditorEnabled = envValue === 'true';
2055
+ }
2056
+ else {
2057
+ // Check persisted settings (settings.json)
2058
+ try {
2059
+ const settingsForAuditor = await getSettingsService().getSettings();
2060
+ auditorEnabled = settingsForAuditor.general.enableAuditor ?? AUDITOR_CONSTANTS.ENABLED_BY_DEFAULT;
2061
+ }
2062
+ catch {
2063
+ auditorEnabled = AUDITOR_CONSTANTS.ENABLED_BY_DEFAULT;
2064
+ }
2065
+ }
2066
+ if (auditorEnabled) {
2067
+ try {
2068
+ const auditorScheduler = AuditorSchedulerService.getInstance();
2069
+ auditorScheduler.setAgentRegistrationService(this.apiController.agentRegistrationService);
2070
+ auditorScheduler.setEventBusService(this.eventBusService);
2071
+ setAuditorSchedulerService(auditorScheduler);
2072
+ auditorScheduler.start();
2073
+ this.logger.info('AuditorSchedulerService started (Claude Code PTY mode)');
2074
+ }
2075
+ catch (error) {
2076
+ this.logger.warn('Failed to start AuditorSchedulerService (non-critical)', {
2077
+ error: error instanceof Error ? error.message : String(error),
2078
+ });
2079
+ }
2080
+ }
2081
+ else {
2082
+ this.logger.info('Auditor disabled (enable via Settings > General or CREWLY_ENABLE_AUDITOR=true)');
2083
+ }
2084
+ }
2085
+ catch (error) {
2086
+ this.logger.error('Failed to start server', { error: error instanceof Error ? error.message : String(error) });
2087
+ if (error instanceof Error && error.message.includes('EADDRINUSE')) {
2088
+ this.logger.error('Port already in use', { port: this.config.webPort });
2089
+ this.logger.info('Try killing existing processes or use a different port');
2090
+ await this.handlePortConflict();
2091
+ }
2092
+ throw error;
2093
+ }
2094
+ }
2095
+ /**
2096
+ * Initialize Slack integration if environment variables are configured.
2097
+ * Gracefully handles missing configuration or connection failures.
2098
+ */
2099
+ async initializeSlackIfConfigured() {
2100
+ try {
2101
+ this.logger.info('Checking Slack configuration...');
2102
+ const result = await initializeSlackIfConfigured({
2103
+ messageQueueService: this.messageQueueService,
2104
+ });
2105
+ if (result.success) {
2106
+ // Wire thread store into the bridge for persistent thread tracking
2107
+ const threadStore = getSlackThreadStore();
2108
+ if (threadStore) {
2109
+ const { getSlackOrchestratorBridge } = await import('./services/slack/slack-orchestrator-bridge.js');
2110
+ const bridge = getSlackOrchestratorBridge();
2111
+ bridge.setSlackThreadStore(threadStore);
2112
+ bridge.setThreadStatusQueue(this.threadStatusQueueService);
2113
+ }
2114
+ this.logger.info('Slack integration initialized successfully');
2115
+ }
2116
+ else if (result.attempted) {
2117
+ this.logger.warn('Slack initialization failed', { error: result.error });
2118
+ }
2119
+ else {
2120
+ this.logger.info('Slack not configured, skipping initialization');
2121
+ }
2122
+ }
2123
+ catch (error) {
2124
+ this.logger.error('Error initializing Slack integration', {
2125
+ error: error instanceof Error ? error.message : String(error),
2126
+ });
2127
+ // Don't fail startup if Slack fails
2128
+ }
2129
+ }
2130
+ /**
2131
+ * Initialize WhatsApp integration if environment variables are configured.
2132
+ * Gracefully handles missing configuration or connection failures.
2133
+ */
2134
+ async initializeWhatsAppIfConfigured() {
2135
+ try {
2136
+ this.logger.info('Checking WhatsApp configuration...');
2137
+ const result = await initializeWhatsAppIfConfigured({
2138
+ messageQueueService: this.messageQueueService,
2139
+ });
2140
+ if (result.success) {
2141
+ this.logger.info('WhatsApp integration initialized successfully');
2142
+ }
2143
+ else if (result.attempted) {
2144
+ this.logger.warn('WhatsApp initialization failed', { error: result.error });
2145
+ }
2146
+ else {
2147
+ this.logger.info('WhatsApp not configured, skipping initialization');
2148
+ }
2149
+ }
2150
+ catch (error) {
2151
+ this.logger.error('Error initializing WhatsApp integration', {
2152
+ error: error instanceof Error ? error.message : String(error),
2153
+ });
2154
+ // Don't fail startup if WhatsApp fails
2155
+ }
2156
+ }
2157
+ /**
2158
+ * Initialize Google Chat adapter from saved credentials if available.
2159
+ * Restarts the Pub/Sub pull loop automatically on backend restart.
2160
+ */
2161
+ async initializeGoogleChatIfConfigured() {
2162
+ try {
2163
+ this.logger.info('Checking Google Chat saved credentials...');
2164
+ const result = await initializeGoogleChatIfConfigured({
2165
+ messageQueueService: this.messageQueueService,
2166
+ });
2167
+ if (result.success) {
2168
+ this.logger.info('Google Chat auto-reconnect successful');
2169
+ }
2170
+ else if (result.attempted) {
2171
+ this.logger.warn('Google Chat auto-reconnect failed', { error: result.error });
2172
+ }
2173
+ else {
2174
+ this.logger.info('Google Chat not configured, skipping initialization');
2175
+ }
2176
+ }
2177
+ catch (error) {
2178
+ this.logger.error('Error initializing Google Chat integration', {
2179
+ error: error instanceof Error ? error.message : String(error),
2180
+ });
2181
+ // Don't fail startup if Google Chat fails
2182
+ }
2183
+ }
2184
+ /**
2185
+ * Initialize Telegram bot from environment variables or saved credentials.
2186
+ * Starts long-polling for incoming messages automatically on backend restart.
2187
+ */
2188
+ async initializeTelegramIfConfigured() {
2189
+ try {
2190
+ this.logger.info('Checking Telegram configuration...');
2191
+ const result = await initializeTelegramIfConfigured({
2192
+ messageQueueService: this.messageQueueService,
2193
+ });
2194
+ if (result.success) {
2195
+ this.logger.info('Telegram bot connected and polling started');
2196
+ }
2197
+ else if (result.attempted) {
2198
+ this.logger.warn('Telegram initialization failed', { error: result.error });
2199
+ }
2200
+ else {
2201
+ this.logger.info('Telegram not configured, skipping initialization');
2202
+ }
2203
+ }
2204
+ catch (error) {
2205
+ this.logger.error('Error initializing Telegram integration', {
2206
+ error: error instanceof Error ? error.message : String(error),
2207
+ });
2208
+ // Don't fail startup if Telegram fails
2209
+ }
2210
+ }
2211
+ /**
2212
+ * Auto-start the orchestrator if the autoStartOrchestrator setting is enabled.
2213
+ * Reads the setting from persistent storage and triggers orchestrator setup.
2214
+ * Failures are logged but do not prevent the server from starting.
2215
+ */
2216
+ async autoStartOrchestratorIfEnabled() {
2217
+ try {
2218
+ const settingsService = getSettingsService();
2219
+ const settings = await settingsService.getSettings();
2220
+ if (!settings.general.autoStartOrchestrator) {
2221
+ this.logger.info('Auto-start orchestrator is disabled, skipping');
2222
+ return;
2223
+ }
2224
+ this.logger.info('Auto-start orchestrator is enabled, starting orchestrator...');
2225
+ // Determine runtime type: env var OVERRIDES stored status > default (claude-code)
2226
+ // DEFAULT_RUNTIME env var is the authoritative config for Docker/headless deployments.
2227
+ let runtimeType = RUNTIME_TYPES.CLAUDE_CODE;
2228
+ // Step 1: Check stored orchestrator status (user changed via UI in previous session)
2229
+ try {
2230
+ const orchestratorStatus = await this.storageService.getOrchestratorStatus();
2231
+ if (orchestratorStatus?.runtimeType) {
2232
+ runtimeType = orchestratorStatus.runtimeType;
2233
+ }
2234
+ }
2235
+ catch {
2236
+ // Use default runtime type
2237
+ }
2238
+ // Step 2: DEFAULT_RUNTIME env var OVERRIDES stored status (product-level config)
2239
+ // This ensures Docker/headless deployments always use the configured runtime
2240
+ // regardless of what was stored from a previous (possibly different) deployment.
2241
+ const envRuntime = process.env.DEFAULT_RUNTIME;
2242
+ if (envRuntime && Object.values(RUNTIME_TYPES).includes(envRuntime)) {
2243
+ const previousRuntime = runtimeType;
2244
+ runtimeType = envRuntime;
2245
+ this.logger.info('DEFAULT_RUNTIME env overrides stored runtime', { runtimeType, previousRuntime });
2246
+ // #183: Persist the override so stored status stays in sync
2247
+ if (previousRuntime !== runtimeType) {
2248
+ try {
2249
+ await this.storageService.updateOrchestratorRuntimeType(runtimeType);
2250
+ this.logger.info('Synced orchestrator runtime to storage', { runtimeType });
2251
+ }
2252
+ catch {
2253
+ this.logger.warn('Failed to sync orchestrator runtime to storage');
2254
+ }
2255
+ }
2256
+ }
2257
+ // Create orchestrator agent session
2258
+ const result = await this.apiController.agentRegistrationService.createAgentSession({
2259
+ sessionName: ORCHESTRATOR_SESSION_NAME,
2260
+ role: ORCHESTRATOR_ROLE,
2261
+ projectPath: this.config.crewlyHome,
2262
+ windowName: ORCHESTRATOR_WINDOW_NAME,
2263
+ runtimeType,
2264
+ forceRecreate: true,
2265
+ });
2266
+ if (!result.success) {
2267
+ this.logger.warn('Auto-start orchestrator failed to create session', {
2268
+ error: result.error,
2269
+ });
2270
+ return;
2271
+ }
2272
+ // Initialize orchestrator memory
2273
+ try {
2274
+ const memoryService = MemoryService.getInstance();
2275
+ await memoryService.initializeForSession(ORCHESTRATOR_SESSION_NAME, ORCHESTRATOR_ROLE, this.config.crewlyHome);
2276
+ }
2277
+ catch (memoryError) {
2278
+ this.logger.warn('Failed to initialize orchestrator memory during auto-start', {
2279
+ error: memoryError instanceof Error ? memoryError.message : String(memoryError),
2280
+ });
2281
+ }
2282
+ // Start persistent chat monitoring
2283
+ if (this.terminalGateway) {
2284
+ this.terminalGateway.startOrchestratorChatMonitoring(ORCHESTRATOR_SESSION_NAME);
2285
+ }
2286
+ this.logger.info('Orchestrator auto-started successfully');
2287
+ }
2288
+ catch (error) {
2289
+ this.logger.error('Failed to auto-start orchestrator', {
2290
+ error: error instanceof Error ? error.message : String(error),
2291
+ });
2292
+ // Don't fail startup if auto-start fails
2293
+ }
2294
+ }
2295
+ /**
2296
+ * Auto-restore agent sessions that were running before the last shutdown.
2297
+ * Loads persisted session state and calls createAgentSession() for each
2298
+ * non-orchestrator session. Gated by the autoResumeOnRestart setting.
2299
+ * Runs after orchestrator auto-start so the orchestrator is available.
2300
+ */
2301
+ async autoRestoreAgentSessionsIfEnabled() {
2302
+ try {
2303
+ const settingsService = getSettingsService();
2304
+ const settings = await settingsService.getSettings();
2305
+ if (!settings.general.autoResumeOnRestart) {
2306
+ this.logger.info('Auto-resume on restart is disabled, skipping agent session restore');
2307
+ return;
2308
+ }
2309
+ const persistence = getSessionStatePersistence();
2310
+ const state = await persistence.loadState();
2311
+ if (!state || state.sessions.length === 0) {
2312
+ this.logger.debug('No persisted agent sessions to restore');
2313
+ return;
2314
+ }
2315
+ // Filter out orchestrator sessions (already auto-started separately)
2316
+ // and auditor sessions when auditor is disabled
2317
+ const isAuditorEnabled = process.env[AUDITOR_CONSTANTS.ENV_VAR]?.toLowerCase() === 'true'
2318
+ || (process.env[AUDITOR_CONSTANTS.ENV_VAR] === undefined && AUDITOR_CONSTANTS.ENABLED_BY_DEFAULT);
2319
+ const baselineSessions = state.sessions.filter((s) => {
2320
+ if (s.role === ORCHESTRATOR_ROLE)
2321
+ return false;
2322
+ if (!isAuditorEnabled && s.name === AUDITOR_SCHEDULER_CONSTANTS.AUDITOR_SESSION_NAME)
2323
+ return false;
2324
+ return true;
2325
+ });
2326
+ // 2026-05-17 — gate by task-pool work. Pre-fix the boot path
2327
+ // blindly resurrected every persisted session even when none had
2328
+ // pending work, defeating the wake-gate philosophy (PR #574/#585)
2329
+ // and bloating RAM until IdleDetection eventually drained them
2330
+ // back. Now: only restore a session if the pool has at least one
2331
+ // non-terminal WorkItem with `target === sessionName`. Idle
2332
+ // agents stay dead until orc dispatches new work, at which point
2333
+ // the dispatcher / wake path raises them on demand.
2334
+ //
2335
+ // Safety valve: if the pool lookup throws (e.g. SQLite not yet
2336
+ // open during early boot), preserve the legacy behaviour rather
2337
+ // than block all restores — better to over-restore than to
2338
+ // silently strand work.
2339
+ let agentSessions = baselineSessions;
2340
+ try {
2341
+ const pool = TaskPoolService.getInstance();
2342
+ const allItems = await pool.getAllItems();
2343
+ const targetedSessions = new Set();
2344
+ for (const wi of allItems) {
2345
+ if (wi.status === 'done' || wi.status === 'cancelled')
2346
+ continue;
2347
+ const t = wi.target;
2348
+ if (typeof t === 'string' && t.length > 0)
2349
+ targetedSessions.add(t);
2350
+ }
2351
+ const filtered = baselineSessions.filter((s) => targetedSessions.has(s.name));
2352
+ const skipped = baselineSessions
2353
+ .filter((s) => !targetedSessions.has(s.name))
2354
+ .map((s) => s.name);
2355
+ if (skipped.length > 0) {
2356
+ this.logger.info('Skipping auto-restore for sessions with no pending WorkItem (idle agents stay dead until dispatched work arrives)', {
2357
+ skippedCount: skipped.length,
2358
+ skipped: skipped.slice(0, 20),
2359
+ truncated: skipped.length > 20,
2360
+ });
2361
+ }
2362
+ agentSessions = filtered;
2363
+ }
2364
+ catch (poolErr) {
2365
+ this.logger.warn('Auto-restore could not query task pool; falling back to restoring every persisted session', { error: poolErr instanceof Error ? poolErr.message : String(poolErr) });
2366
+ }
2367
+ if (agentSessions.length === 0) {
2368
+ this.logger.info('No persisted agent sessions to restore (all idle, no pending WorkItems)');
2369
+ return;
2370
+ }
2371
+ this.logger.info('Auto-restoring agent sessions from persisted state', {
2372
+ count: agentSessions.length,
2373
+ sessions: agentSessions.map((s) => s.name),
2374
+ });
2375
+ let restored = 0;
2376
+ const failed = [];
2377
+ const RESTORE_DELAY_MS = 10_000; // 10 seconds between each session restore to avoid resource pressure
2378
+ for (let i = 0; i < agentSessions.length; i++) {
2379
+ const session = agentSessions[i];
2380
+ // Wait between session restores to avoid SIGTERM from resource pressure
2381
+ if (i > 0) {
2382
+ this.logger.info('Waiting before restoring next session to avoid resource pressure', {
2383
+ delayMs: RESTORE_DELAY_MS,
2384
+ nextSession: session.name,
2385
+ progress: `${i}/${agentSessions.length}`,
2386
+ });
2387
+ await new Promise((resolve) => setTimeout(resolve, RESTORE_DELAY_MS));
2388
+ }
2389
+ try {
2390
+ const result = await this.apiController.agentRegistrationService.createAgentSession({
2391
+ sessionName: session.name,
2392
+ role: session.role || 'developer',
2393
+ projectPath: session.cwd || process.cwd(),
2394
+ runtimeType: session.runtimeType,
2395
+ teamId: session.teamId,
2396
+ memberId: session.memberId,
2397
+ forceRecreate: true,
2398
+ });
2399
+ if (result.success) {
2400
+ restored++;
2401
+ this.logger.info('Restored agent session', {
2402
+ name: session.name,
2403
+ role: session.role,
2404
+ runtimeType: session.runtimeType,
2405
+ progress: `${restored}/${agentSessions.length}`,
2406
+ });
2407
+ }
2408
+ else {
2409
+ failed.push(session.name);
2410
+ this.logger.warn('Failed to restore agent session', {
2411
+ name: session.name,
2412
+ error: result.error,
2413
+ });
2414
+ }
2415
+ }
2416
+ catch (error) {
2417
+ failed.push(session.name);
2418
+ this.logger.error('Error restoring agent session', {
2419
+ name: session.name,
2420
+ error: error instanceof Error ? error.message : String(error),
2421
+ });
2422
+ }
2423
+ }
2424
+ this.logger.info('Agent session restore complete', {
2425
+ restored,
2426
+ total: agentSessions.length,
2427
+ failed: failed.length > 0 ? failed : undefined,
2428
+ });
2429
+ // Clear persisted state after restore attempt to avoid double-restore
2430
+ await persistence.clearState();
2431
+ }
2432
+ catch (error) {
2433
+ this.logger.error('Failed to auto-restore agent sessions', {
2434
+ error: error instanceof Error ? error.message : String(error),
2435
+ });
2436
+ // Don't fail startup if auto-restore fails
2437
+ }
2438
+ }
2439
+ /**
2440
+ * Check for and handle pending self-improvement from hot-reload.
2441
+ * This runs at startup to validate or rollback any changes made
2442
+ * before the process was restarted.
2443
+ */
2444
+ async checkPendingSelfImprovement() {
2445
+ try {
2446
+ const startupService = getImprovementStartupService();
2447
+ const result = await startupService.runStartupCheck();
2448
+ if (result.hadPendingImprovement) {
2449
+ this.logger.info('Handled pending self-improvement', {
2450
+ improvementId: result.improvementId,
2451
+ action: result.action,
2452
+ validationPassed: result.validationPassed,
2453
+ });
2454
+ if (result.action === 'rolled_back') {
2455
+ this.logger.warn('Self-improvement rollback performed', {
2456
+ error: result.error,
2457
+ });
2458
+ }
2459
+ }
2460
+ }
2461
+ catch (error) {
2462
+ this.logger.error('Error checking pending self-improvement', {
2463
+ error: error instanceof Error ? error.message : String(error),
2464
+ });
2465
+ // Continue startup even if self-improvement check fails
2466
+ }
2467
+ }
2468
+ async checkPortAvailability() {
2469
+ const { createServer } = await import('net');
2470
+ const testServer = createServer();
2471
+ return new Promise((resolve, reject) => {
2472
+ testServer.listen(this.config.webPort, () => {
2473
+ testServer.close(() => {
2474
+ this.logger.info('Port is available', { port: this.config.webPort });
2475
+ resolve();
2476
+ });
2477
+ });
2478
+ testServer.on('error', (error) => {
2479
+ if (error.code === 'EADDRINUSE') {
2480
+ reject(new Error(`Port ${this.config.webPort} is already in use`));
2481
+ }
2482
+ else {
2483
+ reject(error);
2484
+ }
2485
+ });
2486
+ });
2487
+ }
2488
+ async startHttpServer() {
2489
+ return new Promise((resolve, reject) => {
2490
+ const startTime = Date.now();
2491
+ this.httpServer.listen(this.config.webPort, () => {
2492
+ const duration = Date.now() - startTime;
2493
+ this.logger.info('Crewly server started', {
2494
+ port: this.config.webPort,
2495
+ durationMs: duration,
2496
+ dashboardUrl: `http://localhost:${this.config.webPort}`,
2497
+ websocketUrl: `ws://localhost:${this.config.webPort}`,
2498
+ home: this.config.crewlyHome
2499
+ });
2500
+ // B0 (interim) per `.crewly/specs/2026-05-05-trigger-persistence-bug.md`:
2501
+ // Broadcast `system:backend_restarted` exactly once per boot. The
2502
+ // trigger engine (`backend/src/services/v3/trigger-engine.service.ts`)
2503
+ // stores all `schedule-followup` / `watch-for-event` triggers in an
2504
+ // in-memory `Map<string, Trigger>` that is wiped on every restart.
2505
+ // Subscribers (e.g. self-watch-scribe, any TL using §3.0 universal
2506
+ // delegator-rule) listen for this event as a freshness signal and
2507
+ // re-arm their watchdogs. Re-arm latency drops from "manual cycle"
2508
+ // to "next event tick" — closes the wipe-coverage-gap to seconds.
2509
+ // B1 (full fix) is disk-backed declarative trigger config per the
2510
+ // spec Path A; B0 is the unblock-first interim until B1 lands.
2511
+ try {
2512
+ // AgentEvent shape (`backend/src/types/event-bus.types.ts:198`)
2513
+ // requires a fixed set of string fields. For system-scoped
2514
+ // events we use 'system' for member/session and leave team
2515
+ // fields empty — subscribers MUST gate on `type` rather than
2516
+ // team/member identity. Boot diagnostics (port, duration) are
2517
+ // already in the preceding `Crewly server started` log;
2518
+ // callers needing them can correlate by `timestamp`.
2519
+ this.eventBusService.publish({
2520
+ id: `system-backend-restarted-${Date.now()}`,
2521
+ type: 'system:backend_restarted',
2522
+ timestamp: new Date().toISOString(),
2523
+ teamId: '',
2524
+ teamName: '',
2525
+ memberId: '',
2526
+ memberName: 'system',
2527
+ sessionName: 'system',
2528
+ previousValue: 'stopped',
2529
+ newValue: 'started',
2530
+ changedField: 'agentStatus'
2531
+ });
2532
+ this.logger.info('Broadcast system:backend_restarted event', {
2533
+ port: this.config.webPort,
2534
+ bootDurationMs: duration
2535
+ });
2536
+ }
2537
+ catch (emitError) {
2538
+ // Failure isolation — never block boot on this telemetry.
2539
+ this.logger.warn('Failed to broadcast system:backend_restarted (non-fatal)', {
2540
+ error: emitError instanceof Error ? emitError.message : String(emitError)
2541
+ });
2542
+ }
2543
+ resolve();
2544
+ });
2545
+ this.httpServer.on('error', (error) => {
2546
+ this.logger.error('HTTP Server error', { error: error.message, code: error.code });
2547
+ if (error.code === 'EADDRINUSE') {
2548
+ this.logger.error('Port already in use by another process', { port: this.config.webPort });
2549
+ this.logger.info('Suggestion: Kill the existing process or change the port');
2550
+ }
2551
+ else if (error.code === 'EACCES') {
2552
+ this.logger.error('Permission denied for port', { port: this.config.webPort });
2553
+ this.logger.info('Suggestion: Try a port above 1024 or run with appropriate permissions');
2554
+ }
2555
+ reject(error);
2556
+ });
2557
+ });
2558
+ }
2559
+ async handlePortConflict() {
2560
+ this.logger.info('Attempting to identify conflicting process...');
2561
+ try {
2562
+ const { execSync } = await import('child_process');
2563
+ const result = execSync(`lsof -ti :${this.config.webPort}`, { encoding: 'utf8' }).trim();
2564
+ if (result) {
2565
+ this.logger.info('Process using port identified', { port: this.config.webPort, pid: result });
2566
+ this.logger.info('To kill it manually', { command: `kill -9 ${result}` });
2567
+ }
2568
+ }
2569
+ catch (error) {
2570
+ this.logger.info('Could not identify the conflicting process');
2571
+ }
2572
+ }
2573
+ sigintCount = 0;
2574
+ registerSignalHandlers() {
2575
+ this.logger.info('Registering signal handlers...');
2576
+ process.on('SIGTERM', () => {
2577
+ this.logger.info('Received SIGTERM signal');
2578
+ this.shutdown();
2579
+ });
2580
+ process.on('SIGINT', () => {
2581
+ this.sigintCount++;
2582
+ if (this.sigintCount === 1) {
2583
+ this.logger.info('Received SIGINT signal (Ctrl+C) - shutting down gracefully. Press Ctrl+C again to force exit.');
2584
+ this.shutdown();
2585
+ }
2586
+ else {
2587
+ this.logger.info('Received second SIGINT - forcing immediate exit');
2588
+ process.exit(1);
2589
+ }
2590
+ });
2591
+ process.on('uncaughtException', (error) => {
2592
+ this.logger.error('Uncaught exception', { error: error.message, stack: error.stack });
2593
+ this.logMemoryUsage();
2594
+ this.shutdown();
2595
+ });
2596
+ process.on('unhandledRejection', (reason, promise) => {
2597
+ const message = reason instanceof Error ? reason.message : String(reason);
2598
+ // Non-fatal rejections from third-party libraries (e.g., Slack Socket Mode
2599
+ // state machine errors) should be logged but not trigger a full shutdown.
2600
+ const nonFatalPatterns = [
2601
+ 'Unhandled event', // finity state machine (Slack Socket Mode)
2602
+ 'socket hang up', // transient network errors
2603
+ 'ECONNRESET', // connection reset by peer
2604
+ ];
2605
+ const isNonFatal = nonFatalPatterns.some(p => message.includes(p));
2606
+ if (isNonFatal) {
2607
+ this.logger.warn('Non-fatal unhandled rejection (suppressed shutdown)', {
2608
+ reason: message,
2609
+ });
2610
+ return;
2611
+ }
2612
+ this.logger.error('Unhandled rejection', {
2613
+ reason: message,
2614
+ stack: reason instanceof Error ? reason.stack : undefined
2615
+ });
2616
+ this.logMemoryUsage();
2617
+ this.shutdown();
2618
+ });
2619
+ }
2620
+ startHealthMonitoring() {
2621
+ this.logger.info('Starting health monitoring...');
2622
+ // Monitor memory usage every 30 seconds
2623
+ this.healthMonitoringInterval = setInterval(() => {
2624
+ this.logMemoryUsage();
2625
+ }, 30000);
2626
+ // V3: Periodic TTL-based auto-close for open Requests (every 2 min)
2627
+ // Catches direct orchestrator responses that finish within a single poll cycle
2628
+ // and never trigger the EventBus agent:idle event
2629
+ setInterval(() => this.autoCloseOpenRequests(), 2 * 60 * 1000);
2630
+ // V3: Mission OKR Reminders (every hour)
2631
+ // Scans active missions and sends Slack alerts for off-track KRs
2632
+ setInterval(async () => {
2633
+ try {
2634
+ const { MissionReminderService } = await import('./services/v3/mission-reminder.service.js');
2635
+ await MissionReminderService.getInstance().runSweep();
2636
+ }
2637
+ catch (err) {
2638
+ this.logger.warn('Mission OKR reminder sweep failed', { error: String(err) });
2639
+ }
2640
+ }, 60 * 60 * 1000);
2641
+ // Purge done Requests and WorkItems older than 24h (every hour)
2642
+ setInterval(() => this.purgeCompletedData(), 60 * 60 * 1000);
2643
+ // Run once at startup after a short delay
2644
+ setTimeout(() => this.purgeCompletedData(), 30 * 1000);
2645
+ setTimeout(async () => {
2646
+ try {
2647
+ const { MissionReminderService } = await import('./services/v3/mission-reminder.service.js');
2648
+ await MissionReminderService.getInstance().runSweep();
2649
+ }
2650
+ catch (err) {
2651
+ // Non-critical
2652
+ }
2653
+ }, 60 * 1000);
2654
+ }
2655
+ /**
2656
+ * Removes done/cancelled Requests older than 24h from disk,
2657
+ * and purges done/cancelled/failed WorkItems from the task pool.
2658
+ * Memory, knowledge, and learnings are never purged.
2659
+ */
2660
+ purgeCompletedData() {
2661
+ setImmediate(async () => {
2662
+ const RETENTION_MS = 24 * 60 * 60 * 1000;
2663
+ const cutoff = Date.now() - RETENTION_MS;
2664
+ // 1. Purge done Requests
2665
+ try {
2666
+ const { RequestService } = await import('./services/v3/request.service.js');
2667
+ const svc = RequestService.getInstance();
2668
+ const all = await svc.listAll();
2669
+ let purgedRequests = 0;
2670
+ for (const req of all) {
2671
+ if (req.status !== 'done' && req.status !== 'cancelled')
2672
+ continue;
2673
+ const completedAt = req.completedAt ? new Date(req.completedAt).getTime() : 0;
2674
+ const createdAt = new Date(req.createdAt).getTime();
2675
+ const age = completedAt || createdAt;
2676
+ if (age < cutoff) {
2677
+ await svc.delete(req.id);
2678
+ purgedRequests++;
2679
+ }
2680
+ }
2681
+ if (purgedRequests > 0) {
2682
+ this.logger.info('Purged old completed Requests', { count: purgedRequests });
2683
+ }
2684
+ }
2685
+ catch (err) {
2686
+ this.logger.warn('Request purge failed (non-critical)', {
2687
+ error: err instanceof Error ? err.message : String(err),
2688
+ });
2689
+ }
2690
+ // 2. Purge done/cancelled/failed WorkItems from pool
2691
+ try {
2692
+ const { TaskPoolService } = await import('./services/task-pool/task-pool.service.js');
2693
+ const pool = TaskPoolService.getInstance();
2694
+ const allItems = await pool.getAllItems();
2695
+ const terminalStatuses = new Set(['done', 'cancelled', 'failed']);
2696
+ let purgedItems = 0;
2697
+ for (const wi of allItems) {
2698
+ if (!terminalStatuses.has(wi.status))
2699
+ continue;
2700
+ const completedAt = wi.completedAt ? new Date(wi.completedAt).getTime() : 0;
2701
+ const createdAt = new Date(wi.createdAt).getTime();
2702
+ const age = completedAt || createdAt;
2703
+ if (age < cutoff) {
2704
+ await pool.removeItem(wi.id);
2705
+ purgedItems++;
2706
+ }
2707
+ }
2708
+ if (purgedItems > 0) {
2709
+ this.logger.info('Purged old completed WorkItems', { count: purgedItems });
2710
+ }
2711
+ }
2712
+ catch (err) {
2713
+ this.logger.warn('WorkItem purge failed (non-critical)', {
2714
+ error: err instanceof Error ? err.message : String(err),
2715
+ });
2716
+ }
2717
+ });
2718
+ }
2719
+ /**
2720
+ * Closes open Requests that were created within the last 10 minutes.
2721
+ * Used to handle direct orchestrator responses that don't go through WorkItems.
2722
+ * Also rolls up orchestrator token usage and sets ownerAgent for the Request.
2723
+ *
2724
+ * Token source varies by runtime:
2725
+ * - claude-code: reads session JSONL (TUI status bar not capturable from PTY)
2726
+ * - gemini-cli / codex-cli: reads from TokenUsageService (fed by PTY parser)
2727
+ * - crewly-agent: reads from TokenUsageService (fed by SDK)
2728
+ */
2729
+ autoCloseOpenRequests() {
2730
+ setImmediate(async () => {
2731
+ try {
2732
+ const { RequestService } = await import('./services/v3/request.service.js');
2733
+ const { TokenUsageService } = await import('./services/monitoring/token-usage.service.js');
2734
+ const { getTokensSince } = await import('./services/monitoring/claude-session-tokens.service.js');
2735
+ const { getSessionStatePersistence } = await import('./services/session/session-state-persistence.js');
2736
+ const svc = RequestService.getInstance();
2737
+ const tokenSvc = TokenUsageService.getInstance();
2738
+ const all = await svc.listAll();
2739
+ const cutoff = Date.now() - 10 * 60 * 1000; // 10 min window
2740
+ const minAgeMs = 3 * 60 * 1000; // Don't close requests younger than 3 min — gives orchestrator time to delegate
2741
+ // Resolve orchestrator runtime type once per cycle
2742
+ const persistence = getSessionStatePersistence();
2743
+ const orcMeta = persistence.getSessionMetadata(ORCHESTRATOR_SESSION_NAME);
2744
+ const orcRuntimeType = orcMeta?.runtimeType || 'claude-code';
2745
+ for (const req of all) {
2746
+ if (req.status !== 'open')
2747
+ continue;
2748
+ const reqAge = Date.now() - new Date(req.createdAt).getTime();
2749
+ if (reqAge > 10 * 60 * 1000)
2750
+ continue; // older than 10 min — skip
2751
+ if (reqAge < minAgeMs)
2752
+ continue; // too young — orchestrator may still be delegating
2753
+ const update = { status: 'done' };
2754
+ // Roll up orchestrator tokens only for direct responses (no WorkItem delegation)
2755
+ if (req.workItemIds.length === 0) {
2756
+ const since = new Date(req.createdAt);
2757
+ let inputTokens = 0;
2758
+ let outputTokens = 0;
2759
+ let cost = 0;
2760
+ if (orcRuntimeType === 'claude-code') {
2761
+ // Claude Code: read from session JSONL (ground truth from API)
2762
+ // Falls back to auto-detecting the latest session file if ID unknown.
2763
+ // Use current time as upper bound to avoid counting tokens from
2764
+ // subsequent requests in the same session.
2765
+ const sessionId = persistence.getSessionId(ORCHESTRATOR_SESSION_NAME) || null;
2766
+ const summary = await getTokensSince(this.config.crewlyHome, sessionId, since, new Date());
2767
+ if (summary && summary.turnCount > 0) {
2768
+ inputTokens = summary.inputTokens;
2769
+ outputTokens = summary.outputTokens;
2770
+ cost = summary.cost;
2771
+ }
2772
+ }
2773
+ else {
2774
+ // Gemini CLI / Codex CLI / crewly-agent: read from TokenUsageService
2775
+ // (fed by PTY terminal output parser or SDK)
2776
+ const usage = tokenSvc.getSessionUsageSince(ORCHESTRATOR_SESSION_NAME, since);
2777
+ inputTokens = usage.inputTokens;
2778
+ outputTokens = usage.outputTokens;
2779
+ cost = usage.cost;
2780
+ }
2781
+ if (inputTokens > 0 || outputTokens > 0) {
2782
+ update.totalInputTokens = (req.totalInputTokens || 0) + inputTokens;
2783
+ update.totalOutputTokens = (req.totalOutputTokens || 0) + outputTokens;
2784
+ update.totalCost = (req.totalCost || 0) + cost;
2785
+ }
2786
+ update.ownerAgent = ORCHESTRATOR_SESSION_NAME;
2787
+ }
2788
+ await svc.update(req.id, update);
2789
+ // Mark the corresponding Slack/chat thread as terminal so the
2790
+ // SessionHandoff resume notification won't re-send it after restart.
2791
+ if (req.sourceConversationItemId.startsWith('slack-')) {
2792
+ try {
2793
+ const { ThreadStatusQueueService } = await import('./services/messaging/thread-status-queue.service.js');
2794
+ const { extractSlackChannelId, extractSlackThreadTs } = await import('./services/v3/request-sla.subscriber.js');
2795
+ const tsq = ThreadStatusQueueService.getInstance();
2796
+ // Use the canonical parser (handles both `slack-{ch}-{ts}` and
2797
+ // the thread-reply `slack-{ch}-{root}-msg-{msgTs}` shapes).
2798
+ const channelId = extractSlackChannelId(req.sourceConversationItemId);
2799
+ const threadTs = extractSlackThreadTs(req.sourceConversationItemId);
2800
+ if (channelId && threadTs) {
2801
+ const threadKey = `${channelId}:${threadTs}`;
2802
+ // Create entry if not tracked, then mark terminal
2803
+ if (!tsq.get(threadKey)) {
2804
+ tsq.trackInbound({
2805
+ threadKey,
2806
+ conversationId: req.sourceConversationItemId,
2807
+ source: 'slack',
2808
+ messagePreview: req.title,
2809
+ });
2810
+ }
2811
+ tsq.markReplied(threadKey, 'replied_completed');
2812
+ }
2813
+ }
2814
+ catch {
2815
+ // Non-critical — thread status is best-effort
2816
+ }
2817
+ }
2818
+ this.logger.debug('V3 Request auto-closed', {
2819
+ requestId: req.id,
2820
+ ownerAgent: update.ownerAgent,
2821
+ inputTokens: update.totalInputTokens,
2822
+ outputTokens: update.totalOutputTokens,
2823
+ cost: update.totalCost,
2824
+ });
2825
+ }
2826
+ }
2827
+ catch (err) {
2828
+ this.logger.warn('V3 Request auto-close failed (non-critical)', {
2829
+ error: err instanceof Error ? err.message : String(err),
2830
+ });
2831
+ }
2832
+ });
2833
+ }
2834
+ logMemoryUsage() {
2835
+ const usage = process.memoryUsage();
2836
+ const heapUsed = Math.round(usage.heapUsed / 1024 / 1024);
2837
+ const heapTotal = Math.round(usage.heapTotal / 1024 / 1024);
2838
+ const external = Math.round(usage.external / 1024 / 1024);
2839
+ this.logger.debug('Memory usage', { heapUsedMB: heapUsed, heapTotalMB: heapTotal, externalMB: external });
2840
+ // Warn if memory usage is high
2841
+ if (heapUsed > 500) {
2842
+ this.logger.warn('High memory usage detected', { heapUsedMB: heapUsed });
2843
+ }
2844
+ }
2845
+ async shutdown() {
2846
+ // Prevent double shutdown
2847
+ if (this.isShuttingDown) {
2848
+ this.logger.info('Shutdown already in progress, skipping...');
2849
+ return;
2850
+ }
2851
+ this.isShuttingDown = true;
2852
+ this.logger.info('Shutting down Crewly server...');
2853
+ // Set a hard timeout to force exit if graceful shutdown takes too long.
2854
+ // Use SIGKILL on self as the ultimate fallback — this is uncatchable and
2855
+ // guarantees death even if native node-pty handles keep the event loop alive.
2856
+ const isDev = process.env.NODE_ENV !== 'production';
2857
+ const timeoutMs = isDev ? 5000 : 10000;
2858
+ const forceExitTimeout = setTimeout(() => {
2859
+ this.logger.warn('Graceful shutdown timed out, sending SIGKILL to self...');
2860
+ process.kill(process.pid, 'SIGKILL');
2861
+ }, timeoutMs);
2862
+ try {
2863
+ // Clear health monitoring interval first
2864
+ if (this.healthMonitoringInterval) {
2865
+ clearInterval(this.healthMonitoringInterval);
2866
+ this.healthMonitoringInterval = null;
2867
+ }
2868
+ // Unload addons (call their unregister hooks)
2869
+ try {
2870
+ await AddonLoaderService.getInstance().unloadAddons();
2871
+ }
2872
+ catch (addonErr) {
2873
+ this.logger.warn('Error unloading addons during shutdown', {
2874
+ error: addonErr instanceof Error ? addonErr.message : String(addonErr),
2875
+ });
2876
+ }
2877
+ // Generate session handoff summary before killing processes
2878
+ // This captures active thread state and agent status for restart recovery
2879
+ try {
2880
+ const { SessionHandoffService } = await import('./services/session/session-handoff.service.js');
2881
+ await SessionHandoffService.getInstance().generateSummary(this.storageService);
2882
+ }
2883
+ catch (error) {
2884
+ this.logger.warn('Failed to generate session handoff summary', {
2885
+ error: error instanceof Error ? error.message : String(error),
2886
+ });
2887
+ }
2888
+ // Disconnect Redis cache
2889
+ try {
2890
+ RedisCacheService.getInstance().disconnect();
2891
+ }
2892
+ catch {
2893
+ // Non-critical — ignore
2894
+ }
2895
+ // Save PTY session state and force-kill all child processes
2896
+ this.logger.info('Saving PTY session state and force-killing child processes...');
2897
+ try {
2898
+ const sessionBackend = getSessionBackendSync();
2899
+ if (sessionBackend) {
2900
+ // Save state for resume-on-restart
2901
+ const persistence = getSessionStatePersistence();
2902
+ const savedCount = await persistence.saveState(sessionBackend);
2903
+ if (savedCount > 0) {
2904
+ this.logger.info('Saved PTY sessions for later restoration', { count: savedCount });
2905
+ }
2906
+ // Collect PIDs before destroying for belt-and-suspenders cleanup
2907
+ let collectedPids = [];
2908
+ if (sessionBackend instanceof PtySessionBackend) {
2909
+ collectedPids = sessionBackend.getAllSessionPids();
2910
+ this.logger.info('Collected PTY PIDs for shutdown', { pids: collectedPids });
2911
+ // Use forceDestroyAll for SIGTERM → SIGKILL escalation
2912
+ await sessionBackend.forceDestroyAll();
2913
+ }
2914
+ else {
2915
+ await sessionBackend.destroy();
2916
+ }
2917
+ // Belt-and-suspenders: SIGKILL any remaining PIDs
2918
+ for (const pid of collectedPids) {
2919
+ try {
2920
+ process.kill(pid, 'SIGKILL');
2921
+ }
2922
+ catch {
2923
+ // ESRCH = already dead, which is expected
2924
+ }
2925
+ }
2926
+ }
2927
+ // Clear the factory singleton
2928
+ await destroySessionBackend();
2929
+ }
2930
+ catch (error) {
2931
+ this.logger.warn('Failed to save PTY session state', { error: error instanceof Error ? error.message : String(error) });
2932
+ }
2933
+ // Flush message queue to disk before stopping processor
2934
+ this.logger.info('Flushing message queue to disk...');
2935
+ try {
2936
+ await this.messageQueueService.flushPersist();
2937
+ }
2938
+ catch (error) {
2939
+ this.logger.warn('Failed to flush message queue', {
2940
+ error: error instanceof Error ? error.message : String(error),
2941
+ });
2942
+ }
2943
+ // Flush thread status queue to disk
2944
+ try {
2945
+ await this.threadStatusQueueService.persist();
2946
+ }
2947
+ catch (error) {
2948
+ this.logger.warn('Failed to flush thread status queue', {
2949
+ error: error instanceof Error ? error.message : String(error),
2950
+ });
2951
+ }
2952
+ // Flush task pool (WorkItems) to disk — prevents data loss on restart
2953
+ try {
2954
+ const { TaskPoolService } = await import('./services/task-pool/task-pool.service.js');
2955
+ const pool = TaskPoolService.getInstance();
2956
+ await pool.flush();
2957
+ this.logger.info('Task pool flushed to disk');
2958
+ }
2959
+ catch (error) {
2960
+ this.logger.warn('Failed to flush task pool', {
2961
+ error: error instanceof Error ? error.message : String(error),
2962
+ });
2963
+ }
2964
+ // Stop system resource alert monitoring
2965
+ if (this.systemResourceAlertService) {
2966
+ this.systemResourceAlertService.stopMonitoring();
2967
+ }
2968
+ // Stop Reconciler loops (V2)
2969
+ if (this.reconcilerService) {
2970
+ this.reconcilerService.stop();
2971
+ this.logger.info('Reconciler stopped');
2972
+ }
2973
+ // Stop Team-Health-Watchdog sweep loop (Layer 4)
2974
+ if (this.teamHealthWatchdog) {
2975
+ this.teamHealthWatchdog.stop();
2976
+ this.logger.info('TeamHealthWatchdog stopped');
2977
+ }
2978
+ // Stop NOTIFY reconciliation service
2979
+ if (this.notifyReconciliationService) {
2980
+ this.notifyReconciliationService.stop();
2981
+ }
2982
+ // Stop message queue processor
2983
+ this.queueProcessorService.stop();
2984
+ // Stop the EventToWorkItemBridge BEFORE cleaning the event bus so
2985
+ // in-flight handler dispatches drain against a still-live bus.
2986
+ if (this.eventToWorkItemBridge) {
2987
+ this.eventToWorkItemBridge.stop();
2988
+ this.eventToWorkItemBridge = null;
2989
+ }
2990
+ // LEARN-1: stop the AutoLearningSubscriber on the same window as the
2991
+ // bridge so its in-flight recordLearning calls drain before the bus
2992
+ // is cleaned.
2993
+ if (this.autoLearningSubscriber) {
2994
+ this.autoLearningSubscriber.stop();
2995
+ this.autoLearningSubscriber = null;
2996
+ }
2997
+ // DF-1 #438: same shutdown window as auto-learning above.
2998
+ if (this.milestoneNotificationSubscriber) {
2999
+ this.milestoneNotificationSubscriber.stop();
3000
+ this.milestoneNotificationSubscriber = null;
3001
+ }
3002
+ // INBOUND-1: stop the SLA subscriber and unset the module-level
3003
+ // references so a follow-up start() doesn't see stale singletons.
3004
+ if (this.requestSlaSubscriber) {
3005
+ this.requestSlaSubscriber.stop();
3006
+ this.requestSlaSubscriber = null;
3007
+ }
3008
+ setRequestSlaSubscriber(null);
3009
+ // Pipeline-#4 follow-up: stop the decompose subscriber and clear
3010
+ // its module-level reference on the same shutdown window as SLA.
3011
+ if (this.requestDecomposeSubscriber) {
3012
+ this.requestDecomposeSubscriber.stop();
3013
+ this.requestDecomposeSubscriber = null;
3014
+ }
3015
+ setRequestDecomposeSubscriber(null);
3016
+ setRequestServiceEventBus(null);
3017
+ // Clean up event bus service
3018
+ this.eventBusService.cleanup();
3019
+ // Clean up schedulers
3020
+ this.schedulerService.cleanup();
3021
+ this.messageSchedulerService.cleanup();
3022
+ // Stop activity monitoring
3023
+ this.activityMonitorService.stopPolling();
3024
+ // Stop idle detection
3025
+ IdleDetectionService.getInstance().stop();
3026
+ // Stop agent heartbeat monitor
3027
+ AgentHeartbeatMonitorService.getInstance().stop();
3028
+ // Stop context window monitor
3029
+ ContextWindowMonitorService.getInstance().stop();
3030
+ // Stop OAuth relogin monitor
3031
+ OAuthReloginMonitorService.getInstance().destroy();
3032
+ // Stop orchestrator heartbeat monitor
3033
+ OrchestratorHeartbeatMonitorService.getInstance().stop();
3034
+ // Stop Crewly in Chrome WebSocket bridge
3035
+ try {
3036
+ const { BrowserBridgeService } = await import('./services/browser/browser-bridge.service.js');
3037
+ BrowserBridgeService.getInstance().stop();
3038
+ }
3039
+ catch {
3040
+ // May not have been initialized
3041
+ }
3042
+ // Disconnect BrowserProxyService from Cloud Relay
3043
+ try {
3044
+ const { BrowserProxyService } = await import('./services/browser/browser-proxy.service.js');
3045
+ BrowserProxyService.getInstance().disconnect();
3046
+ }
3047
+ catch {
3048
+ // May not have been initialized
3049
+ }
3050
+ // Stop team activity WebSocket service
3051
+ this.teamActivityWebSocketService.stop();
3052
+ // Stop teams.json file watcher
3053
+ this.teamsJsonWatcherService.stop();
3054
+ // Stop log rotation service
3055
+ LogRotationService.getInstance().stop();
3056
+ // Stop auditor scheduler
3057
+ AuditorSchedulerService.getInstance().stop();
3058
+ // Flush and shutdown OpenTelemetry tracing
3059
+ try {
3060
+ const { TracingService: TracingSvc } = await import('./services/core/tracing.service.js');
3061
+ await TracingSvc.getInstance().shutdown();
3062
+ }
3063
+ catch {
3064
+ // Ignore if not initialized
3065
+ }
3066
+ // Clean up tmux service resources
3067
+ this.tmuxService.destroy();
3068
+ // Stop Slack image cleanup timer
3069
+ try {
3070
+ const { getSlackImageService: getImgSvc } = await import('./services/slack/slack-image.service.js');
3071
+ getImgSvc().stopCleanup();
3072
+ }
3073
+ catch {
3074
+ // Ignore if not initialized
3075
+ }
3076
+ // Shutdown Slack integration
3077
+ this.logger.info('Shutting down Slack integration...');
3078
+ await shutdownSlack();
3079
+ // Shutdown WhatsApp integration
3080
+ this.logger.info('Shutting down WhatsApp integration...');
3081
+ await shutdownWhatsApp();
3082
+ // Shutdown Telegram integration
3083
+ this.logger.info('Shutting down Telegram integration...');
3084
+ await shutdownTelegram();
3085
+ // Note: Cloud Task Processor has been migrated to services/tasks/
3086
+ // Kill all tmux sessions
3087
+ const sessions = await this.tmuxService.listSessions();
3088
+ for (const session of sessions) {
3089
+ if (session.sessionName.startsWith('crewly_')) {
3090
+ await this.tmuxService.killSession(session.sessionName);
3091
+ }
3092
+ }
3093
+ // Close all socket.io connections
3094
+ this.logger.info('Closing WebSocket connections...');
3095
+ this.io.close();
3096
+ // Close HTTP server with timeout
3097
+ this.logger.info('Closing HTTP server...');
3098
+ await new Promise((resolve) => {
3099
+ this.httpServer.close(() => {
3100
+ this.logger.info('Server shut down gracefully');
3101
+ resolve();
3102
+ });
3103
+ // If server doesn't close in 3 seconds, continue anyway
3104
+ setTimeout(resolve, 3000);
3105
+ });
3106
+ clearTimeout(forceExitTimeout);
3107
+ process.exit(0);
3108
+ }
3109
+ catch (error) {
3110
+ this.logger.error('Error during shutdown', { error: error instanceof Error ? error.message : String(error) });
3111
+ clearTimeout(forceExitTimeout);
3112
+ process.exit(1);
3113
+ }
3114
+ }
3115
+ getConfig() {
3116
+ return { ...this.config };
3117
+ }
3118
+ }
3119
+ // Start server if this file is run directly
3120
+ const isMainModule = process.argv[1] && (process.argv[1].endsWith('/index.ts') || process.argv[1].endsWith('/index.js'));
3121
+ if (isMainModule) {
3122
+ const server = new CrewlyServer();
3123
+ const logger = LoggerService.getInstance().createComponentLogger('CrewlyServer');
3124
+ server.start().catch((error) => {
3125
+ logger.error('Failed to start Crewly server', { error: error instanceof Error ? error.message : String(error) });
3126
+ process.exit(1);
3127
+ });
3128
+ }
3129
+ export default CrewlyServer;
3130
+ //# sourceMappingURL=index.js.map