crewly 1.8.9 → 1.8.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/constants.d.ts.map +1 -0
- package/config/index.d.ts.map +1 -0
- package/config/roles/_common/memory-instructions.md +6 -5
- package/config/roles/_common/wiki-instructions.md +49 -0
- package/config/roles/architect/prompt.md +2 -2
- package/config/roles/backend-developer/prompt.md +2 -2
- package/config/roles/designer/prompt.md +2 -2
- package/config/roles/developer/prompt.md +2 -2
- package/config/roles/frontend-developer/prompt.md +2 -2
- package/config/roles/fullstack-dev/prompt.md +2 -2
- package/config/roles/generalist/prompt.md +2 -2
- package/config/roles/ops/prompt.md +2 -2
- package/config/roles/orchestrator/prompt.md +135 -11
- package/config/roles/product-manager/prompt.md +2 -2
- package/config/roles/qa/prompt.md +2 -2
- package/config/roles/qa-engineer/prompt.md +2 -2
- package/config/roles/researcher/prompt.md +15 -6
- package/config/roles/sales/prompt.md +2 -2
- package/config/roles/support/prompt.md +2 -2
- package/config/roles/team-leader/prompt.md +17 -2
- package/config/roles/tpm/prompt.md +2 -2
- package/config/roles/ux-designer/prompt.md +2 -2
- package/config/skills/orchestrator/wiki-cleanup/SKILL.md +89 -0
- package/config/skills/orchestrator/wiki-cleanup/execute.sh +139 -0
- package/config/skills/orchestrator/wiki-lint/SKILL.md +75 -0
- package/config/skills/orchestrator/wiki-lint/execute.sh +66 -0
- package/config/skills/orchestrator/wiki-migrate/SKILL.md +103 -0
- package/config/skills/orchestrator/wiki-migrate/execute.sh +82 -0
- package/config/skills/orchestrator/wiki-process-queue/SKILL.md +9 -1
- package/dist/backend/backend/src/controllers/task-management/task-management.controller.d.ts +169 -0
- package/dist/backend/backend/src/controllers/task-management/task-management.controller.d.ts.map +1 -0
- package/dist/backend/backend/src/controllers/task-management/task-management.controller.js +1779 -0
- package/dist/backend/backend/src/controllers/task-management/task-management.controller.js.map +1 -0
- package/dist/backend/backend/src/controllers/task-pool/task-pool.controller.d.ts +18 -0
- package/dist/backend/backend/src/controllers/task-pool/task-pool.controller.d.ts.map +1 -1
- package/dist/backend/backend/src/controllers/task-pool/task-pool.controller.js +63 -0
- package/dist/backend/backend/src/controllers/task-pool/task-pool.controller.js.map +1 -1
- package/dist/backend/backend/src/controllers/task-pool/task-pool.routes.d.ts.map +1 -1
- package/dist/backend/backend/src/controllers/task-pool/task-pool.routes.js +5 -1
- package/dist/backend/backend/src/controllers/task-pool/task-pool.routes.js.map +1 -1
- package/dist/backend/backend/src/controllers/wiki/wiki.controller.d.ts +109 -0
- package/dist/backend/backend/src/controllers/wiki/wiki.controller.d.ts.map +1 -1
- package/dist/backend/backend/src/controllers/wiki/wiki.controller.js +418 -4
- package/dist/backend/backend/src/controllers/wiki/wiki.controller.js.map +1 -1
- package/dist/backend/backend/src/controllers/wiki/wiki.routes.d.ts.map +1 -1
- package/dist/backend/backend/src/controllers/wiki/wiki.routes.js +11 -1
- package/dist/backend/backend/src/controllers/wiki/wiki.routes.js.map +1 -1
- package/dist/backend/backend/src/index.d.ts.map +1 -1
- package/dist/backend/backend/src/index.js +64 -0
- package/dist/backend/backend/src/index.js.map +1 -1
- package/dist/backend/backend/src/index.js.orc-bak-20260529 +3130 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/agent-runner.service.d.ts +513 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/agent-runner.service.d.ts.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/agent-runner.service.js +1568 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/agent-runner.service.js.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/agent-worker.d.ts +86 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/agent-worker.d.ts.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/agent-worker.js +147 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/agent-worker.js.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/api-client.d.ts +68 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/api-client.d.ts.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/api-client.js +131 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/api-client.js.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/audit-log.service.d.ts +130 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/audit-log.service.d.ts.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/audit-log.service.js +263 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/audit-log.service.js.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/audit-trail.service.d.ts +74 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/audit-trail.service.d.ts.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/audit-trail.service.js +140 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/audit-trail.service.js.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/auditor-tools.d.ts +29 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/auditor-tools.d.ts.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/auditor-tools.js +279 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/auditor-tools.js.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/crewly-agent-runtime.service.d.ts +340 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/crewly-agent-runtime.service.d.ts.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/crewly-agent-runtime.service.js +1176 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/crewly-agent-runtime.service.js.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/deepseek-sse-transform.d.ts +79 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/deepseek-sse-transform.d.ts.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/deepseek-sse-transform.js +145 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/deepseek-sse-transform.js.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/env-isolation.service.d.ts +79 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/env-isolation.service.d.ts.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/env-isolation.service.js +218 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/env-isolation.service.js.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/index.d.ts +16 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/index.d.ts.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/index.js +16 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/index.js.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/mcp-tool-bridge.d.ts +135 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/mcp-tool-bridge.d.ts.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/mcp-tool-bridge.js +185 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/mcp-tool-bridge.js.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/model-manager.d.ts +141 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/model-manager.d.ts.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/model-manager.js +310 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/model-manager.js.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/output-filter.service.d.ts +91 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/output-filter.service.d.ts.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/output-filter.service.js +143 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/output-filter.service.js.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/prompt-guard.service.d.ts +103 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/prompt-guard.service.d.ts.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/prompt-guard.service.js +256 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/prompt-guard.service.js.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/rate-limiter.d.ts +143 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/rate-limiter.d.ts.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/rate-limiter.js +264 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/rate-limiter.js.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/smoke-test.d.ts +13 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/smoke-test.d.ts.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/smoke-test.js +91 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/smoke-test.js.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/tool-registry.d.ts +135 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/tool-registry.d.ts.map +1 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/tool-registry.js +1937 -0
- package/dist/backend/backend/src/services/agent/crewly-agent/tool-registry.js.map +1 -0
- package/dist/backend/backend/src/services/ai/prompt-builder.service.js +1 -1
- package/dist/backend/backend/src/services/autonomous/auto-assign.service.d.ts +429 -0
- package/dist/backend/backend/src/services/autonomous/auto-assign.service.d.ts.map +1 -0
- package/dist/backend/backend/src/services/autonomous/auto-assign.service.js +852 -0
- package/dist/backend/backend/src/services/autonomous/auto-assign.service.js.map +1 -0
- package/dist/backend/backend/src/services/project/task-tracking.service.d.ts +171 -0
- package/dist/backend/backend/src/services/project/task-tracking.service.d.ts.map +1 -0
- package/dist/backend/backend/src/services/project/task-tracking.service.js +725 -0
- package/dist/backend/backend/src/services/project/task-tracking.service.js.map +1 -0
- package/dist/backend/backend/src/services/reconciler/reconciler-data-provider.d.ts.map +1 -1
- package/dist/backend/backend/src/services/reconciler/reconciler-data-provider.js +50 -0
- package/dist/backend/backend/src/services/reconciler/reconciler-data-provider.js.map +1 -1
- package/dist/backend/backend/src/services/task-pool/task-pool.service.d.ts +19 -0
- package/dist/backend/backend/src/services/task-pool/task-pool.service.d.ts.map +1 -1
- package/dist/backend/backend/src/services/task-pool/task-pool.service.js +45 -0
- package/dist/backend/backend/src/services/task-pool/task-pool.service.js.map +1 -1
- package/dist/backend/backend/src/services/v3/agent-auto-claim.service.d.ts.map +1 -1
- package/dist/backend/backend/src/services/v3/agent-auto-claim.service.js +34 -1
- package/dist/backend/backend/src/services/v3/agent-auto-claim.service.js.map +1 -1
- package/dist/backend/backend/src/services/v3/project-task-watcher.service.d.ts +118 -0
- package/dist/backend/backend/src/services/v3/project-task-watcher.service.d.ts.map +1 -0
- package/dist/backend/backend/src/services/v3/project-task-watcher.service.js +326 -0
- package/dist/backend/backend/src/services/v3/project-task-watcher.service.js.map +1 -0
- package/dist/backend/backend/src/services/wiki/wiki-backlinks.service.d.ts +72 -0
- package/dist/backend/backend/src/services/wiki/wiki-backlinks.service.d.ts.map +1 -0
- package/dist/backend/backend/src/services/wiki/wiki-backlinks.service.js +186 -0
- package/dist/backend/backend/src/services/wiki/wiki-backlinks.service.js.map +1 -0
- package/dist/backend/backend/src/services/wiki/wiki-bookkeep-trigger.service.d.ts +4 -1
- package/dist/backend/backend/src/services/wiki/wiki-bookkeep-trigger.service.d.ts.map +1 -1
- package/dist/backend/backend/src/services/wiki/wiki-bookkeep-trigger.service.js +24 -1
- package/dist/backend/backend/src/services/wiki/wiki-bookkeep-trigger.service.js.map +1 -1
- package/dist/backend/backend/src/services/wiki/wiki-chat-subscriber.service.d.ts +74 -0
- package/dist/backend/backend/src/services/wiki/wiki-chat-subscriber.service.d.ts.map +1 -0
- package/dist/backend/backend/src/services/wiki/wiki-chat-subscriber.service.js +154 -0
- package/dist/backend/backend/src/services/wiki/wiki-chat-subscriber.service.js.map +1 -0
- package/dist/backend/backend/src/services/wiki/wiki-cleanup.service.d.ts +160 -0
- package/dist/backend/backend/src/services/wiki/wiki-cleanup.service.d.ts.map +1 -0
- package/dist/backend/backend/src/services/wiki/wiki-cleanup.service.js +399 -0
- package/dist/backend/backend/src/services/wiki/wiki-cleanup.service.js.map +1 -0
- package/dist/backend/backend/src/services/wiki/wiki-lint.service.d.ts +182 -0
- package/dist/backend/backend/src/services/wiki/wiki-lint.service.d.ts.map +1 -0
- package/dist/backend/backend/src/services/wiki/wiki-lint.service.js +505 -0
- package/dist/backend/backend/src/services/wiki/wiki-lint.service.js.map +1 -0
- package/dist/backend/backend/src/services/wiki/wiki-migrate.service.d.ts +232 -0
- package/dist/backend/backend/src/services/wiki/wiki-migrate.service.d.ts.map +1 -0
- package/dist/backend/backend/src/services/wiki/wiki-migrate.service.js +1416 -0
- package/dist/backend/backend/src/services/wiki/wiki-migrate.service.js.map +1 -0
- package/dist/backend/backend/src/services/wiki/wiki-recent.service.d.ts +51 -0
- package/dist/backend/backend/src/services/wiki/wiki-recent.service.d.ts.map +1 -0
- package/dist/backend/backend/src/services/wiki/wiki-recent.service.js +102 -0
- package/dist/backend/backend/src/services/wiki/wiki-recent.service.js.map +1 -0
- package/dist/backend/backend/src/services/wiki/wiki-reflect-trigger.service.d.ts +84 -0
- package/dist/backend/backend/src/services/wiki/wiki-reflect-trigger.service.d.ts.map +1 -0
- package/dist/backend/backend/src/services/wiki/wiki-reflect-trigger.service.js +156 -0
- package/dist/backend/backend/src/services/wiki/wiki-reflect-trigger.service.js.map +1 -0
- package/dist/backend/backend/src/services/wiki/wiki-search.service.d.ts +90 -0
- package/dist/backend/backend/src/services/wiki/wiki-search.service.d.ts.map +1 -0
- package/dist/backend/backend/src/services/wiki/wiki-search.service.js +190 -0
- package/dist/backend/backend/src/services/wiki/wiki-search.service.js.map +1 -0
- package/dist/backend/backend/src/services/wiki/wiki-workitem-bridge.service.d.ts +164 -0
- package/dist/backend/backend/src/services/wiki/wiki-workitem-bridge.service.d.ts.map +1 -0
- package/dist/backend/backend/src/services/wiki/wiki-workitem-bridge.service.js +675 -0
- package/dist/backend/backend/src/services/wiki/wiki-workitem-bridge.service.js.map +1 -0
- package/dist/backend/backend/src/services/workflow/cron-task.service.d.ts.map +1 -1
- package/dist/backend/backend/src/services/workflow/cron-task.service.js +65 -0
- package/dist/backend/backend/src/services/workflow/cron-task.service.js.map +1 -1
- package/dist/backend/backend/src/types/auto-assign.types.d.ts +271 -0
- package/dist/backend/backend/src/types/auto-assign.types.d.ts.map +1 -0
- package/dist/backend/backend/src/types/auto-assign.types.js +136 -0
- package/dist/backend/backend/src/types/auto-assign.types.js.map +1 -0
- package/dist/backend/backend/src/types/cron-task.types.d.ts +16 -1
- package/dist/backend/backend/src/types/cron-task.types.d.ts.map +1 -1
- package/dist/backend/backend/src/utils/esm-require.utils.d.ts +111 -0
- package/dist/backend/backend/src/utils/esm-require.utils.d.ts.map +1 -0
- package/dist/backend/backend/src/utils/esm-require.utils.js +124 -0
- package/dist/backend/backend/src/utils/esm-require.utils.js.map +1 -0
- package/dist/cli/backend/src/services/ai/prompt-modules/prompt-module.interface.d.ts +220 -0
- package/dist/cli/backend/src/services/ai/prompt-modules/prompt-module.interface.d.ts.map +1 -0
- package/dist/cli/backend/src/services/ai/prompt-modules/prompt-module.interface.js +37 -0
- package/dist/cli/backend/src/services/ai/prompt-modules/prompt-module.interface.js.map +1 -0
- package/dist/cli/backend/src/services/knowledge/fts5-search-strategy.d.ts +56 -0
- package/dist/cli/backend/src/services/knowledge/fts5-search-strategy.d.ts.map +1 -0
- package/dist/cli/backend/src/services/knowledge/fts5-search-strategy.js +91 -0
- package/dist/cli/backend/src/services/knowledge/fts5-search-strategy.js.map +1 -0
- package/dist/cli/backend/src/services/knowledge/learnings-index.service.d.ts +159 -0
- package/dist/cli/backend/src/services/knowledge/learnings-index.service.d.ts.map +1 -0
- package/dist/cli/backend/src/services/knowledge/learnings-index.service.js +304 -0
- package/dist/cli/backend/src/services/knowledge/learnings-index.service.js.map +1 -0
- package/dist/cli/backend/src/services/knowledge/wiki-compiler.service.d.ts +115 -0
- package/dist/cli/backend/src/services/knowledge/wiki-compiler.service.d.ts.map +1 -0
- package/dist/cli/backend/src/services/knowledge/wiki-compiler.service.js +215 -0
- package/dist/cli/backend/src/services/knowledge/wiki-compiler.service.js.map +1 -0
- package/dist/cli/backend/src/services/memory/embedding-provider.d.ts +78 -0
- package/dist/cli/backend/src/services/memory/embedding-provider.d.ts.map +1 -0
- package/dist/cli/backend/src/services/memory/embedding-provider.js +179 -0
- package/dist/cli/backend/src/services/memory/embedding-provider.js.map +1 -0
- package/dist/cli/backend/src/services/memory/vector-store.service.d.ts +331 -0
- package/dist/cli/backend/src/services/memory/vector-store.service.d.ts.map +1 -0
- package/dist/cli/backend/src/services/memory/vector-store.service.js +814 -0
- package/dist/cli/backend/src/services/memory/vector-store.service.js.map +1 -0
- package/dist/cli/backend/src/services/project/task-tracking.service.d.ts +171 -0
- package/dist/cli/backend/src/services/project/task-tracking.service.d.ts.map +1 -0
- package/dist/cli/backend/src/services/project/task-tracking.service.js +725 -0
- package/dist/cli/backend/src/services/project/task-tracking.service.js.map +1 -0
- package/dist/cli/backend/src/services/task-pool/task-pool.service.d.ts +19 -0
- package/dist/cli/backend/src/services/task-pool/task-pool.service.d.ts.map +1 -1
- package/dist/cli/backend/src/services/task-pool/task-pool.service.js +45 -0
- package/dist/cli/backend/src/services/task-pool/task-pool.service.js.map +1 -1
- package/dist/cli/backend/src/types/auto-assign.types.d.ts +271 -0
- package/dist/cli/backend/src/types/auto-assign.types.d.ts.map +1 -0
- package/dist/cli/backend/src/types/auto-assign.types.js +136 -0
- package/dist/cli/backend/src/types/auto-assign.types.js.map +1 -0
- package/dist/cli/cli/src/index.js +0 -0
- package/frontend/dist/assets/{index-db3f5041.css → index-068bb4f6.css} +10 -1
- package/frontend/dist/assets/index-c24ceb15.js +4960 -0
- package/frontend/dist/index.html +2 -2
- package/package.json +1 -1
- package/config/skills/agent/core/query-knowledge/SKILL.md +0 -87
- package/config/skills/agent/core/query-knowledge/execute.sh +0 -30
- package/config/skills/orchestrator/query-knowledge/SKILL.md +0 -75
- package/config/skills/orchestrator/query-knowledge/execute.sh +0 -30
- package/frontend/dist/assets/index-cc115bb4.js +0 -4926
|
@@ -0,0 +1,3130 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// Load environment variables from .env file BEFORE any other imports
|
|
3
|
+
// This ensures env vars are available when services initialize
|
|
4
|
+
import dotenv from 'dotenv';
|
|
5
|
+
import path from 'path';
|
|
6
|
+
// Load .env from project root
|
|
7
|
+
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
|
8
|
+
import express from 'express';
|
|
9
|
+
import { createServer } from 'http';
|
|
10
|
+
import { Server as SocketIOServer } from 'socket.io';
|
|
11
|
+
import cors from 'cors';
|
|
12
|
+
import helmet from 'helmet';
|
|
13
|
+
import morgan from 'morgan';
|
|
14
|
+
import os from 'os';
|
|
15
|
+
import { fileURLToPath } from 'url';
|
|
16
|
+
import { StorageService, TmuxService, SchedulerService, MessageSchedulerService, ActivityMonitorService, TeamActivityWebSocketService, TeamsJsonWatcherService, } from './services/index.js';
|
|
17
|
+
import { getSessionBackend, getSessionBackendSync, getSessionStatePersistence, destroySessionBackend, PtySessionBackend, } from './services/session/index.js';
|
|
18
|
+
import { ApiController } from './controllers/api.controller.js';
|
|
19
|
+
import { createApiRoutes } from './routes/api.routes.js';
|
|
20
|
+
import { TerminalGateway, setTerminalGateway } from './websocket/terminal.gateway.js';
|
|
21
|
+
import { initializeChatGateway } from './websocket/chat.gateway.js';
|
|
22
|
+
import { LoggerService } from './services/core/logger.service.js';
|
|
23
|
+
import { CREWLY_CONSTANTS, ORCHESTRATOR_SESSION_NAME, ORCHESTRATOR_ROLE, ORCHESTRATOR_WINDOW_NAME, MESSAGE_QUEUE_CONSTANTS, RUNTIME_TYPES, AUDITOR_CONSTANTS, AUDITOR_SCHEDULER_CONSTANTS, } from './constants.js';
|
|
24
|
+
import { getSettingsService } from './services/settings/index.js';
|
|
25
|
+
import { MemoryService } from './services/memory/memory.service.js';
|
|
26
|
+
import { getImprovementStartupService } from './services/orchestrator/improvement-startup.service.js';
|
|
27
|
+
import { initializeSlackIfConfigured, shutdownSlack } from './services/slack/index.js';
|
|
28
|
+
import { resolveTeamByIdOrSlug, slugifyTeamName } from './services/workflow/team-identifier-resolver.js';
|
|
29
|
+
import { initializeWhatsAppIfConfigured, shutdownWhatsApp } from './services/whatsapp/index.js';
|
|
30
|
+
import { initializeGoogleChatIfConfigured } from './services/messaging/google-chat-initializer.js';
|
|
31
|
+
import { initializeTelegramIfConfigured, shutdownTelegram } from './services/telegram/index.js';
|
|
32
|
+
import { initializeCloudIfConfigured } from './services/cloud/cloud-initializer.js';
|
|
33
|
+
import { MessageQueueService, QueueProcessorService, ResponseRouterService } from './services/messaging/index.js';
|
|
34
|
+
import { ThreadStatusQueueService } from './services/messaging/thread-status-queue.service.js';
|
|
35
|
+
import { EventBusService } from './services/event-bus/index.js';
|
|
36
|
+
import { EventToWorkItemBridge } from './services/event-bus/event-to-workitem-bridge.service.js';
|
|
37
|
+
import { AutoLearningSubscriber } from './services/memory/auto-learning.subscriber.js';
|
|
38
|
+
import { MilestoneNotificationSubscriber } from './services/notification/milestone-notification.subscriber.js';
|
|
39
|
+
import { RequestSlaSubscriber, setRequestSlaSubscriber, } from './services/v3/request-sla.subscriber.js';
|
|
40
|
+
import { RequestDecomposeSubscriber, setRequestDecomposeSubscriber, } from './services/v3/request-decompose.subscriber.js';
|
|
41
|
+
import { RequestStatusUpdateSubscriber } from './services/v3/request-status-update.subscriber.js';
|
|
42
|
+
import { RequestCascadeSubscriber } from './services/v3/request-cascade.subscriber.js';
|
|
43
|
+
import { setRequestServiceEventBus, RequestService } from './services/v3/request.service.js';
|
|
44
|
+
import { getSlackService } from './services/slack/slack.service.js';
|
|
45
|
+
import { SlackThreadStoreService, setSlackThreadStore, getSlackThreadStore } from './services/slack/slack-thread-store.service.js';
|
|
46
|
+
import { GoogleChatThreadStoreService, setGchatThreadStore } from './services/messaging/gchat-thread-store.service.js';
|
|
47
|
+
import { SlackImageService, setSlackImageService } from './services/slack/slack-image.service.js';
|
|
48
|
+
import { NotifyReconciliationService } from './services/slack/notify-reconciliation.service.js';
|
|
49
|
+
import { setEventBusService as setEventBusControllerService } from './controllers/event-bus/event-bus.controller.js';
|
|
50
|
+
import { setTeamControllerEventBusService } from './controllers/team/team.controller.js';
|
|
51
|
+
import { SkillCatalogService } from './services/skill/skill-catalog.service.js';
|
|
52
|
+
import { setMessageQueueService as setChatMessageQueueService, setThreadStatusQueueService as setChatThreadStatusQueueService } from './controllers/chat/chat.controller.js';
|
|
53
|
+
import { setMessageQueueService as setMessagingControllerQueueService } from './controllers/messaging/messaging.controller.js';
|
|
54
|
+
import { SystemResourceAlertService } from './services/monitoring/system-resource-alert.service.js';
|
|
55
|
+
import { TokenUsageService } from './services/monitoring/token-usage.service.js';
|
|
56
|
+
import { agentHeartbeatMiddleware } from './middleware/agent-heartbeat.middleware.js';
|
|
57
|
+
import { RedisCacheService } from './services/cache/redis-cache.service.js';
|
|
58
|
+
import { OrchestratorRestartService } from './services/orchestrator/orchestrator-restart.service.js';
|
|
59
|
+
import { setOrchestratorSetupDependencies } from './services/orchestrator/orchestrator-setup.service.js';
|
|
60
|
+
import { IdleDetectionService } from './services/agent/idle-detection.service.js';
|
|
61
|
+
import { AgentSuspendService } from './services/agent/agent-suspend.service.js';
|
|
62
|
+
import { AgentHeartbeatMonitorService } from './services/agent/agent-heartbeat-monitor.service.js';
|
|
63
|
+
import { OrchestratorHeartbeatMonitorService } from './services/orchestrator/orchestrator-heartbeat-monitor.service.js';
|
|
64
|
+
import { RuntimeExitMonitorService } from './services/agent/runtime-exit-monitor.service.js';
|
|
65
|
+
import { ContextWindowMonitorService } from './services/agent/context-window-monitor.service.js';
|
|
66
|
+
import { OAuthReloginMonitorService } from './services/agent/oauth-relogin-monitor.service.js';
|
|
67
|
+
import { findPackageRoot } from './utils/package-root.js';
|
|
68
|
+
import { isNativeBindingFatalError } from './utils/native-binding.utils.js';
|
|
69
|
+
import { VersionCheckService } from './services/system/version-check.service.js';
|
|
70
|
+
import { LogRotationService } from './services/session/log-rotation.service.js';
|
|
71
|
+
import { AuditorSchedulerService } from './services/agent/auditor-scheduler.service.js';
|
|
72
|
+
import { setAuditorSchedulerService } from './controllers/auditor/auditor.controller.js';
|
|
73
|
+
import { AddonLoaderService } from './services/addon/addon-loader.service.js';
|
|
74
|
+
import { CronTaskService } from './services/workflow/cron-task.service.js';
|
|
75
|
+
import { ReconcilerService } from './services/reconciler/reconciler.service.js';
|
|
76
|
+
import { LiveReconcilerDataProvider } from './services/reconciler/reconciler-data-provider.js';
|
|
77
|
+
import { setReconcilerService } from './controllers/reconciler/reconciler.controller.js';
|
|
78
|
+
import { FissionGuardService } from './services/fission/fission-guard.service.js';
|
|
79
|
+
import { setFissionGuardService } from './controllers/fission/fission.controller.js';
|
|
80
|
+
import { TaskPoolService } from './services/task-pool/task-pool.service.js';
|
|
81
|
+
import { ProjectMemoryService } from './services/memory/project-memory.service.js';
|
|
82
|
+
import { TaskHistorySubscriber } from './services/memory/task-history.subscriber.js';
|
|
83
|
+
import { TeamHealthWatchdogService, LiveTeamHealthDataProvider, loadTeamHealthConfig, setTeamHealthWatchdogSingleton, getTeamHealthWatchdogSingleton, } from './services/team-health/index.js';
|
|
84
|
+
// ESM __dirname equivalent using import.meta.url
|
|
85
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
86
|
+
const __dirname = path.dirname(__filename);
|
|
87
|
+
/**
|
|
88
|
+
* Safely parses an integer from a string with validation and fallback.
|
|
89
|
+
*
|
|
90
|
+
* @param value - The string value to parse, or undefined
|
|
91
|
+
* @param defaultValue - The default value to return if parsing fails or value is invalid
|
|
92
|
+
* @param envVarName - Optional name of the environment variable for logging purposes
|
|
93
|
+
* @returns The parsed integer or the default value if parsing fails
|
|
94
|
+
*/
|
|
95
|
+
function parseIntWithFallback(value, defaultValue, envVarName) {
|
|
96
|
+
if (value === undefined || value === '') {
|
|
97
|
+
return defaultValue;
|
|
98
|
+
}
|
|
99
|
+
const parsed = parseInt(value, 10);
|
|
100
|
+
// Check if parsing resulted in NaN or if the value contains non-numeric characters
|
|
101
|
+
// that would be silently ignored by parseInt (e.g., "3000abc" -> 3000)
|
|
102
|
+
if (Number.isNaN(parsed) || !Number.isFinite(parsed)) {
|
|
103
|
+
const logger = LoggerService.getInstance().createComponentLogger('ConfigParser');
|
|
104
|
+
logger.warn('Invalid numeric environment variable value, using default', {
|
|
105
|
+
envVar: envVarName,
|
|
106
|
+
value,
|
|
107
|
+
defaultValue,
|
|
108
|
+
});
|
|
109
|
+
return defaultValue;
|
|
110
|
+
}
|
|
111
|
+
// Validate that the entire string was a valid number (no trailing non-numeric chars)
|
|
112
|
+
if (String(parsed) !== value.trim()) {
|
|
113
|
+
const logger = LoggerService.getInstance().createComponentLogger('ConfigParser');
|
|
114
|
+
logger.warn('Environment variable contains non-numeric characters, using parsed value', {
|
|
115
|
+
envVar: envVarName,
|
|
116
|
+
originalValue: value,
|
|
117
|
+
parsedValue: parsed,
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
return parsed;
|
|
121
|
+
}
|
|
122
|
+
export class CrewlyServer {
|
|
123
|
+
app;
|
|
124
|
+
httpServer;
|
|
125
|
+
io;
|
|
126
|
+
config;
|
|
127
|
+
logger = LoggerService.getInstance().createComponentLogger('CrewlyServer');
|
|
128
|
+
storageService;
|
|
129
|
+
tmuxService;
|
|
130
|
+
schedulerService;
|
|
131
|
+
messageSchedulerService;
|
|
132
|
+
activityMonitorService;
|
|
133
|
+
teamActivityWebSocketService;
|
|
134
|
+
teamsJsonWatcherService;
|
|
135
|
+
apiController;
|
|
136
|
+
terminalGateway;
|
|
137
|
+
messageQueueService;
|
|
138
|
+
queueProcessorService;
|
|
139
|
+
threadStatusQueueService;
|
|
140
|
+
eventBusService;
|
|
141
|
+
/** BRIDGE-1: subscribes to autonomy events and creates WorkItems. */
|
|
142
|
+
eventToWorkItemBridge = null;
|
|
143
|
+
/** LEARN-1: subscribes to terminal task / mission:replanned events and auto-records learnings. */
|
|
144
|
+
autoLearningSubscriber = null;
|
|
145
|
+
// DF-1 #438 — symmetric to AutoLearningSubscriber; surfaces milestones
|
|
146
|
+
// to orc's chat queue.
|
|
147
|
+
milestoneNotificationSubscriber = null;
|
|
148
|
+
/** INBOUND-1: subscribes to request:created and tracks 5/10 min SLA on respond_to_user WIs. */
|
|
149
|
+
requestSlaSubscriber = null;
|
|
150
|
+
/** Pipeline-#4 follow-up: subscribes to request:created and auto-decomposes actionable L2 Requests via plan() → addToPool. */
|
|
151
|
+
requestDecomposeSubscriber = null;
|
|
152
|
+
requestStatusUpdateSubscriber = null;
|
|
153
|
+
requestCascadeSubscriber = null;
|
|
154
|
+
notifyReconciliationService;
|
|
155
|
+
systemResourceAlertService;
|
|
156
|
+
reconcilerService = null;
|
|
157
|
+
teamHealthWatchdog = null;
|
|
158
|
+
// Chat MVP Phase 1 — initialized lazily in `start()` after the HTTP
|
|
159
|
+
// server is created. Kept as fields so the shutdown path can close
|
|
160
|
+
// them cleanly and tests can reach in with a reference.
|
|
161
|
+
chatV2Gateway = null;
|
|
162
|
+
chatV2Dispatcher = null;
|
|
163
|
+
// Shutdown state
|
|
164
|
+
isShuttingDown = false;
|
|
165
|
+
healthMonitoringInterval = null;
|
|
166
|
+
constructor(config) {
|
|
167
|
+
// Resolve ~ to actual home directory
|
|
168
|
+
const resolveHomePath = (inputPath) => {
|
|
169
|
+
if (inputPath.startsWith('~/')) {
|
|
170
|
+
return path.join(os.homedir(), inputPath.slice(2));
|
|
171
|
+
}
|
|
172
|
+
if (inputPath === '~') {
|
|
173
|
+
return os.homedir();
|
|
174
|
+
}
|
|
175
|
+
return inputPath;
|
|
176
|
+
};
|
|
177
|
+
const defaultAgentmuxHome = config?.crewlyHome || process.env.CREWLY_HOME || '~/.crewly';
|
|
178
|
+
this.config = {
|
|
179
|
+
webPort: config?.webPort || parseIntWithFallback(process.env.WEB_PORT, 8787, 'WEB_PORT'),
|
|
180
|
+
crewlyHome: resolveHomePath(defaultAgentmuxHome),
|
|
181
|
+
defaultCheckInterval: config?.defaultCheckInterval ||
|
|
182
|
+
parseIntWithFallback(process.env.DEFAULT_CHECK_INTERVAL, 30, 'DEFAULT_CHECK_INTERVAL'),
|
|
183
|
+
autoCommitInterval: config?.autoCommitInterval || parseIntWithFallback(process.env.AUTO_COMMIT_INTERVAL, 30, 'AUTO_COMMIT_INTERVAL'),
|
|
184
|
+
headless: config?.headless ?? process.env.CREWLY_HEADLESS === 'true',
|
|
185
|
+
};
|
|
186
|
+
this.app = express();
|
|
187
|
+
this.httpServer = createServer(this.app);
|
|
188
|
+
this.io = new SocketIOServer(this.httpServer, {
|
|
189
|
+
cors: {
|
|
190
|
+
origin: process.env.NODE_ENV === 'production'
|
|
191
|
+
? ['https://crewlyai.com', 'https://www.crewlyai.com']
|
|
192
|
+
: '*',
|
|
193
|
+
methods: ['GET', 'POST'],
|
|
194
|
+
},
|
|
195
|
+
// Configure ping/pong to keep connections alive
|
|
196
|
+
pingInterval: 10000, // Send ping every 10 seconds
|
|
197
|
+
pingTimeout: 5000, // Wait 5 seconds for pong response
|
|
198
|
+
// Prefer WebSocket transport for lower latency
|
|
199
|
+
transports: ['websocket', 'polling'],
|
|
200
|
+
// Allow transport upgrade from polling to websocket
|
|
201
|
+
allowUpgrades: true,
|
|
202
|
+
// Increase buffer size for large terminal output
|
|
203
|
+
maxHttpBufferSize: 5 * 1024 * 1024, // 5MB
|
|
204
|
+
perMessageDeflate: false,
|
|
205
|
+
// CRITICAL: Prevent Engine.IO from destroying non-matching upgrade requests.
|
|
206
|
+
// Crewly in Chrome (BrowserBridgeService) shares this httpServer and handles /ws/browser upgrades.
|
|
207
|
+
// Without this, Engine.IO sets a 1-second timer to socket.end() any upgrade
|
|
208
|
+
// that doesn't match /socket.io/ — killing Crewly in Chrome connections before
|
|
209
|
+
// any data is exchanged (manifests as "Invalid frame header" errors).
|
|
210
|
+
destroyUpgrade: false,
|
|
211
|
+
});
|
|
212
|
+
this.initializeServices();
|
|
213
|
+
this.configureMiddleware();
|
|
214
|
+
this.configureRoutes();
|
|
215
|
+
this.configureWebSocket();
|
|
216
|
+
}
|
|
217
|
+
initializeServices() {
|
|
218
|
+
this.storageService = StorageService.getInstance(this.config.crewlyHome);
|
|
219
|
+
this.tmuxService = new TmuxService();
|
|
220
|
+
this.schedulerService = new SchedulerService(this.storageService);
|
|
221
|
+
this.messageSchedulerService = new MessageSchedulerService(this.tmuxService, this.storageService);
|
|
222
|
+
this.activityMonitorService = ActivityMonitorService.getInstance();
|
|
223
|
+
// V3-only as of spec 2026-05-06-task-management-v1-deprecation.md.
|
|
224
|
+
// TaskTrackingService is deleted; in-progress task data and lifecycle
|
|
225
|
+
// events come from TaskPoolService + EventBusService respectively.
|
|
226
|
+
this.teamActivityWebSocketService = new TeamActivityWebSocketService(this.storageService, this.tmuxService);
|
|
227
|
+
this.teamsJsonWatcherService = new TeamsJsonWatcherService();
|
|
228
|
+
this.apiController = new ApiController(this.storageService, this.tmuxService, this.schedulerService, this.messageSchedulerService);
|
|
229
|
+
// Wire up reliable delivery: both schedulers use AgentRegistrationService
|
|
230
|
+
// for retry + progressive verification + background stuck-detection
|
|
231
|
+
this.messageSchedulerService.setAgentRegistrationService(this.apiController.agentRegistrationService);
|
|
232
|
+
this.schedulerService.setAgentRegistrationService(this.apiController.agentRegistrationService);
|
|
233
|
+
// Initialize message queue services (with disk persistence)
|
|
234
|
+
// NOTE: Must be created before services that depend on them (scheduler, thread status queue)
|
|
235
|
+
this.messageQueueService = new MessageQueueService(this.config.crewlyHome);
|
|
236
|
+
const responseRouter = new ResponseRouterService();
|
|
237
|
+
this.queueProcessorService = new QueueProcessorService(this.messageQueueService, responseRouter, this.apiController.agentRegistrationService);
|
|
238
|
+
// Initialize event bus service for agent lifecycle pub/sub
|
|
239
|
+
// NOTE: Must be created before services that depend on it (agent registration, thread status queue)
|
|
240
|
+
this.eventBusService = new EventBusService();
|
|
241
|
+
this.eventBusService.setMessageQueueService(this.messageQueueService);
|
|
242
|
+
// Now wire services that depend on messageQueueService and eventBusService
|
|
243
|
+
this.schedulerService.setMessageQueueService(this.messageQueueService);
|
|
244
|
+
this.schedulerService.setActivityMonitor(this.activityMonitorService);
|
|
245
|
+
// #167: Wire scheduler into agent registration for DLQ drain on activation
|
|
246
|
+
this.apiController.agentRegistrationService.setSchedulerService(this.schedulerService);
|
|
247
|
+
// Architecture Upgrade Phase 6: Wire EventBusService for standing task subscriptions
|
|
248
|
+
this.apiController.agentRegistrationService.setEventBusService(this.eventBusService);
|
|
249
|
+
this.terminalGateway = new TerminalGateway(this.io);
|
|
250
|
+
// Set terminal gateway singleton for chat integration
|
|
251
|
+
setTerminalGateway(this.terminalGateway);
|
|
252
|
+
// Initialize ChatGateway for chat message forwarding
|
|
253
|
+
// This sets up the event listeners that forward chat messages to WebSocket clients
|
|
254
|
+
initializeChatGateway(this.io).catch((error) => {
|
|
255
|
+
this.logger.error('Failed to initialize ChatGateway', {
|
|
256
|
+
error: error instanceof Error ? error.message : String(error),
|
|
257
|
+
});
|
|
258
|
+
});
|
|
259
|
+
// Connect WebSocket service to terminal gateway for broadcasting
|
|
260
|
+
this.teamActivityWebSocketService.setTerminalGateway(this.terminalGateway);
|
|
261
|
+
// Connect teams.json watcher to team activity service for real-time updates
|
|
262
|
+
this.teamsJsonWatcherService.setTeamActivityService(this.teamActivityWebSocketService);
|
|
263
|
+
// Initialize thread status queue for tracking inbound message lifecycle
|
|
264
|
+
this.threadStatusQueueService = new ThreadStatusQueueService(this.config.crewlyHome);
|
|
265
|
+
responseRouter.setThreadStatusQueue(this.threadStatusQueueService);
|
|
266
|
+
this.queueProcessorService.setThreadStatusQueue(this.threadStatusQueueService);
|
|
267
|
+
// INBOUND-1.f1: Wire EventBus into the TaskPool singleton so addToPool
|
|
268
|
+
// can publish `workitem:queued` events. Must run before any code path
|
|
269
|
+
// triggers addToPool — the slack listener / TaskPool router below both
|
|
270
|
+
// depend on this for the auto-close path b chain. Idempotent.
|
|
271
|
+
TaskPoolService.getInstance().setEventBusService(this.eventBusService);
|
|
272
|
+
// Memory: TaskHistorySubscriber listens on the bus for
|
|
273
|
+
// task:done_by_worker / task:rejected / task:cancelled and writes
|
|
274
|
+
// the resulting TaskHistoryEntry into ProjectMemory. This is the
|
|
275
|
+
// load-bearing write path behind "who in my team has done X?" —
|
|
276
|
+
// the orchestrator queries via recall(capability:...). Must run
|
|
277
|
+
// AFTER TaskPoolService is wired to the bus (above) so the events
|
|
278
|
+
// it publishes have a subscriber to consume them.
|
|
279
|
+
const taskHistorySubscriber = new TaskHistorySubscriber({
|
|
280
|
+
eventBus: this.eventBusService,
|
|
281
|
+
projectMemoryService: ProjectMemoryService.getInstance(),
|
|
282
|
+
taskPoolService: TaskPoolService.getInstance(),
|
|
283
|
+
});
|
|
284
|
+
taskHistorySubscriber.start();
|
|
285
|
+
// P1 Bug B (Pool umbrella WI 72ca743a): Wire RequestService into the
|
|
286
|
+
// TaskPool singleton so addToPool intrinsically links new WIs into
|
|
287
|
+
// their parent Request.workItemIds[] — independent of the
|
|
288
|
+
// subscriber-driven path (request-sla.subscriber, V3DataService).
|
|
289
|
+
// Pre-fix, manual / programmatic / cron callers that bypassed the
|
|
290
|
+
// event chain left Requests with empty workItemIds[]. The linker is
|
|
291
|
+
// idempotent (request.service.ts:328 short-circuits on duplicate id)
|
|
292
|
+
// so subscriber-driven linking stays as belt-and-suspenders.
|
|
293
|
+
TaskPoolService.getInstance().setRequestService(RequestService.getInstance());
|
|
294
|
+
// P1 Bug C (Pool umbrella WI 72ca743a, sub-WI Bug C): Wire the inverse
|
|
295
|
+
// dependency — RequestService → TaskPool — so RequestService.update
|
|
296
|
+
// can refuse `Request → done` when any child WorkItem is still in a
|
|
297
|
+
// non-terminal state. Bug B (above) makes Request.workItemIds[]
|
|
298
|
+
// authoritative on every addToPool; Bug C makes the closure honor
|
|
299
|
+
// that data. The setter is duck-typed on IWorkItemQueryable so
|
|
300
|
+
// neither side needs a static import of the other.
|
|
301
|
+
RequestService.getInstance().setTaskPoolService(TaskPoolService.getInstance());
|
|
302
|
+
// Atlas 2026-05-23 fix: wire the agent-liveness gate so claimFromPool /
|
|
303
|
+
// claimSpecificItem refuse to put a WI into `running` when the requesting
|
|
304
|
+
// agent's session is dead. delegate-task's "self-heal fix #1" used to
|
|
305
|
+
// pre-claim WIs for inactive targets, which short-circuited the
|
|
306
|
+
// reconciler's wake-rule and left WIs blocked indefinitely. With this
|
|
307
|
+
// probe wired, rejected pre-claims leave the WI in `queued` so the
|
|
308
|
+
// reconciler can fire detectUnclaimedTasks → start-agent → the agent
|
|
309
|
+
// auto-claims when it boots. The probe is the same lightweight check
|
|
310
|
+
// (PTY session exists + child process alive) used by chat-v2 and slack.
|
|
311
|
+
// Wrapped in async-IIFE because initializeServices() is sync.
|
|
312
|
+
void (async () => {
|
|
313
|
+
const { isAgentActive } = await import('./services/orchestrator/orchestrator-status.service.js');
|
|
314
|
+
TaskPoolService.getInstance().setIsAgentActive(isAgentActive);
|
|
315
|
+
})();
|
|
316
|
+
// Wire Task Pool router so [TASK]-prefixed messages route through the pool
|
|
317
|
+
this.queueProcessorService.setTaskPoolRouter(async (messageContent, targetSession) => {
|
|
318
|
+
const { createWorkItem } = await import('./types/v2/work-item.types.js');
|
|
319
|
+
const taskPool = TaskPoolService.getInstance();
|
|
320
|
+
const workItem = createWorkItem({
|
|
321
|
+
type: 'delegate',
|
|
322
|
+
owner: 'orchestrator',
|
|
323
|
+
target: targetSession,
|
|
324
|
+
title: messageContent.slice(0, 100),
|
|
325
|
+
description: messageContent,
|
|
326
|
+
});
|
|
327
|
+
await taskPool.addToPool(workItem);
|
|
328
|
+
const claimed = await taskPool.claimFromPool(targetSession);
|
|
329
|
+
return claimed !== null;
|
|
330
|
+
});
|
|
331
|
+
// Wire thread status queue with scheduler and event bus for follow-up tracking
|
|
332
|
+
this.threadStatusQueueService.setSchedulerService(this.schedulerService);
|
|
333
|
+
this.threadStatusQueueService.setEventBusService(this.eventBusService);
|
|
334
|
+
// Wire queue service into controllers
|
|
335
|
+
setChatMessageQueueService(this.messageQueueService);
|
|
336
|
+
setChatThreadStatusQueueService(this.threadStatusQueueService);
|
|
337
|
+
setMessagingControllerQueueService(this.messageQueueService);
|
|
338
|
+
// LLM-wiki bookkeep trigger (Steve 2026-05-22 design point #5):
|
|
339
|
+
// every 30 minutes (configurable via CREWLY_WIKI_BOOKKEEP_INTERVAL_MS),
|
|
340
|
+
// scan every known vault. When recentMd ≥ threshold OR there are
|
|
341
|
+
// duplicate clusters, enqueue a [BOOKKEEP] message to ORC so it can
|
|
342
|
+
// run wiki-bookkeep + decide which pages to consolidate.
|
|
343
|
+
void (async () => {
|
|
344
|
+
try {
|
|
345
|
+
const { WikiBookkeepTriggerService } = await import('./services/wiki/wiki-bookkeep-trigger.service.js');
|
|
346
|
+
const intervalMs = Number(process.env['CREWLY_WIKI_BOOKKEEP_INTERVAL_MS'] ?? 30 * 60 * 1000);
|
|
347
|
+
const debounceMs = Number(process.env['CREWLY_WIKI_BOOKKEEP_DEBOUNCE_MS'] ?? 6 * 3600 * 1000);
|
|
348
|
+
const trigger = new WikiBookkeepTriggerService({
|
|
349
|
+
intervalMs,
|
|
350
|
+
debounceMs,
|
|
351
|
+
fireFn: async (vaultPath, report) => {
|
|
352
|
+
if (!this.messageQueueService)
|
|
353
|
+
return;
|
|
354
|
+
const summary = `[BOOKKEEP] vault=${vaultPath} | ${report.recentMdCount} new md(s) in last ${report.windowDays}d (threshold ${report.threshold}) | duplicates=${report.duplicateCandidates.length} | pending-queue=${report.queue.pending}. Run wiki-bookkeep to drain.`;
|
|
355
|
+
this.messageQueueService.enqueue({
|
|
356
|
+
content: summary,
|
|
357
|
+
conversationId: 'system:wiki-bookkeep',
|
|
358
|
+
source: 'system_event',
|
|
359
|
+
});
|
|
360
|
+
},
|
|
361
|
+
});
|
|
362
|
+
WikiBookkeepTriggerService.setInstance(trigger);
|
|
363
|
+
trigger.start();
|
|
364
|
+
}
|
|
365
|
+
catch (bookkeepErr) {
|
|
366
|
+
this.logger.warn('Wiki bookkeep trigger failed to start (non-fatal)', {
|
|
367
|
+
error: bookkeepErr.message,
|
|
368
|
+
});
|
|
369
|
+
}
|
|
370
|
+
// LLM-wiki reflect trigger (2026-05-24): if a vault has had zero
|
|
371
|
+
// wiki-queue-add fires in the last `quietWindowMs`, ping ORC with
|
|
372
|
+
// a `[REFLECT-WIKI]` message so it sweeps recent conversation
|
|
373
|
+
// for worth-saving content. Solves the "the queue is never used"
|
|
374
|
+
// problem found during the 2026-05-24 audit.
|
|
375
|
+
try {
|
|
376
|
+
const { WikiReflectTriggerService } = await import('./services/wiki/wiki-reflect-trigger.service.js');
|
|
377
|
+
const reflectInterval = Number(process.env['CREWLY_WIKI_REFLECT_INTERVAL_MS'] ?? 60 * 60 * 1000);
|
|
378
|
+
const reflectQuiet = Number(process.env['CREWLY_WIKI_REFLECT_QUIET_WINDOW_MS'] ?? 4 * 60 * 60 * 1000);
|
|
379
|
+
const reflectDebounce = Number(process.env['CREWLY_WIKI_REFLECT_DEBOUNCE_MS'] ?? 4 * 60 * 60 * 1000);
|
|
380
|
+
const reflectTrigger = new WikiReflectTriggerService({
|
|
381
|
+
intervalMs: reflectInterval,
|
|
382
|
+
quietWindowMs: reflectQuiet,
|
|
383
|
+
debounceMs: reflectDebounce,
|
|
384
|
+
fireFn: async (meta) => {
|
|
385
|
+
if (!this.messageQueueService)
|
|
386
|
+
return;
|
|
387
|
+
const lastAddText = meta.msSinceLastQueueAdd === Number.POSITIVE_INFINITY
|
|
388
|
+
? 'never'
|
|
389
|
+
: `${Math.floor(meta.msSinceLastQueueAdd / (60 * 60 * 1000))}h ago`;
|
|
390
|
+
const summary = `[REFLECT-WIKI] vault=${meta.vaultPath} | last wiki-queue-add: ${lastAddText} | total queue items: ${meta.totalQueueItems}. Sweep the recent conversation for worth-saving content (decisions, customer facts, learnings) and call wiki-queue-add for each, OR reply "nothing this period" if there genuinely is nothing.`;
|
|
391
|
+
this.messageQueueService.enqueue({
|
|
392
|
+
content: summary,
|
|
393
|
+
conversationId: 'system:wiki-reflect',
|
|
394
|
+
source: 'system_event',
|
|
395
|
+
});
|
|
396
|
+
},
|
|
397
|
+
});
|
|
398
|
+
WikiReflectTriggerService.setInstance(reflectTrigger);
|
|
399
|
+
reflectTrigger.start();
|
|
400
|
+
}
|
|
401
|
+
catch (reflectErr) {
|
|
402
|
+
this.logger.warn('Wiki reflect trigger failed to start (non-fatal)', {
|
|
403
|
+
error: reflectErr.message,
|
|
404
|
+
});
|
|
405
|
+
}
|
|
406
|
+
// LLM-wiki → WorkItem bridge (2026-05-27): pending wiki queue
|
|
407
|
+
// items + legacy migration candidates become claimable
|
|
408
|
+
// WorkItems in the V3 pool. Replaces the bookkeep/reflect
|
|
409
|
+
// "shouldFire ≥ threshold" model — even 1 pending item now
|
|
410
|
+
// surfaces in /work-items and idle agents drain it through the
|
|
411
|
+
// standard claim loop. PR-1: target=crewly-orc for both kinds
|
|
412
|
+
// because the wiki-process-queue / wiki-migrate skills live
|
|
413
|
+
// under `config/skills/orchestrator/` today.
|
|
414
|
+
try {
|
|
415
|
+
const { WikiWorkItemBridgeService } = await import('./services/wiki/wiki-workitem-bridge.service.js');
|
|
416
|
+
const intervalMs = Number(process.env['CREWLY_WIKI_BRIDGE_INTERVAL_MS'] ?? 10 * 60 * 1000);
|
|
417
|
+
const targetAgent = process.env['CREWLY_WIKI_BRIDGE_TARGET'] ?? 'crewly-orc';
|
|
418
|
+
const maxCreatesPerTick = Number(process.env['CREWLY_WIKI_BRIDGE_MAX_PER_TICK'] ?? 2);
|
|
419
|
+
const cooldownMs = Number(process.env['CREWLY_WIKI_BRIDGE_COOLDOWN_MS'] ?? 30 * 60 * 1000);
|
|
420
|
+
const bridge = new WikiWorkItemBridgeService({
|
|
421
|
+
intervalMs,
|
|
422
|
+
targetAgent,
|
|
423
|
+
maxCreatesPerTick,
|
|
424
|
+
cooldownMs,
|
|
425
|
+
});
|
|
426
|
+
WikiWorkItemBridgeService.setInstance(bridge);
|
|
427
|
+
bridge.start();
|
|
428
|
+
}
|
|
429
|
+
catch (bridgeErr) {
|
|
430
|
+
this.logger.warn('Wiki work-item bridge failed to start (non-fatal)', {
|
|
431
|
+
error: bridgeErr.message,
|
|
432
|
+
});
|
|
433
|
+
}
|
|
434
|
+
// ORC delivery enforcer (2026-05-23 incident fix): watches for
|
|
435
|
+
// agent `[DONE]` posts to slack threads that ORC hasn't yet
|
|
436
|
+
// forwarded via reply-slack. Fires `[DELIVER_REQUIRED]` nudges
|
|
437
|
+
// at 3 / 10 / 30 min after the agent finished, until ORC
|
|
438
|
+
// actually delivers OR the budget is exhausted.
|
|
439
|
+
try {
|
|
440
|
+
const { OrcDeliveryEnforcerService } = await import('./services/orc/orc-delivery-enforcer.service.js');
|
|
441
|
+
const enforcer = new OrcDeliveryEnforcerService({
|
|
442
|
+
reminderSink: ({ conversationId, text }) => {
|
|
443
|
+
if (!this.messageQueueService)
|
|
444
|
+
return;
|
|
445
|
+
this.messageQueueService.enqueue({
|
|
446
|
+
content: text,
|
|
447
|
+
conversationId,
|
|
448
|
+
source: 'system_event',
|
|
449
|
+
});
|
|
450
|
+
},
|
|
451
|
+
});
|
|
452
|
+
OrcDeliveryEnforcerService.setInstance(enforcer);
|
|
453
|
+
enforcer.start();
|
|
454
|
+
}
|
|
455
|
+
catch (enforcerErr) {
|
|
456
|
+
this.logger.warn('OrcDeliveryEnforcer failed to start (non-fatal)', {
|
|
457
|
+
error: enforcerErr.message,
|
|
458
|
+
});
|
|
459
|
+
}
|
|
460
|
+
})();
|
|
461
|
+
// Initialize system resource alert service for proactive monitoring
|
|
462
|
+
this.systemResourceAlertService = new SystemResourceAlertService();
|
|
463
|
+
this.teamsJsonWatcherService.setEventBusService(this.eventBusService);
|
|
464
|
+
this.activityMonitorService.setEventBusService(this.eventBusService);
|
|
465
|
+
setEventBusControllerService(this.eventBusService);
|
|
466
|
+
setTeamControllerEventBusService(this.eventBusService);
|
|
467
|
+
// Wire team-activity-websocket to EventBus so it reacts to V3
|
|
468
|
+
// WorkItem lifecycle events (replaces the legacy
|
|
469
|
+
// TaskTrackingService.on('task_workflow_event') bridge that was
|
|
470
|
+
// deleted with the v1 task-management subsystem).
|
|
471
|
+
this.teamActivityWebSocketService.setEventBus(this.eventBusService);
|
|
472
|
+
// V3-only autonomy: AgentAutoClaimService (started later in boot)
|
|
473
|
+
// is the single autonomy loop. The legacy AutoAssignService has
|
|
474
|
+
// been retired — see spec 2026-05-06-task-management-v1-deprecation.md.
|
|
475
|
+
// BRIDGE-1: subscribe to autonomy events (task:done_by_worker,
|
|
476
|
+
// task:rejected, task:blocked, team:all_tasks_done, mission:*) and
|
|
477
|
+
// create the appropriate WorkItem(s) — verification WI for TL on
|
|
478
|
+
// done_by_worker, retry WI / escalation WI on rejected, review WI on
|
|
479
|
+
// blocked / mission events. See `event-to-workitem-bridge.service.ts`
|
|
480
|
+
// for idempotency contract + retry cap + cron-recursion guard.
|
|
481
|
+
this.eventToWorkItemBridge = EventToWorkItemBridge.boot(this.eventBusService);
|
|
482
|
+
this.eventToWorkItemBridge.start();
|
|
483
|
+
// LEARN-1: subscribe to terminal task / mission:replanned events and
|
|
484
|
+
// auto-record a learning entry via MemoryService.recordLearning. Closes
|
|
485
|
+
// the prompt-driven "agents-forget-to-record" gap. See
|
|
486
|
+
// `auto-learning.subscriber.ts` for category mapping + idempotency
|
|
487
|
+
// contract (V1) and the V7/V9 self-checks in the co-located test.
|
|
488
|
+
this.autoLearningSubscriber = AutoLearningSubscriber.boot(this.eventBusService);
|
|
489
|
+
this.autoLearningSubscriber.start();
|
|
490
|
+
// DF-1 #438: symmetric notification subscriber. Same architectural
|
|
491
|
+
// pattern as AutoLearningSubscriber — listens to terminal lifecycle
|
|
492
|
+
// events (`task:verified`, `mission:replanned`) and enqueues a
|
|
493
|
+
// `[MILESTONE]` envelope into orc's chat queue. The QW-3 row in
|
|
494
|
+
// `config/roles/orchestrator/prompt.md` (#436) handles the
|
|
495
|
+
// always-forward-to-owner rule on the orc side; this subscriber
|
|
496
|
+
// closes the gap where an agent ships work but forgets to call
|
|
497
|
+
// `report-status --status milestone` (the agent-side QW-1 path).
|
|
498
|
+
this.milestoneNotificationSubscriber = new MilestoneNotificationSubscriber({
|
|
499
|
+
eventBus: this.eventBusService,
|
|
500
|
+
messageQueueService: this.messageQueueService,
|
|
501
|
+
});
|
|
502
|
+
this.milestoneNotificationSubscriber.start();
|
|
503
|
+
// INBOUND-1 + Pipeline-#4 follow-up: wire RequestService → bus, then
|
|
504
|
+
// boot both v3 subscribers (SLA tracker + auto-decompose). Order
|
|
505
|
+
// matters within the block: setRequestServiceEventBus must run
|
|
506
|
+
// BEFORE any code path can call RequestService.create() — the slack
|
|
507
|
+
// listener at line ~370 is the first hot caller, but the slack
|
|
508
|
+
// service hasn't been initialised yet at this point in boot, so
|
|
509
|
+
// we're safe.
|
|
510
|
+
//
|
|
511
|
+
// Failure-isolated (issue #465): the entire v3 subscriber boot is
|
|
512
|
+
// wrapped in try/catch so a wiring failure logs + continues rather
|
|
513
|
+
// than crashing the whole backend. Neither subscriber is essential
|
|
514
|
+
// to API liveness — degrading them is preferable to losing the
|
|
515
|
+
// process. A single catch block treats both as a unit because the
|
|
516
|
+
// failure mode is "wiring is broken, fix the deploy" not
|
|
517
|
+
// "intermittently flaky"; partial recovery would be unnecessary
|
|
518
|
+
// complexity for v1. B0 broadcast (line ~2336) and TriggerEngine
|
|
519
|
+
// boot (line ~1464) already have equivalent isolation; this brings
|
|
520
|
+
// the v3 subscriber block in line with that pattern.
|
|
521
|
+
try {
|
|
522
|
+
setRequestServiceEventBus(this.eventBusService);
|
|
523
|
+
this.requestSlaSubscriber = RequestSlaSubscriber.boot(this.eventBusService, RequestService.getInstance(), TaskPoolService.getInstance(), async ({ channelId, threadTs, messageText }) => {
|
|
524
|
+
// Production wiring of the 10-min escalation hook: nudge the user
|
|
525
|
+
// in the same Slack thread so they're never blind to the miss.
|
|
526
|
+
const slack = getSlackService();
|
|
527
|
+
await slack.sendMessage({
|
|
528
|
+
channelId,
|
|
529
|
+
threadTs,
|
|
530
|
+
text: messageText,
|
|
531
|
+
});
|
|
532
|
+
});
|
|
533
|
+
this.requestSlaSubscriber.start();
|
|
534
|
+
setRequestSlaSubscriber(this.requestSlaSubscriber);
|
|
535
|
+
// Pipeline-#4 follow-up: auto-decompose actionable L2 Requests on
|
|
536
|
+
// request:created. Sequenced AFTER the SLA subscriber so the
|
|
537
|
+
// respond_to_user WI seeding still runs first when both fire on
|
|
538
|
+
// the same event (deterministic listener-attach order; both run
|
|
539
|
+
// via the same in-process bus). Side note: order is semantically
|
|
540
|
+
// irrelevant — the linkWorkItem path keys on workitem:queued, not
|
|
541
|
+
// on relative listener position — but predictable startup ordering
|
|
542
|
+
// helps debug.
|
|
543
|
+
this.requestDecomposeSubscriber = RequestDecomposeSubscriber.boot(this.eventBusService, RequestService.getInstance(), TaskPoolService.getInstance());
|
|
544
|
+
this.requestDecomposeSubscriber.start();
|
|
545
|
+
setRequestDecomposeSubscriber(this.requestDecomposeSubscriber);
|
|
546
|
+
// Status-update subscriber: posts progress on Slack-originated
|
|
547
|
+
// Requests as their child WIs reach milestones, plus a heartbeat
|
|
548
|
+
// while work is still in flight. Closes the "long silence after
|
|
549
|
+
// orc's first ack" UX gap. Idempotent — duplicate boots are no-ops.
|
|
550
|
+
this.requestStatusUpdateSubscriber = new RequestStatusUpdateSubscriber({
|
|
551
|
+
eventBus: this.eventBusService,
|
|
552
|
+
requestService: RequestService.getInstance(),
|
|
553
|
+
taskPool: TaskPoolService.getInstance(),
|
|
554
|
+
slackPoster: async ({ channelId, text, threadTs }) => {
|
|
555
|
+
// Post via the in-process SlackService to avoid a self-HTTP
|
|
556
|
+
// hop. The /api/slack/send route's other side-effects (chat
|
|
557
|
+
// persistence, thread-status replied marker) don't apply
|
|
558
|
+
// to mid-thread heartbeat updates — those are only for
|
|
559
|
+
// the user's direct reply, not for orc's progress pings.
|
|
560
|
+
const slack = getSlackService();
|
|
561
|
+
if (!slack.isConnected())
|
|
562
|
+
return;
|
|
563
|
+
await slack.sendMessage({ channelId, text, threadTs });
|
|
564
|
+
},
|
|
565
|
+
heartbeatMinutes: 30,
|
|
566
|
+
});
|
|
567
|
+
this.requestStatusUpdateSubscriber.start();
|
|
568
|
+
// Cascade subscriber: keeps Request.status in sync with the
|
|
569
|
+
// aggregate state of its child WIs by reacting to live task
|
|
570
|
+
// lifecycle events. Closes the gap left by V3DataService's
|
|
571
|
+
// retired `v3:task_*` subscriptions (see 2026-05-09 dogfood
|
|
572
|
+
// note in request-cascade.subscriber.ts).
|
|
573
|
+
this.requestCascadeSubscriber = new RequestCascadeSubscriber({
|
|
574
|
+
eventBus: this.eventBusService,
|
|
575
|
+
requestService: RequestService.getInstance(),
|
|
576
|
+
taskPool: TaskPoolService.getInstance(),
|
|
577
|
+
notifier: this.eventBusService,
|
|
578
|
+
});
|
|
579
|
+
this.requestCascadeSubscriber.start();
|
|
580
|
+
}
|
|
581
|
+
catch (subscriberBootErr) {
|
|
582
|
+
// Degraded mode: SLA tracking + auto-decompose are off, but the
|
|
583
|
+
// API surface and rest of the backend continue to serve. Ops can
|
|
584
|
+
// grep for `v3 subscriber boot failed` in logs to triage.
|
|
585
|
+
this.logger.error('v3 subscriber boot failed — degrading SLA + auto-decompose paths, continuing backend startup', {
|
|
586
|
+
error: subscriberBootErr instanceof Error
|
|
587
|
+
? subscriberBootErr.message
|
|
588
|
+
: String(subscriberBootErr),
|
|
589
|
+
});
|
|
590
|
+
// Best-effort cleanup of any partial wiring so a later restart
|
|
591
|
+
// doesn't see stale singletons. The setters are idempotent.
|
|
592
|
+
setRequestSlaSubscriber(null);
|
|
593
|
+
setRequestDecomposeSubscriber(null);
|
|
594
|
+
setRequestServiceEventBus(null);
|
|
595
|
+
this.requestSlaSubscriber = null;
|
|
596
|
+
this.requestDecomposeSubscriber = null;
|
|
597
|
+
if (this.requestStatusUpdateSubscriber) {
|
|
598
|
+
try {
|
|
599
|
+
this.requestStatusUpdateSubscriber.stop();
|
|
600
|
+
}
|
|
601
|
+
catch { /* best-effort */ }
|
|
602
|
+
this.requestStatusUpdateSubscriber = null;
|
|
603
|
+
}
|
|
604
|
+
if (this.requestCascadeSubscriber) {
|
|
605
|
+
try {
|
|
606
|
+
this.requestCascadeSubscriber.stop();
|
|
607
|
+
}
|
|
608
|
+
catch { /* best-effort */ }
|
|
609
|
+
this.requestCascadeSubscriber = null;
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
// Initialize Slack thread store for persistent thread conversations
|
|
613
|
+
const slackThreadStore = new SlackThreadStoreService(this.config.crewlyHome);
|
|
614
|
+
setSlackThreadStore(slackThreadStore);
|
|
615
|
+
this.eventBusService.setSlackThreadStore(slackThreadStore);
|
|
616
|
+
// Initialize Google Chat thread store for persistent thread conversations
|
|
617
|
+
const gchatThreadStore = new GoogleChatThreadStoreService(this.config.crewlyHome);
|
|
618
|
+
setGchatThreadStore(gchatThreadStore);
|
|
619
|
+
// Initialize Slack image service for downloading images from Slack messages
|
|
620
|
+
const slackImageService = new SlackImageService(this.config.crewlyHome);
|
|
621
|
+
setSlackImageService(slackImageService);
|
|
622
|
+
// Wire agent:idle events to thread status queue for delegation completion
|
|
623
|
+
this.eventBusService.on('eventPublished', (event) => {
|
|
624
|
+
if (event.type === 'agent:idle' && event.sessionName) {
|
|
625
|
+
try {
|
|
626
|
+
const waitingThreads = this.threadStatusQueueService.getByStatus('replied_waiting_actions');
|
|
627
|
+
for (const entry of waitingThreads) {
|
|
628
|
+
if (entry.delegatedAgents?.includes(event.sessionName)) {
|
|
629
|
+
this.threadStatusQueueService.markDelegationsComplete(entry.threadKey);
|
|
630
|
+
}
|
|
631
|
+
}
|
|
632
|
+
}
|
|
633
|
+
catch (err) {
|
|
634
|
+
this.logger.warn('Failed to check thread delegation completion on agent:idle', {
|
|
635
|
+
sessionName: event.sessionName,
|
|
636
|
+
error: err instanceof Error ? err.message : String(err),
|
|
637
|
+
});
|
|
638
|
+
}
|
|
639
|
+
// V3: Auto-close open Requests when the orchestrator goes idle
|
|
640
|
+
// Handles direct responses (no WorkItem delegation)
|
|
641
|
+
if (event.sessionName === ORCHESTRATOR_SESSION_NAME) {
|
|
642
|
+
setImmediate(() => this.autoCloseOpenRequests());
|
|
643
|
+
}
|
|
644
|
+
}
|
|
645
|
+
});
|
|
646
|
+
// Shared LiveReconcilerDataProvider instance used by both the
|
|
647
|
+
// Reconciler service and the TeamHealthWatchdog data provider.
|
|
648
|
+
// Sharing is required so the memory-pressure broadcast state
|
|
649
|
+
// (`consecutivePressureSkips` / `lastPressureNotifiedAt`) is
|
|
650
|
+
// counted ONCE per sustained pressure episode. Two separate
|
|
651
|
+
// instances would each cross the 5-skip threshold around the same
|
|
652
|
+
// time and publish two `system:memory_pressure` events with
|
|
653
|
+
// distinct `event.id` values (no debounce match), so orc would
|
|
654
|
+
// receive duplicates. See follow-up #5 from PR #543 review.
|
|
655
|
+
const liveDataProvider = new LiveReconcilerDataProvider();
|
|
656
|
+
liveDataProvider.setEventBus(this.eventBusService);
|
|
657
|
+
// Wire AgentRegistrationService so the memory-pressure eviction
|
|
658
|
+
// path can terminate idle agents to free wake slots (issue surfaced
|
|
659
|
+
// 2026-05-16: queued WIs for inactive Atlas could not get woken
|
|
660
|
+
// because the floor was held by idle product/marketing agents).
|
|
661
|
+
liveDataProvider.setAgentRegistrationService(this.apiController.agentRegistrationService);
|
|
662
|
+
// Initialize Reconciler Service (V2 — system truth recomputation)
|
|
663
|
+
{
|
|
664
|
+
const reconcilerLogger = LoggerService.getInstance().createComponentLogger('ReconcilerInit');
|
|
665
|
+
// Live data provider — connects Reconciler to Task Pool, Claim Service,
|
|
666
|
+
// Storage Service, and Agent Suspend for real reconciliation including
|
|
667
|
+
// Hybrid Wake (auto-rehydrating suspended agents when tasks go unclaimed).
|
|
668
|
+
this.reconcilerService = new ReconcilerService(liveDataProvider);
|
|
669
|
+
setReconcilerService(this.reconcilerService);
|
|
670
|
+
// Subscribe EventBus events for targeted reconciliation
|
|
671
|
+
if (this.reconcilerService) {
|
|
672
|
+
const reconciler = this.reconcilerService;
|
|
673
|
+
// 2026-05-15 Steve dogfood: the prior `subscribe({ subscriberSession:
|
|
674
|
+
// '__reconciler__' })` loop here was redundant AND wrong. The
|
|
675
|
+
// subscribe path routes critical events through
|
|
676
|
+
// `MessageQueueService.enqueue` keyed by `targetSession`, which
|
|
677
|
+
// then fails noisily because `__reconciler__` is not a PTY
|
|
678
|
+
// session ("Session '__reconciler__' does not exist", every
|
|
679
|
+
// reconciler tick). The in-process `event_published` listener
|
|
680
|
+
// below already drives the reconciler — no second wiring needed.
|
|
681
|
+
// Removed the subscribe-block; if a future change needs persistent
|
|
682
|
+
// metadata for the reconciler subscription, attach it as a real
|
|
683
|
+
// in-process subscriber via `onInProcess` rather than the
|
|
684
|
+
// session-targeted `subscribe` API.
|
|
685
|
+
// Listen for all published events and trigger targeted reconciliation
|
|
686
|
+
this.eventBusService.on('event_published', (payload) => {
|
|
687
|
+
const targetedEventTypes = ['task:completed', 'task:failed', 'agent:idle', 'agent:inactive'];
|
|
688
|
+
if (targetedEventTypes.includes(payload.eventType)) {
|
|
689
|
+
reconciler.runFast().catch((err) => {
|
|
690
|
+
reconcilerLogger.warn('Event-triggered fast reconcile failed', {
|
|
691
|
+
eventType: payload.eventType,
|
|
692
|
+
error: err instanceof Error ? err.message : String(err),
|
|
693
|
+
});
|
|
694
|
+
});
|
|
695
|
+
}
|
|
696
|
+
});
|
|
697
|
+
}
|
|
698
|
+
reconcilerLogger.info('ReconcilerService initialized and wired to EventBus');
|
|
699
|
+
}
|
|
700
|
+
// Initialize Team-Health-Watchdog (THW) — Layer 4 liveness aggregator
|
|
701
|
+
// Lazy singleton wiring per Sam's etiquette nudge: no module-load
|
|
702
|
+
// side effects; controller and /api/health resolve via accessor.
|
|
703
|
+
{
|
|
704
|
+
const thwLogger = LoggerService.getInstance().createComponentLogger('TeamHealthInit');
|
|
705
|
+
try {
|
|
706
|
+
const config = loadTeamHealthConfig({
|
|
707
|
+
warn: (msg, meta) => thwLogger.warn(msg, meta ?? {}),
|
|
708
|
+
info: (msg, meta) => thwLogger.info(msg, meta ?? {}),
|
|
709
|
+
});
|
|
710
|
+
if (!config.enabled) {
|
|
711
|
+
thwLogger.info('TeamHealthWatchdog disabled by config; skipping init.');
|
|
712
|
+
}
|
|
713
|
+
else if (!this.reconcilerService) {
|
|
714
|
+
thwLogger.warn('Reconciler not available; skipping TeamHealthWatchdog init.');
|
|
715
|
+
}
|
|
716
|
+
else {
|
|
717
|
+
// Reuse the shared LiveReconcilerDataProvider declared
|
|
718
|
+
// above (follow-up #5 from PR #543 review) — instantiating
|
|
719
|
+
// a second copy would double-broadcast memory-pressure.
|
|
720
|
+
const dataProvider = new LiveTeamHealthDataProvider({
|
|
721
|
+
reconcilerProvider: liveDataProvider,
|
|
722
|
+
getTeams: async () => StorageService.getInstance().getTeams(),
|
|
723
|
+
bootedAt: new Date(),
|
|
724
|
+
});
|
|
725
|
+
// Phase 0 alert sink: log-only. Slack delivery wires up
|
|
726
|
+
// in Phase 1 (per §G phasing). Shadow-mode is the
|
|
727
|
+
// default config.json setting, so this sink is mostly
|
|
728
|
+
// invoked for the recovery announcement path.
|
|
729
|
+
const alertSink = {
|
|
730
|
+
deliver: async (decision) => {
|
|
731
|
+
thwLogger.info('THW alert (Phase 0 log-only sink)', {
|
|
732
|
+
teamId: decision.detection.teamId,
|
|
733
|
+
verdict: decision.effectiveVerdict,
|
|
734
|
+
channel: decision.channel,
|
|
735
|
+
message: decision.message,
|
|
736
|
+
});
|
|
737
|
+
},
|
|
738
|
+
};
|
|
739
|
+
this.teamHealthWatchdog = new TeamHealthWatchdogService({
|
|
740
|
+
config,
|
|
741
|
+
dataProvider,
|
|
742
|
+
alertSink,
|
|
743
|
+
bootedAt: new Date(),
|
|
744
|
+
logger: {
|
|
745
|
+
info: (msg, meta) => thwLogger.info(msg, meta ?? {}),
|
|
746
|
+
warn: (msg, meta) => thwLogger.warn(msg, meta ?? {}),
|
|
747
|
+
error: (msg, meta) => thwLogger.error(msg, meta ?? {}),
|
|
748
|
+
},
|
|
749
|
+
});
|
|
750
|
+
setTeamHealthWatchdogSingleton(this.teamHealthWatchdog);
|
|
751
|
+
this.teamHealthWatchdog.start();
|
|
752
|
+
thwLogger.info('TeamHealthWatchdog initialized', {
|
|
753
|
+
shadowMode: config.shadowMode,
|
|
754
|
+
sweepIntervalMs: config.sweepIntervalMs,
|
|
755
|
+
});
|
|
756
|
+
}
|
|
757
|
+
}
|
|
758
|
+
catch (err) {
|
|
759
|
+
thwLogger.error('Failed to initialize TeamHealthWatchdog (continuing without it)', {
|
|
760
|
+
error: err instanceof Error ? err.message : String(err),
|
|
761
|
+
});
|
|
762
|
+
}
|
|
763
|
+
}
|
|
764
|
+
// Initialize Fission Guard Service
|
|
765
|
+
{
|
|
766
|
+
const fissionLogger = LoggerService.getInstance().createComponentLogger('FissionInit');
|
|
767
|
+
try {
|
|
768
|
+
const taskPool = TaskPoolService.getInstance();
|
|
769
|
+
// FissionDataProvider backed by TaskPoolService storage
|
|
770
|
+
const fissionDataProvider = {
|
|
771
|
+
async getWorkItemById(id) {
|
|
772
|
+
const items = await taskPool.getAllItems();
|
|
773
|
+
return items.find((i) => i.id === id) ?? null;
|
|
774
|
+
},
|
|
775
|
+
async countMissionWorkItems(missionId) {
|
|
776
|
+
const items = await taskPool.getAllItems();
|
|
777
|
+
return items.filter((i) => i.missionId === missionId).length;
|
|
778
|
+
},
|
|
779
|
+
async countChildWorkItems(parentWorkItemId) {
|
|
780
|
+
const items = await taskPool.getAllItems();
|
|
781
|
+
return items.filter((i) => i.parentWorkItemId === parentWorkItemId).length;
|
|
782
|
+
},
|
|
783
|
+
};
|
|
784
|
+
const fissionService = FissionGuardService.init(fissionDataProvider);
|
|
785
|
+
setFissionGuardService(fissionService);
|
|
786
|
+
fissionLogger.info('FissionGuardService initialized');
|
|
787
|
+
}
|
|
788
|
+
catch (fissionErr) {
|
|
789
|
+
fissionLogger.warn('FissionGuardService initialization failed (non-fatal)', {
|
|
790
|
+
error: fissionErr instanceof Error ? fissionErr.message : String(fissionErr),
|
|
791
|
+
});
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
// Broadcast queue events via Socket.IO
|
|
795
|
+
this.messageQueueService.on('enqueued', (msg) => {
|
|
796
|
+
this.io.emit(MESSAGE_QUEUE_CONSTANTS.SOCKET_EVENTS.MESSAGE_ENQUEUED, msg);
|
|
797
|
+
});
|
|
798
|
+
this.messageQueueService.on('processing', (msg) => {
|
|
799
|
+
this.io.emit(MESSAGE_QUEUE_CONSTANTS.SOCKET_EVENTS.MESSAGE_PROCESSING, msg);
|
|
800
|
+
});
|
|
801
|
+
this.messageQueueService.on('completed', (msg) => {
|
|
802
|
+
this.io.emit(MESSAGE_QUEUE_CONSTANTS.SOCKET_EVENTS.MESSAGE_COMPLETED, msg);
|
|
803
|
+
});
|
|
804
|
+
this.messageQueueService.on('failed', (msg) => {
|
|
805
|
+
this.io.emit(MESSAGE_QUEUE_CONSTANTS.SOCKET_EVENTS.MESSAGE_FAILED, msg);
|
|
806
|
+
});
|
|
807
|
+
this.messageQueueService.on('statusUpdate', (status) => {
|
|
808
|
+
this.io.emit(MESSAGE_QUEUE_CONSTANTS.SOCKET_EVENTS.STATUS_UPDATE, status);
|
|
809
|
+
});
|
|
810
|
+
}
|
|
811
|
+
configureMiddleware() {
|
|
812
|
+
// Security middleware
|
|
813
|
+
this.app.use(helmet({
|
|
814
|
+
contentSecurityPolicy: {
|
|
815
|
+
directives: {
|
|
816
|
+
defaultSrc: ["'self'"],
|
|
817
|
+
styleSrc: ["'self'", "'unsafe-inline'"],
|
|
818
|
+
scriptSrc: ["'self'", "'unsafe-eval'"],
|
|
819
|
+
imgSrc: ["'self'", 'data:', 'https:', 'blob:'],
|
|
820
|
+
connectSrc: ["'self'", 'ws:', 'wss:', 'blob:'],
|
|
821
|
+
// Disable upgrade-insecure-requests for HTTP-only deployments (ESTestNode)
|
|
822
|
+
// Without this, browsers on HTTP upgrade all asset requests to HTTPS → ERR_SSL_PROTOCOL_ERROR
|
|
823
|
+
upgradeInsecureRequests: null,
|
|
824
|
+
},
|
|
825
|
+
},
|
|
826
|
+
}));
|
|
827
|
+
// CORS — allow Cloud Console frontend and localhost OSS instances
|
|
828
|
+
const CORS_ALLOWED_ORIGINS = process.env['CORS_ALLOWED_ORIGINS']
|
|
829
|
+
? process.env['CORS_ALLOWED_ORIGINS'].split(',')
|
|
830
|
+
: ['https://crewlyai.com', 'https://www.crewlyai.com', 'http://localhost:8787', 'http://localhost:3000'];
|
|
831
|
+
this.app.use(cors({
|
|
832
|
+
origin: process.env.NODE_ENV === 'production'
|
|
833
|
+
? CORS_ALLOWED_ORIGINS
|
|
834
|
+
: '*',
|
|
835
|
+
credentials: true,
|
|
836
|
+
}));
|
|
837
|
+
// Logging
|
|
838
|
+
this.app.use(morgan(process.env.NODE_ENV === 'production' ? 'combined' : 'dev'));
|
|
839
|
+
// Body parsing — `verify` captures the raw bytes so the error handler
|
|
840
|
+
// below can log the exact payload when JSON parsing fails. Without this
|
|
841
|
+
// we only see the position-of-failure, not the bytes.
|
|
842
|
+
this.app.use(express.json({
|
|
843
|
+
limit: '10mb',
|
|
844
|
+
verify: (req, _res, buf) => {
|
|
845
|
+
req.rawBody = buf.toString('utf8');
|
|
846
|
+
},
|
|
847
|
+
}));
|
|
848
|
+
this.app.use(express.urlencoded({ extended: true, limit: '10mb' }));
|
|
849
|
+
// Note: Static files are configured in configureRoutes() after API routes
|
|
850
|
+
}
|
|
851
|
+
configureRoutes() {
|
|
852
|
+
// Agent heartbeat middleware - any API call with X-Agent-Session header updates heartbeat
|
|
853
|
+
this.app.use('/api', agentHeartbeatMiddleware);
|
|
854
|
+
// API routes
|
|
855
|
+
this.app.use('/api', createApiRoutes(this.apiController));
|
|
856
|
+
// Health check (enhanced with mode and agent info)
|
|
857
|
+
this.app.get('/health', (req, res) => {
|
|
858
|
+
const versionService = VersionCheckService.getInstance();
|
|
859
|
+
const cachedCheck = versionService.getCachedCheckResult();
|
|
860
|
+
// Count active agents from the session backend.
|
|
861
|
+
// listSessions() returns names of all active sessions, so
|
|
862
|
+
// total and active counts are equal (only live sessions are listed).
|
|
863
|
+
let agentCount = 0;
|
|
864
|
+
try {
|
|
865
|
+
const sessionBackend = getSessionBackendSync();
|
|
866
|
+
if (sessionBackend) {
|
|
867
|
+
agentCount = sessionBackend.listSessions().length;
|
|
868
|
+
}
|
|
869
|
+
}
|
|
870
|
+
catch {
|
|
871
|
+
// Session backend may not be initialized yet
|
|
872
|
+
}
|
|
873
|
+
// #199: Safely resolve version — findPackageRoot may fail from global install paths
|
|
874
|
+
let version = cachedCheck?.currentVersion ?? null;
|
|
875
|
+
if (!version) {
|
|
876
|
+
try {
|
|
877
|
+
version = versionService.getLocalVersion();
|
|
878
|
+
}
|
|
879
|
+
catch {
|
|
880
|
+
version = process.env.npm_package_version || 'unknown';
|
|
881
|
+
}
|
|
882
|
+
}
|
|
883
|
+
// THW self-instrumentation (§F.3): surface last-sweep age + degraded
|
|
884
|
+
// flag so the watchdog-watchdog (§E.8) bubbles up here. Fail-soft
|
|
885
|
+
// per Sam's etiquette nudge — when the singleton isn't ready, return
|
|
886
|
+
// status:"warming" rather than 5xx.
|
|
887
|
+
const watchdog = getTeamHealthWatchdogSingleton();
|
|
888
|
+
const teamHealthBlock = watchdog
|
|
889
|
+
? {
|
|
890
|
+
status: watchdog.isDegraded() ? 'degraded' : (watchdog.isActive() ? 'ok' : 'inactive'),
|
|
891
|
+
last_sweep_age_ms: watchdog.getLastSweepAgeMs(),
|
|
892
|
+
shadowMode: watchdog.getLastSweep()?.shadowMode ?? null,
|
|
893
|
+
}
|
|
894
|
+
: { status: 'warming', last_sweep_age_ms: -1, shadowMode: null };
|
|
895
|
+
res.json({
|
|
896
|
+
status: 'healthy',
|
|
897
|
+
timestamp: new Date().toISOString(),
|
|
898
|
+
uptime: process.uptime(),
|
|
899
|
+
version,
|
|
900
|
+
latestVersion: cachedCheck?.latestVersion ?? null,
|
|
901
|
+
updateAvailable: cachedCheck?.updateAvailable ?? false,
|
|
902
|
+
mode: this.config.headless ? 'headless' : 'standard',
|
|
903
|
+
agents: {
|
|
904
|
+
active: agentCount,
|
|
905
|
+
total: agentCount,
|
|
906
|
+
},
|
|
907
|
+
team_health: teamHealthBlock,
|
|
908
|
+
});
|
|
909
|
+
});
|
|
910
|
+
// H5 quick entry static page (served regardless of headless mode)
|
|
911
|
+
{
|
|
912
|
+
const projectRoot = findPackageRoot(__dirname);
|
|
913
|
+
const h5StaticPath = path.join(projectRoot, 'backend/src/static/h5');
|
|
914
|
+
this.app.use('/h5', express.static(h5StaticPath));
|
|
915
|
+
}
|
|
916
|
+
// Static files for frontend (skip in headless mode)
|
|
917
|
+
if (!this.config.headless) {
|
|
918
|
+
// Use findPackageRoot() so this works both in dev mode (backend/src/)
|
|
919
|
+
// and in compiled/npm-installed mode (dist/backend/backend/src/)
|
|
920
|
+
const projectRoot = findPackageRoot(__dirname);
|
|
921
|
+
const frontendPath = path.join(projectRoot, 'frontend/dist');
|
|
922
|
+
this.app.use(express.static(frontendPath));
|
|
923
|
+
// Serve frontend for all other routes (SPA)
|
|
924
|
+
// Skip /api/ and /health paths so addon-registered API routes are reachable
|
|
925
|
+
this.app.get('*', (req, res, next) => {
|
|
926
|
+
if (req.path.startsWith('/api/') || req.path === '/health') {
|
|
927
|
+
return next();
|
|
928
|
+
}
|
|
929
|
+
const frontendIndexPath = path.join(projectRoot, 'frontend/dist/index.html');
|
|
930
|
+
res.sendFile(frontendIndexPath);
|
|
931
|
+
});
|
|
932
|
+
}
|
|
933
|
+
else {
|
|
934
|
+
this.logger.info('Headless mode: frontend serving disabled (API-only)');
|
|
935
|
+
}
|
|
936
|
+
// Error handling middleware
|
|
937
|
+
this.app.use((err, req, res, next) => {
|
|
938
|
+
const rawBody = req.rawBody;
|
|
939
|
+
this.logger.error('Request error', {
|
|
940
|
+
error: err.message,
|
|
941
|
+
stack: err.stack,
|
|
942
|
+
url: `${req.method} ${req.originalUrl}`,
|
|
943
|
+
contentType: req.headers['content-type'],
|
|
944
|
+
contentLength: req.headers['content-length'],
|
|
945
|
+
rawBodyLength: rawBody?.length,
|
|
946
|
+
rawBody: rawBody ? JSON.stringify(rawBody) : undefined,
|
|
947
|
+
});
|
|
948
|
+
const status = err.statusCode
|
|
949
|
+
?? err.status
|
|
950
|
+
?? 500;
|
|
951
|
+
res.status(status).json({
|
|
952
|
+
success: false,
|
|
953
|
+
error: process.env.NODE_ENV === 'production'
|
|
954
|
+
? 'Internal server error'
|
|
955
|
+
: err.message,
|
|
956
|
+
});
|
|
957
|
+
});
|
|
958
|
+
}
|
|
959
|
+
configureWebSocket() {
|
|
960
|
+
this.io.on('connection', (socket) => {
|
|
961
|
+
this.logger.info('Client connected', { socketId: socket.id });
|
|
962
|
+
socket.on('disconnect', () => {
|
|
963
|
+
this.logger.info('Client disconnected', { socketId: socket.id });
|
|
964
|
+
});
|
|
965
|
+
});
|
|
966
|
+
// Connect terminal output to WebSocket
|
|
967
|
+
this.tmuxService.on('output', (output) => {
|
|
968
|
+
this.io.emit('terminal_output', output);
|
|
969
|
+
});
|
|
970
|
+
// Forward scheduler events
|
|
971
|
+
this.schedulerService.on('check_executed', (data) => {
|
|
972
|
+
this.io.emit('check_executed', data);
|
|
973
|
+
});
|
|
974
|
+
this.schedulerService.on('check_scheduled', (data) => {
|
|
975
|
+
this.io.emit('check_scheduled', data);
|
|
976
|
+
});
|
|
977
|
+
}
|
|
978
|
+
async start() {
|
|
979
|
+
try {
|
|
980
|
+
// Validate environment configuration (fail fast with clear errors)
|
|
981
|
+
const { validateEnvConfig, logEnvValidation } = await import('./services/core/env.config.js');
|
|
982
|
+
const envValidation = validateEnvConfig();
|
|
983
|
+
logEnvValidation(envValidation);
|
|
984
|
+
if (!envValidation.valid) {
|
|
985
|
+
throw new Error('Environment configuration validation failed — see errors above');
|
|
986
|
+
}
|
|
987
|
+
// Initialize OpenTelemetry tracing (early, before other services)
|
|
988
|
+
const { TracingService } = await import('./services/core/tracing.service.js');
|
|
989
|
+
TracingService.getInstance().initialize();
|
|
990
|
+
// Expose queue instance for cross-machine message routing (used by MessageRouterService)
|
|
991
|
+
const { setMessageQueueInstance } = await import('./services/messaging/index.js');
|
|
992
|
+
setMessageQueueInstance(this.messageQueueService);
|
|
993
|
+
this.logger.info('Starting Crewly server...');
|
|
994
|
+
this.logger.info('Server startup info', {
|
|
995
|
+
pid: process.pid,
|
|
996
|
+
memoryUsageMB: Math.round(process.memoryUsage().heapUsed / 1024 / 1024),
|
|
997
|
+
targetPort: this.config.webPort,
|
|
998
|
+
headless: this.config.headless,
|
|
999
|
+
});
|
|
1000
|
+
// Truncate service.log on startup — it's a raw stdout pipe duplicate of the
|
|
1001
|
+
// daily crewly-YYYY-MM-DD.log files and grows unbounded otherwise.
|
|
1002
|
+
try {
|
|
1003
|
+
const serviceLogPath = path.join(this.config.crewlyHome, 'logs', 'service.log');
|
|
1004
|
+
const { stat, truncate } = await import('fs/promises');
|
|
1005
|
+
const logStat = await stat(serviceLogPath).catch(() => null);
|
|
1006
|
+
if (logStat && logStat.size > 10 * 1024 * 1024) { // truncate if > 10MB
|
|
1007
|
+
await truncate(serviceLogPath, 0);
|
|
1008
|
+
this.logger.info('Truncated service.log on startup', {
|
|
1009
|
+
previousSizeMB: Math.round(logStat.size / 1024 / 1024),
|
|
1010
|
+
});
|
|
1011
|
+
}
|
|
1012
|
+
}
|
|
1013
|
+
catch {
|
|
1014
|
+
// Non-critical
|
|
1015
|
+
}
|
|
1016
|
+
if (this.config.headless) {
|
|
1017
|
+
this.logger.info('Headless mode active: API-only, no frontend serving');
|
|
1018
|
+
}
|
|
1019
|
+
// Check for pending self-improvement (hot-reload recovery)
|
|
1020
|
+
await this.checkPendingSelfImprovement();
|
|
1021
|
+
// Check if port is already in use
|
|
1022
|
+
await this.checkPortAvailability();
|
|
1023
|
+
// Skip tmux initialization since we're using PTY session backend
|
|
1024
|
+
// Note: TmuxService is kept for backward compatibility but PTY is the active backend
|
|
1025
|
+
try {
|
|
1026
|
+
await this.tmuxService.initialize();
|
|
1027
|
+
}
|
|
1028
|
+
catch (error) {
|
|
1029
|
+
// Ignore tmux initialization errors - PTY backend is primary
|
|
1030
|
+
}
|
|
1031
|
+
// Reset orchestrator status to inactive on startup.
|
|
1032
|
+
// The persisted status file may still say "active" from the previous session,
|
|
1033
|
+
// but a fresh app start has no running agent. Without this reset, the UI
|
|
1034
|
+
// would show "Active" for a bare shell that has no Claude running inside it.
|
|
1035
|
+
try {
|
|
1036
|
+
await this.storageService.updateOrchestratorStatus(CREWLY_CONSTANTS.AGENT_STATUSES.INACTIVE);
|
|
1037
|
+
this.logger.info('Reset orchestrator status to inactive on startup');
|
|
1038
|
+
}
|
|
1039
|
+
catch (resetErr) {
|
|
1040
|
+
this.logger.warn('Failed to reset orchestrator status on startup', {
|
|
1041
|
+
error: resetErr instanceof Error ? resetErr.message : String(resetErr),
|
|
1042
|
+
});
|
|
1043
|
+
}
|
|
1044
|
+
// Initialize PTY session backend.
|
|
1045
|
+
// We load persisted session metadata (including Claude session IDs) so that
|
|
1046
|
+
// when agents are re-started, they can resume their previous conversations
|
|
1047
|
+
// using --resume. The actual PTY sessions are NOT restored here — they are
|
|
1048
|
+
// recreated when the user starts teams again.
|
|
1049
|
+
this.logger.info('Initializing PTY session backend...');
|
|
1050
|
+
await getSessionBackend();
|
|
1051
|
+
// Load persisted session metadata for resume-on-restart support
|
|
1052
|
+
try {
|
|
1053
|
+
const persistence = getSessionStatePersistence();
|
|
1054
|
+
const savedState = await persistence.loadState();
|
|
1055
|
+
if (savedState && savedState.sessions.length > 0) {
|
|
1056
|
+
for (const sessionInfo of savedState.sessions) {
|
|
1057
|
+
persistence.registerSession(sessionInfo.name, {
|
|
1058
|
+
cwd: sessionInfo.cwd,
|
|
1059
|
+
command: sessionInfo.command,
|
|
1060
|
+
args: sessionInfo.args,
|
|
1061
|
+
env: sessionInfo.env,
|
|
1062
|
+
}, sessionInfo.runtimeType, sessionInfo.role, sessionInfo.teamId);
|
|
1063
|
+
if (sessionInfo.claudeSessionId) {
|
|
1064
|
+
persistence.updateSessionId(sessionInfo.name, sessionInfo.claudeSessionId);
|
|
1065
|
+
}
|
|
1066
|
+
}
|
|
1067
|
+
this.logger.info('Loaded persisted session metadata for resume support', {
|
|
1068
|
+
count: savedState.sessions.length,
|
|
1069
|
+
sessionsWithResumeId: savedState.sessions.filter(s => s.claudeSessionId).length,
|
|
1070
|
+
});
|
|
1071
|
+
}
|
|
1072
|
+
}
|
|
1073
|
+
catch (loadError) {
|
|
1074
|
+
this.logger.debug('No persisted session state to load (first run or cleared)', {
|
|
1075
|
+
error: loadError instanceof Error ? loadError.message : String(loadError),
|
|
1076
|
+
});
|
|
1077
|
+
}
|
|
1078
|
+
// Initialize Redis cache (non-blocking — falls back to memory if Redis is unavailable)
|
|
1079
|
+
try {
|
|
1080
|
+
const redisConnected = await RedisCacheService.getInstance().connect();
|
|
1081
|
+
this.logger.info('Redis cache initialized', { connected: redisConnected, backend: redisConnected ? 'redis' : 'memory' });
|
|
1082
|
+
}
|
|
1083
|
+
catch (cacheErr) {
|
|
1084
|
+
this.logger.info('Redis cache not available, using in-memory fallback', {
|
|
1085
|
+
error: cacheErr instanceof Error ? cacheErr.message : String(cacheErr),
|
|
1086
|
+
});
|
|
1087
|
+
}
|
|
1088
|
+
// Start message scheduler
|
|
1089
|
+
this.logger.info('Starting message scheduler...');
|
|
1090
|
+
await this.messageSchedulerService.start();
|
|
1091
|
+
// Restore persisted scheduled checks (non-critical — don't block startup)
|
|
1092
|
+
try {
|
|
1093
|
+
this.logger.info('Restoring persisted scheduled checks...');
|
|
1094
|
+
const [recurringRestored, oneTimeRestored] = await Promise.all([
|
|
1095
|
+
this.schedulerService.restoreRecurringChecks(),
|
|
1096
|
+
this.schedulerService.restoreOneTimeChecks(),
|
|
1097
|
+
]);
|
|
1098
|
+
if (recurringRestored > 0 || oneTimeRestored > 0) {
|
|
1099
|
+
this.logger.info('Restored scheduled checks', { recurringRestored, oneTimeRestored });
|
|
1100
|
+
}
|
|
1101
|
+
}
|
|
1102
|
+
catch (restoreError) {
|
|
1103
|
+
this.logger.warn('Failed to restore scheduled checks (non-critical)', {
|
|
1104
|
+
error: restoreError instanceof Error ? restoreError.message : String(restoreError),
|
|
1105
|
+
});
|
|
1106
|
+
}
|
|
1107
|
+
// Start activity monitoring
|
|
1108
|
+
this.logger.info('Starting activity monitoring...');
|
|
1109
|
+
this.activityMonitorService.startPolling();
|
|
1110
|
+
// Start idle detection for agent suspension
|
|
1111
|
+
this.logger.info('Starting idle detection service...');
|
|
1112
|
+
const idleDetection = IdleDetectionService.getInstance();
|
|
1113
|
+
idleDetection.setAgentRegistrationService(this.apiController.agentRegistrationService);
|
|
1114
|
+
idleDetection.start();
|
|
1115
|
+
// Wire OrchestratorRestartService with dependencies for auto-restart
|
|
1116
|
+
try {
|
|
1117
|
+
const sessionBackend = getSessionBackendSync();
|
|
1118
|
+
if (sessionBackend) {
|
|
1119
|
+
const restartService = OrchestratorRestartService.getInstance();
|
|
1120
|
+
restartService.setDependencies(this.apiController.agentRegistrationService, sessionBackend, this.io);
|
|
1121
|
+
this.logger.info('OrchestratorRestartService wired with dependencies');
|
|
1122
|
+
}
|
|
1123
|
+
}
|
|
1124
|
+
catch (error) {
|
|
1125
|
+
this.logger.warn('Failed to wire OrchestratorRestartService (non-critical)', {
|
|
1126
|
+
error: error instanceof Error ? error.message : String(error),
|
|
1127
|
+
});
|
|
1128
|
+
}
|
|
1129
|
+
// Wire orchestrator-setup service for SlackBridge auto-recovery (B0).
|
|
1130
|
+
// Without this, the bridge's auto-recovery path returns "deps not
|
|
1131
|
+
// initialized" and falls through to the offline branch.
|
|
1132
|
+
try {
|
|
1133
|
+
setOrchestratorSetupDependencies({
|
|
1134
|
+
agentRegistrationService: this.apiController.agentRegistrationService,
|
|
1135
|
+
storageService: this.storageService,
|
|
1136
|
+
});
|
|
1137
|
+
this.logger.info('OrchestratorSetupService wired with dependencies');
|
|
1138
|
+
}
|
|
1139
|
+
catch (error) {
|
|
1140
|
+
this.logger.warn('Failed to wire OrchestratorSetupService (non-critical)', {
|
|
1141
|
+
error: error instanceof Error ? error.message : String(error),
|
|
1142
|
+
});
|
|
1143
|
+
}
|
|
1144
|
+
// Wire and start OrchestratorHeartbeatMonitorService for auto-restart
|
|
1145
|
+
try {
|
|
1146
|
+
const orchHbSessionBackend = getSessionBackendSync();
|
|
1147
|
+
if (orchHbSessionBackend) {
|
|
1148
|
+
const orchHeartbeatMonitor = OrchestratorHeartbeatMonitorService.getInstance();
|
|
1149
|
+
orchHeartbeatMonitor.setDependencies(orchHbSessionBackend, () => this.messageQueueService.hasPending() || this.queueProcessorService.isProcessingMessage());
|
|
1150
|
+
orchHeartbeatMonitor.start();
|
|
1151
|
+
this.logger.info('OrchestratorHeartbeatMonitorService started');
|
|
1152
|
+
}
|
|
1153
|
+
}
|
|
1154
|
+
catch (error) {
|
|
1155
|
+
this.logger.warn('Failed to start OrchestratorHeartbeatMonitorService (non-critical)', {
|
|
1156
|
+
error: error instanceof Error ? error.message : String(error),
|
|
1157
|
+
});
|
|
1158
|
+
}
|
|
1159
|
+
// Wire AgentSuspendService with registration service for rehydration
|
|
1160
|
+
try {
|
|
1161
|
+
AgentSuspendService.getInstance().setDependencies(this.apiController.agentRegistrationService);
|
|
1162
|
+
this.logger.info('AgentSuspendService wired with dependencies');
|
|
1163
|
+
}
|
|
1164
|
+
catch (error) {
|
|
1165
|
+
this.logger.warn('Failed to wire AgentSuspendService (non-critical)', {
|
|
1166
|
+
error: error instanceof Error ? error.message : String(error),
|
|
1167
|
+
});
|
|
1168
|
+
}
|
|
1169
|
+
// Wire and start AgentHeartbeatMonitorService
|
|
1170
|
+
try {
|
|
1171
|
+
const agentHbSessionBackend = getSessionBackendSync();
|
|
1172
|
+
if (agentHbSessionBackend) {
|
|
1173
|
+
const agentHeartbeatMonitor = AgentHeartbeatMonitorService.getInstance();
|
|
1174
|
+
agentHeartbeatMonitor.setDependencies(agentHbSessionBackend, this.apiController.agentRegistrationService, this.storageService);
|
|
1175
|
+
agentHeartbeatMonitor.start();
|
|
1176
|
+
this.logger.info('AgentHeartbeatMonitorService started');
|
|
1177
|
+
}
|
|
1178
|
+
}
|
|
1179
|
+
catch (error) {
|
|
1180
|
+
this.logger.warn('Failed to start AgentHeartbeatMonitorService (non-critical)', {
|
|
1181
|
+
error: error instanceof Error ? error.message : String(error),
|
|
1182
|
+
});
|
|
1183
|
+
}
|
|
1184
|
+
// Wire and start ContextWindowMonitorService
|
|
1185
|
+
try {
|
|
1186
|
+
const ctxSessionBackend = getSessionBackendSync();
|
|
1187
|
+
if (ctxSessionBackend) {
|
|
1188
|
+
const contextWindowMonitor = ContextWindowMonitorService.getInstance();
|
|
1189
|
+
contextWindowMonitor.setDependencies(ctxSessionBackend, this.apiController.agentRegistrationService, this.storageService, this.eventBusService);
|
|
1190
|
+
contextWindowMonitor.start();
|
|
1191
|
+
this.logger.info('ContextWindowMonitorService started');
|
|
1192
|
+
}
|
|
1193
|
+
}
|
|
1194
|
+
catch (error) {
|
|
1195
|
+
this.logger.warn('Failed to start ContextWindowMonitorService (non-critical)', {
|
|
1196
|
+
error: error instanceof Error ? error.message : String(error),
|
|
1197
|
+
});
|
|
1198
|
+
}
|
|
1199
|
+
// Wire OAuthReloginMonitorService EventBus dependency
|
|
1200
|
+
try {
|
|
1201
|
+
OAuthReloginMonitorService.getInstance().setEventBusService(this.eventBusService);
|
|
1202
|
+
}
|
|
1203
|
+
catch (error) {
|
|
1204
|
+
this.logger.warn('Failed to wire OAuthReloginMonitorService EventBus (non-critical)', {
|
|
1205
|
+
error: error instanceof Error ? error.message : String(error),
|
|
1206
|
+
});
|
|
1207
|
+
}
|
|
1208
|
+
// Wire RuntimeExitMonitorService dependencies for task-aware restart
|
|
1209
|
+
try {
|
|
1210
|
+
const runtimeExitMonitor = RuntimeExitMonitorService.getInstance();
|
|
1211
|
+
runtimeExitMonitor.setAgentRegistrationService(this.apiController.agentRegistrationService);
|
|
1212
|
+
runtimeExitMonitor.setEventBusService(this.eventBusService);
|
|
1213
|
+
}
|
|
1214
|
+
catch (error) {
|
|
1215
|
+
this.logger.warn('Failed to wire RuntimeExitMonitorService dependencies (non-critical)', {
|
|
1216
|
+
error: error instanceof Error ? error.message : String(error),
|
|
1217
|
+
});
|
|
1218
|
+
}
|
|
1219
|
+
// Start Crewly in Chrome WebSocket bridge
|
|
1220
|
+
try {
|
|
1221
|
+
const { BrowserBridgeService } = await import('./services/browser/browser-bridge.service.js');
|
|
1222
|
+
const browserBridge = BrowserBridgeService.getInstance();
|
|
1223
|
+
browserBridge.attach(this.httpServer);
|
|
1224
|
+
this.logger.info('Crewly in Chrome WebSocket bridge started');
|
|
1225
|
+
}
|
|
1226
|
+
catch (error) {
|
|
1227
|
+
this.logger.warn('Failed to start Crewly in Chrome bridge (non-critical)', {
|
|
1228
|
+
error: error instanceof Error ? error.message : String(error),
|
|
1229
|
+
});
|
|
1230
|
+
}
|
|
1231
|
+
// Start chat-v2 WebSocket gateway + dispatcher (Phase 1 Chat MVP).
|
|
1232
|
+
// The gateway fans `message`/`presence` frames to subscribers of
|
|
1233
|
+
// `/ws/chat?channelId=...`. The dispatcher pushes user-origin
|
|
1234
|
+
// messages into the bound agent session so it can reply via the
|
|
1235
|
+
// `reply-channel` skill. See chat-v2.gateway.ts for the contract.
|
|
1236
|
+
try {
|
|
1237
|
+
const [{ ChatV2Gateway, devAnonymousTokenVerifier }, { ChatV2DispatcherService }, { ChatV2MentionResolver }, { getChatV2Service }, { setChatV2RealtimeDeps }, { verifyHs256Token },] = await Promise.all([
|
|
1238
|
+
import('./websocket/chat-v2.gateway.js'),
|
|
1239
|
+
import('./services/chat-v2/chat-v2.dispatcher.service.js'),
|
|
1240
|
+
import('./services/chat-v2/chat-v2.mention-resolver.js'),
|
|
1241
|
+
import('./services/chat-v2/chat-v2.singleton.js'),
|
|
1242
|
+
import('./services/chat-v2/chat-v2.realtime-holder.js'),
|
|
1243
|
+
import('./middleware/require-auth.middleware.js'),
|
|
1244
|
+
]);
|
|
1245
|
+
const chatService = getChatV2Service();
|
|
1246
|
+
const jwtSecret = process.env['CREWLY_JWT_SECRET'];
|
|
1247
|
+
const verifyToken = jwtSecret
|
|
1248
|
+
? async (token) => {
|
|
1249
|
+
if (!token)
|
|
1250
|
+
return null;
|
|
1251
|
+
const payload = verifyHs256Token(token, jwtSecret);
|
|
1252
|
+
if (!payload?.sub)
|
|
1253
|
+
return null;
|
|
1254
|
+
return { userId: payload.sub };
|
|
1255
|
+
}
|
|
1256
|
+
: devAnonymousTokenVerifier;
|
|
1257
|
+
const chatGateway = new ChatV2Gateway({ service: chatService, verifyToken });
|
|
1258
|
+
chatGateway.attach(this.httpServer);
|
|
1259
|
+
// Phase C BE.3 — inject the mention resolver so type='channel'
|
|
1260
|
+
// messages fan out to @-mentioned recipients instead of
|
|
1261
|
+
// short-circuiting with strategy='skip' at the dispatcher.
|
|
1262
|
+
// Pattern matches LiveTeamHealthDataProvider wiring (~line 487):
|
|
1263
|
+
// `getTeams: async () => StorageService.getInstance().getTeams()`.
|
|
1264
|
+
const chatMentionResolver = new ChatV2MentionResolver({
|
|
1265
|
+
loadTeams: async () => StorageService.getInstance().getTeams(),
|
|
1266
|
+
});
|
|
1267
|
+
const chatDispatcher = new ChatV2DispatcherService({
|
|
1268
|
+
agentSink: this.apiController.agentRegistrationService,
|
|
1269
|
+
mentionResolver: chatMentionResolver,
|
|
1270
|
+
// Phase B-2 — huddle roster lookup. ChatV2Service owns
|
|
1271
|
+
// the chat_channel_members table; the dispatcher just
|
|
1272
|
+
// needs the list of session names for a given channel
|
|
1273
|
+
// to fan-out a user message to every huddle member.
|
|
1274
|
+
huddleMembersFor: (channelId) => chatService.queryHuddleMembersForDispatch(channelId),
|
|
1275
|
+
});
|
|
1276
|
+
this.chatV2Gateway = chatGateway;
|
|
1277
|
+
this.chatV2Dispatcher = chatDispatcher;
|
|
1278
|
+
// The chat-v2 router mounted earlier reads realtime deps from
|
|
1279
|
+
// this holder at request time, so it picks up broadcast +
|
|
1280
|
+
// dispatch without a re-mount.
|
|
1281
|
+
setChatV2RealtimeDeps({ gateway: chatGateway, dispatcher: chatDispatcher });
|
|
1282
|
+
this.logger.info('chat-v2 WebSocket gateway + dispatcher started', {
|
|
1283
|
+
path: '/ws/chat',
|
|
1284
|
+
authMode: jwtSecret ? 'jwt' : 'dev-anonymous',
|
|
1285
|
+
});
|
|
1286
|
+
// LLM-wiki Phase 1 (redesign 2026-05-22): the prior auto-write
|
|
1287
|
+
// subscriber was REMOVED. Steve's direction: agents decide what
|
|
1288
|
+
// is wiki-worthy from inside the conversation and call the
|
|
1289
|
+
// `wiki-queue-add` skill explicitly. No keyword routing, no
|
|
1290
|
+
// blanket "every chat → log.md." See WikiQueueService for the
|
|
1291
|
+
// queue + the orchestrator system prompt for the agent rule.
|
|
1292
|
+
// Cloud Portal relay bridge — gives the Crewly Portal at
|
|
1293
|
+
// crewlyai.com the same /agents experience by tunnelling chat-v2
|
|
1294
|
+
// RPC calls through the Cloud relay queue + forwarding gateway
|
|
1295
|
+
// broadcasts as `chat_event` messages. Only wired when Cloud Sync
|
|
1296
|
+
// is running (BrowserRelayAdapter pattern).
|
|
1297
|
+
try {
|
|
1298
|
+
const { ChatV2RelayAdapter } = await import('./services/chat-v2/chat-v2.relay-adapter.service.js');
|
|
1299
|
+
const { CloudSyncService } = await import('./services/cloud/cloud-sync.service.js');
|
|
1300
|
+
const { createOssAgentDirectoryProvider, createOssAgentPresenceProvider, } = await import('./services/chat-v2/chat-v2.providers.js');
|
|
1301
|
+
const sync = CloudSyncService.getInstance();
|
|
1302
|
+
if (sync) {
|
|
1303
|
+
const chatRelayAdapter = new ChatV2RelayAdapter({
|
|
1304
|
+
service: chatService,
|
|
1305
|
+
gateway: chatGateway,
|
|
1306
|
+
cloudSync: sync,
|
|
1307
|
+
// Wire the dispatcher so Portal-sent user messages also fire the
|
|
1308
|
+
// agent-side prompt (parity with the HTTP controller path).
|
|
1309
|
+
// Without this, Portal user-messages persist but the bound agent
|
|
1310
|
+
// never receives the `[CHAT:<id>]` prompt — orc/etc. stay silent.
|
|
1311
|
+
dispatcher: chatDispatcher,
|
|
1312
|
+
directory: createOssAgentDirectoryProvider(this.storageService),
|
|
1313
|
+
presence: createOssAgentPresenceProvider(this.storageService),
|
|
1314
|
+
});
|
|
1315
|
+
chatRelayAdapter.start();
|
|
1316
|
+
this.logger.info('ChatV2RelayAdapter started — Cloud Portal can now drive chat-v2 via relay');
|
|
1317
|
+
}
|
|
1318
|
+
}
|
|
1319
|
+
catch (err) {
|
|
1320
|
+
// Adapter wiring failure is non-fatal — local OSS UI still works.
|
|
1321
|
+
this.logger.warn('ChatV2RelayAdapter wiring skipped', {
|
|
1322
|
+
error: err instanceof Error ? err.message : String(err),
|
|
1323
|
+
});
|
|
1324
|
+
}
|
|
1325
|
+
// Onboarding v3 (B1) — wire the cold-start detector with the
|
|
1326
|
+
// chat-v2 service we just stood up. The orc bootstrap path
|
|
1327
|
+
// (CrewlyAgentExternalRuntimeService.detectOnboardingMode) probes this
|
|
1328
|
+
// singleton; null means "skip the cold-start probe", so this
|
|
1329
|
+
// wiring is what flips onboarding mode on for the demo path.
|
|
1330
|
+
try {
|
|
1331
|
+
const { OnboardingBootstrapService, setOnboardingBootstrapService } = await import('./services/orchestrator/onboarding-bootstrap.service.js');
|
|
1332
|
+
setOnboardingBootstrapService(new OnboardingBootstrapService({
|
|
1333
|
+
storage: this.storageService,
|
|
1334
|
+
chat: { countAllMessages: () => chatService.countAllMessages() },
|
|
1335
|
+
}));
|
|
1336
|
+
this.logger.info('OnboardingBootstrapService wired with storage + chat probes');
|
|
1337
|
+
}
|
|
1338
|
+
catch (wireErr) {
|
|
1339
|
+
this.logger.warn('Failed to wire OnboardingBootstrapService (non-critical)', {
|
|
1340
|
+
error: wireErr instanceof Error ? wireErr.message : String(wireErr),
|
|
1341
|
+
});
|
|
1342
|
+
}
|
|
1343
|
+
}
|
|
1344
|
+
catch (error) {
|
|
1345
|
+
// F-CYCLE7-1: a native-binding failure (e.g. better-sqlite3 built
|
|
1346
|
+
// for the wrong arch) MUST crash the boot rather than be downgraded
|
|
1347
|
+
// to a JSON-file fallback. The audit on 2026-05-07 caught this
|
|
1348
|
+
// exact path: chat.db went stale at 11:17Z because dlopen errors
|
|
1349
|
+
// were swallowed here as "non-critical", so operators had no signal
|
|
1350
|
+
// to run `npm rebuild better-sqlite3 --build-from-source`.
|
|
1351
|
+
//
|
|
1352
|
+
// `isNativeBindingFatalError` matches structurally (not just via
|
|
1353
|
+
// instanceof) so realm-boundary cases — same module loaded via
|
|
1354
|
+
// two require paths — still trip the rethrow.
|
|
1355
|
+
if (isNativeBindingFatalError(error)) {
|
|
1356
|
+
this.logger.error('FATAL native binding failed at chat-v2 boot — refusing to downgrade to JSON fallback. Run the printed remediation and restart.', { error: error.message });
|
|
1357
|
+
throw error;
|
|
1358
|
+
}
|
|
1359
|
+
this.logger.warn('Failed to start chat-v2 WS gateway (non-critical)', {
|
|
1360
|
+
error: error instanceof Error ? error.message : String(error),
|
|
1361
|
+
});
|
|
1362
|
+
}
|
|
1363
|
+
// Connect BrowserProxyService to Cloud Relay (lazy — does not block startup)
|
|
1364
|
+
try {
|
|
1365
|
+
const { BrowserProxyService } = await import('./services/browser/browser-proxy.service.js');
|
|
1366
|
+
const { CloudClientService } = await import('./services/cloud/cloud-client.service.js');
|
|
1367
|
+
const cloudClient = CloudClientService.getInstance();
|
|
1368
|
+
const browserProxy = BrowserProxyService.getInstance();
|
|
1369
|
+
// Wire up token resolver so reconnects always use the freshest JWT
|
|
1370
|
+
// Reconnects must use the freshest RELAY token (NOT the access token,
|
|
1371
|
+
// per the RELAY-TOKEN-TYPE invariant). The relay only accepts a relay-
|
|
1372
|
+
// signed access JWT; the access token churns the socket.
|
|
1373
|
+
browserProxy.setTokenResolver(() => cloudClient.getRelayToken());
|
|
1374
|
+
// Subscribe to RELAY-token refresh events (distinct channel from the
|
|
1375
|
+
// access-token refresh) so the proxy re-registers in place with the
|
|
1376
|
+
// fresh relay token before its exp.
|
|
1377
|
+
cloudClient.onRelayTokenRefresh((newRelayToken) => {
|
|
1378
|
+
browserProxy.updateToken(newRelayToken);
|
|
1379
|
+
});
|
|
1380
|
+
const relayToken = cloudClient.getRelayToken();
|
|
1381
|
+
if (relayToken) {
|
|
1382
|
+
browserProxy.connect(relayToken);
|
|
1383
|
+
this.logger.info('BrowserProxyService connecting to Cloud Relay');
|
|
1384
|
+
}
|
|
1385
|
+
else {
|
|
1386
|
+
// No relay token yet (connectLocal mints it asynchronously). Defer
|
|
1387
|
+
// connect to the onRelayTokenRefresh callback above rather than
|
|
1388
|
+
// connecting with the wrong (access) token.
|
|
1389
|
+
this.logger.debug('BrowserProxyService deferred — no relay token yet, will connect on relay-token refresh');
|
|
1390
|
+
}
|
|
1391
|
+
}
|
|
1392
|
+
catch (error) {
|
|
1393
|
+
this.logger.warn('Failed to initialize BrowserProxyService (non-critical)', {
|
|
1394
|
+
error: error instanceof Error ? error.message : String(error),
|
|
1395
|
+
});
|
|
1396
|
+
}
|
|
1397
|
+
// Start team activity WebSocket service
|
|
1398
|
+
this.logger.info('Starting team activity WebSocket service...');
|
|
1399
|
+
this.teamActivityWebSocketService.start();
|
|
1400
|
+
// Start teams.json file watcher for real-time updates
|
|
1401
|
+
this.logger.info('Starting teams.json file watcher...');
|
|
1402
|
+
this.teamsJsonWatcherService.start();
|
|
1403
|
+
this.logger.info('Teams.json file watcher started for real-time updates');
|
|
1404
|
+
// Generate orchestrator skills catalog
|
|
1405
|
+
try {
|
|
1406
|
+
const skillCatalogProjectRoot = findPackageRoot(__dirname);
|
|
1407
|
+
const catalogService = SkillCatalogService.getInstance(skillCatalogProjectRoot);
|
|
1408
|
+
const catalogResult = await catalogService.generateCatalog();
|
|
1409
|
+
this.logger.info('Orchestrator skills catalog generated', {
|
|
1410
|
+
catalogPath: catalogResult.catalogPath,
|
|
1411
|
+
skillCount: catalogResult.skillCount,
|
|
1412
|
+
});
|
|
1413
|
+
const agentCatalogResult = await catalogService.generateAgentCatalog();
|
|
1414
|
+
this.logger.info('Agent skills catalog generated', {
|
|
1415
|
+
catalogPath: agentCatalogResult.catalogPath,
|
|
1416
|
+
skillCount: agentCatalogResult.skillCount,
|
|
1417
|
+
});
|
|
1418
|
+
}
|
|
1419
|
+
catch (error) {
|
|
1420
|
+
this.logger.warn('Failed to generate skills catalog (non-critical)', {
|
|
1421
|
+
error: error instanceof Error ? error.message : String(error),
|
|
1422
|
+
});
|
|
1423
|
+
}
|
|
1424
|
+
// Restore persisted message queue state (pending messages survive restarts)
|
|
1425
|
+
this.logger.info('Loading persisted message queue state...');
|
|
1426
|
+
try {
|
|
1427
|
+
await this.messageQueueService.loadPersistedState();
|
|
1428
|
+
const queueStatus = this.messageQueueService.getStatus();
|
|
1429
|
+
if (queueStatus.pendingCount > 0) {
|
|
1430
|
+
this.logger.info('Restored pending messages from previous session', {
|
|
1431
|
+
pendingCount: queueStatus.pendingCount,
|
|
1432
|
+
});
|
|
1433
|
+
}
|
|
1434
|
+
}
|
|
1435
|
+
catch (error) {
|
|
1436
|
+
this.logger.warn('Failed to load persisted queue state', {
|
|
1437
|
+
error: error instanceof Error ? error.message : String(error),
|
|
1438
|
+
});
|
|
1439
|
+
}
|
|
1440
|
+
// Load thread status queue from disk so replay can check terminal statuses
|
|
1441
|
+
try {
|
|
1442
|
+
await this.threadStatusQueueService.loadPersistedState();
|
|
1443
|
+
}
|
|
1444
|
+
catch (err) {
|
|
1445
|
+
this.logger.warn('Failed to load thread status queue state (non-critical)', {
|
|
1446
|
+
error: err instanceof Error ? err.message : String(err),
|
|
1447
|
+
});
|
|
1448
|
+
}
|
|
1449
|
+
// Backfill: mark Slack threads for done Requests as terminal so the
|
|
1450
|
+
// resume notification won't re-send already-answered conversations.
|
|
1451
|
+
try {
|
|
1452
|
+
const { RequestService } = await import('./services/v3/request.service.js');
|
|
1453
|
+
const { extractSlackChannelId, extractSlackThreadTs } = await import('./services/v3/request-sla.subscriber.js');
|
|
1454
|
+
const reqSvc = RequestService.getInstance();
|
|
1455
|
+
const allReqs = await reqSvc.listAll();
|
|
1456
|
+
let backfilled = 0;
|
|
1457
|
+
for (const req of allReqs) {
|
|
1458
|
+
if (req.status !== 'done')
|
|
1459
|
+
continue;
|
|
1460
|
+
const scid = req.sourceConversationItemId || '';
|
|
1461
|
+
// `extractSlack*` strips the optional `-msg-{ts}` thread-reply
|
|
1462
|
+
// suffix before parsing, so both top-level and in-thread
|
|
1463
|
+
// Requests resolve to the canonical `{channelId}:{threadRoot}`.
|
|
1464
|
+
// Previously a local regex was used here and its greedy `.+`
|
|
1465
|
+
// swallowed the suffix, producing a malformed threadKey that
|
|
1466
|
+
// missed the dedup check and bloated the persistence file.
|
|
1467
|
+
const channelId = extractSlackChannelId(scid);
|
|
1468
|
+
const threadTs = extractSlackThreadTs(scid);
|
|
1469
|
+
if (!channelId || !threadTs)
|
|
1470
|
+
continue;
|
|
1471
|
+
const threadKey = `${channelId}:${threadTs}`;
|
|
1472
|
+
if (this.threadStatusQueueService.get(threadKey))
|
|
1473
|
+
continue;
|
|
1474
|
+
this.threadStatusQueueService.trackInbound({
|
|
1475
|
+
threadKey,
|
|
1476
|
+
conversationId: scid,
|
|
1477
|
+
source: 'slack',
|
|
1478
|
+
messagePreview: req.title.slice(0, 200),
|
|
1479
|
+
});
|
|
1480
|
+
this.threadStatusQueueService.markReplied(threadKey, 'replied_completed');
|
|
1481
|
+
backfilled++;
|
|
1482
|
+
}
|
|
1483
|
+
if (backfilled > 0) {
|
|
1484
|
+
this.logger.info('Backfilled thread status for done Requests', { count: backfilled });
|
|
1485
|
+
}
|
|
1486
|
+
}
|
|
1487
|
+
catch (err) {
|
|
1488
|
+
this.logger.warn('Thread status backfill failed (non-critical)', {
|
|
1489
|
+
error: err instanceof Error ? err.message : String(err),
|
|
1490
|
+
});
|
|
1491
|
+
}
|
|
1492
|
+
// #247: Replay pending messages that arrived while the orchestrator was offline.
|
|
1493
|
+
// This must happen after loadPersistedState() (so we know what's already queued)
|
|
1494
|
+
// but before the queue processor starts (so replayed messages are ready for delivery).
|
|
1495
|
+
try {
|
|
1496
|
+
const { MessageReplayService } = await import('./services/messaging/message-replay.service.js');
|
|
1497
|
+
const replayService = new MessageReplayService(this.messageQueueService, this.config.crewlyHome);
|
|
1498
|
+
const replayResult = await replayService.replayPendingMessages();
|
|
1499
|
+
if (replayResult.replayedCount > 0) {
|
|
1500
|
+
this.logger.info('Replayed pending messages from offline period (#247)', {
|
|
1501
|
+
replayed: replayResult.replayedCount,
|
|
1502
|
+
found: replayResult.foundCount,
|
|
1503
|
+
skipped: replayResult.skippedDuplicate,
|
|
1504
|
+
offlineSince: replayResult.offlineSince,
|
|
1505
|
+
offlineDurationMs: replayResult.offlineDurationMs,
|
|
1506
|
+
});
|
|
1507
|
+
}
|
|
1508
|
+
}
|
|
1509
|
+
catch (replayErr) {
|
|
1510
|
+
this.logger.warn('Failed to replay pending messages (non-critical)', {
|
|
1511
|
+
error: replayErr instanceof Error ? replayErr.message : String(replayErr),
|
|
1512
|
+
});
|
|
1513
|
+
}
|
|
1514
|
+
// Start message queue processor
|
|
1515
|
+
this.logger.info('Starting message queue processor...');
|
|
1516
|
+
this.queueProcessorService.start();
|
|
1517
|
+
// Thread Status Queue: load persisted state and recover pending threads
|
|
1518
|
+
try {
|
|
1519
|
+
const recoveryResult = await this.threadStatusQueueService.recoverPendingThreads(this.messageQueueService, {
|
|
1520
|
+
agentStatusChecker: {
|
|
1521
|
+
getAgentWorkingStatus: async (sessionName) => {
|
|
1522
|
+
const status = await this.activityMonitorService.getWorkingStatusForSession(sessionName);
|
|
1523
|
+
if (status === null)
|
|
1524
|
+
return 'unknown';
|
|
1525
|
+
return status;
|
|
1526
|
+
},
|
|
1527
|
+
},
|
|
1528
|
+
});
|
|
1529
|
+
if (recoveryResult.reEnqueued > 0 || recoveryResult.followUpRestored > 0 || recoveryResult.delegationsCompleted > 0) {
|
|
1530
|
+
this.logger.info('Thread status queue recovery complete', recoveryResult);
|
|
1531
|
+
}
|
|
1532
|
+
if (recoveryResult.expired > 0 || recoveryResult.cleaned > 0) {
|
|
1533
|
+
this.logger.info('Thread status queue maintenance', {
|
|
1534
|
+
expired: recoveryResult.expired,
|
|
1535
|
+
cleaned: recoveryResult.cleaned,
|
|
1536
|
+
});
|
|
1537
|
+
}
|
|
1538
|
+
}
|
|
1539
|
+
catch (err) {
|
|
1540
|
+
this.logger.warn('Thread status queue recovery failed (non-critical)', {
|
|
1541
|
+
error: err instanceof Error ? err.message : String(err),
|
|
1542
|
+
});
|
|
1543
|
+
}
|
|
1544
|
+
// #286: Start cron task service with agent status/start callbacks
|
|
1545
|
+
try {
|
|
1546
|
+
const cronTaskService = CronTaskService.getInstance();
|
|
1547
|
+
const storageRef = this.storageService;
|
|
1548
|
+
const registrationRef = this.apiController.agentRegistrationService;
|
|
1549
|
+
cronTaskService.setExecutionCallback(async (task) => {
|
|
1550
|
+
this.logger.info('Executing cron task', { id: task.id, target: task.targetAgent });
|
|
1551
|
+
await registrationRef.sendMessageToAgent(task.targetAgent, `[CRON_TASK:${task.id}] ${task.taskDescription}`);
|
|
1552
|
+
});
|
|
1553
|
+
// Issue #307: cron tasks created with `targetTeamId` set to a
|
|
1554
|
+
// name slug (e.g. "stock-ops-team") instead of the UUID would
|
|
1555
|
+
// silently 404 on every fire — `teams.find(t => t.id === teamId)`
|
|
1556
|
+
// returned undefined and both callbacks returned `false` with
|
|
1557
|
+
// no log surface. `resolveTeamByIdOrSlug` (imported statically
|
|
1558
|
+
// at the top of the file) tries UUID first, then falls back
|
|
1559
|
+
// to a slug match against `name`. Misses now surface a distinct
|
|
1560
|
+
// warn-log with the available slugs so the cause is visible
|
|
1561
|
+
// instead of hiding behind the generic "agent offline" warn
|
|
1562
|
+
// from cron-task.service.
|
|
1563
|
+
cronTaskService.setAgentStatusCallback(async (sessionName, teamId) => {
|
|
1564
|
+
// Handle orchestrator separately — it's not in regular teams
|
|
1565
|
+
if (sessionName === CREWLY_CONSTANTS.SESSIONS.ORCHESTRATOR_NAME || teamId === 'orchestrator') {
|
|
1566
|
+
const orchStatus = await storageRef.getOrchestratorStatus();
|
|
1567
|
+
return orchStatus?.agentStatus === 'active' || orchStatus?.agentStatus === 'started';
|
|
1568
|
+
}
|
|
1569
|
+
const teams = await storageRef.getTeams();
|
|
1570
|
+
const team = resolveTeamByIdOrSlug(teams, teamId);
|
|
1571
|
+
if (!team) {
|
|
1572
|
+
this.logger.warn('CronTask: targetTeamId resolves to no team', {
|
|
1573
|
+
sessionName,
|
|
1574
|
+
targetTeamId: teamId,
|
|
1575
|
+
availableSlugs: teams.slice(0, 10).map((t) => slugifyTeamName(t.name)),
|
|
1576
|
+
hint: 'Set targetTeamId to either the team UUID or one of availableSlugs (lowercase, spaces→-)',
|
|
1577
|
+
});
|
|
1578
|
+
return false;
|
|
1579
|
+
}
|
|
1580
|
+
const member = team.members.find((m) => m.sessionName === sessionName);
|
|
1581
|
+
if (!member)
|
|
1582
|
+
return false;
|
|
1583
|
+
// #286 Root Cause C: treat both 'active' and 'started' as online
|
|
1584
|
+
return member.agentStatus === 'active' || member.agentStatus === 'started';
|
|
1585
|
+
});
|
|
1586
|
+
cronTaskService.setAgentStartCallback(async (sessionName, teamId) => {
|
|
1587
|
+
try {
|
|
1588
|
+
const teams = await storageRef.getTeams();
|
|
1589
|
+
const team = resolveTeamByIdOrSlug(teams, teamId);
|
|
1590
|
+
if (!team) {
|
|
1591
|
+
this.logger.warn('CronTask auto-start: targetTeamId resolves to no team', {
|
|
1592
|
+
sessionName,
|
|
1593
|
+
targetTeamId: teamId,
|
|
1594
|
+
availableSlugs: teams.slice(0, 10).map((t) => slugifyTeamName(t.name)),
|
|
1595
|
+
hint: 'Set targetTeamId to either the team UUID or one of availableSlugs (lowercase, spaces→-)',
|
|
1596
|
+
});
|
|
1597
|
+
return false;
|
|
1598
|
+
}
|
|
1599
|
+
const member = team.members.find((m) => m.sessionName === sessionName);
|
|
1600
|
+
if (!member)
|
|
1601
|
+
return false;
|
|
1602
|
+
await registrationRef.createAgentSession({
|
|
1603
|
+
sessionName: member.sessionName,
|
|
1604
|
+
role: member.role,
|
|
1605
|
+
// Use the resolved team's UUID — not the user-supplied identifier
|
|
1606
|
+
// — so downstream agent-registration always sees the canonical id.
|
|
1607
|
+
teamId: team.id,
|
|
1608
|
+
memberId: member.id,
|
|
1609
|
+
});
|
|
1610
|
+
return true;
|
|
1611
|
+
}
|
|
1612
|
+
catch {
|
|
1613
|
+
return false;
|
|
1614
|
+
}
|
|
1615
|
+
});
|
|
1616
|
+
// Self-heal stale nextRunAt values from pre-timezone-fix versions
|
|
1617
|
+
await cronTaskService.recalculateAllNextRunTimes();
|
|
1618
|
+
cronTaskService.start();
|
|
1619
|
+
this.logger.info('CronTaskService started');
|
|
1620
|
+
}
|
|
1621
|
+
catch (cronErr) {
|
|
1622
|
+
this.logger.warn('CronTaskService initialization failed (non-critical)', {
|
|
1623
|
+
error: cronErr instanceof Error ? cronErr.message : String(cronErr),
|
|
1624
|
+
});
|
|
1625
|
+
}
|
|
1626
|
+
// Start TriggerEngine (V3 unified trigger system) and wire action handler
|
|
1627
|
+
try {
|
|
1628
|
+
const { TriggerEngine } = await import('./services/v3/trigger-engine.service.js');
|
|
1629
|
+
const { TaskProjectionService } = await import('./services/v3/task-projection.service.js');
|
|
1630
|
+
const triggerEngine = TriggerEngine.getInstance();
|
|
1631
|
+
const taskProjection = TaskProjectionService.getInstance();
|
|
1632
|
+
// Load TaskProjection state from disk
|
|
1633
|
+
await taskProjection.load();
|
|
1634
|
+
// Wire EventBus so signal triggers can subscribe to events
|
|
1635
|
+
triggerEngine.setEventBus(this.eventBusService);
|
|
1636
|
+
// Wire action handler — executes the effect when a trigger fires
|
|
1637
|
+
triggerEngine.setActionHandler(async (trigger, action) => {
|
|
1638
|
+
const triggerId = trigger.id;
|
|
1639
|
+
const logger = this.logger;
|
|
1640
|
+
// 1. sendMessage — enqueue a message to the orchestrator session
|
|
1641
|
+
if (action.sendMessage) {
|
|
1642
|
+
const { target, message } = action.sendMessage;
|
|
1643
|
+
try {
|
|
1644
|
+
// Format with trigger context so Agent knows why it was woken
|
|
1645
|
+
const formattedContent = `[SYSTEM ALERT] Trigger '${triggerId}' says: ${message}`;
|
|
1646
|
+
this.messageQueueService.enqueue({
|
|
1647
|
+
content: formattedContent,
|
|
1648
|
+
conversationId: target || 'system',
|
|
1649
|
+
source: 'system_event',
|
|
1650
|
+
});
|
|
1651
|
+
logger.info('TriggerEngine: sendMessage enqueued', { triggerId, target });
|
|
1652
|
+
}
|
|
1653
|
+
catch (err) {
|
|
1654
|
+
logger.warn('TriggerEngine: sendMessage failed', {
|
|
1655
|
+
triggerId,
|
|
1656
|
+
error: err instanceof Error ? err.message : String(err),
|
|
1657
|
+
});
|
|
1658
|
+
}
|
|
1659
|
+
}
|
|
1660
|
+
// 2. createWorkItem — push a new WorkItem into the task pool
|
|
1661
|
+
if (action.createWorkItem) {
|
|
1662
|
+
try {
|
|
1663
|
+
const { TaskPoolService } = await import('./services/task-pool/task-pool.service.js');
|
|
1664
|
+
const { createWorkItem } = await import('./types/v2/work-item.types.js');
|
|
1665
|
+
const template = action.createWorkItem;
|
|
1666
|
+
const workItem = createWorkItem({
|
|
1667
|
+
title: template.title || `Triggered task (${trigger.id})`,
|
|
1668
|
+
description: template.description || `Auto-created by trigger ${trigger.id}`,
|
|
1669
|
+
type: template.type ?? 'delegate',
|
|
1670
|
+
owner: template.owner ?? 'orchestrator',
|
|
1671
|
+
target: template.target,
|
|
1672
|
+
triggerId,
|
|
1673
|
+
requestId: template.requestId,
|
|
1674
|
+
});
|
|
1675
|
+
await TaskPoolService.getInstance().addToPool(workItem);
|
|
1676
|
+
// Project as a trigger_action TaskRecord
|
|
1677
|
+
await taskProjection.createRecord({
|
|
1678
|
+
title: workItem.title,
|
|
1679
|
+
type: 'trigger_action',
|
|
1680
|
+
ownerAgent: 'system',
|
|
1681
|
+
triggerId,
|
|
1682
|
+
workItemId: workItem.id,
|
|
1683
|
+
});
|
|
1684
|
+
logger.info('TriggerEngine: WorkItem enqueued', { triggerId, workItemId: workItem.id });
|
|
1685
|
+
}
|
|
1686
|
+
catch (err) {
|
|
1687
|
+
logger.warn('TriggerEngine: createWorkItem failed', {
|
|
1688
|
+
triggerId,
|
|
1689
|
+
error: err instanceof Error ? err.message : String(err),
|
|
1690
|
+
});
|
|
1691
|
+
}
|
|
1692
|
+
}
|
|
1693
|
+
// 3. wakeWorkItemId — re-queue a suspended/blocked WorkItem with context note
|
|
1694
|
+
if (action.wakeWorkItemId) {
|
|
1695
|
+
try {
|
|
1696
|
+
const { TaskPoolService } = await import('./services/task-pool/task-pool.service.js');
|
|
1697
|
+
const taskPool = TaskPoolService.getInstance();
|
|
1698
|
+
// Append system note to description so Agent knows why it was woken
|
|
1699
|
+
const wakeNote = `\n\n[SYSTEM NOTE] Woken automatically by Trigger '${triggerId}' at ${new Date().toISOString()}.`;
|
|
1700
|
+
await taskPool.updateItemStatus(action.wakeWorkItemId, 'queued');
|
|
1701
|
+
// Append wake reason to WorkItem description via storage
|
|
1702
|
+
try {
|
|
1703
|
+
const item = (await taskPool.getAllItems()).find(wi => wi.id === action.wakeWorkItemId);
|
|
1704
|
+
if (item) {
|
|
1705
|
+
await taskPool.updateTokenUsage(action.wakeWorkItemId, item.inputTokens, item.outputTokens, item.cost);
|
|
1706
|
+
// We use the storage directly through the service — patch description via a minimal re-add isn't feasible,
|
|
1707
|
+
// so we write the note to the task projection instead:
|
|
1708
|
+
await taskProjection.createRecord({
|
|
1709
|
+
title: `[TRIGGER WAKE] ${item.title}`,
|
|
1710
|
+
type: 'trigger_action',
|
|
1711
|
+
ownerAgent: 'system',
|
|
1712
|
+
triggerId,
|
|
1713
|
+
workItemId: item.id,
|
|
1714
|
+
requestId: item.requestId,
|
|
1715
|
+
});
|
|
1716
|
+
}
|
|
1717
|
+
}
|
|
1718
|
+
catch { /* non-critical */ }
|
|
1719
|
+
logger.info('TriggerEngine: WorkItem woken', { triggerId, workItemId: action.wakeWorkItemId, note: wakeNote });
|
|
1720
|
+
}
|
|
1721
|
+
catch (err) {
|
|
1722
|
+
logger.warn('TriggerEngine: wakeWorkItemId failed', {
|
|
1723
|
+
triggerId,
|
|
1724
|
+
workItemId: action.wakeWorkItemId,
|
|
1725
|
+
error: err instanceof Error ? err.message : String(err),
|
|
1726
|
+
});
|
|
1727
|
+
}
|
|
1728
|
+
}
|
|
1729
|
+
// 4. runReconciler — trigger a targeted reconciliation cycle
|
|
1730
|
+
if (action.runReconciler && this.reconcilerService) {
|
|
1731
|
+
try {
|
|
1732
|
+
await this.reconcilerService.runFull();
|
|
1733
|
+
logger.info('TriggerEngine: Reconciler run triggered', { triggerId });
|
|
1734
|
+
}
|
|
1735
|
+
catch (err) {
|
|
1736
|
+
logger.warn('TriggerEngine: runReconciler failed', {
|
|
1737
|
+
triggerId,
|
|
1738
|
+
error: err instanceof Error ? err.message : String(err),
|
|
1739
|
+
});
|
|
1740
|
+
}
|
|
1741
|
+
}
|
|
1742
|
+
});
|
|
1743
|
+
await triggerEngine.start();
|
|
1744
|
+
this.logger.info('TriggerEngine started with action handler wired');
|
|
1745
|
+
// Wire team-scoped triggers: reconcile declarative Team.triggers spec
|
|
1746
|
+
// against the running engine on every team save, and cancel all of a
|
|
1747
|
+
// team's triggers when it's deleted. Listener is fire-and-forget.
|
|
1748
|
+
try {
|
|
1749
|
+
const { TeamTriggerReconciler } = await import('./services/v3/team-trigger-reconciler.service.js');
|
|
1750
|
+
const reconciler = new TeamTriggerReconciler(triggerEngine);
|
|
1751
|
+
// Initial converge: reconcile every team that already exists on disk.
|
|
1752
|
+
const existingTeams = await this.storageService.getTeams();
|
|
1753
|
+
for (const team of existingTeams) {
|
|
1754
|
+
try {
|
|
1755
|
+
await reconciler.reconcile(team);
|
|
1756
|
+
}
|
|
1757
|
+
catch (recErr) {
|
|
1758
|
+
this.logger.warn('Initial team-trigger reconcile failed', {
|
|
1759
|
+
teamId: team.id,
|
|
1760
|
+
error: recErr instanceof Error ? recErr.message : String(recErr),
|
|
1761
|
+
});
|
|
1762
|
+
}
|
|
1763
|
+
}
|
|
1764
|
+
// Ongoing: subscribe to storage events.
|
|
1765
|
+
this.storageService.onStorageEvent(async (event) => {
|
|
1766
|
+
if (event.kind === 'team-saved') {
|
|
1767
|
+
await reconciler.reconcile(event.team);
|
|
1768
|
+
}
|
|
1769
|
+
else if (event.kind === 'team-deleted') {
|
|
1770
|
+
await reconciler.unregisterAll(event.teamId);
|
|
1771
|
+
}
|
|
1772
|
+
});
|
|
1773
|
+
this.logger.info('TeamTriggerReconciler subscribed to storage events', {
|
|
1774
|
+
initialTeams: existingTeams.length,
|
|
1775
|
+
});
|
|
1776
|
+
}
|
|
1777
|
+
catch (recErr) {
|
|
1778
|
+
this.logger.warn('TeamTriggerReconciler wiring failed (non-critical)', {
|
|
1779
|
+
error: recErr instanceof Error ? recErr.message : String(recErr),
|
|
1780
|
+
});
|
|
1781
|
+
}
|
|
1782
|
+
}
|
|
1783
|
+
catch (triggerErr) {
|
|
1784
|
+
this.logger.warn('TriggerEngine initialization failed (non-critical)', {
|
|
1785
|
+
error: triggerErr instanceof Error ? triggerErr.message : String(triggerErr),
|
|
1786
|
+
});
|
|
1787
|
+
}
|
|
1788
|
+
// Start V3DataService — listens for v3:task_delegated / v3:task_completed events
|
|
1789
|
+
// and links WorkItems to their parent Requests via requestService.linkWorkItem().
|
|
1790
|
+
// Must be initialized after EventBusService is ready.
|
|
1791
|
+
try {
|
|
1792
|
+
const { V3DataService } = await import('./services/v3/v3-data.service.js');
|
|
1793
|
+
new V3DataService(this.eventBusService, process.cwd());
|
|
1794
|
+
this.logger.info('V3DataService started — WorkItem↔Request linking active');
|
|
1795
|
+
}
|
|
1796
|
+
catch (v3Err) {
|
|
1797
|
+
this.logger.warn('V3DataService initialization failed (non-critical)', {
|
|
1798
|
+
error: v3Err instanceof Error ? v3Err.message : String(v3Err),
|
|
1799
|
+
});
|
|
1800
|
+
}
|
|
1801
|
+
// Start WorkItemDispatchSubscriber FIRST — AgentAutoClaim's recovery
|
|
1802
|
+
// path delegates to its dispatchTo() for the "active target session"
|
|
1803
|
+
// branch, so the singleton must be reachable when recovery fires.
|
|
1804
|
+
try {
|
|
1805
|
+
const { WorkItemDispatchSubscriber } = await import('./services/v3/workitem-dispatch.subscriber.js');
|
|
1806
|
+
const dispatchSubscriber = WorkItemDispatchSubscriber.getInstance();
|
|
1807
|
+
dispatchSubscriber.initialize(this.eventBusService);
|
|
1808
|
+
dispatchSubscriber.start();
|
|
1809
|
+
this.logger.info('WorkItemDispatchSubscriber started — workitem:queued events push to target sessions');
|
|
1810
|
+
}
|
|
1811
|
+
catch (dispatchErr) {
|
|
1812
|
+
this.logger.warn('WorkItemDispatchSubscriber initialization failed (non-critical)', {
|
|
1813
|
+
error: dispatchErr instanceof Error ? dispatchErr.message : String(dispatchErr),
|
|
1814
|
+
});
|
|
1815
|
+
}
|
|
1816
|
+
// Start AgentAutoClaimService — auto-assign work to idle agents
|
|
1817
|
+
try {
|
|
1818
|
+
const { AgentAutoClaimService } = await import('./services/v3/agent-auto-claim.service.js');
|
|
1819
|
+
const autoClaimService = AgentAutoClaimService.getInstance();
|
|
1820
|
+
autoClaimService.initialize(this.eventBusService);
|
|
1821
|
+
await autoClaimService.start();
|
|
1822
|
+
this.logger.info('AgentAutoClaimService started — idle agents will auto-claim work');
|
|
1823
|
+
}
|
|
1824
|
+
catch (autoClaimErr) {
|
|
1825
|
+
this.logger.warn('AgentAutoClaimService initialization failed (non-critical)', {
|
|
1826
|
+
error: autoClaimErr instanceof Error ? autoClaimErr.message : String(autoClaimErr),
|
|
1827
|
+
});
|
|
1828
|
+
}
|
|
1829
|
+
// Start TLAutoVerifyService — auto-trigger TL verification on worker task completion
|
|
1830
|
+
try {
|
|
1831
|
+
const { TLAutoVerifyService } = await import('./services/v3/tl-auto-verify.service.js');
|
|
1832
|
+
const tlVerifyService = TLAutoVerifyService.getInstance();
|
|
1833
|
+
tlVerifyService.initialize(this.eventBusService);
|
|
1834
|
+
tlVerifyService.start();
|
|
1835
|
+
this.logger.info('TLAutoVerifyService started — worker completions trigger TL verification');
|
|
1836
|
+
}
|
|
1837
|
+
catch (tlVerifyErr) {
|
|
1838
|
+
this.logger.warn('TLAutoVerifyService initialization failed (non-critical)', {
|
|
1839
|
+
error: tlVerifyErr instanceof Error ? tlVerifyErr.message : String(tlVerifyErr),
|
|
1840
|
+
});
|
|
1841
|
+
}
|
|
1842
|
+
// Initialize MissionExecutorService — Mission lifecycle + decomposition processing
|
|
1843
|
+
try {
|
|
1844
|
+
const { MissionExecutorService } = await import('./services/v3/mission-executor.service.js');
|
|
1845
|
+
MissionExecutorService.getInstance();
|
|
1846
|
+
this.logger.info('MissionExecutorService initialized — Mission decomposition + progress tracking ready');
|
|
1847
|
+
}
|
|
1848
|
+
catch (missionErr) {
|
|
1849
|
+
this.logger.warn('MissionExecutorService initialization failed (non-critical)', {
|
|
1850
|
+
error: missionErr instanceof Error ? missionErr.message : String(missionErr),
|
|
1851
|
+
});
|
|
1852
|
+
}
|
|
1853
|
+
// Start marketplace auto-update (check registry every 6 hours)
|
|
1854
|
+
try {
|
|
1855
|
+
const { startAutoUpdate } = await import('./services/marketplace/marketplace-auto-update.service.js');
|
|
1856
|
+
startAutoUpdate();
|
|
1857
|
+
}
|
|
1858
|
+
catch (autoUpdateErr) {
|
|
1859
|
+
this.logger.warn('Marketplace auto-update startup failed (non-fatal)', {
|
|
1860
|
+
error: autoUpdateErr instanceof Error ? autoUpdateErr.message : String(autoUpdateErr),
|
|
1861
|
+
});
|
|
1862
|
+
}
|
|
1863
|
+
// Start Slack image cleanup (download temp files)
|
|
1864
|
+
try {
|
|
1865
|
+
const { getSlackImageService: getImgService } = await import('./services/slack/slack-image.service.js');
|
|
1866
|
+
const imgService = getImgService();
|
|
1867
|
+
await imgService.cleanupOnStartup();
|
|
1868
|
+
imgService.startCleanup();
|
|
1869
|
+
}
|
|
1870
|
+
catch (err) {
|
|
1871
|
+
this.logger.warn('Failed to initialize Slack image service', {
|
|
1872
|
+
error: err instanceof Error ? err.message : String(err),
|
|
1873
|
+
});
|
|
1874
|
+
}
|
|
1875
|
+
// Initialize Slack if configured
|
|
1876
|
+
await this.initializeSlackIfConfigured();
|
|
1877
|
+
// Initialize WhatsApp if configured
|
|
1878
|
+
await this.initializeWhatsAppIfConfigured();
|
|
1879
|
+
// Initialize Google Chat if saved credentials exist
|
|
1880
|
+
await this.initializeGoogleChatIfConfigured();
|
|
1881
|
+
// Initialize Telegram if configured
|
|
1882
|
+
await this.initializeTelegramIfConfigured();
|
|
1883
|
+
// Restore Cloud connection from persisted config (non-blocking)
|
|
1884
|
+
initializeCloudIfConfigured().catch((err) => {
|
|
1885
|
+
this.logger.warn('Cloud initialization failed (non-fatal)', {
|
|
1886
|
+
error: err instanceof Error ? err.message : String(err),
|
|
1887
|
+
});
|
|
1888
|
+
});
|
|
1889
|
+
// Start NOTIFY reconciliation service (retries failed Slack deliveries)
|
|
1890
|
+
this.notifyReconciliationService = new NotifyReconciliationService();
|
|
1891
|
+
this.notifyReconciliationService.start();
|
|
1892
|
+
// Start system resource alert monitoring (proactive disk/memory/CPU alerts)
|
|
1893
|
+
this.systemResourceAlertService.startMonitoring();
|
|
1894
|
+
// Fire-and-forget background version check (populates cache for /health)
|
|
1895
|
+
VersionCheckService.getInstance().checkForUpdate().catch(() => {
|
|
1896
|
+
// Silently ignore — version check is non-critical
|
|
1897
|
+
});
|
|
1898
|
+
// V3-only as of spec 2026-05-06-task-management-v1-deprecation.md.
|
|
1899
|
+
// The legacy `TaskTrackingService.startAutoSync()` is gone — V3
|
|
1900
|
+
// task-pool reconciler owns lifecycle cleanup now.
|
|
1901
|
+
// Initialize token usage tracking: load persisted data and start periodic flush
|
|
1902
|
+
try {
|
|
1903
|
+
const tokenUsageService = TokenUsageService.getInstance();
|
|
1904
|
+
await tokenUsageService.loadFromDisk();
|
|
1905
|
+
tokenUsageService.startPeriodicFlush();
|
|
1906
|
+
// Sync Claude Code session JSONL files → TokenUsageService
|
|
1907
|
+
// so the Usage dashboard has data from claude-code runtime agents
|
|
1908
|
+
const { syncSessionsToTokenUsageService } = await import('./services/monitoring/claude-session-tokens.service.js');
|
|
1909
|
+
const synced = await syncSessionsToTokenUsageService(this.config.crewlyHome, 7);
|
|
1910
|
+
this.logger.info('Token usage tracking initialized', { syncedClaudeSessions: synced });
|
|
1911
|
+
}
|
|
1912
|
+
catch (tokenErr) {
|
|
1913
|
+
this.logger.warn('Token usage initialization failed (non-fatal)', {
|
|
1914
|
+
error: tokenErr instanceof Error ? tokenErr.message : String(tokenErr),
|
|
1915
|
+
});
|
|
1916
|
+
}
|
|
1917
|
+
// Start Reconciler: run initial full reconcile and start loops
|
|
1918
|
+
if (this.reconcilerService) {
|
|
1919
|
+
try {
|
|
1920
|
+
this.logger.info('Running initial full reconciliation...');
|
|
1921
|
+
const initialResult = await this.reconcilerService.runFull();
|
|
1922
|
+
this.logger.info('Initial reconciliation complete', {
|
|
1923
|
+
durationMs: initialResult.durationMs,
|
|
1924
|
+
corrections: initialResult.corrections.length,
|
|
1925
|
+
errors: initialResult.errors.length,
|
|
1926
|
+
});
|
|
1927
|
+
this.reconcilerService.start();
|
|
1928
|
+
this.logger.info('Reconciler loops started (fast: 10s, full: 60s)');
|
|
1929
|
+
}
|
|
1930
|
+
catch (reconcilerErr) {
|
|
1931
|
+
this.logger.warn('Reconciler startup failed (non-fatal)', {
|
|
1932
|
+
error: reconcilerErr instanceof Error ? reconcilerErr.message : String(reconcilerErr),
|
|
1933
|
+
});
|
|
1934
|
+
}
|
|
1935
|
+
}
|
|
1936
|
+
// C1 — boot-time state invariant check (Persistence P0 spec).
|
|
1937
|
+
// Refuses to start serving traffic if the live teams directory
|
|
1938
|
+
// is empty but a healthy backup snapshot exists. Override via
|
|
1939
|
+
// CREWLY_FORCE_EMPTY_BOOT=1 for legitimate fresh-install / reset.
|
|
1940
|
+
try {
|
|
1941
|
+
await this.storageService.verifyStateInvariantOnBoot();
|
|
1942
|
+
}
|
|
1943
|
+
catch (invariantErr) {
|
|
1944
|
+
const { StateInvariantViolation } = await import('./services/core/state-invariant.types.js');
|
|
1945
|
+
if (invariantErr instanceof StateInvariantViolation) {
|
|
1946
|
+
this.logger.error('Boot aborted by state invariant check — refusing to serve traffic with wiped state', {
|
|
1947
|
+
currentTeamCount: invariantErr.currentTeamCount,
|
|
1948
|
+
backupTeamCount: invariantErr.backupTeamCount,
|
|
1949
|
+
backupTimestamp: invariantErr.backupTimestamp,
|
|
1950
|
+
message: invariantErr.message,
|
|
1951
|
+
});
|
|
1952
|
+
}
|
|
1953
|
+
throw invariantErr;
|
|
1954
|
+
}
|
|
1955
|
+
// Start HTTP server with enhanced error handling
|
|
1956
|
+
await this.startHttpServer();
|
|
1957
|
+
// Load addons from ~/.crewly/addons/ (Pro features, extensions, etc.)
|
|
1958
|
+
try {
|
|
1959
|
+
const addonLoader = AddonLoaderService.getInstance();
|
|
1960
|
+
const loadedAddons = await addonLoader.loadAddons(this.app, this.httpServer);
|
|
1961
|
+
if (loadedAddons.length > 0) {
|
|
1962
|
+
this.logger.info('Addons loaded successfully', { addons: loadedAddons });
|
|
1963
|
+
}
|
|
1964
|
+
}
|
|
1965
|
+
catch (addonErr) {
|
|
1966
|
+
this.logger.warn('Addon loading encountered an error (non-fatal)', {
|
|
1967
|
+
error: addonErr instanceof Error ? addonErr.message : String(addonErr),
|
|
1968
|
+
});
|
|
1969
|
+
}
|
|
1970
|
+
// Register cleanup handlers
|
|
1971
|
+
this.registerSignalHandlers();
|
|
1972
|
+
// Start health monitoring
|
|
1973
|
+
this.startHealthMonitoring();
|
|
1974
|
+
// Auto-start orchestrator if enabled in settings
|
|
1975
|
+
await this.autoStartOrchestratorIfEnabled();
|
|
1976
|
+
// Auto-restore agent sessions that were running before the last shutdown
|
|
1977
|
+
await this.autoRestoreAgentSessionsIfEnabled();
|
|
1978
|
+
// #166: Auto-recover in-progress tasks after restart.
|
|
1979
|
+
// #196: Skip tasks older than 1 hour to avoid re-sending stale work.
|
|
1980
|
+
// V3-only as of spec 2026-05-06-task-management-v1-deprecation.md —
|
|
1981
|
+
// reads WorkItems from TaskPoolService (replaces the prior
|
|
1982
|
+
// `TaskTrackingService.getAllInProgressTasks()` call).
|
|
1983
|
+
try {
|
|
1984
|
+
const TASK_RECOVERY_MAX_AGE_MS = 60 * 60 * 1000; // 1 hour
|
|
1985
|
+
const { TaskPoolService } = await import('./services/task-pool/task-pool.service.js');
|
|
1986
|
+
const allItems = await TaskPoolService.getInstance().getAllItems();
|
|
1987
|
+
const now = Date.now();
|
|
1988
|
+
const activeTasks = allItems.filter(wi => {
|
|
1989
|
+
if (wi.status !== 'queued' && wi.status !== 'accepted' && wi.status !== 'running')
|
|
1990
|
+
return false;
|
|
1991
|
+
if (!wi.target)
|
|
1992
|
+
return false;
|
|
1993
|
+
// Skip stale tasks — startedAt/createdAt older than threshold
|
|
1994
|
+
const taskTime = new Date(wi.startedAt || wi.createdAt || 0).getTime();
|
|
1995
|
+
if (now - taskTime > TASK_RECOVERY_MAX_AGE_MS) {
|
|
1996
|
+
this.logger.info('Skipping stale task recovery (older than 1 hour)', {
|
|
1997
|
+
workItemId: wi.id,
|
|
1998
|
+
taskName: wi.title,
|
|
1999
|
+
age: `${Math.round((now - taskTime) / 60000)} minutes`,
|
|
2000
|
+
});
|
|
2001
|
+
return false;
|
|
2002
|
+
}
|
|
2003
|
+
return true;
|
|
2004
|
+
});
|
|
2005
|
+
if (activeTasks.length > 0) {
|
|
2006
|
+
this.logger.info('Found in-progress WorkItems to recover after restart', {
|
|
2007
|
+
count: activeTasks.length,
|
|
2008
|
+
});
|
|
2009
|
+
for (const wi of activeTasks) {
|
|
2010
|
+
try {
|
|
2011
|
+
const recoveryMessage = `[SYSTEM — TASK RECOVERY] You were working on this task before the server restarted. Please continue:\n\nTask: ${wi.title}\nWorkItem: ${wi.id}\n\nFetch full brief: bash config/skills/agent/core/read-task/execute.sh '{"workItemId":"${wi.id}"}'\n\nPlease check the current state and continue working.`;
|
|
2012
|
+
await this.apiController.agentRegistrationService.sendMessageToAgent(wi.target, recoveryMessage, undefined);
|
|
2013
|
+
this.logger.info('Task recovery message sent', {
|
|
2014
|
+
workItemId: wi.id,
|
|
2015
|
+
sessionName: wi.target,
|
|
2016
|
+
taskName: wi.title,
|
|
2017
|
+
});
|
|
2018
|
+
}
|
|
2019
|
+
catch (err) {
|
|
2020
|
+
// Agent might not be online yet — DLQ in scheduler will handle it
|
|
2021
|
+
this.logger.warn('Task recovery delivery deferred (agent may not be online yet)', {
|
|
2022
|
+
workItemId: wi.id,
|
|
2023
|
+
sessionName: wi.target,
|
|
2024
|
+
error: err instanceof Error ? err.message : String(err),
|
|
2025
|
+
});
|
|
2026
|
+
}
|
|
2027
|
+
}
|
|
2028
|
+
}
|
|
2029
|
+
}
|
|
2030
|
+
catch (err) {
|
|
2031
|
+
this.logger.warn('Task auto-recovery failed (non-critical)', {
|
|
2032
|
+
error: err instanceof Error ? err.message : String(err),
|
|
2033
|
+
});
|
|
2034
|
+
}
|
|
2035
|
+
// Start log rotation service (non-critical — logs cleanup)
|
|
2036
|
+
try {
|
|
2037
|
+
const logRotation = LogRotationService.getInstance();
|
|
2038
|
+
const backend = getSessionBackendSync();
|
|
2039
|
+
const activeNames = backend ? backend.listSessions() : [];
|
|
2040
|
+
await logRotation.start(activeNames);
|
|
2041
|
+
this.logger.info('LogRotationService started');
|
|
2042
|
+
}
|
|
2043
|
+
catch (error) {
|
|
2044
|
+
this.logger.warn('Failed to start LogRotationService (non-critical)', {
|
|
2045
|
+
error: error instanceof Error ? error.message : String(error),
|
|
2046
|
+
});
|
|
2047
|
+
}
|
|
2048
|
+
// Start AuditorSchedulerService (non-critical — audit scheduling)
|
|
2049
|
+
// Priority: env var > settings.json > ENABLED_BY_DEFAULT constant
|
|
2050
|
+
const envValue = process.env[AUDITOR_CONSTANTS.ENV_VAR]?.toLowerCase();
|
|
2051
|
+
let auditorEnabled;
|
|
2052
|
+
if (envValue !== undefined) {
|
|
2053
|
+
// Env var explicitly set — use it
|
|
2054
|
+
auditorEnabled = envValue === 'true';
|
|
2055
|
+
}
|
|
2056
|
+
else {
|
|
2057
|
+
// Check persisted settings (settings.json)
|
|
2058
|
+
try {
|
|
2059
|
+
const settingsForAuditor = await getSettingsService().getSettings();
|
|
2060
|
+
auditorEnabled = settingsForAuditor.general.enableAuditor ?? AUDITOR_CONSTANTS.ENABLED_BY_DEFAULT;
|
|
2061
|
+
}
|
|
2062
|
+
catch {
|
|
2063
|
+
auditorEnabled = AUDITOR_CONSTANTS.ENABLED_BY_DEFAULT;
|
|
2064
|
+
}
|
|
2065
|
+
}
|
|
2066
|
+
if (auditorEnabled) {
|
|
2067
|
+
try {
|
|
2068
|
+
const auditorScheduler = AuditorSchedulerService.getInstance();
|
|
2069
|
+
auditorScheduler.setAgentRegistrationService(this.apiController.agentRegistrationService);
|
|
2070
|
+
auditorScheduler.setEventBusService(this.eventBusService);
|
|
2071
|
+
setAuditorSchedulerService(auditorScheduler);
|
|
2072
|
+
auditorScheduler.start();
|
|
2073
|
+
this.logger.info('AuditorSchedulerService started (Claude Code PTY mode)');
|
|
2074
|
+
}
|
|
2075
|
+
catch (error) {
|
|
2076
|
+
this.logger.warn('Failed to start AuditorSchedulerService (non-critical)', {
|
|
2077
|
+
error: error instanceof Error ? error.message : String(error),
|
|
2078
|
+
});
|
|
2079
|
+
}
|
|
2080
|
+
}
|
|
2081
|
+
else {
|
|
2082
|
+
this.logger.info('Auditor disabled (enable via Settings > General or CREWLY_ENABLE_AUDITOR=true)');
|
|
2083
|
+
}
|
|
2084
|
+
}
|
|
2085
|
+
catch (error) {
|
|
2086
|
+
this.logger.error('Failed to start server', { error: error instanceof Error ? error.message : String(error) });
|
|
2087
|
+
if (error instanceof Error && error.message.includes('EADDRINUSE')) {
|
|
2088
|
+
this.logger.error('Port already in use', { port: this.config.webPort });
|
|
2089
|
+
this.logger.info('Try killing existing processes or use a different port');
|
|
2090
|
+
await this.handlePortConflict();
|
|
2091
|
+
}
|
|
2092
|
+
throw error;
|
|
2093
|
+
}
|
|
2094
|
+
}
|
|
2095
|
+
/**
|
|
2096
|
+
* Initialize Slack integration if environment variables are configured.
|
|
2097
|
+
* Gracefully handles missing configuration or connection failures.
|
|
2098
|
+
*/
|
|
2099
|
+
async initializeSlackIfConfigured() {
|
|
2100
|
+
try {
|
|
2101
|
+
this.logger.info('Checking Slack configuration...');
|
|
2102
|
+
const result = await initializeSlackIfConfigured({
|
|
2103
|
+
messageQueueService: this.messageQueueService,
|
|
2104
|
+
});
|
|
2105
|
+
if (result.success) {
|
|
2106
|
+
// Wire thread store into the bridge for persistent thread tracking
|
|
2107
|
+
const threadStore = getSlackThreadStore();
|
|
2108
|
+
if (threadStore) {
|
|
2109
|
+
const { getSlackOrchestratorBridge } = await import('./services/slack/slack-orchestrator-bridge.js');
|
|
2110
|
+
const bridge = getSlackOrchestratorBridge();
|
|
2111
|
+
bridge.setSlackThreadStore(threadStore);
|
|
2112
|
+
bridge.setThreadStatusQueue(this.threadStatusQueueService);
|
|
2113
|
+
}
|
|
2114
|
+
this.logger.info('Slack integration initialized successfully');
|
|
2115
|
+
}
|
|
2116
|
+
else if (result.attempted) {
|
|
2117
|
+
this.logger.warn('Slack initialization failed', { error: result.error });
|
|
2118
|
+
}
|
|
2119
|
+
else {
|
|
2120
|
+
this.logger.info('Slack not configured, skipping initialization');
|
|
2121
|
+
}
|
|
2122
|
+
}
|
|
2123
|
+
catch (error) {
|
|
2124
|
+
this.logger.error('Error initializing Slack integration', {
|
|
2125
|
+
error: error instanceof Error ? error.message : String(error),
|
|
2126
|
+
});
|
|
2127
|
+
// Don't fail startup if Slack fails
|
|
2128
|
+
}
|
|
2129
|
+
}
|
|
2130
|
+
/**
|
|
2131
|
+
* Initialize WhatsApp integration if environment variables are configured.
|
|
2132
|
+
* Gracefully handles missing configuration or connection failures.
|
|
2133
|
+
*/
|
|
2134
|
+
async initializeWhatsAppIfConfigured() {
|
|
2135
|
+
try {
|
|
2136
|
+
this.logger.info('Checking WhatsApp configuration...');
|
|
2137
|
+
const result = await initializeWhatsAppIfConfigured({
|
|
2138
|
+
messageQueueService: this.messageQueueService,
|
|
2139
|
+
});
|
|
2140
|
+
if (result.success) {
|
|
2141
|
+
this.logger.info('WhatsApp integration initialized successfully');
|
|
2142
|
+
}
|
|
2143
|
+
else if (result.attempted) {
|
|
2144
|
+
this.logger.warn('WhatsApp initialization failed', { error: result.error });
|
|
2145
|
+
}
|
|
2146
|
+
else {
|
|
2147
|
+
this.logger.info('WhatsApp not configured, skipping initialization');
|
|
2148
|
+
}
|
|
2149
|
+
}
|
|
2150
|
+
catch (error) {
|
|
2151
|
+
this.logger.error('Error initializing WhatsApp integration', {
|
|
2152
|
+
error: error instanceof Error ? error.message : String(error),
|
|
2153
|
+
});
|
|
2154
|
+
// Don't fail startup if WhatsApp fails
|
|
2155
|
+
}
|
|
2156
|
+
}
|
|
2157
|
+
/**
|
|
2158
|
+
* Initialize Google Chat adapter from saved credentials if available.
|
|
2159
|
+
* Restarts the Pub/Sub pull loop automatically on backend restart.
|
|
2160
|
+
*/
|
|
2161
|
+
async initializeGoogleChatIfConfigured() {
|
|
2162
|
+
try {
|
|
2163
|
+
this.logger.info('Checking Google Chat saved credentials...');
|
|
2164
|
+
const result = await initializeGoogleChatIfConfigured({
|
|
2165
|
+
messageQueueService: this.messageQueueService,
|
|
2166
|
+
});
|
|
2167
|
+
if (result.success) {
|
|
2168
|
+
this.logger.info('Google Chat auto-reconnect successful');
|
|
2169
|
+
}
|
|
2170
|
+
else if (result.attempted) {
|
|
2171
|
+
this.logger.warn('Google Chat auto-reconnect failed', { error: result.error });
|
|
2172
|
+
}
|
|
2173
|
+
else {
|
|
2174
|
+
this.logger.info('Google Chat not configured, skipping initialization');
|
|
2175
|
+
}
|
|
2176
|
+
}
|
|
2177
|
+
catch (error) {
|
|
2178
|
+
this.logger.error('Error initializing Google Chat integration', {
|
|
2179
|
+
error: error instanceof Error ? error.message : String(error),
|
|
2180
|
+
});
|
|
2181
|
+
// Don't fail startup if Google Chat fails
|
|
2182
|
+
}
|
|
2183
|
+
}
|
|
2184
|
+
/**
|
|
2185
|
+
* Initialize Telegram bot from environment variables or saved credentials.
|
|
2186
|
+
* Starts long-polling for incoming messages automatically on backend restart.
|
|
2187
|
+
*/
|
|
2188
|
+
async initializeTelegramIfConfigured() {
|
|
2189
|
+
try {
|
|
2190
|
+
this.logger.info('Checking Telegram configuration...');
|
|
2191
|
+
const result = await initializeTelegramIfConfigured({
|
|
2192
|
+
messageQueueService: this.messageQueueService,
|
|
2193
|
+
});
|
|
2194
|
+
if (result.success) {
|
|
2195
|
+
this.logger.info('Telegram bot connected and polling started');
|
|
2196
|
+
}
|
|
2197
|
+
else if (result.attempted) {
|
|
2198
|
+
this.logger.warn('Telegram initialization failed', { error: result.error });
|
|
2199
|
+
}
|
|
2200
|
+
else {
|
|
2201
|
+
this.logger.info('Telegram not configured, skipping initialization');
|
|
2202
|
+
}
|
|
2203
|
+
}
|
|
2204
|
+
catch (error) {
|
|
2205
|
+
this.logger.error('Error initializing Telegram integration', {
|
|
2206
|
+
error: error instanceof Error ? error.message : String(error),
|
|
2207
|
+
});
|
|
2208
|
+
// Don't fail startup if Telegram fails
|
|
2209
|
+
}
|
|
2210
|
+
}
|
|
2211
|
+
/**
|
|
2212
|
+
* Auto-start the orchestrator if the autoStartOrchestrator setting is enabled.
|
|
2213
|
+
* Reads the setting from persistent storage and triggers orchestrator setup.
|
|
2214
|
+
* Failures are logged but do not prevent the server from starting.
|
|
2215
|
+
*/
|
|
2216
|
+
async autoStartOrchestratorIfEnabled() {
|
|
2217
|
+
try {
|
|
2218
|
+
const settingsService = getSettingsService();
|
|
2219
|
+
const settings = await settingsService.getSettings();
|
|
2220
|
+
if (!settings.general.autoStartOrchestrator) {
|
|
2221
|
+
this.logger.info('Auto-start orchestrator is disabled, skipping');
|
|
2222
|
+
return;
|
|
2223
|
+
}
|
|
2224
|
+
this.logger.info('Auto-start orchestrator is enabled, starting orchestrator...');
|
|
2225
|
+
// Determine runtime type: env var OVERRIDES stored status > default (claude-code)
|
|
2226
|
+
// DEFAULT_RUNTIME env var is the authoritative config for Docker/headless deployments.
|
|
2227
|
+
let runtimeType = RUNTIME_TYPES.CLAUDE_CODE;
|
|
2228
|
+
// Step 1: Check stored orchestrator status (user changed via UI in previous session)
|
|
2229
|
+
try {
|
|
2230
|
+
const orchestratorStatus = await this.storageService.getOrchestratorStatus();
|
|
2231
|
+
if (orchestratorStatus?.runtimeType) {
|
|
2232
|
+
runtimeType = orchestratorStatus.runtimeType;
|
|
2233
|
+
}
|
|
2234
|
+
}
|
|
2235
|
+
catch {
|
|
2236
|
+
// Use default runtime type
|
|
2237
|
+
}
|
|
2238
|
+
// Step 2: DEFAULT_RUNTIME env var OVERRIDES stored status (product-level config)
|
|
2239
|
+
// This ensures Docker/headless deployments always use the configured runtime
|
|
2240
|
+
// regardless of what was stored from a previous (possibly different) deployment.
|
|
2241
|
+
const envRuntime = process.env.DEFAULT_RUNTIME;
|
|
2242
|
+
if (envRuntime && Object.values(RUNTIME_TYPES).includes(envRuntime)) {
|
|
2243
|
+
const previousRuntime = runtimeType;
|
|
2244
|
+
runtimeType = envRuntime;
|
|
2245
|
+
this.logger.info('DEFAULT_RUNTIME env overrides stored runtime', { runtimeType, previousRuntime });
|
|
2246
|
+
// #183: Persist the override so stored status stays in sync
|
|
2247
|
+
if (previousRuntime !== runtimeType) {
|
|
2248
|
+
try {
|
|
2249
|
+
await this.storageService.updateOrchestratorRuntimeType(runtimeType);
|
|
2250
|
+
this.logger.info('Synced orchestrator runtime to storage', { runtimeType });
|
|
2251
|
+
}
|
|
2252
|
+
catch {
|
|
2253
|
+
this.logger.warn('Failed to sync orchestrator runtime to storage');
|
|
2254
|
+
}
|
|
2255
|
+
}
|
|
2256
|
+
}
|
|
2257
|
+
// Create orchestrator agent session
|
|
2258
|
+
const result = await this.apiController.agentRegistrationService.createAgentSession({
|
|
2259
|
+
sessionName: ORCHESTRATOR_SESSION_NAME,
|
|
2260
|
+
role: ORCHESTRATOR_ROLE,
|
|
2261
|
+
projectPath: this.config.crewlyHome,
|
|
2262
|
+
windowName: ORCHESTRATOR_WINDOW_NAME,
|
|
2263
|
+
runtimeType,
|
|
2264
|
+
forceRecreate: true,
|
|
2265
|
+
});
|
|
2266
|
+
if (!result.success) {
|
|
2267
|
+
this.logger.warn('Auto-start orchestrator failed to create session', {
|
|
2268
|
+
error: result.error,
|
|
2269
|
+
});
|
|
2270
|
+
return;
|
|
2271
|
+
}
|
|
2272
|
+
// Initialize orchestrator memory
|
|
2273
|
+
try {
|
|
2274
|
+
const memoryService = MemoryService.getInstance();
|
|
2275
|
+
await memoryService.initializeForSession(ORCHESTRATOR_SESSION_NAME, ORCHESTRATOR_ROLE, this.config.crewlyHome);
|
|
2276
|
+
}
|
|
2277
|
+
catch (memoryError) {
|
|
2278
|
+
this.logger.warn('Failed to initialize orchestrator memory during auto-start', {
|
|
2279
|
+
error: memoryError instanceof Error ? memoryError.message : String(memoryError),
|
|
2280
|
+
});
|
|
2281
|
+
}
|
|
2282
|
+
// Start persistent chat monitoring
|
|
2283
|
+
if (this.terminalGateway) {
|
|
2284
|
+
this.terminalGateway.startOrchestratorChatMonitoring(ORCHESTRATOR_SESSION_NAME);
|
|
2285
|
+
}
|
|
2286
|
+
this.logger.info('Orchestrator auto-started successfully');
|
|
2287
|
+
}
|
|
2288
|
+
catch (error) {
|
|
2289
|
+
this.logger.error('Failed to auto-start orchestrator', {
|
|
2290
|
+
error: error instanceof Error ? error.message : String(error),
|
|
2291
|
+
});
|
|
2292
|
+
// Don't fail startup if auto-start fails
|
|
2293
|
+
}
|
|
2294
|
+
}
|
|
2295
|
+
/**
|
|
2296
|
+
* Auto-restore agent sessions that were running before the last shutdown.
|
|
2297
|
+
* Loads persisted session state and calls createAgentSession() for each
|
|
2298
|
+
* non-orchestrator session. Gated by the autoResumeOnRestart setting.
|
|
2299
|
+
* Runs after orchestrator auto-start so the orchestrator is available.
|
|
2300
|
+
*/
|
|
2301
|
+
async autoRestoreAgentSessionsIfEnabled() {
|
|
2302
|
+
try {
|
|
2303
|
+
const settingsService = getSettingsService();
|
|
2304
|
+
const settings = await settingsService.getSettings();
|
|
2305
|
+
if (!settings.general.autoResumeOnRestart) {
|
|
2306
|
+
this.logger.info('Auto-resume on restart is disabled, skipping agent session restore');
|
|
2307
|
+
return;
|
|
2308
|
+
}
|
|
2309
|
+
const persistence = getSessionStatePersistence();
|
|
2310
|
+
const state = await persistence.loadState();
|
|
2311
|
+
if (!state || state.sessions.length === 0) {
|
|
2312
|
+
this.logger.debug('No persisted agent sessions to restore');
|
|
2313
|
+
return;
|
|
2314
|
+
}
|
|
2315
|
+
// Filter out orchestrator sessions (already auto-started separately)
|
|
2316
|
+
// and auditor sessions when auditor is disabled
|
|
2317
|
+
const isAuditorEnabled = process.env[AUDITOR_CONSTANTS.ENV_VAR]?.toLowerCase() === 'true'
|
|
2318
|
+
|| (process.env[AUDITOR_CONSTANTS.ENV_VAR] === undefined && AUDITOR_CONSTANTS.ENABLED_BY_DEFAULT);
|
|
2319
|
+
const baselineSessions = state.sessions.filter((s) => {
|
|
2320
|
+
if (s.role === ORCHESTRATOR_ROLE)
|
|
2321
|
+
return false;
|
|
2322
|
+
if (!isAuditorEnabled && s.name === AUDITOR_SCHEDULER_CONSTANTS.AUDITOR_SESSION_NAME)
|
|
2323
|
+
return false;
|
|
2324
|
+
return true;
|
|
2325
|
+
});
|
|
2326
|
+
// 2026-05-17 — gate by task-pool work. Pre-fix the boot path
|
|
2327
|
+
// blindly resurrected every persisted session even when none had
|
|
2328
|
+
// pending work, defeating the wake-gate philosophy (PR #574/#585)
|
|
2329
|
+
// and bloating RAM until IdleDetection eventually drained them
|
|
2330
|
+
// back. Now: only restore a session if the pool has at least one
|
|
2331
|
+
// non-terminal WorkItem with `target === sessionName`. Idle
|
|
2332
|
+
// agents stay dead until orc dispatches new work, at which point
|
|
2333
|
+
// the dispatcher / wake path raises them on demand.
|
|
2334
|
+
//
|
|
2335
|
+
// Safety valve: if the pool lookup throws (e.g. SQLite not yet
|
|
2336
|
+
// open during early boot), preserve the legacy behaviour rather
|
|
2337
|
+
// than block all restores — better to over-restore than to
|
|
2338
|
+
// silently strand work.
|
|
2339
|
+
let agentSessions = baselineSessions;
|
|
2340
|
+
try {
|
|
2341
|
+
const pool = TaskPoolService.getInstance();
|
|
2342
|
+
const allItems = await pool.getAllItems();
|
|
2343
|
+
const targetedSessions = new Set();
|
|
2344
|
+
for (const wi of allItems) {
|
|
2345
|
+
if (wi.status === 'done' || wi.status === 'cancelled')
|
|
2346
|
+
continue;
|
|
2347
|
+
const t = wi.target;
|
|
2348
|
+
if (typeof t === 'string' && t.length > 0)
|
|
2349
|
+
targetedSessions.add(t);
|
|
2350
|
+
}
|
|
2351
|
+
const filtered = baselineSessions.filter((s) => targetedSessions.has(s.name));
|
|
2352
|
+
const skipped = baselineSessions
|
|
2353
|
+
.filter((s) => !targetedSessions.has(s.name))
|
|
2354
|
+
.map((s) => s.name);
|
|
2355
|
+
if (skipped.length > 0) {
|
|
2356
|
+
this.logger.info('Skipping auto-restore for sessions with no pending WorkItem (idle agents stay dead until dispatched work arrives)', {
|
|
2357
|
+
skippedCount: skipped.length,
|
|
2358
|
+
skipped: skipped.slice(0, 20),
|
|
2359
|
+
truncated: skipped.length > 20,
|
|
2360
|
+
});
|
|
2361
|
+
}
|
|
2362
|
+
agentSessions = filtered;
|
|
2363
|
+
}
|
|
2364
|
+
catch (poolErr) {
|
|
2365
|
+
this.logger.warn('Auto-restore could not query task pool; falling back to restoring every persisted session', { error: poolErr instanceof Error ? poolErr.message : String(poolErr) });
|
|
2366
|
+
}
|
|
2367
|
+
if (agentSessions.length === 0) {
|
|
2368
|
+
this.logger.info('No persisted agent sessions to restore (all idle, no pending WorkItems)');
|
|
2369
|
+
return;
|
|
2370
|
+
}
|
|
2371
|
+
this.logger.info('Auto-restoring agent sessions from persisted state', {
|
|
2372
|
+
count: agentSessions.length,
|
|
2373
|
+
sessions: agentSessions.map((s) => s.name),
|
|
2374
|
+
});
|
|
2375
|
+
let restored = 0;
|
|
2376
|
+
const failed = [];
|
|
2377
|
+
const RESTORE_DELAY_MS = 10_000; // 10 seconds between each session restore to avoid resource pressure
|
|
2378
|
+
for (let i = 0; i < agentSessions.length; i++) {
|
|
2379
|
+
const session = agentSessions[i];
|
|
2380
|
+
// Wait between session restores to avoid SIGTERM from resource pressure
|
|
2381
|
+
if (i > 0) {
|
|
2382
|
+
this.logger.info('Waiting before restoring next session to avoid resource pressure', {
|
|
2383
|
+
delayMs: RESTORE_DELAY_MS,
|
|
2384
|
+
nextSession: session.name,
|
|
2385
|
+
progress: `${i}/${agentSessions.length}`,
|
|
2386
|
+
});
|
|
2387
|
+
await new Promise((resolve) => setTimeout(resolve, RESTORE_DELAY_MS));
|
|
2388
|
+
}
|
|
2389
|
+
try {
|
|
2390
|
+
const result = await this.apiController.agentRegistrationService.createAgentSession({
|
|
2391
|
+
sessionName: session.name,
|
|
2392
|
+
role: session.role || 'developer',
|
|
2393
|
+
projectPath: session.cwd || process.cwd(),
|
|
2394
|
+
runtimeType: session.runtimeType,
|
|
2395
|
+
teamId: session.teamId,
|
|
2396
|
+
memberId: session.memberId,
|
|
2397
|
+
forceRecreate: true,
|
|
2398
|
+
});
|
|
2399
|
+
if (result.success) {
|
|
2400
|
+
restored++;
|
|
2401
|
+
this.logger.info('Restored agent session', {
|
|
2402
|
+
name: session.name,
|
|
2403
|
+
role: session.role,
|
|
2404
|
+
runtimeType: session.runtimeType,
|
|
2405
|
+
progress: `${restored}/${agentSessions.length}`,
|
|
2406
|
+
});
|
|
2407
|
+
}
|
|
2408
|
+
else {
|
|
2409
|
+
failed.push(session.name);
|
|
2410
|
+
this.logger.warn('Failed to restore agent session', {
|
|
2411
|
+
name: session.name,
|
|
2412
|
+
error: result.error,
|
|
2413
|
+
});
|
|
2414
|
+
}
|
|
2415
|
+
}
|
|
2416
|
+
catch (error) {
|
|
2417
|
+
failed.push(session.name);
|
|
2418
|
+
this.logger.error('Error restoring agent session', {
|
|
2419
|
+
name: session.name,
|
|
2420
|
+
error: error instanceof Error ? error.message : String(error),
|
|
2421
|
+
});
|
|
2422
|
+
}
|
|
2423
|
+
}
|
|
2424
|
+
this.logger.info('Agent session restore complete', {
|
|
2425
|
+
restored,
|
|
2426
|
+
total: agentSessions.length,
|
|
2427
|
+
failed: failed.length > 0 ? failed : undefined,
|
|
2428
|
+
});
|
|
2429
|
+
// Clear persisted state after restore attempt to avoid double-restore
|
|
2430
|
+
await persistence.clearState();
|
|
2431
|
+
}
|
|
2432
|
+
catch (error) {
|
|
2433
|
+
this.logger.error('Failed to auto-restore agent sessions', {
|
|
2434
|
+
error: error instanceof Error ? error.message : String(error),
|
|
2435
|
+
});
|
|
2436
|
+
// Don't fail startup if auto-restore fails
|
|
2437
|
+
}
|
|
2438
|
+
}
|
|
2439
|
+
/**
|
|
2440
|
+
* Check for and handle pending self-improvement from hot-reload.
|
|
2441
|
+
* This runs at startup to validate or rollback any changes made
|
|
2442
|
+
* before the process was restarted.
|
|
2443
|
+
*/
|
|
2444
|
+
async checkPendingSelfImprovement() {
|
|
2445
|
+
try {
|
|
2446
|
+
const startupService = getImprovementStartupService();
|
|
2447
|
+
const result = await startupService.runStartupCheck();
|
|
2448
|
+
if (result.hadPendingImprovement) {
|
|
2449
|
+
this.logger.info('Handled pending self-improvement', {
|
|
2450
|
+
improvementId: result.improvementId,
|
|
2451
|
+
action: result.action,
|
|
2452
|
+
validationPassed: result.validationPassed,
|
|
2453
|
+
});
|
|
2454
|
+
if (result.action === 'rolled_back') {
|
|
2455
|
+
this.logger.warn('Self-improvement rollback performed', {
|
|
2456
|
+
error: result.error,
|
|
2457
|
+
});
|
|
2458
|
+
}
|
|
2459
|
+
}
|
|
2460
|
+
}
|
|
2461
|
+
catch (error) {
|
|
2462
|
+
this.logger.error('Error checking pending self-improvement', {
|
|
2463
|
+
error: error instanceof Error ? error.message : String(error),
|
|
2464
|
+
});
|
|
2465
|
+
// Continue startup even if self-improvement check fails
|
|
2466
|
+
}
|
|
2467
|
+
}
|
|
2468
|
+
async checkPortAvailability() {
|
|
2469
|
+
const { createServer } = await import('net');
|
|
2470
|
+
const testServer = createServer();
|
|
2471
|
+
return new Promise((resolve, reject) => {
|
|
2472
|
+
testServer.listen(this.config.webPort, () => {
|
|
2473
|
+
testServer.close(() => {
|
|
2474
|
+
this.logger.info('Port is available', { port: this.config.webPort });
|
|
2475
|
+
resolve();
|
|
2476
|
+
});
|
|
2477
|
+
});
|
|
2478
|
+
testServer.on('error', (error) => {
|
|
2479
|
+
if (error.code === 'EADDRINUSE') {
|
|
2480
|
+
reject(new Error(`Port ${this.config.webPort} is already in use`));
|
|
2481
|
+
}
|
|
2482
|
+
else {
|
|
2483
|
+
reject(error);
|
|
2484
|
+
}
|
|
2485
|
+
});
|
|
2486
|
+
});
|
|
2487
|
+
}
|
|
2488
|
+
async startHttpServer() {
|
|
2489
|
+
return new Promise((resolve, reject) => {
|
|
2490
|
+
const startTime = Date.now();
|
|
2491
|
+
this.httpServer.listen(this.config.webPort, () => {
|
|
2492
|
+
const duration = Date.now() - startTime;
|
|
2493
|
+
this.logger.info('Crewly server started', {
|
|
2494
|
+
port: this.config.webPort,
|
|
2495
|
+
durationMs: duration,
|
|
2496
|
+
dashboardUrl: `http://localhost:${this.config.webPort}`,
|
|
2497
|
+
websocketUrl: `ws://localhost:${this.config.webPort}`,
|
|
2498
|
+
home: this.config.crewlyHome
|
|
2499
|
+
});
|
|
2500
|
+
// B0 (interim) per `.crewly/specs/2026-05-05-trigger-persistence-bug.md`:
|
|
2501
|
+
// Broadcast `system:backend_restarted` exactly once per boot. The
|
|
2502
|
+
// trigger engine (`backend/src/services/v3/trigger-engine.service.ts`)
|
|
2503
|
+
// stores all `schedule-followup` / `watch-for-event` triggers in an
|
|
2504
|
+
// in-memory `Map<string, Trigger>` that is wiped on every restart.
|
|
2505
|
+
// Subscribers (e.g. self-watch-scribe, any TL using §3.0 universal
|
|
2506
|
+
// delegator-rule) listen for this event as a freshness signal and
|
|
2507
|
+
// re-arm their watchdogs. Re-arm latency drops from "manual cycle"
|
|
2508
|
+
// to "next event tick" — closes the wipe-coverage-gap to seconds.
|
|
2509
|
+
// B1 (full fix) is disk-backed declarative trigger config per the
|
|
2510
|
+
// spec Path A; B0 is the unblock-first interim until B1 lands.
|
|
2511
|
+
try {
|
|
2512
|
+
// AgentEvent shape (`backend/src/types/event-bus.types.ts:198`)
|
|
2513
|
+
// requires a fixed set of string fields. For system-scoped
|
|
2514
|
+
// events we use 'system' for member/session and leave team
|
|
2515
|
+
// fields empty — subscribers MUST gate on `type` rather than
|
|
2516
|
+
// team/member identity. Boot diagnostics (port, duration) are
|
|
2517
|
+
// already in the preceding `Crewly server started` log;
|
|
2518
|
+
// callers needing them can correlate by `timestamp`.
|
|
2519
|
+
this.eventBusService.publish({
|
|
2520
|
+
id: `system-backend-restarted-${Date.now()}`,
|
|
2521
|
+
type: 'system:backend_restarted',
|
|
2522
|
+
timestamp: new Date().toISOString(),
|
|
2523
|
+
teamId: '',
|
|
2524
|
+
teamName: '',
|
|
2525
|
+
memberId: '',
|
|
2526
|
+
memberName: 'system',
|
|
2527
|
+
sessionName: 'system',
|
|
2528
|
+
previousValue: 'stopped',
|
|
2529
|
+
newValue: 'started',
|
|
2530
|
+
changedField: 'agentStatus'
|
|
2531
|
+
});
|
|
2532
|
+
this.logger.info('Broadcast system:backend_restarted event', {
|
|
2533
|
+
port: this.config.webPort,
|
|
2534
|
+
bootDurationMs: duration
|
|
2535
|
+
});
|
|
2536
|
+
}
|
|
2537
|
+
catch (emitError) {
|
|
2538
|
+
// Failure isolation — never block boot on this telemetry.
|
|
2539
|
+
this.logger.warn('Failed to broadcast system:backend_restarted (non-fatal)', {
|
|
2540
|
+
error: emitError instanceof Error ? emitError.message : String(emitError)
|
|
2541
|
+
});
|
|
2542
|
+
}
|
|
2543
|
+
resolve();
|
|
2544
|
+
});
|
|
2545
|
+
this.httpServer.on('error', (error) => {
|
|
2546
|
+
this.logger.error('HTTP Server error', { error: error.message, code: error.code });
|
|
2547
|
+
if (error.code === 'EADDRINUSE') {
|
|
2548
|
+
this.logger.error('Port already in use by another process', { port: this.config.webPort });
|
|
2549
|
+
this.logger.info('Suggestion: Kill the existing process or change the port');
|
|
2550
|
+
}
|
|
2551
|
+
else if (error.code === 'EACCES') {
|
|
2552
|
+
this.logger.error('Permission denied for port', { port: this.config.webPort });
|
|
2553
|
+
this.logger.info('Suggestion: Try a port above 1024 or run with appropriate permissions');
|
|
2554
|
+
}
|
|
2555
|
+
reject(error);
|
|
2556
|
+
});
|
|
2557
|
+
});
|
|
2558
|
+
}
|
|
2559
|
+
async handlePortConflict() {
|
|
2560
|
+
this.logger.info('Attempting to identify conflicting process...');
|
|
2561
|
+
try {
|
|
2562
|
+
const { execSync } = await import('child_process');
|
|
2563
|
+
const result = execSync(`lsof -ti :${this.config.webPort}`, { encoding: 'utf8' }).trim();
|
|
2564
|
+
if (result) {
|
|
2565
|
+
this.logger.info('Process using port identified', { port: this.config.webPort, pid: result });
|
|
2566
|
+
this.logger.info('To kill it manually', { command: `kill -9 ${result}` });
|
|
2567
|
+
}
|
|
2568
|
+
}
|
|
2569
|
+
catch (error) {
|
|
2570
|
+
this.logger.info('Could not identify the conflicting process');
|
|
2571
|
+
}
|
|
2572
|
+
}
|
|
2573
|
+
sigintCount = 0;
|
|
2574
|
+
registerSignalHandlers() {
|
|
2575
|
+
this.logger.info('Registering signal handlers...');
|
|
2576
|
+
process.on('SIGTERM', () => {
|
|
2577
|
+
this.logger.info('Received SIGTERM signal');
|
|
2578
|
+
this.shutdown();
|
|
2579
|
+
});
|
|
2580
|
+
process.on('SIGINT', () => {
|
|
2581
|
+
this.sigintCount++;
|
|
2582
|
+
if (this.sigintCount === 1) {
|
|
2583
|
+
this.logger.info('Received SIGINT signal (Ctrl+C) - shutting down gracefully. Press Ctrl+C again to force exit.');
|
|
2584
|
+
this.shutdown();
|
|
2585
|
+
}
|
|
2586
|
+
else {
|
|
2587
|
+
this.logger.info('Received second SIGINT - forcing immediate exit');
|
|
2588
|
+
process.exit(1);
|
|
2589
|
+
}
|
|
2590
|
+
});
|
|
2591
|
+
process.on('uncaughtException', (error) => {
|
|
2592
|
+
this.logger.error('Uncaught exception', { error: error.message, stack: error.stack });
|
|
2593
|
+
this.logMemoryUsage();
|
|
2594
|
+
this.shutdown();
|
|
2595
|
+
});
|
|
2596
|
+
process.on('unhandledRejection', (reason, promise) => {
|
|
2597
|
+
const message = reason instanceof Error ? reason.message : String(reason);
|
|
2598
|
+
// Non-fatal rejections from third-party libraries (e.g., Slack Socket Mode
|
|
2599
|
+
// state machine errors) should be logged but not trigger a full shutdown.
|
|
2600
|
+
const nonFatalPatterns = [
|
|
2601
|
+
'Unhandled event', // finity state machine (Slack Socket Mode)
|
|
2602
|
+
'socket hang up', // transient network errors
|
|
2603
|
+
'ECONNRESET', // connection reset by peer
|
|
2604
|
+
];
|
|
2605
|
+
const isNonFatal = nonFatalPatterns.some(p => message.includes(p));
|
|
2606
|
+
if (isNonFatal) {
|
|
2607
|
+
this.logger.warn('Non-fatal unhandled rejection (suppressed shutdown)', {
|
|
2608
|
+
reason: message,
|
|
2609
|
+
});
|
|
2610
|
+
return;
|
|
2611
|
+
}
|
|
2612
|
+
this.logger.error('Unhandled rejection', {
|
|
2613
|
+
reason: message,
|
|
2614
|
+
stack: reason instanceof Error ? reason.stack : undefined
|
|
2615
|
+
});
|
|
2616
|
+
this.logMemoryUsage();
|
|
2617
|
+
this.shutdown();
|
|
2618
|
+
});
|
|
2619
|
+
}
|
|
2620
|
+
startHealthMonitoring() {
|
|
2621
|
+
this.logger.info('Starting health monitoring...');
|
|
2622
|
+
// Monitor memory usage every 30 seconds
|
|
2623
|
+
this.healthMonitoringInterval = setInterval(() => {
|
|
2624
|
+
this.logMemoryUsage();
|
|
2625
|
+
}, 30000);
|
|
2626
|
+
// V3: Periodic TTL-based auto-close for open Requests (every 2 min)
|
|
2627
|
+
// Catches direct orchestrator responses that finish within a single poll cycle
|
|
2628
|
+
// and never trigger the EventBus agent:idle event
|
|
2629
|
+
setInterval(() => this.autoCloseOpenRequests(), 2 * 60 * 1000);
|
|
2630
|
+
// V3: Mission OKR Reminders (every hour)
|
|
2631
|
+
// Scans active missions and sends Slack alerts for off-track KRs
|
|
2632
|
+
setInterval(async () => {
|
|
2633
|
+
try {
|
|
2634
|
+
const { MissionReminderService } = await import('./services/v3/mission-reminder.service.js');
|
|
2635
|
+
await MissionReminderService.getInstance().runSweep();
|
|
2636
|
+
}
|
|
2637
|
+
catch (err) {
|
|
2638
|
+
this.logger.warn('Mission OKR reminder sweep failed', { error: String(err) });
|
|
2639
|
+
}
|
|
2640
|
+
}, 60 * 60 * 1000);
|
|
2641
|
+
// Purge done Requests and WorkItems older than 24h (every hour)
|
|
2642
|
+
setInterval(() => this.purgeCompletedData(), 60 * 60 * 1000);
|
|
2643
|
+
// Run once at startup after a short delay
|
|
2644
|
+
setTimeout(() => this.purgeCompletedData(), 30 * 1000);
|
|
2645
|
+
setTimeout(async () => {
|
|
2646
|
+
try {
|
|
2647
|
+
const { MissionReminderService } = await import('./services/v3/mission-reminder.service.js');
|
|
2648
|
+
await MissionReminderService.getInstance().runSweep();
|
|
2649
|
+
}
|
|
2650
|
+
catch (err) {
|
|
2651
|
+
// Non-critical
|
|
2652
|
+
}
|
|
2653
|
+
}, 60 * 1000);
|
|
2654
|
+
}
|
|
2655
|
+
/**
|
|
2656
|
+
* Removes done/cancelled Requests older than 24h from disk,
|
|
2657
|
+
* and purges done/cancelled/failed WorkItems from the task pool.
|
|
2658
|
+
* Memory, knowledge, and learnings are never purged.
|
|
2659
|
+
*/
|
|
2660
|
+
purgeCompletedData() {
|
|
2661
|
+
setImmediate(async () => {
|
|
2662
|
+
const RETENTION_MS = 24 * 60 * 60 * 1000;
|
|
2663
|
+
const cutoff = Date.now() - RETENTION_MS;
|
|
2664
|
+
// 1. Purge done Requests
|
|
2665
|
+
try {
|
|
2666
|
+
const { RequestService } = await import('./services/v3/request.service.js');
|
|
2667
|
+
const svc = RequestService.getInstance();
|
|
2668
|
+
const all = await svc.listAll();
|
|
2669
|
+
let purgedRequests = 0;
|
|
2670
|
+
for (const req of all) {
|
|
2671
|
+
if (req.status !== 'done' && req.status !== 'cancelled')
|
|
2672
|
+
continue;
|
|
2673
|
+
const completedAt = req.completedAt ? new Date(req.completedAt).getTime() : 0;
|
|
2674
|
+
const createdAt = new Date(req.createdAt).getTime();
|
|
2675
|
+
const age = completedAt || createdAt;
|
|
2676
|
+
if (age < cutoff) {
|
|
2677
|
+
await svc.delete(req.id);
|
|
2678
|
+
purgedRequests++;
|
|
2679
|
+
}
|
|
2680
|
+
}
|
|
2681
|
+
if (purgedRequests > 0) {
|
|
2682
|
+
this.logger.info('Purged old completed Requests', { count: purgedRequests });
|
|
2683
|
+
}
|
|
2684
|
+
}
|
|
2685
|
+
catch (err) {
|
|
2686
|
+
this.logger.warn('Request purge failed (non-critical)', {
|
|
2687
|
+
error: err instanceof Error ? err.message : String(err),
|
|
2688
|
+
});
|
|
2689
|
+
}
|
|
2690
|
+
// 2. Purge done/cancelled/failed WorkItems from pool
|
|
2691
|
+
try {
|
|
2692
|
+
const { TaskPoolService } = await import('./services/task-pool/task-pool.service.js');
|
|
2693
|
+
const pool = TaskPoolService.getInstance();
|
|
2694
|
+
const allItems = await pool.getAllItems();
|
|
2695
|
+
const terminalStatuses = new Set(['done', 'cancelled', 'failed']);
|
|
2696
|
+
let purgedItems = 0;
|
|
2697
|
+
for (const wi of allItems) {
|
|
2698
|
+
if (!terminalStatuses.has(wi.status))
|
|
2699
|
+
continue;
|
|
2700
|
+
const completedAt = wi.completedAt ? new Date(wi.completedAt).getTime() : 0;
|
|
2701
|
+
const createdAt = new Date(wi.createdAt).getTime();
|
|
2702
|
+
const age = completedAt || createdAt;
|
|
2703
|
+
if (age < cutoff) {
|
|
2704
|
+
await pool.removeItem(wi.id);
|
|
2705
|
+
purgedItems++;
|
|
2706
|
+
}
|
|
2707
|
+
}
|
|
2708
|
+
if (purgedItems > 0) {
|
|
2709
|
+
this.logger.info('Purged old completed WorkItems', { count: purgedItems });
|
|
2710
|
+
}
|
|
2711
|
+
}
|
|
2712
|
+
catch (err) {
|
|
2713
|
+
this.logger.warn('WorkItem purge failed (non-critical)', {
|
|
2714
|
+
error: err instanceof Error ? err.message : String(err),
|
|
2715
|
+
});
|
|
2716
|
+
}
|
|
2717
|
+
});
|
|
2718
|
+
}
|
|
2719
|
+
/**
|
|
2720
|
+
* Closes open Requests that were created within the last 10 minutes.
|
|
2721
|
+
* Used to handle direct orchestrator responses that don't go through WorkItems.
|
|
2722
|
+
* Also rolls up orchestrator token usage and sets ownerAgent for the Request.
|
|
2723
|
+
*
|
|
2724
|
+
* Token source varies by runtime:
|
|
2725
|
+
* - claude-code: reads session JSONL (TUI status bar not capturable from PTY)
|
|
2726
|
+
* - gemini-cli / codex-cli: reads from TokenUsageService (fed by PTY parser)
|
|
2727
|
+
* - crewly-agent: reads from TokenUsageService (fed by SDK)
|
|
2728
|
+
*/
|
|
2729
|
+
autoCloseOpenRequests() {
|
|
2730
|
+
setImmediate(async () => {
|
|
2731
|
+
try {
|
|
2732
|
+
const { RequestService } = await import('./services/v3/request.service.js');
|
|
2733
|
+
const { TokenUsageService } = await import('./services/monitoring/token-usage.service.js');
|
|
2734
|
+
const { getTokensSince } = await import('./services/monitoring/claude-session-tokens.service.js');
|
|
2735
|
+
const { getSessionStatePersistence } = await import('./services/session/session-state-persistence.js');
|
|
2736
|
+
const svc = RequestService.getInstance();
|
|
2737
|
+
const tokenSvc = TokenUsageService.getInstance();
|
|
2738
|
+
const all = await svc.listAll();
|
|
2739
|
+
const cutoff = Date.now() - 10 * 60 * 1000; // 10 min window
|
|
2740
|
+
const minAgeMs = 3 * 60 * 1000; // Don't close requests younger than 3 min — gives orchestrator time to delegate
|
|
2741
|
+
// Resolve orchestrator runtime type once per cycle
|
|
2742
|
+
const persistence = getSessionStatePersistence();
|
|
2743
|
+
const orcMeta = persistence.getSessionMetadata(ORCHESTRATOR_SESSION_NAME);
|
|
2744
|
+
const orcRuntimeType = orcMeta?.runtimeType || 'claude-code';
|
|
2745
|
+
for (const req of all) {
|
|
2746
|
+
if (req.status !== 'open')
|
|
2747
|
+
continue;
|
|
2748
|
+
const reqAge = Date.now() - new Date(req.createdAt).getTime();
|
|
2749
|
+
if (reqAge > 10 * 60 * 1000)
|
|
2750
|
+
continue; // older than 10 min — skip
|
|
2751
|
+
if (reqAge < minAgeMs)
|
|
2752
|
+
continue; // too young — orchestrator may still be delegating
|
|
2753
|
+
const update = { status: 'done' };
|
|
2754
|
+
// Roll up orchestrator tokens only for direct responses (no WorkItem delegation)
|
|
2755
|
+
if (req.workItemIds.length === 0) {
|
|
2756
|
+
const since = new Date(req.createdAt);
|
|
2757
|
+
let inputTokens = 0;
|
|
2758
|
+
let outputTokens = 0;
|
|
2759
|
+
let cost = 0;
|
|
2760
|
+
if (orcRuntimeType === 'claude-code') {
|
|
2761
|
+
// Claude Code: read from session JSONL (ground truth from API)
|
|
2762
|
+
// Falls back to auto-detecting the latest session file if ID unknown.
|
|
2763
|
+
// Use current time as upper bound to avoid counting tokens from
|
|
2764
|
+
// subsequent requests in the same session.
|
|
2765
|
+
const sessionId = persistence.getSessionId(ORCHESTRATOR_SESSION_NAME) || null;
|
|
2766
|
+
const summary = await getTokensSince(this.config.crewlyHome, sessionId, since, new Date());
|
|
2767
|
+
if (summary && summary.turnCount > 0) {
|
|
2768
|
+
inputTokens = summary.inputTokens;
|
|
2769
|
+
outputTokens = summary.outputTokens;
|
|
2770
|
+
cost = summary.cost;
|
|
2771
|
+
}
|
|
2772
|
+
}
|
|
2773
|
+
else {
|
|
2774
|
+
// Gemini CLI / Codex CLI / crewly-agent: read from TokenUsageService
|
|
2775
|
+
// (fed by PTY terminal output parser or SDK)
|
|
2776
|
+
const usage = tokenSvc.getSessionUsageSince(ORCHESTRATOR_SESSION_NAME, since);
|
|
2777
|
+
inputTokens = usage.inputTokens;
|
|
2778
|
+
outputTokens = usage.outputTokens;
|
|
2779
|
+
cost = usage.cost;
|
|
2780
|
+
}
|
|
2781
|
+
if (inputTokens > 0 || outputTokens > 0) {
|
|
2782
|
+
update.totalInputTokens = (req.totalInputTokens || 0) + inputTokens;
|
|
2783
|
+
update.totalOutputTokens = (req.totalOutputTokens || 0) + outputTokens;
|
|
2784
|
+
update.totalCost = (req.totalCost || 0) + cost;
|
|
2785
|
+
}
|
|
2786
|
+
update.ownerAgent = ORCHESTRATOR_SESSION_NAME;
|
|
2787
|
+
}
|
|
2788
|
+
await svc.update(req.id, update);
|
|
2789
|
+
// Mark the corresponding Slack/chat thread as terminal so the
|
|
2790
|
+
// SessionHandoff resume notification won't re-send it after restart.
|
|
2791
|
+
if (req.sourceConversationItemId.startsWith('slack-')) {
|
|
2792
|
+
try {
|
|
2793
|
+
const { ThreadStatusQueueService } = await import('./services/messaging/thread-status-queue.service.js');
|
|
2794
|
+
const { extractSlackChannelId, extractSlackThreadTs } = await import('./services/v3/request-sla.subscriber.js');
|
|
2795
|
+
const tsq = ThreadStatusQueueService.getInstance();
|
|
2796
|
+
// Use the canonical parser (handles both `slack-{ch}-{ts}` and
|
|
2797
|
+
// the thread-reply `slack-{ch}-{root}-msg-{msgTs}` shapes).
|
|
2798
|
+
const channelId = extractSlackChannelId(req.sourceConversationItemId);
|
|
2799
|
+
const threadTs = extractSlackThreadTs(req.sourceConversationItemId);
|
|
2800
|
+
if (channelId && threadTs) {
|
|
2801
|
+
const threadKey = `${channelId}:${threadTs}`;
|
|
2802
|
+
// Create entry if not tracked, then mark terminal
|
|
2803
|
+
if (!tsq.get(threadKey)) {
|
|
2804
|
+
tsq.trackInbound({
|
|
2805
|
+
threadKey,
|
|
2806
|
+
conversationId: req.sourceConversationItemId,
|
|
2807
|
+
source: 'slack',
|
|
2808
|
+
messagePreview: req.title,
|
|
2809
|
+
});
|
|
2810
|
+
}
|
|
2811
|
+
tsq.markReplied(threadKey, 'replied_completed');
|
|
2812
|
+
}
|
|
2813
|
+
}
|
|
2814
|
+
catch {
|
|
2815
|
+
// Non-critical — thread status is best-effort
|
|
2816
|
+
}
|
|
2817
|
+
}
|
|
2818
|
+
this.logger.debug('V3 Request auto-closed', {
|
|
2819
|
+
requestId: req.id,
|
|
2820
|
+
ownerAgent: update.ownerAgent,
|
|
2821
|
+
inputTokens: update.totalInputTokens,
|
|
2822
|
+
outputTokens: update.totalOutputTokens,
|
|
2823
|
+
cost: update.totalCost,
|
|
2824
|
+
});
|
|
2825
|
+
}
|
|
2826
|
+
}
|
|
2827
|
+
catch (err) {
|
|
2828
|
+
this.logger.warn('V3 Request auto-close failed (non-critical)', {
|
|
2829
|
+
error: err instanceof Error ? err.message : String(err),
|
|
2830
|
+
});
|
|
2831
|
+
}
|
|
2832
|
+
});
|
|
2833
|
+
}
|
|
2834
|
+
logMemoryUsage() {
|
|
2835
|
+
const usage = process.memoryUsage();
|
|
2836
|
+
const heapUsed = Math.round(usage.heapUsed / 1024 / 1024);
|
|
2837
|
+
const heapTotal = Math.round(usage.heapTotal / 1024 / 1024);
|
|
2838
|
+
const external = Math.round(usage.external / 1024 / 1024);
|
|
2839
|
+
this.logger.debug('Memory usage', { heapUsedMB: heapUsed, heapTotalMB: heapTotal, externalMB: external });
|
|
2840
|
+
// Warn if memory usage is high
|
|
2841
|
+
if (heapUsed > 500) {
|
|
2842
|
+
this.logger.warn('High memory usage detected', { heapUsedMB: heapUsed });
|
|
2843
|
+
}
|
|
2844
|
+
}
|
|
2845
|
+
async shutdown() {
|
|
2846
|
+
// Prevent double shutdown
|
|
2847
|
+
if (this.isShuttingDown) {
|
|
2848
|
+
this.logger.info('Shutdown already in progress, skipping...');
|
|
2849
|
+
return;
|
|
2850
|
+
}
|
|
2851
|
+
this.isShuttingDown = true;
|
|
2852
|
+
this.logger.info('Shutting down Crewly server...');
|
|
2853
|
+
// Set a hard timeout to force exit if graceful shutdown takes too long.
|
|
2854
|
+
// Use SIGKILL on self as the ultimate fallback — this is uncatchable and
|
|
2855
|
+
// guarantees death even if native node-pty handles keep the event loop alive.
|
|
2856
|
+
const isDev = process.env.NODE_ENV !== 'production';
|
|
2857
|
+
const timeoutMs = isDev ? 5000 : 10000;
|
|
2858
|
+
const forceExitTimeout = setTimeout(() => {
|
|
2859
|
+
this.logger.warn('Graceful shutdown timed out, sending SIGKILL to self...');
|
|
2860
|
+
process.kill(process.pid, 'SIGKILL');
|
|
2861
|
+
}, timeoutMs);
|
|
2862
|
+
try {
|
|
2863
|
+
// Clear health monitoring interval first
|
|
2864
|
+
if (this.healthMonitoringInterval) {
|
|
2865
|
+
clearInterval(this.healthMonitoringInterval);
|
|
2866
|
+
this.healthMonitoringInterval = null;
|
|
2867
|
+
}
|
|
2868
|
+
// Unload addons (call their unregister hooks)
|
|
2869
|
+
try {
|
|
2870
|
+
await AddonLoaderService.getInstance().unloadAddons();
|
|
2871
|
+
}
|
|
2872
|
+
catch (addonErr) {
|
|
2873
|
+
this.logger.warn('Error unloading addons during shutdown', {
|
|
2874
|
+
error: addonErr instanceof Error ? addonErr.message : String(addonErr),
|
|
2875
|
+
});
|
|
2876
|
+
}
|
|
2877
|
+
// Generate session handoff summary before killing processes
|
|
2878
|
+
// This captures active thread state and agent status for restart recovery
|
|
2879
|
+
try {
|
|
2880
|
+
const { SessionHandoffService } = await import('./services/session/session-handoff.service.js');
|
|
2881
|
+
await SessionHandoffService.getInstance().generateSummary(this.storageService);
|
|
2882
|
+
}
|
|
2883
|
+
catch (error) {
|
|
2884
|
+
this.logger.warn('Failed to generate session handoff summary', {
|
|
2885
|
+
error: error instanceof Error ? error.message : String(error),
|
|
2886
|
+
});
|
|
2887
|
+
}
|
|
2888
|
+
// Disconnect Redis cache
|
|
2889
|
+
try {
|
|
2890
|
+
RedisCacheService.getInstance().disconnect();
|
|
2891
|
+
}
|
|
2892
|
+
catch {
|
|
2893
|
+
// Non-critical — ignore
|
|
2894
|
+
}
|
|
2895
|
+
// Save PTY session state and force-kill all child processes
|
|
2896
|
+
this.logger.info('Saving PTY session state and force-killing child processes...');
|
|
2897
|
+
try {
|
|
2898
|
+
const sessionBackend = getSessionBackendSync();
|
|
2899
|
+
if (sessionBackend) {
|
|
2900
|
+
// Save state for resume-on-restart
|
|
2901
|
+
const persistence = getSessionStatePersistence();
|
|
2902
|
+
const savedCount = await persistence.saveState(sessionBackend);
|
|
2903
|
+
if (savedCount > 0) {
|
|
2904
|
+
this.logger.info('Saved PTY sessions for later restoration', { count: savedCount });
|
|
2905
|
+
}
|
|
2906
|
+
// Collect PIDs before destroying for belt-and-suspenders cleanup
|
|
2907
|
+
let collectedPids = [];
|
|
2908
|
+
if (sessionBackend instanceof PtySessionBackend) {
|
|
2909
|
+
collectedPids = sessionBackend.getAllSessionPids();
|
|
2910
|
+
this.logger.info('Collected PTY PIDs for shutdown', { pids: collectedPids });
|
|
2911
|
+
// Use forceDestroyAll for SIGTERM → SIGKILL escalation
|
|
2912
|
+
await sessionBackend.forceDestroyAll();
|
|
2913
|
+
}
|
|
2914
|
+
else {
|
|
2915
|
+
await sessionBackend.destroy();
|
|
2916
|
+
}
|
|
2917
|
+
// Belt-and-suspenders: SIGKILL any remaining PIDs
|
|
2918
|
+
for (const pid of collectedPids) {
|
|
2919
|
+
try {
|
|
2920
|
+
process.kill(pid, 'SIGKILL');
|
|
2921
|
+
}
|
|
2922
|
+
catch {
|
|
2923
|
+
// ESRCH = already dead, which is expected
|
|
2924
|
+
}
|
|
2925
|
+
}
|
|
2926
|
+
}
|
|
2927
|
+
// Clear the factory singleton
|
|
2928
|
+
await destroySessionBackend();
|
|
2929
|
+
}
|
|
2930
|
+
catch (error) {
|
|
2931
|
+
this.logger.warn('Failed to save PTY session state', { error: error instanceof Error ? error.message : String(error) });
|
|
2932
|
+
}
|
|
2933
|
+
// Flush message queue to disk before stopping processor
|
|
2934
|
+
this.logger.info('Flushing message queue to disk...');
|
|
2935
|
+
try {
|
|
2936
|
+
await this.messageQueueService.flushPersist();
|
|
2937
|
+
}
|
|
2938
|
+
catch (error) {
|
|
2939
|
+
this.logger.warn('Failed to flush message queue', {
|
|
2940
|
+
error: error instanceof Error ? error.message : String(error),
|
|
2941
|
+
});
|
|
2942
|
+
}
|
|
2943
|
+
// Flush thread status queue to disk
|
|
2944
|
+
try {
|
|
2945
|
+
await this.threadStatusQueueService.persist();
|
|
2946
|
+
}
|
|
2947
|
+
catch (error) {
|
|
2948
|
+
this.logger.warn('Failed to flush thread status queue', {
|
|
2949
|
+
error: error instanceof Error ? error.message : String(error),
|
|
2950
|
+
});
|
|
2951
|
+
}
|
|
2952
|
+
// Flush task pool (WorkItems) to disk — prevents data loss on restart
|
|
2953
|
+
try {
|
|
2954
|
+
const { TaskPoolService } = await import('./services/task-pool/task-pool.service.js');
|
|
2955
|
+
const pool = TaskPoolService.getInstance();
|
|
2956
|
+
await pool.flush();
|
|
2957
|
+
this.logger.info('Task pool flushed to disk');
|
|
2958
|
+
}
|
|
2959
|
+
catch (error) {
|
|
2960
|
+
this.logger.warn('Failed to flush task pool', {
|
|
2961
|
+
error: error instanceof Error ? error.message : String(error),
|
|
2962
|
+
});
|
|
2963
|
+
}
|
|
2964
|
+
// Stop system resource alert monitoring
|
|
2965
|
+
if (this.systemResourceAlertService) {
|
|
2966
|
+
this.systemResourceAlertService.stopMonitoring();
|
|
2967
|
+
}
|
|
2968
|
+
// Stop Reconciler loops (V2)
|
|
2969
|
+
if (this.reconcilerService) {
|
|
2970
|
+
this.reconcilerService.stop();
|
|
2971
|
+
this.logger.info('Reconciler stopped');
|
|
2972
|
+
}
|
|
2973
|
+
// Stop Team-Health-Watchdog sweep loop (Layer 4)
|
|
2974
|
+
if (this.teamHealthWatchdog) {
|
|
2975
|
+
this.teamHealthWatchdog.stop();
|
|
2976
|
+
this.logger.info('TeamHealthWatchdog stopped');
|
|
2977
|
+
}
|
|
2978
|
+
// Stop NOTIFY reconciliation service
|
|
2979
|
+
if (this.notifyReconciliationService) {
|
|
2980
|
+
this.notifyReconciliationService.stop();
|
|
2981
|
+
}
|
|
2982
|
+
// Stop message queue processor
|
|
2983
|
+
this.queueProcessorService.stop();
|
|
2984
|
+
// Stop the EventToWorkItemBridge BEFORE cleaning the event bus so
|
|
2985
|
+
// in-flight handler dispatches drain against a still-live bus.
|
|
2986
|
+
if (this.eventToWorkItemBridge) {
|
|
2987
|
+
this.eventToWorkItemBridge.stop();
|
|
2988
|
+
this.eventToWorkItemBridge = null;
|
|
2989
|
+
}
|
|
2990
|
+
// LEARN-1: stop the AutoLearningSubscriber on the same window as the
|
|
2991
|
+
// bridge so its in-flight recordLearning calls drain before the bus
|
|
2992
|
+
// is cleaned.
|
|
2993
|
+
if (this.autoLearningSubscriber) {
|
|
2994
|
+
this.autoLearningSubscriber.stop();
|
|
2995
|
+
this.autoLearningSubscriber = null;
|
|
2996
|
+
}
|
|
2997
|
+
// DF-1 #438: same shutdown window as auto-learning above.
|
|
2998
|
+
if (this.milestoneNotificationSubscriber) {
|
|
2999
|
+
this.milestoneNotificationSubscriber.stop();
|
|
3000
|
+
this.milestoneNotificationSubscriber = null;
|
|
3001
|
+
}
|
|
3002
|
+
// INBOUND-1: stop the SLA subscriber and unset the module-level
|
|
3003
|
+
// references so a follow-up start() doesn't see stale singletons.
|
|
3004
|
+
if (this.requestSlaSubscriber) {
|
|
3005
|
+
this.requestSlaSubscriber.stop();
|
|
3006
|
+
this.requestSlaSubscriber = null;
|
|
3007
|
+
}
|
|
3008
|
+
setRequestSlaSubscriber(null);
|
|
3009
|
+
// Pipeline-#4 follow-up: stop the decompose subscriber and clear
|
|
3010
|
+
// its module-level reference on the same shutdown window as SLA.
|
|
3011
|
+
if (this.requestDecomposeSubscriber) {
|
|
3012
|
+
this.requestDecomposeSubscriber.stop();
|
|
3013
|
+
this.requestDecomposeSubscriber = null;
|
|
3014
|
+
}
|
|
3015
|
+
setRequestDecomposeSubscriber(null);
|
|
3016
|
+
setRequestServiceEventBus(null);
|
|
3017
|
+
// Clean up event bus service
|
|
3018
|
+
this.eventBusService.cleanup();
|
|
3019
|
+
// Clean up schedulers
|
|
3020
|
+
this.schedulerService.cleanup();
|
|
3021
|
+
this.messageSchedulerService.cleanup();
|
|
3022
|
+
// Stop activity monitoring
|
|
3023
|
+
this.activityMonitorService.stopPolling();
|
|
3024
|
+
// Stop idle detection
|
|
3025
|
+
IdleDetectionService.getInstance().stop();
|
|
3026
|
+
// Stop agent heartbeat monitor
|
|
3027
|
+
AgentHeartbeatMonitorService.getInstance().stop();
|
|
3028
|
+
// Stop context window monitor
|
|
3029
|
+
ContextWindowMonitorService.getInstance().stop();
|
|
3030
|
+
// Stop OAuth relogin monitor
|
|
3031
|
+
OAuthReloginMonitorService.getInstance().destroy();
|
|
3032
|
+
// Stop orchestrator heartbeat monitor
|
|
3033
|
+
OrchestratorHeartbeatMonitorService.getInstance().stop();
|
|
3034
|
+
// Stop Crewly in Chrome WebSocket bridge
|
|
3035
|
+
try {
|
|
3036
|
+
const { BrowserBridgeService } = await import('./services/browser/browser-bridge.service.js');
|
|
3037
|
+
BrowserBridgeService.getInstance().stop();
|
|
3038
|
+
}
|
|
3039
|
+
catch {
|
|
3040
|
+
// May not have been initialized
|
|
3041
|
+
}
|
|
3042
|
+
// Disconnect BrowserProxyService from Cloud Relay
|
|
3043
|
+
try {
|
|
3044
|
+
const { BrowserProxyService } = await import('./services/browser/browser-proxy.service.js');
|
|
3045
|
+
BrowserProxyService.getInstance().disconnect();
|
|
3046
|
+
}
|
|
3047
|
+
catch {
|
|
3048
|
+
// May not have been initialized
|
|
3049
|
+
}
|
|
3050
|
+
// Stop team activity WebSocket service
|
|
3051
|
+
this.teamActivityWebSocketService.stop();
|
|
3052
|
+
// Stop teams.json file watcher
|
|
3053
|
+
this.teamsJsonWatcherService.stop();
|
|
3054
|
+
// Stop log rotation service
|
|
3055
|
+
LogRotationService.getInstance().stop();
|
|
3056
|
+
// Stop auditor scheduler
|
|
3057
|
+
AuditorSchedulerService.getInstance().stop();
|
|
3058
|
+
// Flush and shutdown OpenTelemetry tracing
|
|
3059
|
+
try {
|
|
3060
|
+
const { TracingService: TracingSvc } = await import('./services/core/tracing.service.js');
|
|
3061
|
+
await TracingSvc.getInstance().shutdown();
|
|
3062
|
+
}
|
|
3063
|
+
catch {
|
|
3064
|
+
// Ignore if not initialized
|
|
3065
|
+
}
|
|
3066
|
+
// Clean up tmux service resources
|
|
3067
|
+
this.tmuxService.destroy();
|
|
3068
|
+
// Stop Slack image cleanup timer
|
|
3069
|
+
try {
|
|
3070
|
+
const { getSlackImageService: getImgSvc } = await import('./services/slack/slack-image.service.js');
|
|
3071
|
+
getImgSvc().stopCleanup();
|
|
3072
|
+
}
|
|
3073
|
+
catch {
|
|
3074
|
+
// Ignore if not initialized
|
|
3075
|
+
}
|
|
3076
|
+
// Shutdown Slack integration
|
|
3077
|
+
this.logger.info('Shutting down Slack integration...');
|
|
3078
|
+
await shutdownSlack();
|
|
3079
|
+
// Shutdown WhatsApp integration
|
|
3080
|
+
this.logger.info('Shutting down WhatsApp integration...');
|
|
3081
|
+
await shutdownWhatsApp();
|
|
3082
|
+
// Shutdown Telegram integration
|
|
3083
|
+
this.logger.info('Shutting down Telegram integration...');
|
|
3084
|
+
await shutdownTelegram();
|
|
3085
|
+
// Note: Cloud Task Processor has been migrated to services/tasks/
|
|
3086
|
+
// Kill all tmux sessions
|
|
3087
|
+
const sessions = await this.tmuxService.listSessions();
|
|
3088
|
+
for (const session of sessions) {
|
|
3089
|
+
if (session.sessionName.startsWith('crewly_')) {
|
|
3090
|
+
await this.tmuxService.killSession(session.sessionName);
|
|
3091
|
+
}
|
|
3092
|
+
}
|
|
3093
|
+
// Close all socket.io connections
|
|
3094
|
+
this.logger.info('Closing WebSocket connections...');
|
|
3095
|
+
this.io.close();
|
|
3096
|
+
// Close HTTP server with timeout
|
|
3097
|
+
this.logger.info('Closing HTTP server...');
|
|
3098
|
+
await new Promise((resolve) => {
|
|
3099
|
+
this.httpServer.close(() => {
|
|
3100
|
+
this.logger.info('Server shut down gracefully');
|
|
3101
|
+
resolve();
|
|
3102
|
+
});
|
|
3103
|
+
// If server doesn't close in 3 seconds, continue anyway
|
|
3104
|
+
setTimeout(resolve, 3000);
|
|
3105
|
+
});
|
|
3106
|
+
clearTimeout(forceExitTimeout);
|
|
3107
|
+
process.exit(0);
|
|
3108
|
+
}
|
|
3109
|
+
catch (error) {
|
|
3110
|
+
this.logger.error('Error during shutdown', { error: error instanceof Error ? error.message : String(error) });
|
|
3111
|
+
clearTimeout(forceExitTimeout);
|
|
3112
|
+
process.exit(1);
|
|
3113
|
+
}
|
|
3114
|
+
}
|
|
3115
|
+
getConfig() {
|
|
3116
|
+
return { ...this.config };
|
|
3117
|
+
}
|
|
3118
|
+
}
|
|
3119
|
+
// Start server if this file is run directly
|
|
3120
|
+
const isMainModule = process.argv[1] && (process.argv[1].endsWith('/index.ts') || process.argv[1].endsWith('/index.js'));
|
|
3121
|
+
if (isMainModule) {
|
|
3122
|
+
const server = new CrewlyServer();
|
|
3123
|
+
const logger = LoggerService.getInstance().createComponentLogger('CrewlyServer');
|
|
3124
|
+
server.start().catch((error) => {
|
|
3125
|
+
logger.error('Failed to start Crewly server', { error: error instanceof Error ? error.message : String(error) });
|
|
3126
|
+
process.exit(1);
|
|
3127
|
+
});
|
|
3128
|
+
}
|
|
3129
|
+
export default CrewlyServer;
|
|
3130
|
+
//# sourceMappingURL=index.js.map
|