sensorium-mcp 3.0.4 → 3.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/dist/dashboard/routes/data.d.ts.map +1 -1
  2. package/dist/dashboard/routes/data.js +2 -1
  3. package/dist/dashboard/routes/data.js.map +1 -1
  4. package/dist/dashboard/routes/threads.js +1 -1
  5. package/dist/dashboard/routes/threads.js.map +1 -1
  6. package/dist/dashboard/routes.d.ts.map +1 -1
  7. package/dist/dashboard/routes.js +1 -3
  8. package/dist/dashboard/routes.js.map +1 -1
  9. package/dist/data/memory/migration-runner.d.ts +1 -1
  10. package/dist/data/memory/migration-runner.d.ts.map +1 -1
  11. package/dist/data/memory/migration-runner.js +59 -3
  12. package/dist/data/memory/migration-runner.js.map +1 -1
  13. package/dist/data/memory/schema-ddl.d.ts +1 -1
  14. package/dist/data/memory/schema-ddl.d.ts.map +1 -1
  15. package/dist/data/memory/schema-ddl.js +2 -1
  16. package/dist/data/memory/schema-ddl.js.map +1 -1
  17. package/dist/data/memory/thread-registry.js +1 -1
  18. package/dist/data/memory/thread-registry.js.map +1 -1
  19. package/dist/http-server.d.ts.map +1 -1
  20. package/dist/http-server.js +1 -9
  21. package/dist/http-server.js.map +1 -1
  22. package/dist/index.js +3 -6
  23. package/dist/index.js.map +1 -1
  24. package/dist/server/factory.js +1 -1
  25. package/dist/server/factory.js.map +1 -1
  26. package/dist/services/agent-spawn.service.d.ts +7 -1
  27. package/dist/services/agent-spawn.service.d.ts.map +1 -1
  28. package/dist/services/agent-spawn.service.js +69 -45
  29. package/dist/services/agent-spawn.service.js.map +1 -1
  30. package/dist/services/consolidation.service.d.ts.map +1 -1
  31. package/dist/services/consolidation.service.js +49 -35
  32. package/dist/services/consolidation.service.js.map +1 -1
  33. package/dist/services/keeper.service.d.ts +21 -0
  34. package/dist/services/keeper.service.d.ts.map +1 -0
  35. package/dist/services/keeper.service.js +195 -0
  36. package/dist/services/keeper.service.js.map +1 -0
  37. package/dist/services/maintenance-signal.d.ts +2 -0
  38. package/dist/services/maintenance-signal.d.ts.map +1 -1
  39. package/dist/services/maintenance-signal.js +7 -1
  40. package/dist/services/maintenance-signal.js.map +1 -1
  41. package/dist/services/process.service.d.ts +19 -2
  42. package/dist/services/process.service.d.ts.map +1 -1
  43. package/dist/services/process.service.js +104 -10
  44. package/dist/services/process.service.js.map +1 -1
  45. package/dist/services/thread-lifecycle.service.d.ts +5 -0
  46. package/dist/services/thread-lifecycle.service.d.ts.map +1 -1
  47. package/dist/services/thread-lifecycle.service.js +33 -8
  48. package/dist/services/thread-lifecycle.service.js.map +1 -1
  49. package/dist/services/worker-cleanup.service.d.ts +14 -1
  50. package/dist/services/worker-cleanup.service.d.ts.map +1 -1
  51. package/dist/services/worker-cleanup.service.js +36 -38
  52. package/dist/services/worker-cleanup.service.js.map +1 -1
  53. package/dist/sessions.d.ts +0 -5
  54. package/dist/sessions.d.ts.map +1 -1
  55. package/dist/sessions.js +0 -7
  56. package/dist/sessions.js.map +1 -1
  57. package/dist/stdio-server.d.ts.map +1 -1
  58. package/dist/stdio-server.js +1 -7
  59. package/dist/stdio-server.js.map +1 -1
  60. package/dist/tools/delegate-tool.d.ts.map +1 -1
  61. package/dist/tools/delegate-tool.js +2 -2
  62. package/dist/tools/delegate-tool.js.map +1 -1
  63. package/dist/tools/session-tools.js +1 -1
  64. package/dist/tools/session-tools.js.map +1 -1
  65. package/dist/tools/start-session-tool.d.ts.map +1 -1
  66. package/dist/tools/start-session-tool.js +8 -9
  67. package/dist/tools/start-session-tool.js.map +1 -1
  68. package/dist/tools/wait/message-processing.d.ts.map +1 -1
  69. package/dist/tools/wait/message-processing.js +28 -0
  70. package/dist/tools/wait/message-processing.js.map +1 -1
  71. package/dist/tools/wait/poll-loop.js +1 -1
  72. package/dist/tools/wait/poll-loop.js.map +1 -1
  73. package/package.json +1 -1
  74. package/dist/tools/thread-lifecycle.d.ts +0 -6
  75. package/dist/tools/thread-lifecycle.d.ts.map +0 -1
  76. package/dist/tools/thread-lifecycle.js +0 -6
  77. package/dist/tools/thread-lifecycle.js.map +0 -1
  78. package/supervisor/config.go +0 -253
  79. package/supervisor/config_test.go +0 -78
  80. package/supervisor/go.mod +0 -15
  81. package/supervisor/go.sum +0 -20
  82. package/supervisor/health.go +0 -433
  83. package/supervisor/health_test.go +0 -93
  84. package/supervisor/keeper.go +0 -309
  85. package/supervisor/keeper_test.go +0 -27
  86. package/supervisor/lock.go +0 -57
  87. package/supervisor/lock_test.go +0 -54
  88. package/supervisor/log.go +0 -195
  89. package/supervisor/log_test.go +0 -125
  90. package/supervisor/main.go +0 -475
  91. package/supervisor/main_test.go +0 -130
  92. package/supervisor/notify.go +0 -53
  93. package/supervisor/process.go +0 -294
  94. package/supervisor/process_test.go +0 -108
  95. package/supervisor/process_unix.go +0 -14
  96. package/supervisor/process_windows.go +0 -15
  97. package/supervisor/secrets.go +0 -95
  98. package/supervisor/secrets_securevault_test.go +0 -98
  99. package/supervisor/secrets_test.go +0 -119
  100. package/supervisor/self_update.go +0 -282
  101. package/supervisor/self_update_test.go +0 -177
  102. package/supervisor/service_restart_stub.go +0 -9
  103. package/supervisor/service_restart_windows.go +0 -63
  104. package/supervisor/service_stub.go +0 -15
  105. package/supervisor/service_windows.go +0 -194
  106. package/supervisor/update_state.go +0 -264
  107. package/supervisor/update_state_test.go +0 -306
  108. package/supervisor/updater.go +0 -613
  109. package/supervisor/updater_test.go +0 -64
@@ -1,125 +0,0 @@
1
- package main
2
-
3
- import (
4
- "os"
5
- "path/filepath"
6
- "strings"
7
- "testing"
8
- )
9
-
10
- func TestLogRotation(t *testing.T) {
11
- dir := t.TempDir()
12
- logPath := filepath.Join(dir, "test.log")
13
-
14
- l := &Logger{
15
- logPath: logPath,
16
- maxSize: 100, // tiny threshold for test
17
- maxKeep: 2,
18
- }
19
- l.openFile()
20
- defer l.Close()
21
-
22
- // Write enough lines to trigger multiple rotations
23
- for i := 0; i < 50; i++ {
24
- l.Info("line %d: %s", i, strings.Repeat("x", 20))
25
- }
26
-
27
- // Current log should exist and be under maxSize
28
- info, err := os.Stat(logPath)
29
- if err != nil {
30
- t.Fatalf("log file missing: %v", err)
31
- }
32
- if info.Size() >= 100 {
33
- t.Errorf("log file should have been rotated, size=%d", info.Size())
34
- }
35
-
36
- // At least one rotated file should exist (timestamp-based, e.g. test.2026-04-15T....log)
37
- entries, err := os.ReadDir(dir)
38
- if err != nil {
39
- t.Fatalf("cannot read dir: %v", err)
40
- }
41
- var rotated []string
42
- for _, e := range entries {
43
- if e.Name() != "test.log" && strings.HasPrefix(e.Name(), "test.") {
44
- rotated = append(rotated, e.Name())
45
- }
46
- }
47
- if len(rotated) == 0 {
48
- t.Error("expected at least one rotated file to exist")
49
- }
50
-
51
- // maxKeep=2: no more than 2 rotated files should exist
52
- if len(rotated) > 2 {
53
- t.Errorf("expected at most 2 rotated files (maxKeep=2), got %d: %v", len(rotated), rotated)
54
- }
55
- }
56
-
57
- func TestDailyRotation(t *testing.T) {
58
- dir := t.TempDir()
59
- logPath := filepath.Join(dir, "test.log")
60
-
61
- // Write a fake "yesterday" log
62
- yesterday := "2026-04-14"
63
- if err := os.WriteFile(logPath, []byte("old log content\n"), 0644); err != nil {
64
- t.Fatal(err)
65
- }
66
-
67
- l := &Logger{
68
- logPath: logPath,
69
- maxSize: 5 * 1024 * 1024,
70
- maxKeep: 7,
71
- today: "2026-04-15", // simulate tomorrow
72
- }
73
- l.rotateDailyIfNeeded()
74
-
75
- // Original file should have been renamed to test.2026-04-14.log (mod date matches)
76
- // (mod date may be today in tests, so just verify the original is gone or renamed)
77
- entries, err := os.ReadDir(dir)
78
- if err != nil {
79
- t.Fatal(err)
80
- }
81
- var found bool
82
- for _, e := range entries {
83
- if strings.Contains(e.Name(), yesterday) || (e.Name() != "test.log" && strings.HasPrefix(e.Name(), "test.")) {
84
- found = true
85
- }
86
- }
87
- _ = found // rotation may or may not fire depending on file mod time in test env
88
- }
89
-
90
- func TestLogRotationMaxKeep(t *testing.T) {
91
- dir := t.TempDir()
92
- logPath := filepath.Join(dir, "test.log")
93
-
94
- // Pre-create 5 fake rotated files
95
- for i := 0; i < 5; i++ {
96
- fake := filepath.Join(dir, "test.2026-04-1"+string(rune('0'+i))+"T120000.log")
97
- if err := os.WriteFile(fake, []byte("x"), 0644); err != nil {
98
- t.Fatal(err)
99
- }
100
- }
101
-
102
- l := &Logger{
103
- logPath: logPath,
104
- maxSize: 100,
105
- maxKeep: 2,
106
- }
107
- l.openFile()
108
- defer l.Close()
109
-
110
- l.pruneOldLogs()
111
-
112
- entries, err := os.ReadDir(dir)
113
- if err != nil {
114
- t.Fatal(err)
115
- }
116
- var rotated []string
117
- for _, e := range entries {
118
- if e.Name() != "test.log" && strings.HasPrefix(e.Name(), "test.") {
119
- rotated = append(rotated, e.Name())
120
- }
121
- }
122
- if len(rotated) > 2 {
123
- t.Errorf("pruneOldLogs should have left at most maxKeep=2, got %d: %v", len(rotated), rotated)
124
- }
125
- }
@@ -1,475 +0,0 @@
1
- package main
2
-
3
- import (
4
- "context"
5
- "flag"
6
- "fmt"
7
- "os"
8
- "os/signal"
9
- "strings"
10
- "sync"
11
- "syscall"
12
- "time"
13
- )
14
-
15
- var (
16
- globalCancelMu sync.Mutex
17
- globalCancel context.CancelFunc
18
- )
19
-
20
- // KeeperEntry tracks a running keeper and its settings.
21
- type KeeperEntry struct {
22
- keeper *Keeper
23
- settings KeeperConfig
24
- }
25
-
26
- func main() {
27
- isService, err := isWindowsService()
28
- if err != nil {
29
- fmt.Fprintf(os.Stderr, "Failed to detect service mode: %v\n", err)
30
- os.Exit(1)
31
- }
32
- if isService {
33
- if err := runAsService(); err != nil {
34
- fmt.Fprintf(os.Stderr, "Service run failed: %v\n", err)
35
- os.Exit(1)
36
- }
37
- return
38
- }
39
-
40
- if handled, err := handleServiceCommand(os.Args[1:]); err != nil {
41
- fmt.Fprintf(os.Stderr, "%v\n", err)
42
- os.Exit(1)
43
- } else if handled {
44
- return
45
- }
46
-
47
- runningAsService := resolveRunSupervisorMode(isService, os.Getenv("HOST_MODE"))
48
- if err := runSupervisor(runningAsService); err != nil {
49
- fmt.Fprintf(os.Stderr, "Supervisor failed: %v\n", err)
50
- os.Exit(1)
51
- }
52
- }
53
-
54
- func resolveRunSupervisorMode(processIsService bool, hostModeValue string) bool {
55
- if processIsService {
56
- return true
57
- }
58
-
59
- return parseHostMode(hostModeValue, false) == "service"
60
- }
61
-
62
- func handleServiceCommand(args []string) (bool, error) {
63
- if len(args) == 0 {
64
- return false, nil
65
- }
66
-
67
- switch args[0] {
68
- case "install":
69
- fs := flag.NewFlagSet("install", flag.ContinueOnError)
70
- serviceUser := fs.String("service-user", "", "Windows account to run service as (e.g. .\\YourUser). Defaults to LocalSystem if empty.")
71
- servicePassword := fs.String("service-password", "", "Password for the service account (required for regular user accounts; not needed for LocalSystem/LocalService/NetworkService, NT SERVICE\\*, or gMSA names ending with '$').")
72
- if err := fs.Parse(args[1:]); err != nil {
73
- return true, err
74
- }
75
- if *serviceUser != "" && *servicePassword == "" && !isPasswordlessServiceIdentity(*serviceUser) {
76
- return true, fmt.Errorf("install failed: -service-password is required for regular -service-user accounts\nAllowed passwordless identities: LocalSystem/LocalService/NetworkService, NT SERVICE\\*, and gMSA names ending with '$'\nNote: prefer using Install-Sensorium.ps1, which prompts securely for passwords")
77
- }
78
- exePath, err := os.Executable()
79
- if err != nil {
80
- return true, fmt.Errorf("install failed: resolve executable: %w", err)
81
- }
82
- return true, installService(exePath, *serviceUser, *servicePassword)
83
- case "uninstall":
84
- return true, uninstallService()
85
- case "start":
86
- return true, startService()
87
- case "stop":
88
- return true, stopService()
89
- case "status":
90
- return true, serviceStatus()
91
- default:
92
- return false, nil
93
- }
94
- }
95
-
96
- func isPasswordlessServiceIdentity(user string) bool {
97
- trimmed := strings.TrimSpace(user)
98
- if trimmed == "" {
99
- return false
100
- }
101
-
102
- lower := strings.ToLower(trimmed)
103
- switch lower {
104
- case "localsystem", "nt authority\\system", "localservice", "nt authority\\localservice", "networkservice", "nt authority\\networkservice":
105
- return true
106
- }
107
-
108
- if strings.HasPrefix(lower, "nt service\\") {
109
- return true
110
- }
111
-
112
- return strings.HasSuffix(trimmed, "$")
113
- }
114
-
115
- func stopSupervisor() {
116
- globalCancelMu.Lock()
117
- fn := globalCancel
118
- globalCancelMu.Unlock()
119
- if fn != nil {
120
- fn()
121
- }
122
- }
123
-
124
- func runSupervisor(runningAsService bool) error {
125
- cfg := LoadConfig(runningAsService)
126
-
127
- if err := os.MkdirAll(cfg.DataDir, 0755); err != nil {
128
- return fmt.Errorf("cannot create data dir %s: %w", cfg.DataDir, err)
129
- }
130
-
131
- log := NewLogger(cfg.Paths.WatcherLog)
132
- defer log.Close()
133
-
134
- // Acquire lock — prevent multiple instances
135
- if !AcquireLock(cfg.Paths.WatcherLock, log) {
136
- return fmt.Errorf("another supervisor instance is already running")
137
- }
138
- defer ReleaseLock(cfg.Paths.WatcherLock)
139
-
140
- shouldRestart, err := applyPendingSupervisorUpdate(cfg, log)
141
- if err != nil {
142
- log.Warn("Pending supervisor update could not be applied: %v", err)
143
- }
144
- if shouldRestart {
145
- return nil
146
- }
147
-
148
- recoverPersistedUpdateStateOnStartup(cfg, log)
149
-
150
- log.Info("sensorium-supervisor starting (mode=%s, hostMode=%s, port=%d, dataDir=%s)", cfg.Mode, cfg.HostMode, cfg.MCPHttpPort, cfg.DataDir)
151
- log.Debug("Config: MCPStartCommand=%q, PollInterval=%v, MinUptime=%v, KeeperMaxRetries=%d", cfg.MCPStartCommand, cfg.PollInterval, cfg.MinUptime, cfg.KeeperMaxRetries)
152
- log.Debug("Config: TelegramToken=%v, HealthFailThresh=%d, StuckThreshold=%v", cfg.TelegramToken != "", cfg.HealthFailThresh, cfg.StuckThreshold)
153
-
154
- if err := os.MkdirAll(cfg.Paths.PIDsDir, 0755); err != nil {
155
- log.Warn("Cannot create PIDs dir %s: %v", cfg.Paths.PIDsDir, err)
156
- }
157
- if err := os.MkdirAll(cfg.Paths.HeartbeatsDir, 0755); err != nil {
158
- log.Warn("Cannot create heartbeats dir %s: %v", cfg.Paths.HeartbeatsDir, err)
159
- }
160
-
161
- if cfg.MCPHttpPort <= 0 {
162
- log.Error("MCP_HTTP_PORT must be set (got %d)", cfg.MCPHttpPort)
163
- return fmt.Errorf("MCP_HTTP_PORT must be set (got %d)", cfg.MCPHttpPort)
164
- }
165
-
166
- mcp := NewMCPClient(cfg.MCPHttpPort, cfg.MCPHttpSecret)
167
- mcp.Log = log
168
-
169
- // Check if MCP server is already running and healthy — inherit it instead of
170
- // killing and restarting (allows transparent supervisor binary updates).
171
- inherited := false
172
- if oldPid, pidErr := ReadPIDFile(cfg.Paths.ServerPID); pidErr == nil && oldPid > 0 && IsProcessAlive(oldPid) {
173
- if mcp.IsServerReady(context.Background()) {
174
- log.Info("Inherited running MCP server (PID %d) — skipping full restart", oldPid)
175
- inherited = true
176
- } else {
177
- log.Info("MCP server process (PID %d) did not pass health check — proceeding with full restart", oldPid)
178
- }
179
- }
180
-
181
- if !inherited {
182
- // Kill orphan thread processes from previous runs, then clean PID files
183
- KillOrphanThreads(cfg.Paths.PIDsDir, log)
184
-
185
- // Kill orphan MCP server from previous run
186
- if oldPid, pidErr := ReadPIDFile(cfg.Paths.ServerPID); pidErr == nil && oldPid > 0 && IsProcessAlive(oldPid) {
187
- log.Info("Killing orphan MCP server (PID %d) from previous run", oldPid)
188
- _ = KillProcess(oldPid, log)
189
- time.Sleep(1 * time.Second) // allow port to release
190
- }
191
- _ = os.Remove(cfg.Paths.ServerPID)
192
- KillByPort(cfg.MCPHttpPort, log)
193
-
194
- // Spawn MCP server
195
- _, err = SpawnMCPServer(cfg, log)
196
- if err != nil {
197
- log.Error("Failed to start MCP server: %v", err)
198
- return fmt.Errorf("failed to start MCP server: %w", err)
199
- }
200
- }
201
-
202
- // Wait for server to be ready
203
- ctx, rootCancel := context.WithCancel(context.Background())
204
- defer rootCancel()
205
- globalCancelMu.Lock()
206
- globalCancel = rootCancel
207
- globalCancelMu.Unlock()
208
- defer func() {
209
- globalCancelMu.Lock()
210
- globalCancel = nil
211
- globalCancelMu.Unlock()
212
- }()
213
-
214
- if mcp.WaitForReady(ctx, 3*time.Second, cfg.KeeperReadyTimeout) {
215
- log.Info("MCP server is ready")
216
- } else {
217
- log.Warn("MCP server did not become ready in %v — proceeding anyway", cfg.KeeperReadyTimeout)
218
- }
219
-
220
- // Start keeper management
221
- var mu sync.Mutex
222
- keepers := make(map[int]*KeeperEntry)
223
-
224
- onDeath := func(threadID int, sessionName string) {
225
- log.Warn("Thread %d ('%s') died", threadID, sessionName)
226
- NotifyOperator(cfg, log, fmt.Sprintf("💀 <b>%s</b> session died — restarting…", sessionName), threadID)
227
- }
228
-
229
- syncKeepers := func() {
230
- if cfg.MCPHttpPort <= 0 {
231
- log.Debug("syncKeepers: skipped (no port configured)")
232
- return
233
- }
234
-
235
- log.Debug("syncKeepers: fetching keeper settings...")
236
- settings, err := fetchKeeperSettings(ctx, mcp, log)
237
- if err != nil {
238
- log.Warn("Failed to fetch keeper settings: %v", err)
239
- return
240
- }
241
- log.Debug("syncKeepers: got %d keeper configs", len(settings))
242
-
243
- mu.Lock()
244
- defer mu.Unlock()
245
-
246
- // Find keepers to remove (no longer in settings)
247
- wanted := make(map[int]bool)
248
- for _, s := range settings {
249
- wanted[s.ThreadID] = true
250
- }
251
- for tid, entry := range keepers {
252
- if !wanted[tid] {
253
- log.Info("Stopping keeper for removed thread %d", tid)
254
- entry.keeper.Stop()
255
- delete(keepers, tid)
256
- }
257
- }
258
-
259
- // Start or update keepers
260
- for _, s := range settings {
261
- existing, exists := keepers[s.ThreadID]
262
- if exists && settingsChanged(existing.settings, s) {
263
- log.Info("Settings changed for thread %d — restarting keeper", s.ThreadID)
264
- existing.keeper.Stop()
265
- delete(keepers, s.ThreadID)
266
- exists = false
267
- }
268
- if !exists {
269
- k := NewKeeper(s, cfg, mcp, log, onDeath)
270
- k.Start()
271
- keepers[s.ThreadID] = &KeeperEntry{keeper: k, settings: s}
272
- log.Info("Started keeper for thread %d ('%s')", s.ThreadID, s.SessionName)
273
- }
274
- }
275
- }
276
-
277
- // Initial sync
278
- log.Info("Running initial keeper sync")
279
- syncKeepers()
280
-
281
- // Keeper settings poller (every 2 min)
282
- keeperPollerDone := make(chan struct{})
283
- go func() {
284
- defer close(keeperPollerDone)
285
- ticker := time.NewTicker(2 * time.Minute)
286
- defer ticker.Stop()
287
- for {
288
- select {
289
- case <-ctx.Done():
290
- return
291
- case <-ticker.C:
292
- log.Debug("Keeper settings poll triggered")
293
- syncKeepers()
294
- }
295
- }
296
- }()
297
-
298
- // Start updater
299
- log.Info("Starting auto-updater")
300
- updater := NewUpdater(cfg, mcp, log)
301
- updater.Start()
302
-
303
- // Health check loop for the server process itself
304
- healthDone := make(chan struct{})
305
- go func() {
306
- defer close(healthDone)
307
- consecutiveFails := 0
308
- ticker := time.NewTicker(60 * time.Second)
309
- defer ticker.Stop()
310
- for {
311
- select {
312
- case <-ctx.Done():
313
- return
314
- case <-ticker.C:
315
- if mcp.IsServerReady(ctx) {
316
- if consecutiveFails > 0 {
317
- log.Info("Server health check recovered (was at %d fails)", consecutiveFails)
318
- }
319
- consecutiveFails = 0
320
- } else {
321
- consecutiveFails++
322
- log.Warn("Server health check failed (%d/%d)", consecutiveFails, cfg.HealthFailThresh)
323
- if consecutiveFails >= cfg.HealthFailThresh {
324
- log.Error("Server unresponsive after %d consecutive failures — restarting", consecutiveFails)
325
- NotifyOperator(cfg, log, "⚠️ Supervisor: server process not running — restarting...", 0)
326
-
327
- // Kill and respawn
328
- pid, pidErr := ReadPIDFile(cfg.Paths.ServerPID)
329
- if pidErr != nil {
330
- log.Warn("Could not read server PID file: %v", pidErr)
331
- }
332
- if pid > 0 {
333
- _ = KillProcess(pid, log)
334
- }
335
- KillByPort(cfg.MCPHttpPort, log)
336
-
337
- if _, err := SpawnMCPServer(cfg, log); err != nil {
338
- log.Error("Failed to respawn server: %v", err)
339
- }
340
- consecutiveFails = 0
341
- }
342
- }
343
- }
344
- }
345
- }()
346
-
347
- // Wait for shutdown signal
348
- sigCh := make(chan os.Signal, 1)
349
- signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
350
-
351
- log.Info("All subsystems started — supervisor is running (PID %d)", os.Getpid())
352
-
353
- select {
354
- case sig := <-sigCh:
355
- log.Info("Received %s — shutting down", sig)
356
- rootCancel()
357
- case <-ctx.Done():
358
- log.Info("Shutdown requested")
359
- }
360
-
361
- // Stop keepers (with 10s timeout)
362
- shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 10*time.Second)
363
- defer shutdownCancel()
364
-
365
- mu.Lock()
366
- var wg sync.WaitGroup
367
- for _, entry := range keepers {
368
- wg.Add(1)
369
- go func(k *Keeper) {
370
- defer wg.Done()
371
- k.Stop()
372
- }(entry.keeper)
373
- }
374
- mu.Unlock()
375
-
376
- doneCh := make(chan struct{})
377
- go func() { wg.Wait(); close(doneCh) }()
378
- select {
379
- case <-doneCh:
380
- log.Info("All keepers stopped")
381
- case <-shutdownCtx.Done():
382
- log.Warn("Keeper shutdown timed out after 10s")
383
- }
384
-
385
- // Stop updater
386
- updater.Stop()
387
-
388
- // Wait for background goroutines
389
- <-keeperPollerDone
390
- <-healthDone
391
-
392
- // Ask MCP to write reconnect snapshot before killing it
393
- mcp.PrepareShutdown(context.Background())
394
-
395
- // Kill server process
396
- pid, err := ReadPIDFile(cfg.Paths.ServerPID)
397
- if err == nil && pid > 0 {
398
- log.Info("Stopping MCP server (PID %d)", pid)
399
- _ = KillProcess(pid, log)
400
- }
401
-
402
- log.Info("Supervisor stopped cleanly")
403
- return nil
404
- }
405
-
406
- // fetchKeeperSettings reads all keepAlive threads from the MCP server
407
- // (root, branch, and daily — excludes worker threads).
408
- func fetchKeeperSettings(ctx context.Context, mcp *MCPClient, log *Logger) ([]KeeperConfig, error) {
409
- roots, err := mcp.GetKeepAliveThreads(ctx)
410
- if err != nil {
411
- return nil, err
412
- }
413
-
414
- var result []KeeperConfig
415
- for _, r := range roots {
416
- keepAlive, _ := r["keepAlive"].(bool)
417
- if !keepAlive {
418
- continue
419
- }
420
-
421
- // Skip non-active roots (archived, expired, exited)
422
- if status, _ := r["status"].(string); status != "" && status != "active" {
423
- continue
424
- }
425
-
426
- tidFloat, _ := r["threadId"].(float64) // JSON numbers decode as float64
427
- tid := int(tidFloat)
428
- if tid <= 0 {
429
- continue
430
- }
431
-
432
- client := "claude"
433
- if c, ok := r["client"].(string); ok && c != "" {
434
- client = c
435
- }
436
-
437
- sessionName := ""
438
- if n, ok := r["name"].(string); ok {
439
- sessionName = n
440
- }
441
-
442
- maxRetries := 5
443
- if mr, ok := r["maxRetries"].(float64); ok {
444
- maxRetries = int(mr)
445
- }
446
-
447
- cooldownMs := 300_000
448
- if cd, ok := r["cooldownMs"].(float64); ok {
449
- cooldownMs = int(cd)
450
- }
451
-
452
- workDir := ""
453
- if wd, ok := r["workingDirectory"].(string); ok {
454
- workDir = wd
455
- }
456
-
457
- result = append(result, KeeperConfig{
458
- ThreadID: tid,
459
- SessionName: sessionName,
460
- Client: client,
461
- WorkingDirectory: workDir,
462
- MaxRetries: maxRetries,
463
- CooldownMs: cooldownMs,
464
- })
465
- }
466
- return result, nil
467
- }
468
-
469
- func settingsChanged(a, b KeeperConfig) bool {
470
- return a.MaxRetries != b.MaxRetries ||
471
- a.CooldownMs != b.CooldownMs ||
472
- a.Client != b.Client ||
473
- a.SessionName != b.SessionName ||
474
- a.WorkingDirectory != b.WorkingDirectory
475
- }