sensorium-mcp 3.0.4 → 3.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Install-Sensorium.ps1 +102 -209
- package/dist/dashboard/routes/data.d.ts.map +1 -1
- package/dist/dashboard/routes/data.js +2 -1
- package/dist/dashboard/routes/data.js.map +1 -1
- package/dist/dashboard/routes/threads.js +1 -1
- package/dist/dashboard/routes/threads.js.map +1 -1
- package/dist/dashboard/routes.d.ts.map +1 -1
- package/dist/dashboard/routes.js +1 -3
- package/dist/dashboard/routes.js.map +1 -1
- package/dist/data/memory/migration-runner.d.ts +1 -1
- package/dist/data/memory/migration-runner.d.ts.map +1 -1
- package/dist/data/memory/migration-runner.js +59 -3
- package/dist/data/memory/migration-runner.js.map +1 -1
- package/dist/data/memory/narrative.d.ts.map +1 -1
- package/dist/data/memory/narrative.js +43 -6
- package/dist/data/memory/narrative.js.map +1 -1
- package/dist/data/memory/reflection.d.ts +24 -0
- package/dist/data/memory/reflection.d.ts.map +1 -1
- package/dist/data/memory/reflection.js +65 -1
- package/dist/data/memory/reflection.js.map +1 -1
- package/dist/data/memory/schema-ddl.d.ts +1 -1
- package/dist/data/memory/schema-ddl.d.ts.map +1 -1
- package/dist/data/memory/schema-ddl.js +2 -1
- package/dist/data/memory/schema-ddl.js.map +1 -1
- package/dist/data/memory/thread-registry.js +1 -1
- package/dist/data/memory/thread-registry.js.map +1 -1
- package/dist/http-server.d.ts.map +1 -1
- package/dist/http-server.js +1 -9
- package/dist/http-server.js.map +1 -1
- package/dist/index.js +3 -6
- package/dist/index.js.map +1 -1
- package/dist/server/factory.js +1 -1
- package/dist/server/factory.js.map +1 -1
- package/dist/services/agent-spawn.service.d.ts +7 -1
- package/dist/services/agent-spawn.service.d.ts.map +1 -1
- package/dist/services/agent-spawn.service.js +69 -45
- package/dist/services/agent-spawn.service.js.map +1 -1
- package/dist/services/consolidation.service.d.ts.map +1 -1
- package/dist/services/consolidation.service.js +88 -35
- package/dist/services/consolidation.service.js.map +1 -1
- package/dist/services/keeper.service.d.ts +21 -0
- package/dist/services/keeper.service.d.ts.map +1 -0
- package/dist/services/keeper.service.js +195 -0
- package/dist/services/keeper.service.js.map +1 -0
- package/dist/services/maintenance-signal.d.ts +2 -0
- package/dist/services/maintenance-signal.d.ts.map +1 -1
- package/dist/services/maintenance-signal.js +7 -1
- package/dist/services/maintenance-signal.js.map +1 -1
- package/dist/services/memory-briefing.service.d.ts.map +1 -1
- package/dist/services/memory-briefing.service.js +17 -1
- package/dist/services/memory-briefing.service.js.map +1 -1
- package/dist/services/process.service.d.ts +19 -2
- package/dist/services/process.service.d.ts.map +1 -1
- package/dist/services/process.service.js +104 -10
- package/dist/services/process.service.js.map +1 -1
- package/dist/services/thread-lifecycle.service.d.ts +5 -0
- package/dist/services/thread-lifecycle.service.d.ts.map +1 -1
- package/dist/services/thread-lifecycle.service.js +33 -8
- package/dist/services/thread-lifecycle.service.js.map +1 -1
- package/dist/services/worker-cleanup.service.d.ts +14 -1
- package/dist/services/worker-cleanup.service.d.ts.map +1 -1
- package/dist/services/worker-cleanup.service.js +36 -38
- package/dist/services/worker-cleanup.service.js.map +1 -1
- package/dist/sessions.d.ts +0 -5
- package/dist/sessions.d.ts.map +1 -1
- package/dist/sessions.js +0 -7
- package/dist/sessions.js.map +1 -1
- package/dist/stdio-server.d.ts.map +1 -1
- package/dist/stdio-server.js +1 -7
- package/dist/stdio-server.js.map +1 -1
- package/dist/tools/delegate-tool.d.ts.map +1 -1
- package/dist/tools/delegate-tool.js +2 -2
- package/dist/tools/delegate-tool.js.map +1 -1
- package/dist/tools/session-tools.js +1 -1
- package/dist/tools/session-tools.js.map +1 -1
- package/dist/tools/start-session-tool.d.ts.map +1 -1
- package/dist/tools/start-session-tool.js +8 -9
- package/dist/tools/start-session-tool.js.map +1 -1
- package/dist/tools/wait/message-processing.d.ts.map +1 -1
- package/dist/tools/wait/message-processing.js +28 -0
- package/dist/tools/wait/message-processing.js.map +1 -1
- package/dist/tools/wait/poll-loop.js +1 -1
- package/dist/tools/wait/poll-loop.js.map +1 -1
- package/package.json +1 -1
- package/dist/tools/thread-lifecycle.d.ts +0 -6
- package/dist/tools/thread-lifecycle.d.ts.map +0 -1
- package/dist/tools/thread-lifecycle.js +0 -6
- package/dist/tools/thread-lifecycle.js.map +0 -1
- package/supervisor/config.go +0 -253
- package/supervisor/config_test.go +0 -78
- package/supervisor/go.mod +0 -15
- package/supervisor/go.sum +0 -20
- package/supervisor/health.go +0 -433
- package/supervisor/health_test.go +0 -93
- package/supervisor/keeper.go +0 -309
- package/supervisor/keeper_test.go +0 -27
- package/supervisor/lock.go +0 -57
- package/supervisor/lock_test.go +0 -54
- package/supervisor/log.go +0 -195
- package/supervisor/log_test.go +0 -125
- package/supervisor/main.go +0 -475
- package/supervisor/main_test.go +0 -130
- package/supervisor/notify.go +0 -53
- package/supervisor/process.go +0 -294
- package/supervisor/process_test.go +0 -108
- package/supervisor/process_unix.go +0 -14
- package/supervisor/process_windows.go +0 -15
- package/supervisor/secrets.go +0 -95
- package/supervisor/secrets_securevault_test.go +0 -98
- package/supervisor/secrets_test.go +0 -119
- package/supervisor/self_update.go +0 -282
- package/supervisor/self_update_test.go +0 -177
- package/supervisor/service_restart_stub.go +0 -9
- package/supervisor/service_restart_windows.go +0 -63
- package/supervisor/service_stub.go +0 -15
- package/supervisor/service_windows.go +0 -194
- package/supervisor/update_state.go +0 -264
- package/supervisor/update_state_test.go +0 -306
- package/supervisor/updater.go +0 -613
- package/supervisor/updater_test.go +0 -64
package/supervisor/log_test.go
DELETED
|
@@ -1,125 +0,0 @@
|
|
|
1
|
-
package main
|
|
2
|
-
|
|
3
|
-
import (
|
|
4
|
-
"os"
|
|
5
|
-
"path/filepath"
|
|
6
|
-
"strings"
|
|
7
|
-
"testing"
|
|
8
|
-
)
|
|
9
|
-
|
|
10
|
-
func TestLogRotation(t *testing.T) {
|
|
11
|
-
dir := t.TempDir()
|
|
12
|
-
logPath := filepath.Join(dir, "test.log")
|
|
13
|
-
|
|
14
|
-
l := &Logger{
|
|
15
|
-
logPath: logPath,
|
|
16
|
-
maxSize: 100, // tiny threshold for test
|
|
17
|
-
maxKeep: 2,
|
|
18
|
-
}
|
|
19
|
-
l.openFile()
|
|
20
|
-
defer l.Close()
|
|
21
|
-
|
|
22
|
-
// Write enough lines to trigger multiple rotations
|
|
23
|
-
for i := 0; i < 50; i++ {
|
|
24
|
-
l.Info("line %d: %s", i, strings.Repeat("x", 20))
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
// Current log should exist and be under maxSize
|
|
28
|
-
info, err := os.Stat(logPath)
|
|
29
|
-
if err != nil {
|
|
30
|
-
t.Fatalf("log file missing: %v", err)
|
|
31
|
-
}
|
|
32
|
-
if info.Size() >= 100 {
|
|
33
|
-
t.Errorf("log file should have been rotated, size=%d", info.Size())
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
// At least one rotated file should exist (timestamp-based, e.g. test.2026-04-15T....log)
|
|
37
|
-
entries, err := os.ReadDir(dir)
|
|
38
|
-
if err != nil {
|
|
39
|
-
t.Fatalf("cannot read dir: %v", err)
|
|
40
|
-
}
|
|
41
|
-
var rotated []string
|
|
42
|
-
for _, e := range entries {
|
|
43
|
-
if e.Name() != "test.log" && strings.HasPrefix(e.Name(), "test.") {
|
|
44
|
-
rotated = append(rotated, e.Name())
|
|
45
|
-
}
|
|
46
|
-
}
|
|
47
|
-
if len(rotated) == 0 {
|
|
48
|
-
t.Error("expected at least one rotated file to exist")
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
// maxKeep=2: no more than 2 rotated files should exist
|
|
52
|
-
if len(rotated) > 2 {
|
|
53
|
-
t.Errorf("expected at most 2 rotated files (maxKeep=2), got %d: %v", len(rotated), rotated)
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
func TestDailyRotation(t *testing.T) {
|
|
58
|
-
dir := t.TempDir()
|
|
59
|
-
logPath := filepath.Join(dir, "test.log")
|
|
60
|
-
|
|
61
|
-
// Write a fake "yesterday" log
|
|
62
|
-
yesterday := "2026-04-14"
|
|
63
|
-
if err := os.WriteFile(logPath, []byte("old log content\n"), 0644); err != nil {
|
|
64
|
-
t.Fatal(err)
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
l := &Logger{
|
|
68
|
-
logPath: logPath,
|
|
69
|
-
maxSize: 5 * 1024 * 1024,
|
|
70
|
-
maxKeep: 7,
|
|
71
|
-
today: "2026-04-15", // simulate tomorrow
|
|
72
|
-
}
|
|
73
|
-
l.rotateDailyIfNeeded()
|
|
74
|
-
|
|
75
|
-
// Original file should have been renamed to test.2026-04-14.log (mod date matches)
|
|
76
|
-
// (mod date may be today in tests, so just verify the original is gone or renamed)
|
|
77
|
-
entries, err := os.ReadDir(dir)
|
|
78
|
-
if err != nil {
|
|
79
|
-
t.Fatal(err)
|
|
80
|
-
}
|
|
81
|
-
var found bool
|
|
82
|
-
for _, e := range entries {
|
|
83
|
-
if strings.Contains(e.Name(), yesterday) || (e.Name() != "test.log" && strings.HasPrefix(e.Name(), "test.")) {
|
|
84
|
-
found = true
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
_ = found // rotation may or may not fire depending on file mod time in test env
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
func TestLogRotationMaxKeep(t *testing.T) {
|
|
91
|
-
dir := t.TempDir()
|
|
92
|
-
logPath := filepath.Join(dir, "test.log")
|
|
93
|
-
|
|
94
|
-
// Pre-create 5 fake rotated files
|
|
95
|
-
for i := 0; i < 5; i++ {
|
|
96
|
-
fake := filepath.Join(dir, "test.2026-04-1"+string(rune('0'+i))+"T120000.log")
|
|
97
|
-
if err := os.WriteFile(fake, []byte("x"), 0644); err != nil {
|
|
98
|
-
t.Fatal(err)
|
|
99
|
-
}
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
l := &Logger{
|
|
103
|
-
logPath: logPath,
|
|
104
|
-
maxSize: 100,
|
|
105
|
-
maxKeep: 2,
|
|
106
|
-
}
|
|
107
|
-
l.openFile()
|
|
108
|
-
defer l.Close()
|
|
109
|
-
|
|
110
|
-
l.pruneOldLogs()
|
|
111
|
-
|
|
112
|
-
entries, err := os.ReadDir(dir)
|
|
113
|
-
if err != nil {
|
|
114
|
-
t.Fatal(err)
|
|
115
|
-
}
|
|
116
|
-
var rotated []string
|
|
117
|
-
for _, e := range entries {
|
|
118
|
-
if e.Name() != "test.log" && strings.HasPrefix(e.Name(), "test.") {
|
|
119
|
-
rotated = append(rotated, e.Name())
|
|
120
|
-
}
|
|
121
|
-
}
|
|
122
|
-
if len(rotated) > 2 {
|
|
123
|
-
t.Errorf("pruneOldLogs should have left at most maxKeep=2, got %d: %v", len(rotated), rotated)
|
|
124
|
-
}
|
|
125
|
-
}
|
package/supervisor/main.go
DELETED
|
@@ -1,475 +0,0 @@
|
|
|
1
|
-
package main
|
|
2
|
-
|
|
3
|
-
import (
|
|
4
|
-
"context"
|
|
5
|
-
"flag"
|
|
6
|
-
"fmt"
|
|
7
|
-
"os"
|
|
8
|
-
"os/signal"
|
|
9
|
-
"strings"
|
|
10
|
-
"sync"
|
|
11
|
-
"syscall"
|
|
12
|
-
"time"
|
|
13
|
-
)
|
|
14
|
-
|
|
15
|
-
var (
|
|
16
|
-
globalCancelMu sync.Mutex
|
|
17
|
-
globalCancel context.CancelFunc
|
|
18
|
-
)
|
|
19
|
-
|
|
20
|
-
// KeeperEntry tracks a running keeper and its settings.
|
|
21
|
-
type KeeperEntry struct {
|
|
22
|
-
keeper *Keeper
|
|
23
|
-
settings KeeperConfig
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
func main() {
|
|
27
|
-
isService, err := isWindowsService()
|
|
28
|
-
if err != nil {
|
|
29
|
-
fmt.Fprintf(os.Stderr, "Failed to detect service mode: %v\n", err)
|
|
30
|
-
os.Exit(1)
|
|
31
|
-
}
|
|
32
|
-
if isService {
|
|
33
|
-
if err := runAsService(); err != nil {
|
|
34
|
-
fmt.Fprintf(os.Stderr, "Service run failed: %v\n", err)
|
|
35
|
-
os.Exit(1)
|
|
36
|
-
}
|
|
37
|
-
return
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
if handled, err := handleServiceCommand(os.Args[1:]); err != nil {
|
|
41
|
-
fmt.Fprintf(os.Stderr, "%v\n", err)
|
|
42
|
-
os.Exit(1)
|
|
43
|
-
} else if handled {
|
|
44
|
-
return
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
runningAsService := resolveRunSupervisorMode(isService, os.Getenv("HOST_MODE"))
|
|
48
|
-
if err := runSupervisor(runningAsService); err != nil {
|
|
49
|
-
fmt.Fprintf(os.Stderr, "Supervisor failed: %v\n", err)
|
|
50
|
-
os.Exit(1)
|
|
51
|
-
}
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
func resolveRunSupervisorMode(processIsService bool, hostModeValue string) bool {
|
|
55
|
-
if processIsService {
|
|
56
|
-
return true
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
return parseHostMode(hostModeValue, false) == "service"
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
func handleServiceCommand(args []string) (bool, error) {
|
|
63
|
-
if len(args) == 0 {
|
|
64
|
-
return false, nil
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
switch args[0] {
|
|
68
|
-
case "install":
|
|
69
|
-
fs := flag.NewFlagSet("install", flag.ContinueOnError)
|
|
70
|
-
serviceUser := fs.String("service-user", "", "Windows account to run service as (e.g. .\\YourUser). Defaults to LocalSystem if empty.")
|
|
71
|
-
servicePassword := fs.String("service-password", "", "Password for the service account (required for regular user accounts; not needed for LocalSystem/LocalService/NetworkService, NT SERVICE\\*, or gMSA names ending with '$').")
|
|
72
|
-
if err := fs.Parse(args[1:]); err != nil {
|
|
73
|
-
return true, err
|
|
74
|
-
}
|
|
75
|
-
if *serviceUser != "" && *servicePassword == "" && !isPasswordlessServiceIdentity(*serviceUser) {
|
|
76
|
-
return true, fmt.Errorf("install failed: -service-password is required for regular -service-user accounts\nAllowed passwordless identities: LocalSystem/LocalService/NetworkService, NT SERVICE\\*, and gMSA names ending with '$'\nNote: prefer using Install-Sensorium.ps1, which prompts securely for passwords")
|
|
77
|
-
}
|
|
78
|
-
exePath, err := os.Executable()
|
|
79
|
-
if err != nil {
|
|
80
|
-
return true, fmt.Errorf("install failed: resolve executable: %w", err)
|
|
81
|
-
}
|
|
82
|
-
return true, installService(exePath, *serviceUser, *servicePassword)
|
|
83
|
-
case "uninstall":
|
|
84
|
-
return true, uninstallService()
|
|
85
|
-
case "start":
|
|
86
|
-
return true, startService()
|
|
87
|
-
case "stop":
|
|
88
|
-
return true, stopService()
|
|
89
|
-
case "status":
|
|
90
|
-
return true, serviceStatus()
|
|
91
|
-
default:
|
|
92
|
-
return false, nil
|
|
93
|
-
}
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
func isPasswordlessServiceIdentity(user string) bool {
|
|
97
|
-
trimmed := strings.TrimSpace(user)
|
|
98
|
-
if trimmed == "" {
|
|
99
|
-
return false
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
lower := strings.ToLower(trimmed)
|
|
103
|
-
switch lower {
|
|
104
|
-
case "localsystem", "nt authority\\system", "localservice", "nt authority\\localservice", "networkservice", "nt authority\\networkservice":
|
|
105
|
-
return true
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
if strings.HasPrefix(lower, "nt service\\") {
|
|
109
|
-
return true
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
return strings.HasSuffix(trimmed, "$")
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
func stopSupervisor() {
|
|
116
|
-
globalCancelMu.Lock()
|
|
117
|
-
fn := globalCancel
|
|
118
|
-
globalCancelMu.Unlock()
|
|
119
|
-
if fn != nil {
|
|
120
|
-
fn()
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
func runSupervisor(runningAsService bool) error {
|
|
125
|
-
cfg := LoadConfig(runningAsService)
|
|
126
|
-
|
|
127
|
-
if err := os.MkdirAll(cfg.DataDir, 0755); err != nil {
|
|
128
|
-
return fmt.Errorf("cannot create data dir %s: %w", cfg.DataDir, err)
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
log := NewLogger(cfg.Paths.WatcherLog)
|
|
132
|
-
defer log.Close()
|
|
133
|
-
|
|
134
|
-
// Acquire lock — prevent multiple instances
|
|
135
|
-
if !AcquireLock(cfg.Paths.WatcherLock, log) {
|
|
136
|
-
return fmt.Errorf("another supervisor instance is already running")
|
|
137
|
-
}
|
|
138
|
-
defer ReleaseLock(cfg.Paths.WatcherLock)
|
|
139
|
-
|
|
140
|
-
shouldRestart, err := applyPendingSupervisorUpdate(cfg, log)
|
|
141
|
-
if err != nil {
|
|
142
|
-
log.Warn("Pending supervisor update could not be applied: %v", err)
|
|
143
|
-
}
|
|
144
|
-
if shouldRestart {
|
|
145
|
-
return nil
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
recoverPersistedUpdateStateOnStartup(cfg, log)
|
|
149
|
-
|
|
150
|
-
log.Info("sensorium-supervisor starting (mode=%s, hostMode=%s, port=%d, dataDir=%s)", cfg.Mode, cfg.HostMode, cfg.MCPHttpPort, cfg.DataDir)
|
|
151
|
-
log.Debug("Config: MCPStartCommand=%q, PollInterval=%v, MinUptime=%v, KeeperMaxRetries=%d", cfg.MCPStartCommand, cfg.PollInterval, cfg.MinUptime, cfg.KeeperMaxRetries)
|
|
152
|
-
log.Debug("Config: TelegramToken=%v, HealthFailThresh=%d, StuckThreshold=%v", cfg.TelegramToken != "", cfg.HealthFailThresh, cfg.StuckThreshold)
|
|
153
|
-
|
|
154
|
-
if err := os.MkdirAll(cfg.Paths.PIDsDir, 0755); err != nil {
|
|
155
|
-
log.Warn("Cannot create PIDs dir %s: %v", cfg.Paths.PIDsDir, err)
|
|
156
|
-
}
|
|
157
|
-
if err := os.MkdirAll(cfg.Paths.HeartbeatsDir, 0755); err != nil {
|
|
158
|
-
log.Warn("Cannot create heartbeats dir %s: %v", cfg.Paths.HeartbeatsDir, err)
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
if cfg.MCPHttpPort <= 0 {
|
|
162
|
-
log.Error("MCP_HTTP_PORT must be set (got %d)", cfg.MCPHttpPort)
|
|
163
|
-
return fmt.Errorf("MCP_HTTP_PORT must be set (got %d)", cfg.MCPHttpPort)
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
mcp := NewMCPClient(cfg.MCPHttpPort, cfg.MCPHttpSecret)
|
|
167
|
-
mcp.Log = log
|
|
168
|
-
|
|
169
|
-
// Check if MCP server is already running and healthy — inherit it instead of
|
|
170
|
-
// killing and restarting (allows transparent supervisor binary updates).
|
|
171
|
-
inherited := false
|
|
172
|
-
if oldPid, pidErr := ReadPIDFile(cfg.Paths.ServerPID); pidErr == nil && oldPid > 0 && IsProcessAlive(oldPid) {
|
|
173
|
-
if mcp.IsServerReady(context.Background()) {
|
|
174
|
-
log.Info("Inherited running MCP server (PID %d) — skipping full restart", oldPid)
|
|
175
|
-
inherited = true
|
|
176
|
-
} else {
|
|
177
|
-
log.Info("MCP server process (PID %d) did not pass health check — proceeding with full restart", oldPid)
|
|
178
|
-
}
|
|
179
|
-
}
|
|
180
|
-
|
|
181
|
-
if !inherited {
|
|
182
|
-
// Kill orphan thread processes from previous runs, then clean PID files
|
|
183
|
-
KillOrphanThreads(cfg.Paths.PIDsDir, log)
|
|
184
|
-
|
|
185
|
-
// Kill orphan MCP server from previous run
|
|
186
|
-
if oldPid, pidErr := ReadPIDFile(cfg.Paths.ServerPID); pidErr == nil && oldPid > 0 && IsProcessAlive(oldPid) {
|
|
187
|
-
log.Info("Killing orphan MCP server (PID %d) from previous run", oldPid)
|
|
188
|
-
_ = KillProcess(oldPid, log)
|
|
189
|
-
time.Sleep(1 * time.Second) // allow port to release
|
|
190
|
-
}
|
|
191
|
-
_ = os.Remove(cfg.Paths.ServerPID)
|
|
192
|
-
KillByPort(cfg.MCPHttpPort, log)
|
|
193
|
-
|
|
194
|
-
// Spawn MCP server
|
|
195
|
-
_, err = SpawnMCPServer(cfg, log)
|
|
196
|
-
if err != nil {
|
|
197
|
-
log.Error("Failed to start MCP server: %v", err)
|
|
198
|
-
return fmt.Errorf("failed to start MCP server: %w", err)
|
|
199
|
-
}
|
|
200
|
-
}
|
|
201
|
-
|
|
202
|
-
// Wait for server to be ready
|
|
203
|
-
ctx, rootCancel := context.WithCancel(context.Background())
|
|
204
|
-
defer rootCancel()
|
|
205
|
-
globalCancelMu.Lock()
|
|
206
|
-
globalCancel = rootCancel
|
|
207
|
-
globalCancelMu.Unlock()
|
|
208
|
-
defer func() {
|
|
209
|
-
globalCancelMu.Lock()
|
|
210
|
-
globalCancel = nil
|
|
211
|
-
globalCancelMu.Unlock()
|
|
212
|
-
}()
|
|
213
|
-
|
|
214
|
-
if mcp.WaitForReady(ctx, 3*time.Second, cfg.KeeperReadyTimeout) {
|
|
215
|
-
log.Info("MCP server is ready")
|
|
216
|
-
} else {
|
|
217
|
-
log.Warn("MCP server did not become ready in %v — proceeding anyway", cfg.KeeperReadyTimeout)
|
|
218
|
-
}
|
|
219
|
-
|
|
220
|
-
// Start keeper management
|
|
221
|
-
var mu sync.Mutex
|
|
222
|
-
keepers := make(map[int]*KeeperEntry)
|
|
223
|
-
|
|
224
|
-
onDeath := func(threadID int, sessionName string) {
|
|
225
|
-
log.Warn("Thread %d ('%s') died", threadID, sessionName)
|
|
226
|
-
NotifyOperator(cfg, log, fmt.Sprintf("💀 <b>%s</b> session died — restarting…", sessionName), threadID)
|
|
227
|
-
}
|
|
228
|
-
|
|
229
|
-
syncKeepers := func() {
|
|
230
|
-
if cfg.MCPHttpPort <= 0 {
|
|
231
|
-
log.Debug("syncKeepers: skipped (no port configured)")
|
|
232
|
-
return
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
log.Debug("syncKeepers: fetching keeper settings...")
|
|
236
|
-
settings, err := fetchKeeperSettings(ctx, mcp, log)
|
|
237
|
-
if err != nil {
|
|
238
|
-
log.Warn("Failed to fetch keeper settings: %v", err)
|
|
239
|
-
return
|
|
240
|
-
}
|
|
241
|
-
log.Debug("syncKeepers: got %d keeper configs", len(settings))
|
|
242
|
-
|
|
243
|
-
mu.Lock()
|
|
244
|
-
defer mu.Unlock()
|
|
245
|
-
|
|
246
|
-
// Find keepers to remove (no longer in settings)
|
|
247
|
-
wanted := make(map[int]bool)
|
|
248
|
-
for _, s := range settings {
|
|
249
|
-
wanted[s.ThreadID] = true
|
|
250
|
-
}
|
|
251
|
-
for tid, entry := range keepers {
|
|
252
|
-
if !wanted[tid] {
|
|
253
|
-
log.Info("Stopping keeper for removed thread %d", tid)
|
|
254
|
-
entry.keeper.Stop()
|
|
255
|
-
delete(keepers, tid)
|
|
256
|
-
}
|
|
257
|
-
}
|
|
258
|
-
|
|
259
|
-
// Start or update keepers
|
|
260
|
-
for _, s := range settings {
|
|
261
|
-
existing, exists := keepers[s.ThreadID]
|
|
262
|
-
if exists && settingsChanged(existing.settings, s) {
|
|
263
|
-
log.Info("Settings changed for thread %d — restarting keeper", s.ThreadID)
|
|
264
|
-
existing.keeper.Stop()
|
|
265
|
-
delete(keepers, s.ThreadID)
|
|
266
|
-
exists = false
|
|
267
|
-
}
|
|
268
|
-
if !exists {
|
|
269
|
-
k := NewKeeper(s, cfg, mcp, log, onDeath)
|
|
270
|
-
k.Start()
|
|
271
|
-
keepers[s.ThreadID] = &KeeperEntry{keeper: k, settings: s}
|
|
272
|
-
log.Info("Started keeper for thread %d ('%s')", s.ThreadID, s.SessionName)
|
|
273
|
-
}
|
|
274
|
-
}
|
|
275
|
-
}
|
|
276
|
-
|
|
277
|
-
// Initial sync
|
|
278
|
-
log.Info("Running initial keeper sync")
|
|
279
|
-
syncKeepers()
|
|
280
|
-
|
|
281
|
-
// Keeper settings poller (every 2 min)
|
|
282
|
-
keeperPollerDone := make(chan struct{})
|
|
283
|
-
go func() {
|
|
284
|
-
defer close(keeperPollerDone)
|
|
285
|
-
ticker := time.NewTicker(2 * time.Minute)
|
|
286
|
-
defer ticker.Stop()
|
|
287
|
-
for {
|
|
288
|
-
select {
|
|
289
|
-
case <-ctx.Done():
|
|
290
|
-
return
|
|
291
|
-
case <-ticker.C:
|
|
292
|
-
log.Debug("Keeper settings poll triggered")
|
|
293
|
-
syncKeepers()
|
|
294
|
-
}
|
|
295
|
-
}
|
|
296
|
-
}()
|
|
297
|
-
|
|
298
|
-
// Start updater
|
|
299
|
-
log.Info("Starting auto-updater")
|
|
300
|
-
updater := NewUpdater(cfg, mcp, log)
|
|
301
|
-
updater.Start()
|
|
302
|
-
|
|
303
|
-
// Health check loop for the server process itself
|
|
304
|
-
healthDone := make(chan struct{})
|
|
305
|
-
go func() {
|
|
306
|
-
defer close(healthDone)
|
|
307
|
-
consecutiveFails := 0
|
|
308
|
-
ticker := time.NewTicker(60 * time.Second)
|
|
309
|
-
defer ticker.Stop()
|
|
310
|
-
for {
|
|
311
|
-
select {
|
|
312
|
-
case <-ctx.Done():
|
|
313
|
-
return
|
|
314
|
-
case <-ticker.C:
|
|
315
|
-
if mcp.IsServerReady(ctx) {
|
|
316
|
-
if consecutiveFails > 0 {
|
|
317
|
-
log.Info("Server health check recovered (was at %d fails)", consecutiveFails)
|
|
318
|
-
}
|
|
319
|
-
consecutiveFails = 0
|
|
320
|
-
} else {
|
|
321
|
-
consecutiveFails++
|
|
322
|
-
log.Warn("Server health check failed (%d/%d)", consecutiveFails, cfg.HealthFailThresh)
|
|
323
|
-
if consecutiveFails >= cfg.HealthFailThresh {
|
|
324
|
-
log.Error("Server unresponsive after %d consecutive failures — restarting", consecutiveFails)
|
|
325
|
-
NotifyOperator(cfg, log, "⚠️ Supervisor: server process not running — restarting...", 0)
|
|
326
|
-
|
|
327
|
-
// Kill and respawn
|
|
328
|
-
pid, pidErr := ReadPIDFile(cfg.Paths.ServerPID)
|
|
329
|
-
if pidErr != nil {
|
|
330
|
-
log.Warn("Could not read server PID file: %v", pidErr)
|
|
331
|
-
}
|
|
332
|
-
if pid > 0 {
|
|
333
|
-
_ = KillProcess(pid, log)
|
|
334
|
-
}
|
|
335
|
-
KillByPort(cfg.MCPHttpPort, log)
|
|
336
|
-
|
|
337
|
-
if _, err := SpawnMCPServer(cfg, log); err != nil {
|
|
338
|
-
log.Error("Failed to respawn server: %v", err)
|
|
339
|
-
}
|
|
340
|
-
consecutiveFails = 0
|
|
341
|
-
}
|
|
342
|
-
}
|
|
343
|
-
}
|
|
344
|
-
}
|
|
345
|
-
}()
|
|
346
|
-
|
|
347
|
-
// Wait for shutdown signal
|
|
348
|
-
sigCh := make(chan os.Signal, 1)
|
|
349
|
-
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
|
|
350
|
-
|
|
351
|
-
log.Info("All subsystems started — supervisor is running (PID %d)", os.Getpid())
|
|
352
|
-
|
|
353
|
-
select {
|
|
354
|
-
case sig := <-sigCh:
|
|
355
|
-
log.Info("Received %s — shutting down", sig)
|
|
356
|
-
rootCancel()
|
|
357
|
-
case <-ctx.Done():
|
|
358
|
-
log.Info("Shutdown requested")
|
|
359
|
-
}
|
|
360
|
-
|
|
361
|
-
// Stop keepers (with 10s timeout)
|
|
362
|
-
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 10*time.Second)
|
|
363
|
-
defer shutdownCancel()
|
|
364
|
-
|
|
365
|
-
mu.Lock()
|
|
366
|
-
var wg sync.WaitGroup
|
|
367
|
-
for _, entry := range keepers {
|
|
368
|
-
wg.Add(1)
|
|
369
|
-
go func(k *Keeper) {
|
|
370
|
-
defer wg.Done()
|
|
371
|
-
k.Stop()
|
|
372
|
-
}(entry.keeper)
|
|
373
|
-
}
|
|
374
|
-
mu.Unlock()
|
|
375
|
-
|
|
376
|
-
doneCh := make(chan struct{})
|
|
377
|
-
go func() { wg.Wait(); close(doneCh) }()
|
|
378
|
-
select {
|
|
379
|
-
case <-doneCh:
|
|
380
|
-
log.Info("All keepers stopped")
|
|
381
|
-
case <-shutdownCtx.Done():
|
|
382
|
-
log.Warn("Keeper shutdown timed out after 10s")
|
|
383
|
-
}
|
|
384
|
-
|
|
385
|
-
// Stop updater
|
|
386
|
-
updater.Stop()
|
|
387
|
-
|
|
388
|
-
// Wait for background goroutines
|
|
389
|
-
<-keeperPollerDone
|
|
390
|
-
<-healthDone
|
|
391
|
-
|
|
392
|
-
// Ask MCP to write reconnect snapshot before killing it
|
|
393
|
-
mcp.PrepareShutdown(context.Background())
|
|
394
|
-
|
|
395
|
-
// Kill server process
|
|
396
|
-
pid, err := ReadPIDFile(cfg.Paths.ServerPID)
|
|
397
|
-
if err == nil && pid > 0 {
|
|
398
|
-
log.Info("Stopping MCP server (PID %d)", pid)
|
|
399
|
-
_ = KillProcess(pid, log)
|
|
400
|
-
}
|
|
401
|
-
|
|
402
|
-
log.Info("Supervisor stopped cleanly")
|
|
403
|
-
return nil
|
|
404
|
-
}
|
|
405
|
-
|
|
406
|
-
// fetchKeeperSettings reads all keepAlive threads from the MCP server
|
|
407
|
-
// (root, branch, and daily — excludes worker threads).
|
|
408
|
-
func fetchKeeperSettings(ctx context.Context, mcp *MCPClient, log *Logger) ([]KeeperConfig, error) {
|
|
409
|
-
roots, err := mcp.GetKeepAliveThreads(ctx)
|
|
410
|
-
if err != nil {
|
|
411
|
-
return nil, err
|
|
412
|
-
}
|
|
413
|
-
|
|
414
|
-
var result []KeeperConfig
|
|
415
|
-
for _, r := range roots {
|
|
416
|
-
keepAlive, _ := r["keepAlive"].(bool)
|
|
417
|
-
if !keepAlive {
|
|
418
|
-
continue
|
|
419
|
-
}
|
|
420
|
-
|
|
421
|
-
// Skip non-active roots (archived, expired, exited)
|
|
422
|
-
if status, _ := r["status"].(string); status != "" && status != "active" {
|
|
423
|
-
continue
|
|
424
|
-
}
|
|
425
|
-
|
|
426
|
-
tidFloat, _ := r["threadId"].(float64) // JSON numbers decode as float64
|
|
427
|
-
tid := int(tidFloat)
|
|
428
|
-
if tid <= 0 {
|
|
429
|
-
continue
|
|
430
|
-
}
|
|
431
|
-
|
|
432
|
-
client := "claude"
|
|
433
|
-
if c, ok := r["client"].(string); ok && c != "" {
|
|
434
|
-
client = c
|
|
435
|
-
}
|
|
436
|
-
|
|
437
|
-
sessionName := ""
|
|
438
|
-
if n, ok := r["name"].(string); ok {
|
|
439
|
-
sessionName = n
|
|
440
|
-
}
|
|
441
|
-
|
|
442
|
-
maxRetries := 5
|
|
443
|
-
if mr, ok := r["maxRetries"].(float64); ok {
|
|
444
|
-
maxRetries = int(mr)
|
|
445
|
-
}
|
|
446
|
-
|
|
447
|
-
cooldownMs := 300_000
|
|
448
|
-
if cd, ok := r["cooldownMs"].(float64); ok {
|
|
449
|
-
cooldownMs = int(cd)
|
|
450
|
-
}
|
|
451
|
-
|
|
452
|
-
workDir := ""
|
|
453
|
-
if wd, ok := r["workingDirectory"].(string); ok {
|
|
454
|
-
workDir = wd
|
|
455
|
-
}
|
|
456
|
-
|
|
457
|
-
result = append(result, KeeperConfig{
|
|
458
|
-
ThreadID: tid,
|
|
459
|
-
SessionName: sessionName,
|
|
460
|
-
Client: client,
|
|
461
|
-
WorkingDirectory: workDir,
|
|
462
|
-
MaxRetries: maxRetries,
|
|
463
|
-
CooldownMs: cooldownMs,
|
|
464
|
-
})
|
|
465
|
-
}
|
|
466
|
-
return result, nil
|
|
467
|
-
}
|
|
468
|
-
|
|
469
|
-
func settingsChanged(a, b KeeperConfig) bool {
|
|
470
|
-
return a.MaxRetries != b.MaxRetries ||
|
|
471
|
-
a.CooldownMs != b.CooldownMs ||
|
|
472
|
-
a.Client != b.Client ||
|
|
473
|
-
a.SessionName != b.SessionName ||
|
|
474
|
-
a.WorkingDirectory != b.WorkingDirectory
|
|
475
|
-
}
|