sensorium-mcp 2.17.26 → 2.17.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/dist/dashboard/routes/threads.d.ts.map +1 -1
  2. package/dist/dashboard/routes/threads.js +16 -5
  3. package/dist/dashboard/routes/threads.js.map +1 -1
  4. package/dist/data/memory/bootstrap.js +2 -2
  5. package/dist/data/memory/bootstrap.js.map +1 -1
  6. package/dist/data/memory/consolidation.d.ts.map +1 -1
  7. package/dist/data/memory/consolidation.js +74 -4
  8. package/dist/data/memory/consolidation.js.map +1 -1
  9. package/dist/data/memory/semantic.d.ts +11 -0
  10. package/dist/data/memory/semantic.d.ts.map +1 -1
  11. package/dist/data/memory/semantic.js +37 -0
  12. package/dist/data/memory/semantic.js.map +1 -1
  13. package/dist/data/memory/thread-registry.d.ts +7 -0
  14. package/dist/data/memory/thread-registry.d.ts.map +1 -1
  15. package/dist/data/memory/thread-registry.js +11 -1
  16. package/dist/data/memory/thread-registry.js.map +1 -1
  17. package/dist/index.js +17 -5
  18. package/dist/index.js.map +1 -1
  19. package/dist/tools/shared-agent-utils.d.ts.map +1 -1
  20. package/dist/tools/shared-agent-utils.js +3 -4
  21. package/dist/tools/shared-agent-utils.js.map +1 -1
  22. package/dist/tools/thread-lifecycle.d.ts.map +1 -1
  23. package/dist/tools/thread-lifecycle.js +33 -15
  24. package/dist/tools/thread-lifecycle.js.map +1 -1
  25. package/package.json +10 -2
  26. package/scripts/install-supervisor.ps1 +67 -0
  27. package/scripts/install-supervisor.sh +43 -0
  28. package/scripts/start-supervisor.ps1 +46 -0
  29. package/scripts/start-supervisor.sh +20 -0
  30. package/supervisor/config.go +140 -0
  31. package/supervisor/go.mod +3 -0
  32. package/supervisor/health.go +389 -0
  33. package/supervisor/health_test.go +93 -0
  34. package/supervisor/keeper.go +303 -0
  35. package/supervisor/keeper_test.go +27 -0
  36. package/supervisor/lock.go +56 -0
  37. package/supervisor/lock_test.go +54 -0
  38. package/supervisor/log.go +114 -0
  39. package/supervisor/log_test.go +45 -0
  40. package/supervisor/main.go +325 -0
  41. package/supervisor/notify.go +53 -0
  42. package/supervisor/process.go +222 -0
  43. package/supervisor/process_test.go +94 -0
  44. package/supervisor/process_unix.go +14 -0
  45. package/supervisor/process_windows.go +15 -0
  46. package/supervisor/updater.go +281 -0
  47. package/dist/claude-keeper.d.ts +0 -24
  48. package/dist/claude-keeper.d.ts.map +0 -1
  49. package/dist/claude-keeper.js +0 -374
  50. package/dist/claude-keeper.js.map +0 -1
  51. package/dist/watcher-service.d.ts +0 -2
  52. package/dist/watcher-service.d.ts.map +0 -1
  53. package/dist/watcher-service.js +0 -997
  54. package/dist/watcher-service.js.map +0 -1
@@ -0,0 +1,15 @@
1
+ //go:build windows
2
+
3
+ package main
4
+
5
+ import (
6
+ "os/exec"
7
+ "syscall"
8
+ )
9
+
10
+ func setSysProcAttr(cmd *exec.Cmd) {
11
+ cmd.SysProcAttr = &syscall.SysProcAttr{
12
+ CreationFlags: syscall.CREATE_NEW_PROCESS_GROUP,
13
+ HideWindow: true,
14
+ }
15
+ }
@@ -0,0 +1,281 @@
1
+ package main
2
+
3
+ import (
4
+ "context"
5
+ "encoding/json"
6
+ "fmt"
7
+ "net/http"
8
+ "os"
9
+ "path/filepath"
10
+ "runtime"
11
+ "strings"
12
+ "time"
13
+ )
14
+
15
+ const registryURL = "https://registry.npmjs.org/sensorium-mcp/latest"
16
+
17
+ // Updater checks the npm registry for new versions and performs updates.
18
+ type Updater struct {
19
+ cfg Config
20
+ mcp *MCPClient
21
+ log *Logger
22
+ startAt time.Time
23
+ cancel context.CancelFunc
24
+ done chan struct{}
25
+ }
26
+
27
+ func NewUpdater(cfg Config, mcp *MCPClient, log *Logger) *Updater {
28
+ return &Updater{
29
+ cfg: cfg,
30
+ mcp: mcp,
31
+ log: log,
32
+ startAt: time.Now(),
33
+ done: make(chan struct{}),
34
+ }
35
+ }
36
+
37
+ // Start begins the update check loop.
38
+ func (u *Updater) Start() {
39
+ ctx, cancel := context.WithCancel(context.Background())
40
+ u.cancel = cancel
41
+ go u.run(ctx)
42
+ }
43
+
44
+ // Stop signals the updater to shut down and waits.
45
+ func (u *Updater) Stop() {
46
+ if u.cancel != nil {
47
+ u.cancel()
48
+ }
49
+ <-u.done
50
+ }
51
+
52
+ func (u *Updater) run(ctx context.Context) {
53
+ defer close(u.done)
54
+ u.log.Info("Updater started (mode=%s)", u.cfg.Mode)
55
+
56
+ // In development mode, check every PollInterval.
57
+ // In production, check once per day at PollAtHour.
58
+ for {
59
+ var sleepDuration time.Duration
60
+ if u.cfg.Mode == "development" {
61
+ sleepDuration = u.cfg.PollInterval
62
+ } else {
63
+ sleepDuration = u.timeUntilNextPoll()
64
+ }
65
+ u.log.Debug("Updater: next version check in %v", sleepDuration.Round(time.Second))
66
+
67
+ select {
68
+ case <-ctx.Done():
69
+ return
70
+ case <-time.After(sleepDuration):
71
+ }
72
+
73
+ if ctx.Err() != nil {
74
+ return
75
+ }
76
+
77
+ u.checkAndUpdate(ctx)
78
+ }
79
+ }
80
+
81
+ func (u *Updater) timeUntilNextPoll() time.Duration {
82
+ now := time.Now()
83
+ next := time.Date(now.Year(), now.Month(), now.Day(), u.cfg.PollAtHour, 0, 0, 0, now.Location())
84
+ if next.Before(now) {
85
+ next = next.Add(24 * time.Hour)
86
+ }
87
+ return time.Until(next)
88
+ }
89
+
90
+ // getRemoteVersion fetches the latest version from npm registry.
91
+ func (u *Updater) getRemoteVersion(ctx context.Context) (string, error) {
92
+ ctx2, cancel := context.WithTimeout(ctx, 15*time.Second)
93
+ defer cancel()
94
+
95
+ req, err := http.NewRequestWithContext(ctx2, "GET", registryURL, nil)
96
+ if err != nil {
97
+ return "", err
98
+ }
99
+
100
+ resp, err := http.DefaultClient.Do(req)
101
+ if err != nil {
102
+ return "", err
103
+ }
104
+ defer resp.Body.Close()
105
+
106
+ if resp.StatusCode != 200 {
107
+ return "", fmt.Errorf("npm registry HTTP %d", resp.StatusCode)
108
+ }
109
+
110
+ var pkg struct {
111
+ Version string `json:"version"`
112
+ }
113
+ if err := json.NewDecoder(resp.Body).Decode(&pkg); err != nil {
114
+ return "", err
115
+ }
116
+ return pkg.Version, nil
117
+ }
118
+
119
+ // getLocalVersion reads the current version from the version file.
120
+ func (u *Updater) getLocalVersion() string {
121
+ data, err := os.ReadFile(u.cfg.Paths.VersionFile)
122
+ if err != nil {
123
+ return ""
124
+ }
125
+ return strings.TrimSpace(string(data))
126
+ }
127
+
128
+ func (u *Updater) setLocalVersion(v string) {
129
+ os.MkdirAll(u.cfg.DataDir, 0755)
130
+ if err := atomicWrite(u.cfg.Paths.VersionFile, []byte(v)); err != nil {
131
+ u.log.Warn("Failed to write version file: %v", err)
132
+ }
133
+ }
134
+
135
+ func (u *Updater) checkAndUpdate(ctx context.Context) {
136
+ // Enforce minimum uptime before updating
137
+ uptime := time.Since(u.startAt)
138
+ if uptime < u.cfg.MinUptime {
139
+ u.log.Info("Deferring update — too early (uptime %v < %v)", uptime.Round(time.Second), u.cfg.MinUptime)
140
+ return
141
+ }
142
+
143
+ remote, err := u.getRemoteVersion(ctx)
144
+ if err != nil {
145
+ u.log.Warn("Failed to check npm registry: %v", err)
146
+ return
147
+ }
148
+
149
+ local := u.getLocalVersion()
150
+ if local == "" {
151
+ u.log.Info("No local version recorded — storing %s", remote)
152
+ u.setLocalVersion(remote)
153
+ return
154
+ }
155
+
156
+ if local == remote {
157
+ u.log.Debug("Updater: version %s is up to date", local)
158
+ return
159
+ }
160
+
161
+ u.log.Info("Update available: %s → %s", local, remote)
162
+ NotifyOperator(u.cfg, u.log, fmt.Sprintf("⚙️ Supervisor: updating sensorium v%s → v%s. Grace period %v...", local, remote, u.cfg.GracePeriod), 0)
163
+
164
+ // Grace period
165
+ u.log.Info("Grace period %v...", u.cfg.GracePeriod)
166
+ select {
167
+ case <-ctx.Done():
168
+ return
169
+ case <-time.After(u.cfg.GracePeriod):
170
+ }
171
+
172
+ // Set maintenance flag — always clean up on exit
173
+ if err := atomicWrite(u.cfg.Paths.MaintenanceFlag, []byte(time.Now().Format(time.RFC3339))); err != nil {
174
+ u.log.Warn("Failed to write maintenance flag: %v", err)
175
+ }
176
+ defer os.Remove(u.cfg.Paths.MaintenanceFlag)
177
+
178
+ // Kill the current MCP server
179
+ if ctx.Err() != nil {
180
+ return
181
+ }
182
+ u.killServer()
183
+
184
+ // Clean npx cache
185
+ if ctx.Err() != nil {
186
+ return
187
+ }
188
+ u.clearNpxCache()
189
+
190
+ // Spawn new server — retry up to 3 times on failure
191
+ var pid int
192
+ for attempt := 1; attempt <= 3; attempt++ {
193
+ if ctx.Err() != nil {
194
+ return
195
+ }
196
+ pid, err = SpawnMCPServer(u.cfg, u.log)
197
+ if err == nil {
198
+ break
199
+ }
200
+ u.log.Error("Failed to spawn updated MCP server (attempt %d/3): %v", attempt, err)
201
+ if attempt < 3 {
202
+ time.Sleep(2 * time.Second)
203
+ }
204
+ }
205
+ if err != nil {
206
+ u.log.Error("All spawn attempts failed — server is down!")
207
+ NotifyOperator(u.cfg, u.log, "🔴 Supervisor: update FAILED — server is down! Manual intervention required.", 0)
208
+ return
209
+ }
210
+
211
+ // Wait for new server to be ready
212
+ if u.mcp.WaitForReady(ctx, 3*time.Second, 60*time.Second) {
213
+ u.log.Info("Updated MCP server ready (PID %d)", pid)
214
+ } else {
215
+ u.log.Warn("Updated server did not become ready in 60s")
216
+ }
217
+
218
+ u.setLocalVersion(remote)
219
+
220
+ NotifyOperator(u.cfg, u.log, fmt.Sprintf("✅ Supervisor: update to v%s complete. Server ready.", remote), 0)
221
+ u.log.Info("Update complete: v%s → v%s", local, remote)
222
+
223
+ // Reset start time for min uptime tracking
224
+ u.startAt = time.Now()
225
+ }
226
+
227
+ func (u *Updater) killServer() {
228
+ u.log.Info("Updater: stopping current MCP server for update")
229
+ pid, err := ReadPIDFile(u.cfg.Paths.ServerPID)
230
+ if err != nil {
231
+ u.log.Warn("Could not read server PID file: %v", err)
232
+ // Try killing by port as fallback
233
+ KillByPort(u.cfg.MCPHttpPort, u.log)
234
+ return
235
+ }
236
+ if err := KillProcess(pid, u.log); err != nil {
237
+ u.log.Error("Failed to kill server (PID %d): %v", pid, err)
238
+ KillByPort(u.cfg.MCPHttpPort, u.log)
239
+ }
240
+ }
241
+
242
+ // clearNpxCache removes the cached sensorium-mcp package from the npx cache
243
+ // so the next `npx -y sensorium-mcp@latest` fetches the new version.
244
+ func (u *Updater) clearNpxCache() {
245
+ u.log.Info("Updater: clearing npx cache")
246
+ var base string
247
+ if runtime.GOOS == "windows" {
248
+ localAppData := os.Getenv("LOCALAPPDATA")
249
+ if localAppData == "" {
250
+ home, _ := os.UserHomeDir()
251
+ localAppData = filepath.Join(home, "AppData", "Local")
252
+ }
253
+ base = filepath.Join(localAppData, "npm-cache", "_npx")
254
+ } else {
255
+ home, _ := os.UserHomeDir()
256
+ base = filepath.Join(home, ".npm", "_npx")
257
+ }
258
+
259
+ u.log.Info("Clearing sensorium-mcp from npx cache (%s)", base)
260
+
261
+ entries, err := os.ReadDir(base)
262
+ if err != nil {
263
+ return
264
+ }
265
+
266
+ for _, e := range entries {
267
+ if !e.IsDir() {
268
+ continue
269
+ }
270
+ pkgDir := filepath.Join(base, e.Name(), "node_modules", "sensorium-mcp")
271
+ // Validate path doesn't escape base directory
272
+ if !strings.HasPrefix(pkgDir, base) {
273
+ continue
274
+ }
275
+ if _, err := os.Stat(pkgDir); err == nil {
276
+ if err := os.RemoveAll(pkgDir); err != nil {
277
+ u.log.Warn("Failed to clear npx cache entry %s: %v", pkgDir, err)
278
+ }
279
+ }
280
+ }
281
+ }
@@ -1,24 +0,0 @@
1
- /**
2
- * Thread keeper — monitors keep-alive threads and restarts them
3
- * via the start_thread MCP tool when they stop running.
4
- *
5
- * No direct process spawning — delegates to start_thread which handles
6
- * all lifecycle concerns (PID tracking, registry updates, MCP config).
7
- */
8
- export interface KeeperConfig {
9
- threadId: number;
10
- sessionName: string;
11
- client: string;
12
- mcpHttpPort: number;
13
- mcpHttpSecret: string | null;
14
- workingDirectory?: string;
15
- maxRetries?: number;
16
- cooldownMs?: number;
17
- /** Called when the keeper detects the thread process has died. */
18
- onDeath?: (threadId: number, sessionName: string) => void;
19
- }
20
- export interface KeeperHandle {
21
- stop(): Promise<void>;
22
- }
23
- export declare function startClaudeKeeper(config: KeeperConfig): Promise<KeeperHandle>;
24
- //# sourceMappingURL=claude-keeper.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"claude-keeper.d.ts","sourceRoot":"","sources":["../src/claude-keeper.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAuBH,MAAM,WAAW,YAAY;IAC3B,QAAQ,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;IACpB,MAAM,EAAE,MAAM,CAAC;IACf,WAAW,EAAE,MAAM,CAAC;IACpB,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,kEAAkE;IAClE,OAAO,CAAC,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,WAAW,EAAE,MAAM,KAAK,IAAI,CAAC;CAC3D;AAED,MAAM,WAAW,YAAY;IAC3B,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;CACvB;AA4OD,wBAAsB,iBAAiB,CAAC,MAAM,EAAE,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC,CAgHnF"}
@@ -1,374 +0,0 @@
1
- /**
2
- * Thread keeper — monitors keep-alive threads and restarts them
3
- * via the start_thread MCP tool when they stop running.
4
- *
5
- * No direct process spawning — delegates to start_thread which handles
6
- * all lifecycle concerns (PID tracking, registry updates, MCP config).
7
- */
8
- import { LATEST_PROTOCOL_VERSION } from "@modelcontextprotocol/sdk/types.js";
9
- import { readFileSync } from "node:fs";
10
- import { homedir } from "node:os";
11
- import { join } from "node:path";
12
- import { errorMessage } from "./utils.js";
13
- // ─── Constants ───────────────────────────────────────────────────────────────
14
- const BASE_BACKOFF_MS = 5_000;
15
- const MAX_BACKOFF_MS = 5 * 60_000;
16
- const HEALTH_CHECK_INTERVAL_MS = 2 * 60_000;
17
- const DEFAULT_MAX_RETRIES = 5;
18
- const DEFAULT_COOLDOWN_MS = 300_000;
19
- const MCP_READY_POLL_INTERVAL_MS = 3_000;
20
- const MCP_READY_TIMEOUT_MS = 120_000;
21
- const FAST_EXIT_THRESHOLD_MS = 60_000; // process died within 60s = fast exit
22
- const FAST_EXIT_MAX_COUNT = 3; // 3 fast exits = escalating cooldown
23
- const FAST_EXIT_BASE_COOLDOWN_MS = 10 * 60_000; // 10 min initial, doubles each time
24
- // ─── Helpers ──────────────────────────────────────────────────────────────────
25
- function keeperLog(level, msg) {
26
- const ts = new Date().toISOString();
27
- console.log(`[${ts}] [KEEPER/${level}] ${msg}`);
28
- }
29
- function authHeaders(secret) {
30
- const h = {
31
- "Content-Type": "application/json",
32
- "Accept": "application/json, text/event-stream",
33
- };
34
- if (secret)
35
- h["Authorization"] = `Bearer ${secret}`;
36
- return h;
37
- }
38
- /**
39
- * Parse a fetch Response that may be JSON or SSE (text/event-stream).
40
- * The MCP Streamable HTTP transport returns SSE when the Accept header
41
- * includes text/event-stream. Each SSE "data:" line contains a JSON-RPC
42
- * message. We extract the last data line as the result.
43
- */
44
- async function parseJsonOrSse(res) {
45
- const ct = res.headers.get("content-type") ?? "";
46
- if (ct.includes("text/event-stream")) {
47
- const text = await res.text();
48
- // Extract all "data:" lines and parse the last one (the final result)
49
- const dataLines = text.split("\n")
50
- .filter(line => line.startsWith("data:"))
51
- .map(line => line.slice(5).trim());
52
- for (let i = dataLines.length - 1; i >= 0; i--) {
53
- try {
54
- return JSON.parse(dataLines[i]);
55
- }
56
- catch { /* try previous line */ }
57
- }
58
- return {};
59
- }
60
- return await res.json();
61
- }
62
- async function waitForMcpReady(port, secret) {
63
- const deadline = Date.now() + MCP_READY_TIMEOUT_MS;
64
- while (Date.now() < deadline) {
65
- try {
66
- const res = await fetch(`http://127.0.0.1:${port}/api/threads/roots`, {
67
- headers: authHeaders(secret),
68
- signal: AbortSignal.timeout(5_000),
69
- });
70
- if (res.ok)
71
- return true;
72
- }
73
- catch { /* server not ready */ }
74
- await new Promise(r => setTimeout(r, MCP_READY_POLL_INTERVAL_MS));
75
- }
76
- return false;
77
- }
78
- async function isThreadRunning(port, secret, threadId) {
79
- try {
80
- const res = await fetch(`http://127.0.0.1:${port}/api/threads/${threadId}/running`, {
81
- headers: authHeaders(secret),
82
- signal: AbortSignal.timeout(5_000),
83
- });
84
- if (!res.ok)
85
- return false;
86
- const data = await res.json();
87
- return data.running === true;
88
- }
89
- catch {
90
- return false;
91
- }
92
- }
93
- const STUCK_THRESHOLD_MS = 10 * 60_000; // 10 minutes without MCP activity = stuck
94
- async function isThreadStuck(port, secret, threadId) {
95
- try {
96
- const res = await fetch(`http://127.0.0.1:${port}/api/threads/${threadId}/heartbeat`, {
97
- headers: authHeaders(secret),
98
- signal: AbortSignal.timeout(5_000),
99
- });
100
- if (!res.ok)
101
- return false;
102
- const data = await res.json();
103
- if (data.lastActivity == null)
104
- return false; // no heartbeat yet, just started
105
- return Date.now() - data.lastActivity > STUCK_THRESHOLD_MS;
106
- }
107
- catch {
108
- return false;
109
- }
110
- }
111
- async function killThread(port, secret, threadId) {
112
- // Read PID file and kill the process directly
113
- try {
114
- const pidFile = join(homedir(), ".remote-copilot-mcp", "pids", `${threadId}.pid`);
115
- const raw = readFileSync(pidFile, "utf-8").trim();
116
- let pid;
117
- try {
118
- pid = JSON.parse(raw).pid;
119
- }
120
- catch {
121
- pid = Number(raw);
122
- }
123
- if (Number.isFinite(pid)) {
124
- if (process.platform === "win32") {
125
- // On Windows, process.kill throws EPERM for child processes.
126
- // Use taskkill which works reliably.
127
- const { execSync } = await import("child_process");
128
- try {
129
- execSync(`taskkill /F /T /PID ${pid}`, { stdio: "ignore", timeout: 10_000 });
130
- }
131
- catch { /* already dead */ }
132
- }
133
- else {
134
- try {
135
- process.kill(pid, "SIGKILL");
136
- }
137
- catch { /* already dead */ }
138
- }
139
- keeperLog("INFO", `Killed stuck process PID=${pid} for thread ${threadId}`);
140
- }
141
- }
142
- catch { /* PID file missing or unreadable */ }
143
- }
144
- async function openMcpSession(port, secret) {
145
- try {
146
- const res = await fetch(`http://127.0.0.1:${port}/mcp`, {
147
- method: "POST",
148
- headers: authHeaders(secret),
149
- body: JSON.stringify({
150
- jsonrpc: "2.0",
151
- id: `keeper-init-${Date.now()}`,
152
- method: "initialize",
153
- params: {
154
- protocolVersion: LATEST_PROTOCOL_VERSION,
155
- capabilities: {},
156
- clientInfo: {
157
- name: "thread-keeper",
158
- version: "1.0.0",
159
- },
160
- },
161
- }),
162
- signal: AbortSignal.timeout(30_000),
163
- });
164
- if (!res.ok) {
165
- keeperLog("WARN", `initialize HTTP ${res.status}: ${res.statusText}`);
166
- return null;
167
- }
168
- // Consume the body (may be SSE or JSON) to avoid connection stalls
169
- await res.text();
170
- const sessionId = res.headers.get("mcp-session-id");
171
- if (!sessionId) {
172
- keeperLog("WARN", "initialize succeeded but did not return an MCP session ID");
173
- return null;
174
- }
175
- await fetch(`http://127.0.0.1:${port}/mcp`, {
176
- method: "POST",
177
- headers: {
178
- ...authHeaders(secret),
179
- "mcp-session-id": sessionId,
180
- },
181
- body: JSON.stringify({
182
- jsonrpc: "2.0",
183
- method: "notifications/initialized",
184
- params: {},
185
- }),
186
- signal: AbortSignal.timeout(30_000),
187
- });
188
- return sessionId;
189
- }
190
- catch (err) {
191
- keeperLog("ERROR", `initialize failed: ${errorMessage(err)}`);
192
- return null;
193
- }
194
- }
195
- async function closeMcpSession(port, secret, sessionId) {
196
- try {
197
- await fetch(`http://127.0.0.1:${port}/mcp`, {
198
- method: "DELETE",
199
- headers: {
200
- ...authHeaders(secret),
201
- "mcp-session-id": sessionId,
202
- },
203
- signal: AbortSignal.timeout(10_000),
204
- });
205
- }
206
- catch {
207
- // Best-effort cleanup.
208
- }
209
- }
210
- async function callStartThread(config) {
211
- const { mcpHttpPort: port, mcpHttpSecret: secret, threadId, sessionName, client, workingDirectory } = config;
212
- const sessionId = await openMcpSession(port, secret);
213
- if (!sessionId)
214
- return false;
215
- try {
216
- const body = JSON.stringify({
217
- jsonrpc: "2.0",
218
- method: "tools/call",
219
- params: {
220
- name: "start_thread",
221
- arguments: {
222
- name: sessionName,
223
- targetThreadId: threadId,
224
- agentType: client,
225
- mode: "resume",
226
- ...(workingDirectory !== undefined ? { workingDirectory } : {}),
227
- },
228
- },
229
- id: `keeper-${threadId}-${Date.now()}`,
230
- });
231
- const res = await fetch(`http://127.0.0.1:${port}/mcp`, {
232
- method: "POST",
233
- headers: {
234
- ...authHeaders(secret),
235
- "mcp-session-id": sessionId,
236
- },
237
- body,
238
- signal: AbortSignal.timeout(30_000),
239
- });
240
- if (!res.ok) {
241
- keeperLog("WARN", `start_thread HTTP ${res.status}: ${res.statusText}`);
242
- return false;
243
- }
244
- const result = await parseJsonOrSse(res);
245
- if (result.error) {
246
- keeperLog("WARN", `start_thread RPC error ${result.error.code ?? "unknown"}: ${result.error.message ?? "unknown error"}`);
247
- return false;
248
- }
249
- const text = result?.result?.content?.[0]?.text ?? "";
250
- if (!text.trim()) {
251
- keeperLog("WARN", "start_thread returned an empty response");
252
- return false;
253
- }
254
- keeperLog("INFO", `start_thread response: ${text.slice(0, 200)}`);
255
- return !text.toLowerCase().includes("error");
256
- }
257
- catch (err) {
258
- keeperLog("ERROR", `start_thread call failed: ${errorMessage(err)}`);
259
- return false;
260
- }
261
- finally {
262
- await closeMcpSession(port, secret, sessionId);
263
- }
264
- }
265
- // ─── Core keeper ──────────────────────────────────────────────────────────────
266
- export async function startClaudeKeeper(config) {
267
- const maxRetries = config.maxRetries ?? DEFAULT_MAX_RETRIES;
268
- const cooldownMs = config.cooldownMs ?? DEFAULT_COOLDOWN_MS;
269
- keeperLog("INFO", `Starting keeper for thread ${config.threadId} ('${config.sessionName}') [client=${config.client}]`);
270
- let stopped = false;
271
- let retryCount = 0;
272
- let consecutiveNotRunning = 0;
273
- let consecutiveStuck = 0;
274
- let timer = null;
275
- let lastStartTime = 0; // epoch ms when the last process was started
276
- let fastExitCount = 0; // consecutive fast exits (< 60s lifetime)
277
- let fastExitEscalation = 0; // escalation level for exponential backoff
278
- keeperLog("INFO", "Waiting for MCP server to be ready...");
279
- const ready = await waitForMcpReady(config.mcpHttpPort, config.mcpHttpSecret);
280
- if (!ready)
281
- keeperLog("WARN", "MCP server did not respond in time — attempting start_thread anyway.");
282
- else
283
- keeperLog("INFO", "MCP server ready.");
284
- async function checkAndStart() {
285
- if (stopped)
286
- return;
287
- const running = await isThreadRunning(config.mcpHttpPort, config.mcpHttpSecret, config.threadId);
288
- if (running) {
289
- consecutiveNotRunning = 0;
290
- // Check for stuck process (alive but no MCP activity for 10+ min)
291
- const stuck = await isThreadStuck(config.mcpHttpPort, config.mcpHttpSecret, config.threadId);
292
- if (stuck) {
293
- consecutiveStuck++;
294
- if (consecutiveStuck < 3) {
295
- // Require 3 consecutive stuck checks (~6 min) before killing
296
- keeperLog("INFO", `Thread ${config.threadId} appears stuck (${consecutiveStuck}/3) — rechecking in 2 min`);
297
- scheduleCheck();
298
- return;
299
- }
300
- keeperLog("WARN", `Thread ${config.threadId} confirmed stuck (no MCP activity for ${STUCK_THRESHOLD_MS / 60_000}+ min, ${consecutiveStuck} checks) — killing process`);
301
- await killThread(config.mcpHttpPort, config.mcpHttpSecret, config.threadId);
302
- consecutiveStuck = 0;
303
- // Wait for process to actually die, then restart on next normal check
304
- scheduleCheck();
305
- return;
306
- }
307
- consecutiveStuck = 0;
308
- scheduleCheck();
309
- return;
310
- }
311
- consecutiveNotRunning++;
312
- if (consecutiveNotRunning === 1) {
313
- // First detection — notify operator immediately
314
- config.onDeath?.(config.threadId, config.sessionName);
315
- }
316
- if (consecutiveNotRunning < 2) {
317
- // Single failure may be a timeout; recheck quickly before restarting
318
- timer = setTimeout(() => void checkAndStart(), 10_000);
319
- return;
320
- }
321
- if (retryCount >= maxRetries) {
322
- keeperLog("WARN", `Max retries (${maxRetries}) exceeded — cooling down for ${Math.round(cooldownMs / 1000)}s`);
323
- retryCount = 0;
324
- timer = setTimeout(() => void checkAndStart(), cooldownMs);
325
- return;
326
- }
327
- retryCount++;
328
- keeperLog("INFO", `Thread ${config.threadId} not running — calling start_thread (attempt ${retryCount}/${maxRetries})`);
329
- // Fast-exit detection: if the process died quickly, use exponential backoff.
330
- if (lastStartTime > 0 && (Date.now() - lastStartTime) < FAST_EXIT_THRESHOLD_MS) {
331
- fastExitCount++;
332
- if (fastExitCount >= FAST_EXIT_MAX_COUNT) {
333
- const cooldown = Math.min(FAST_EXIT_BASE_COOLDOWN_MS * 2 ** fastExitEscalation, 4 * 60 * 60_000); // cap 4h
334
- fastExitEscalation++;
335
- keeperLog("WARN", `Thread ${config.threadId}: ${fastExitCount} consecutive fast exits — backing off ${Math.round(cooldown / 60_000)} min`);
336
- config.onDeath?.(config.threadId, `${config.sessionName} (repeated fast exits — check credits/API key)`);
337
- fastExitCount = 0;
338
- retryCount = 0;
339
- timer = setTimeout(() => void checkAndStart(), cooldown);
340
- return;
341
- }
342
- }
343
- else {
344
- fastExitCount = 0;
345
- fastExitEscalation = 0; // reset if the process ran long enough
346
- }
347
- lastStartTime = Date.now();
348
- const ok = await callStartThread(config);
349
- if (ok) {
350
- retryCount = 0;
351
- consecutiveNotRunning = 0;
352
- scheduleCheck();
353
- }
354
- else {
355
- const delay = Math.min(BASE_BACKOFF_MS * 2 ** retryCount, MAX_BACKOFF_MS);
356
- keeperLog("INFO", `Scheduling retry in ${delay}ms`);
357
- timer = setTimeout(() => void checkAndStart(), delay);
358
- }
359
- }
360
- function scheduleCheck() {
361
- if (stopped)
362
- return;
363
- timer = setTimeout(() => void checkAndStart(), HEALTH_CHECK_INTERVAL_MS);
364
- }
365
- void checkAndStart();
366
- return {
367
- async stop() {
368
- stopped = true;
369
- if (timer)
370
- clearTimeout(timer);
371
- },
372
- };
373
- }
374
- //# sourceMappingURL=claude-keeper.js.map