sensorium-mcp 2.17.26 → 2.17.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/dashboard/routes/threads.d.ts.map +1 -1
- package/dist/dashboard/routes/threads.js +16 -5
- package/dist/dashboard/routes/threads.js.map +1 -1
- package/dist/data/memory/bootstrap.js +2 -2
- package/dist/data/memory/bootstrap.js.map +1 -1
- package/dist/data/memory/consolidation.d.ts.map +1 -1
- package/dist/data/memory/consolidation.js +74 -4
- package/dist/data/memory/consolidation.js.map +1 -1
- package/dist/data/memory/semantic.d.ts +11 -0
- package/dist/data/memory/semantic.d.ts.map +1 -1
- package/dist/data/memory/semantic.js +37 -0
- package/dist/data/memory/semantic.js.map +1 -1
- package/dist/data/memory/thread-registry.d.ts +7 -0
- package/dist/data/memory/thread-registry.d.ts.map +1 -1
- package/dist/data/memory/thread-registry.js +11 -1
- package/dist/data/memory/thread-registry.js.map +1 -1
- package/dist/index.js +17 -5
- package/dist/index.js.map +1 -1
- package/dist/tools/shared-agent-utils.d.ts.map +1 -1
- package/dist/tools/shared-agent-utils.js +3 -4
- package/dist/tools/shared-agent-utils.js.map +1 -1
- package/dist/tools/thread-lifecycle.d.ts.map +1 -1
- package/dist/tools/thread-lifecycle.js +33 -15
- package/dist/tools/thread-lifecycle.js.map +1 -1
- package/package.json +10 -2
- package/scripts/install-supervisor.ps1 +67 -0
- package/scripts/install-supervisor.sh +43 -0
- package/scripts/start-supervisor.ps1 +46 -0
- package/scripts/start-supervisor.sh +20 -0
- package/supervisor/config.go +140 -0
- package/supervisor/go.mod +3 -0
- package/supervisor/health.go +389 -0
- package/supervisor/health_test.go +93 -0
- package/supervisor/keeper.go +303 -0
- package/supervisor/keeper_test.go +27 -0
- package/supervisor/lock.go +56 -0
- package/supervisor/lock_test.go +54 -0
- package/supervisor/log.go +114 -0
- package/supervisor/log_test.go +45 -0
- package/supervisor/main.go +325 -0
- package/supervisor/notify.go +53 -0
- package/supervisor/process.go +222 -0
- package/supervisor/process_test.go +94 -0
- package/supervisor/process_unix.go +14 -0
- package/supervisor/process_windows.go +15 -0
- package/supervisor/updater.go +281 -0
- package/dist/claude-keeper.d.ts +0 -24
- package/dist/claude-keeper.d.ts.map +0 -1
- package/dist/claude-keeper.js +0 -374
- package/dist/claude-keeper.js.map +0 -1
- package/dist/watcher-service.d.ts +0 -2
- package/dist/watcher-service.d.ts.map +0 -1
- package/dist/watcher-service.js +0 -997
- package/dist/watcher-service.js.map +0 -1
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
package main
|
|
2
|
+
|
|
3
|
+
import (
|
|
4
|
+
"context"
|
|
5
|
+
"encoding/json"
|
|
6
|
+
"fmt"
|
|
7
|
+
"net/http"
|
|
8
|
+
"os"
|
|
9
|
+
"path/filepath"
|
|
10
|
+
"runtime"
|
|
11
|
+
"strings"
|
|
12
|
+
"time"
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
const registryURL = "https://registry.npmjs.org/sensorium-mcp/latest"
|
|
16
|
+
|
|
17
|
+
// Updater checks the npm registry for new versions and performs updates.
|
|
18
|
+
type Updater struct {
|
|
19
|
+
cfg Config
|
|
20
|
+
mcp *MCPClient
|
|
21
|
+
log *Logger
|
|
22
|
+
startAt time.Time
|
|
23
|
+
cancel context.CancelFunc
|
|
24
|
+
done chan struct{}
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
func NewUpdater(cfg Config, mcp *MCPClient, log *Logger) *Updater {
|
|
28
|
+
return &Updater{
|
|
29
|
+
cfg: cfg,
|
|
30
|
+
mcp: mcp,
|
|
31
|
+
log: log,
|
|
32
|
+
startAt: time.Now(),
|
|
33
|
+
done: make(chan struct{}),
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Start begins the update check loop.
|
|
38
|
+
func (u *Updater) Start() {
|
|
39
|
+
ctx, cancel := context.WithCancel(context.Background())
|
|
40
|
+
u.cancel = cancel
|
|
41
|
+
go u.run(ctx)
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Stop signals the updater to shut down and waits.
|
|
45
|
+
func (u *Updater) Stop() {
|
|
46
|
+
if u.cancel != nil {
|
|
47
|
+
u.cancel()
|
|
48
|
+
}
|
|
49
|
+
<-u.done
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
func (u *Updater) run(ctx context.Context) {
|
|
53
|
+
defer close(u.done)
|
|
54
|
+
u.log.Info("Updater started (mode=%s)", u.cfg.Mode)
|
|
55
|
+
|
|
56
|
+
// In development mode, check every PollInterval.
|
|
57
|
+
// In production, check once per day at PollAtHour.
|
|
58
|
+
for {
|
|
59
|
+
var sleepDuration time.Duration
|
|
60
|
+
if u.cfg.Mode == "development" {
|
|
61
|
+
sleepDuration = u.cfg.PollInterval
|
|
62
|
+
} else {
|
|
63
|
+
sleepDuration = u.timeUntilNextPoll()
|
|
64
|
+
}
|
|
65
|
+
u.log.Debug("Updater: next version check in %v", sleepDuration.Round(time.Second))
|
|
66
|
+
|
|
67
|
+
select {
|
|
68
|
+
case <-ctx.Done():
|
|
69
|
+
return
|
|
70
|
+
case <-time.After(sleepDuration):
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
if ctx.Err() != nil {
|
|
74
|
+
return
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
u.checkAndUpdate(ctx)
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
func (u *Updater) timeUntilNextPoll() time.Duration {
|
|
82
|
+
now := time.Now()
|
|
83
|
+
next := time.Date(now.Year(), now.Month(), now.Day(), u.cfg.PollAtHour, 0, 0, 0, now.Location())
|
|
84
|
+
if next.Before(now) {
|
|
85
|
+
next = next.Add(24 * time.Hour)
|
|
86
|
+
}
|
|
87
|
+
return time.Until(next)
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// getRemoteVersion fetches the latest version from npm registry.
|
|
91
|
+
func (u *Updater) getRemoteVersion(ctx context.Context) (string, error) {
|
|
92
|
+
ctx2, cancel := context.WithTimeout(ctx, 15*time.Second)
|
|
93
|
+
defer cancel()
|
|
94
|
+
|
|
95
|
+
req, err := http.NewRequestWithContext(ctx2, "GET", registryURL, nil)
|
|
96
|
+
if err != nil {
|
|
97
|
+
return "", err
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
resp, err := http.DefaultClient.Do(req)
|
|
101
|
+
if err != nil {
|
|
102
|
+
return "", err
|
|
103
|
+
}
|
|
104
|
+
defer resp.Body.Close()
|
|
105
|
+
|
|
106
|
+
if resp.StatusCode != 200 {
|
|
107
|
+
return "", fmt.Errorf("npm registry HTTP %d", resp.StatusCode)
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
var pkg struct {
|
|
111
|
+
Version string `json:"version"`
|
|
112
|
+
}
|
|
113
|
+
if err := json.NewDecoder(resp.Body).Decode(&pkg); err != nil {
|
|
114
|
+
return "", err
|
|
115
|
+
}
|
|
116
|
+
return pkg.Version, nil
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// getLocalVersion reads the current version from the version file.
|
|
120
|
+
func (u *Updater) getLocalVersion() string {
|
|
121
|
+
data, err := os.ReadFile(u.cfg.Paths.VersionFile)
|
|
122
|
+
if err != nil {
|
|
123
|
+
return ""
|
|
124
|
+
}
|
|
125
|
+
return strings.TrimSpace(string(data))
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
func (u *Updater) setLocalVersion(v string) {
|
|
129
|
+
os.MkdirAll(u.cfg.DataDir, 0755)
|
|
130
|
+
if err := atomicWrite(u.cfg.Paths.VersionFile, []byte(v)); err != nil {
|
|
131
|
+
u.log.Warn("Failed to write version file: %v", err)
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
func (u *Updater) checkAndUpdate(ctx context.Context) {
|
|
136
|
+
// Enforce minimum uptime before updating
|
|
137
|
+
uptime := time.Since(u.startAt)
|
|
138
|
+
if uptime < u.cfg.MinUptime {
|
|
139
|
+
u.log.Info("Deferring update — too early (uptime %v < %v)", uptime.Round(time.Second), u.cfg.MinUptime)
|
|
140
|
+
return
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
remote, err := u.getRemoteVersion(ctx)
|
|
144
|
+
if err != nil {
|
|
145
|
+
u.log.Warn("Failed to check npm registry: %v", err)
|
|
146
|
+
return
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
local := u.getLocalVersion()
|
|
150
|
+
if local == "" {
|
|
151
|
+
u.log.Info("No local version recorded — storing %s", remote)
|
|
152
|
+
u.setLocalVersion(remote)
|
|
153
|
+
return
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
if local == remote {
|
|
157
|
+
u.log.Debug("Updater: version %s is up to date", local)
|
|
158
|
+
return
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
u.log.Info("Update available: %s → %s", local, remote)
|
|
162
|
+
NotifyOperator(u.cfg, u.log, fmt.Sprintf("⚙️ Supervisor: updating sensorium v%s → v%s. Grace period %v...", local, remote, u.cfg.GracePeriod), 0)
|
|
163
|
+
|
|
164
|
+
// Grace period
|
|
165
|
+
u.log.Info("Grace period %v...", u.cfg.GracePeriod)
|
|
166
|
+
select {
|
|
167
|
+
case <-ctx.Done():
|
|
168
|
+
return
|
|
169
|
+
case <-time.After(u.cfg.GracePeriod):
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
// Set maintenance flag — always clean up on exit
|
|
173
|
+
if err := atomicWrite(u.cfg.Paths.MaintenanceFlag, []byte(time.Now().Format(time.RFC3339))); err != nil {
|
|
174
|
+
u.log.Warn("Failed to write maintenance flag: %v", err)
|
|
175
|
+
}
|
|
176
|
+
defer os.Remove(u.cfg.Paths.MaintenanceFlag)
|
|
177
|
+
|
|
178
|
+
// Kill the current MCP server
|
|
179
|
+
if ctx.Err() != nil {
|
|
180
|
+
return
|
|
181
|
+
}
|
|
182
|
+
u.killServer()
|
|
183
|
+
|
|
184
|
+
// Clean npx cache
|
|
185
|
+
if ctx.Err() != nil {
|
|
186
|
+
return
|
|
187
|
+
}
|
|
188
|
+
u.clearNpxCache()
|
|
189
|
+
|
|
190
|
+
// Spawn new server — retry up to 3 times on failure
|
|
191
|
+
var pid int
|
|
192
|
+
for attempt := 1; attempt <= 3; attempt++ {
|
|
193
|
+
if ctx.Err() != nil {
|
|
194
|
+
return
|
|
195
|
+
}
|
|
196
|
+
pid, err = SpawnMCPServer(u.cfg, u.log)
|
|
197
|
+
if err == nil {
|
|
198
|
+
break
|
|
199
|
+
}
|
|
200
|
+
u.log.Error("Failed to spawn updated MCP server (attempt %d/3): %v", attempt, err)
|
|
201
|
+
if attempt < 3 {
|
|
202
|
+
time.Sleep(2 * time.Second)
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
if err != nil {
|
|
206
|
+
u.log.Error("All spawn attempts failed — server is down!")
|
|
207
|
+
NotifyOperator(u.cfg, u.log, "🔴 Supervisor: update FAILED — server is down! Manual intervention required.", 0)
|
|
208
|
+
return
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
// Wait for new server to be ready
|
|
212
|
+
if u.mcp.WaitForReady(ctx, 3*time.Second, 60*time.Second) {
|
|
213
|
+
u.log.Info("Updated MCP server ready (PID %d)", pid)
|
|
214
|
+
} else {
|
|
215
|
+
u.log.Warn("Updated server did not become ready in 60s")
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
u.setLocalVersion(remote)
|
|
219
|
+
|
|
220
|
+
NotifyOperator(u.cfg, u.log, fmt.Sprintf("✅ Supervisor: update to v%s complete. Server ready.", remote), 0)
|
|
221
|
+
u.log.Info("Update complete: v%s → v%s", local, remote)
|
|
222
|
+
|
|
223
|
+
// Reset start time for min uptime tracking
|
|
224
|
+
u.startAt = time.Now()
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
func (u *Updater) killServer() {
|
|
228
|
+
u.log.Info("Updater: stopping current MCP server for update")
|
|
229
|
+
pid, err := ReadPIDFile(u.cfg.Paths.ServerPID)
|
|
230
|
+
if err != nil {
|
|
231
|
+
u.log.Warn("Could not read server PID file: %v", err)
|
|
232
|
+
// Try killing by port as fallback
|
|
233
|
+
KillByPort(u.cfg.MCPHttpPort, u.log)
|
|
234
|
+
return
|
|
235
|
+
}
|
|
236
|
+
if err := KillProcess(pid, u.log); err != nil {
|
|
237
|
+
u.log.Error("Failed to kill server (PID %d): %v", pid, err)
|
|
238
|
+
KillByPort(u.cfg.MCPHttpPort, u.log)
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// clearNpxCache removes the cached sensorium-mcp package from the npx cache
|
|
243
|
+
// so the next `npx -y sensorium-mcp@latest` fetches the new version.
|
|
244
|
+
func (u *Updater) clearNpxCache() {
|
|
245
|
+
u.log.Info("Updater: clearing npx cache")
|
|
246
|
+
var base string
|
|
247
|
+
if runtime.GOOS == "windows" {
|
|
248
|
+
localAppData := os.Getenv("LOCALAPPDATA")
|
|
249
|
+
if localAppData == "" {
|
|
250
|
+
home, _ := os.UserHomeDir()
|
|
251
|
+
localAppData = filepath.Join(home, "AppData", "Local")
|
|
252
|
+
}
|
|
253
|
+
base = filepath.Join(localAppData, "npm-cache", "_npx")
|
|
254
|
+
} else {
|
|
255
|
+
home, _ := os.UserHomeDir()
|
|
256
|
+
base = filepath.Join(home, ".npm", "_npx")
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
u.log.Info("Clearing sensorium-mcp from npx cache (%s)", base)
|
|
260
|
+
|
|
261
|
+
entries, err := os.ReadDir(base)
|
|
262
|
+
if err != nil {
|
|
263
|
+
return
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
for _, e := range entries {
|
|
267
|
+
if !e.IsDir() {
|
|
268
|
+
continue
|
|
269
|
+
}
|
|
270
|
+
pkgDir := filepath.Join(base, e.Name(), "node_modules", "sensorium-mcp")
|
|
271
|
+
// Validate path doesn't escape base directory
|
|
272
|
+
if !strings.HasPrefix(pkgDir, base) {
|
|
273
|
+
continue
|
|
274
|
+
}
|
|
275
|
+
if _, err := os.Stat(pkgDir); err == nil {
|
|
276
|
+
if err := os.RemoveAll(pkgDir); err != nil {
|
|
277
|
+
u.log.Warn("Failed to clear npx cache entry %s: %v", pkgDir, err)
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
}
|
package/dist/claude-keeper.d.ts
DELETED
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Thread keeper — monitors keep-alive threads and restarts them
|
|
3
|
-
* via the start_thread MCP tool when they stop running.
|
|
4
|
-
*
|
|
5
|
-
* No direct process spawning — delegates to start_thread which handles
|
|
6
|
-
* all lifecycle concerns (PID tracking, registry updates, MCP config).
|
|
7
|
-
*/
|
|
8
|
-
export interface KeeperConfig {
|
|
9
|
-
threadId: number;
|
|
10
|
-
sessionName: string;
|
|
11
|
-
client: string;
|
|
12
|
-
mcpHttpPort: number;
|
|
13
|
-
mcpHttpSecret: string | null;
|
|
14
|
-
workingDirectory?: string;
|
|
15
|
-
maxRetries?: number;
|
|
16
|
-
cooldownMs?: number;
|
|
17
|
-
/** Called when the keeper detects the thread process has died. */
|
|
18
|
-
onDeath?: (threadId: number, sessionName: string) => void;
|
|
19
|
-
}
|
|
20
|
-
export interface KeeperHandle {
|
|
21
|
-
stop(): Promise<void>;
|
|
22
|
-
}
|
|
23
|
-
export declare function startClaudeKeeper(config: KeeperConfig): Promise<KeeperHandle>;
|
|
24
|
-
//# sourceMappingURL=claude-keeper.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"claude-keeper.d.ts","sourceRoot":"","sources":["../src/claude-keeper.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAuBH,MAAM,WAAW,YAAY;IAC3B,QAAQ,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;IACpB,MAAM,EAAE,MAAM,CAAC;IACf,WAAW,EAAE,MAAM,CAAC;IACpB,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,kEAAkE;IAClE,OAAO,CAAC,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,WAAW,EAAE,MAAM,KAAK,IAAI,CAAC;CAC3D;AAED,MAAM,WAAW,YAAY;IAC3B,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;CACvB;AA4OD,wBAAsB,iBAAiB,CAAC,MAAM,EAAE,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC,CAgHnF"}
|
package/dist/claude-keeper.js
DELETED
|
@@ -1,374 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Thread keeper — monitors keep-alive threads and restarts them
|
|
3
|
-
* via the start_thread MCP tool when they stop running.
|
|
4
|
-
*
|
|
5
|
-
* No direct process spawning — delegates to start_thread which handles
|
|
6
|
-
* all lifecycle concerns (PID tracking, registry updates, MCP config).
|
|
7
|
-
*/
|
|
8
|
-
import { LATEST_PROTOCOL_VERSION } from "@modelcontextprotocol/sdk/types.js";
|
|
9
|
-
import { readFileSync } from "node:fs";
|
|
10
|
-
import { homedir } from "node:os";
|
|
11
|
-
import { join } from "node:path";
|
|
12
|
-
import { errorMessage } from "./utils.js";
|
|
13
|
-
// ─── Constants ───────────────────────────────────────────────────────────────
|
|
14
|
-
const BASE_BACKOFF_MS = 5_000;
|
|
15
|
-
const MAX_BACKOFF_MS = 5 * 60_000;
|
|
16
|
-
const HEALTH_CHECK_INTERVAL_MS = 2 * 60_000;
|
|
17
|
-
const DEFAULT_MAX_RETRIES = 5;
|
|
18
|
-
const DEFAULT_COOLDOWN_MS = 300_000;
|
|
19
|
-
const MCP_READY_POLL_INTERVAL_MS = 3_000;
|
|
20
|
-
const MCP_READY_TIMEOUT_MS = 120_000;
|
|
21
|
-
const FAST_EXIT_THRESHOLD_MS = 60_000; // process died within 60s = fast exit
|
|
22
|
-
const FAST_EXIT_MAX_COUNT = 3; // 3 fast exits = escalating cooldown
|
|
23
|
-
const FAST_EXIT_BASE_COOLDOWN_MS = 10 * 60_000; // 10 min initial, doubles each time
|
|
24
|
-
// ─── Helpers ──────────────────────────────────────────────────────────────────
|
|
25
|
-
function keeperLog(level, msg) {
|
|
26
|
-
const ts = new Date().toISOString();
|
|
27
|
-
console.log(`[${ts}] [KEEPER/${level}] ${msg}`);
|
|
28
|
-
}
|
|
29
|
-
function authHeaders(secret) {
|
|
30
|
-
const h = {
|
|
31
|
-
"Content-Type": "application/json",
|
|
32
|
-
"Accept": "application/json, text/event-stream",
|
|
33
|
-
};
|
|
34
|
-
if (secret)
|
|
35
|
-
h["Authorization"] = `Bearer ${secret}`;
|
|
36
|
-
return h;
|
|
37
|
-
}
|
|
38
|
-
/**
|
|
39
|
-
* Parse a fetch Response that may be JSON or SSE (text/event-stream).
|
|
40
|
-
* The MCP Streamable HTTP transport returns SSE when the Accept header
|
|
41
|
-
* includes text/event-stream. Each SSE "data:" line contains a JSON-RPC
|
|
42
|
-
* message. We extract the last data line as the result.
|
|
43
|
-
*/
|
|
44
|
-
async function parseJsonOrSse(res) {
|
|
45
|
-
const ct = res.headers.get("content-type") ?? "";
|
|
46
|
-
if (ct.includes("text/event-stream")) {
|
|
47
|
-
const text = await res.text();
|
|
48
|
-
// Extract all "data:" lines and parse the last one (the final result)
|
|
49
|
-
const dataLines = text.split("\n")
|
|
50
|
-
.filter(line => line.startsWith("data:"))
|
|
51
|
-
.map(line => line.slice(5).trim());
|
|
52
|
-
for (let i = dataLines.length - 1; i >= 0; i--) {
|
|
53
|
-
try {
|
|
54
|
-
return JSON.parse(dataLines[i]);
|
|
55
|
-
}
|
|
56
|
-
catch { /* try previous line */ }
|
|
57
|
-
}
|
|
58
|
-
return {};
|
|
59
|
-
}
|
|
60
|
-
return await res.json();
|
|
61
|
-
}
|
|
62
|
-
async function waitForMcpReady(port, secret) {
|
|
63
|
-
const deadline = Date.now() + MCP_READY_TIMEOUT_MS;
|
|
64
|
-
while (Date.now() < deadline) {
|
|
65
|
-
try {
|
|
66
|
-
const res = await fetch(`http://127.0.0.1:${port}/api/threads/roots`, {
|
|
67
|
-
headers: authHeaders(secret),
|
|
68
|
-
signal: AbortSignal.timeout(5_000),
|
|
69
|
-
});
|
|
70
|
-
if (res.ok)
|
|
71
|
-
return true;
|
|
72
|
-
}
|
|
73
|
-
catch { /* server not ready */ }
|
|
74
|
-
await new Promise(r => setTimeout(r, MCP_READY_POLL_INTERVAL_MS));
|
|
75
|
-
}
|
|
76
|
-
return false;
|
|
77
|
-
}
|
|
78
|
-
async function isThreadRunning(port, secret, threadId) {
|
|
79
|
-
try {
|
|
80
|
-
const res = await fetch(`http://127.0.0.1:${port}/api/threads/${threadId}/running`, {
|
|
81
|
-
headers: authHeaders(secret),
|
|
82
|
-
signal: AbortSignal.timeout(5_000),
|
|
83
|
-
});
|
|
84
|
-
if (!res.ok)
|
|
85
|
-
return false;
|
|
86
|
-
const data = await res.json();
|
|
87
|
-
return data.running === true;
|
|
88
|
-
}
|
|
89
|
-
catch {
|
|
90
|
-
return false;
|
|
91
|
-
}
|
|
92
|
-
}
|
|
93
|
-
const STUCK_THRESHOLD_MS = 10 * 60_000; // 10 minutes without MCP activity = stuck
|
|
94
|
-
async function isThreadStuck(port, secret, threadId) {
|
|
95
|
-
try {
|
|
96
|
-
const res = await fetch(`http://127.0.0.1:${port}/api/threads/${threadId}/heartbeat`, {
|
|
97
|
-
headers: authHeaders(secret),
|
|
98
|
-
signal: AbortSignal.timeout(5_000),
|
|
99
|
-
});
|
|
100
|
-
if (!res.ok)
|
|
101
|
-
return false;
|
|
102
|
-
const data = await res.json();
|
|
103
|
-
if (data.lastActivity == null)
|
|
104
|
-
return false; // no heartbeat yet, just started
|
|
105
|
-
return Date.now() - data.lastActivity > STUCK_THRESHOLD_MS;
|
|
106
|
-
}
|
|
107
|
-
catch {
|
|
108
|
-
return false;
|
|
109
|
-
}
|
|
110
|
-
}
|
|
111
|
-
async function killThread(port, secret, threadId) {
|
|
112
|
-
// Read PID file and kill the process directly
|
|
113
|
-
try {
|
|
114
|
-
const pidFile = join(homedir(), ".remote-copilot-mcp", "pids", `${threadId}.pid`);
|
|
115
|
-
const raw = readFileSync(pidFile, "utf-8").trim();
|
|
116
|
-
let pid;
|
|
117
|
-
try {
|
|
118
|
-
pid = JSON.parse(raw).pid;
|
|
119
|
-
}
|
|
120
|
-
catch {
|
|
121
|
-
pid = Number(raw);
|
|
122
|
-
}
|
|
123
|
-
if (Number.isFinite(pid)) {
|
|
124
|
-
if (process.platform === "win32") {
|
|
125
|
-
// On Windows, process.kill throws EPERM for child processes.
|
|
126
|
-
// Use taskkill which works reliably.
|
|
127
|
-
const { execSync } = await import("child_process");
|
|
128
|
-
try {
|
|
129
|
-
execSync(`taskkill /F /T /PID ${pid}`, { stdio: "ignore", timeout: 10_000 });
|
|
130
|
-
}
|
|
131
|
-
catch { /* already dead */ }
|
|
132
|
-
}
|
|
133
|
-
else {
|
|
134
|
-
try {
|
|
135
|
-
process.kill(pid, "SIGKILL");
|
|
136
|
-
}
|
|
137
|
-
catch { /* already dead */ }
|
|
138
|
-
}
|
|
139
|
-
keeperLog("INFO", `Killed stuck process PID=${pid} for thread ${threadId}`);
|
|
140
|
-
}
|
|
141
|
-
}
|
|
142
|
-
catch { /* PID file missing or unreadable */ }
|
|
143
|
-
}
|
|
144
|
-
async function openMcpSession(port, secret) {
|
|
145
|
-
try {
|
|
146
|
-
const res = await fetch(`http://127.0.0.1:${port}/mcp`, {
|
|
147
|
-
method: "POST",
|
|
148
|
-
headers: authHeaders(secret),
|
|
149
|
-
body: JSON.stringify({
|
|
150
|
-
jsonrpc: "2.0",
|
|
151
|
-
id: `keeper-init-${Date.now()}`,
|
|
152
|
-
method: "initialize",
|
|
153
|
-
params: {
|
|
154
|
-
protocolVersion: LATEST_PROTOCOL_VERSION,
|
|
155
|
-
capabilities: {},
|
|
156
|
-
clientInfo: {
|
|
157
|
-
name: "thread-keeper",
|
|
158
|
-
version: "1.0.0",
|
|
159
|
-
},
|
|
160
|
-
},
|
|
161
|
-
}),
|
|
162
|
-
signal: AbortSignal.timeout(30_000),
|
|
163
|
-
});
|
|
164
|
-
if (!res.ok) {
|
|
165
|
-
keeperLog("WARN", `initialize HTTP ${res.status}: ${res.statusText}`);
|
|
166
|
-
return null;
|
|
167
|
-
}
|
|
168
|
-
// Consume the body (may be SSE or JSON) to avoid connection stalls
|
|
169
|
-
await res.text();
|
|
170
|
-
const sessionId = res.headers.get("mcp-session-id");
|
|
171
|
-
if (!sessionId) {
|
|
172
|
-
keeperLog("WARN", "initialize succeeded but did not return an MCP session ID");
|
|
173
|
-
return null;
|
|
174
|
-
}
|
|
175
|
-
await fetch(`http://127.0.0.1:${port}/mcp`, {
|
|
176
|
-
method: "POST",
|
|
177
|
-
headers: {
|
|
178
|
-
...authHeaders(secret),
|
|
179
|
-
"mcp-session-id": sessionId,
|
|
180
|
-
},
|
|
181
|
-
body: JSON.stringify({
|
|
182
|
-
jsonrpc: "2.0",
|
|
183
|
-
method: "notifications/initialized",
|
|
184
|
-
params: {},
|
|
185
|
-
}),
|
|
186
|
-
signal: AbortSignal.timeout(30_000),
|
|
187
|
-
});
|
|
188
|
-
return sessionId;
|
|
189
|
-
}
|
|
190
|
-
catch (err) {
|
|
191
|
-
keeperLog("ERROR", `initialize failed: ${errorMessage(err)}`);
|
|
192
|
-
return null;
|
|
193
|
-
}
|
|
194
|
-
}
|
|
195
|
-
async function closeMcpSession(port, secret, sessionId) {
|
|
196
|
-
try {
|
|
197
|
-
await fetch(`http://127.0.0.1:${port}/mcp`, {
|
|
198
|
-
method: "DELETE",
|
|
199
|
-
headers: {
|
|
200
|
-
...authHeaders(secret),
|
|
201
|
-
"mcp-session-id": sessionId,
|
|
202
|
-
},
|
|
203
|
-
signal: AbortSignal.timeout(10_000),
|
|
204
|
-
});
|
|
205
|
-
}
|
|
206
|
-
catch {
|
|
207
|
-
// Best-effort cleanup.
|
|
208
|
-
}
|
|
209
|
-
}
|
|
210
|
-
async function callStartThread(config) {
|
|
211
|
-
const { mcpHttpPort: port, mcpHttpSecret: secret, threadId, sessionName, client, workingDirectory } = config;
|
|
212
|
-
const sessionId = await openMcpSession(port, secret);
|
|
213
|
-
if (!sessionId)
|
|
214
|
-
return false;
|
|
215
|
-
try {
|
|
216
|
-
const body = JSON.stringify({
|
|
217
|
-
jsonrpc: "2.0",
|
|
218
|
-
method: "tools/call",
|
|
219
|
-
params: {
|
|
220
|
-
name: "start_thread",
|
|
221
|
-
arguments: {
|
|
222
|
-
name: sessionName,
|
|
223
|
-
targetThreadId: threadId,
|
|
224
|
-
agentType: client,
|
|
225
|
-
mode: "resume",
|
|
226
|
-
...(workingDirectory !== undefined ? { workingDirectory } : {}),
|
|
227
|
-
},
|
|
228
|
-
},
|
|
229
|
-
id: `keeper-${threadId}-${Date.now()}`,
|
|
230
|
-
});
|
|
231
|
-
const res = await fetch(`http://127.0.0.1:${port}/mcp`, {
|
|
232
|
-
method: "POST",
|
|
233
|
-
headers: {
|
|
234
|
-
...authHeaders(secret),
|
|
235
|
-
"mcp-session-id": sessionId,
|
|
236
|
-
},
|
|
237
|
-
body,
|
|
238
|
-
signal: AbortSignal.timeout(30_000),
|
|
239
|
-
});
|
|
240
|
-
if (!res.ok) {
|
|
241
|
-
keeperLog("WARN", `start_thread HTTP ${res.status}: ${res.statusText}`);
|
|
242
|
-
return false;
|
|
243
|
-
}
|
|
244
|
-
const result = await parseJsonOrSse(res);
|
|
245
|
-
if (result.error) {
|
|
246
|
-
keeperLog("WARN", `start_thread RPC error ${result.error.code ?? "unknown"}: ${result.error.message ?? "unknown error"}`);
|
|
247
|
-
return false;
|
|
248
|
-
}
|
|
249
|
-
const text = result?.result?.content?.[0]?.text ?? "";
|
|
250
|
-
if (!text.trim()) {
|
|
251
|
-
keeperLog("WARN", "start_thread returned an empty response");
|
|
252
|
-
return false;
|
|
253
|
-
}
|
|
254
|
-
keeperLog("INFO", `start_thread response: ${text.slice(0, 200)}`);
|
|
255
|
-
return !text.toLowerCase().includes("error");
|
|
256
|
-
}
|
|
257
|
-
catch (err) {
|
|
258
|
-
keeperLog("ERROR", `start_thread call failed: ${errorMessage(err)}`);
|
|
259
|
-
return false;
|
|
260
|
-
}
|
|
261
|
-
finally {
|
|
262
|
-
await closeMcpSession(port, secret, sessionId);
|
|
263
|
-
}
|
|
264
|
-
}
|
|
265
|
-
// ─── Core keeper ──────────────────────────────────────────────────────────────
|
|
266
|
-
export async function startClaudeKeeper(config) {
|
|
267
|
-
const maxRetries = config.maxRetries ?? DEFAULT_MAX_RETRIES;
|
|
268
|
-
const cooldownMs = config.cooldownMs ?? DEFAULT_COOLDOWN_MS;
|
|
269
|
-
keeperLog("INFO", `Starting keeper for thread ${config.threadId} ('${config.sessionName}') [client=${config.client}]`);
|
|
270
|
-
let stopped = false;
|
|
271
|
-
let retryCount = 0;
|
|
272
|
-
let consecutiveNotRunning = 0;
|
|
273
|
-
let consecutiveStuck = 0;
|
|
274
|
-
let timer = null;
|
|
275
|
-
let lastStartTime = 0; // epoch ms when the last process was started
|
|
276
|
-
let fastExitCount = 0; // consecutive fast exits (< 60s lifetime)
|
|
277
|
-
let fastExitEscalation = 0; // escalation level for exponential backoff
|
|
278
|
-
keeperLog("INFO", "Waiting for MCP server to be ready...");
|
|
279
|
-
const ready = await waitForMcpReady(config.mcpHttpPort, config.mcpHttpSecret);
|
|
280
|
-
if (!ready)
|
|
281
|
-
keeperLog("WARN", "MCP server did not respond in time — attempting start_thread anyway.");
|
|
282
|
-
else
|
|
283
|
-
keeperLog("INFO", "MCP server ready.");
|
|
284
|
-
async function checkAndStart() {
|
|
285
|
-
if (stopped)
|
|
286
|
-
return;
|
|
287
|
-
const running = await isThreadRunning(config.mcpHttpPort, config.mcpHttpSecret, config.threadId);
|
|
288
|
-
if (running) {
|
|
289
|
-
consecutiveNotRunning = 0;
|
|
290
|
-
// Check for stuck process (alive but no MCP activity for 10+ min)
|
|
291
|
-
const stuck = await isThreadStuck(config.mcpHttpPort, config.mcpHttpSecret, config.threadId);
|
|
292
|
-
if (stuck) {
|
|
293
|
-
consecutiveStuck++;
|
|
294
|
-
if (consecutiveStuck < 3) {
|
|
295
|
-
// Require 3 consecutive stuck checks (~6 min) before killing
|
|
296
|
-
keeperLog("INFO", `Thread ${config.threadId} appears stuck (${consecutiveStuck}/3) — rechecking in 2 min`);
|
|
297
|
-
scheduleCheck();
|
|
298
|
-
return;
|
|
299
|
-
}
|
|
300
|
-
keeperLog("WARN", `Thread ${config.threadId} confirmed stuck (no MCP activity for ${STUCK_THRESHOLD_MS / 60_000}+ min, ${consecutiveStuck} checks) — killing process`);
|
|
301
|
-
await killThread(config.mcpHttpPort, config.mcpHttpSecret, config.threadId);
|
|
302
|
-
consecutiveStuck = 0;
|
|
303
|
-
// Wait for process to actually die, then restart on next normal check
|
|
304
|
-
scheduleCheck();
|
|
305
|
-
return;
|
|
306
|
-
}
|
|
307
|
-
consecutiveStuck = 0;
|
|
308
|
-
scheduleCheck();
|
|
309
|
-
return;
|
|
310
|
-
}
|
|
311
|
-
consecutiveNotRunning++;
|
|
312
|
-
if (consecutiveNotRunning === 1) {
|
|
313
|
-
// First detection — notify operator immediately
|
|
314
|
-
config.onDeath?.(config.threadId, config.sessionName);
|
|
315
|
-
}
|
|
316
|
-
if (consecutiveNotRunning < 2) {
|
|
317
|
-
// Single failure may be a timeout; recheck quickly before restarting
|
|
318
|
-
timer = setTimeout(() => void checkAndStart(), 10_000);
|
|
319
|
-
return;
|
|
320
|
-
}
|
|
321
|
-
if (retryCount >= maxRetries) {
|
|
322
|
-
keeperLog("WARN", `Max retries (${maxRetries}) exceeded — cooling down for ${Math.round(cooldownMs / 1000)}s`);
|
|
323
|
-
retryCount = 0;
|
|
324
|
-
timer = setTimeout(() => void checkAndStart(), cooldownMs);
|
|
325
|
-
return;
|
|
326
|
-
}
|
|
327
|
-
retryCount++;
|
|
328
|
-
keeperLog("INFO", `Thread ${config.threadId} not running — calling start_thread (attempt ${retryCount}/${maxRetries})`);
|
|
329
|
-
// Fast-exit detection: if the process died quickly, use exponential backoff.
|
|
330
|
-
if (lastStartTime > 0 && (Date.now() - lastStartTime) < FAST_EXIT_THRESHOLD_MS) {
|
|
331
|
-
fastExitCount++;
|
|
332
|
-
if (fastExitCount >= FAST_EXIT_MAX_COUNT) {
|
|
333
|
-
const cooldown = Math.min(FAST_EXIT_BASE_COOLDOWN_MS * 2 ** fastExitEscalation, 4 * 60 * 60_000); // cap 4h
|
|
334
|
-
fastExitEscalation++;
|
|
335
|
-
keeperLog("WARN", `Thread ${config.threadId}: ${fastExitCount} consecutive fast exits — backing off ${Math.round(cooldown / 60_000)} min`);
|
|
336
|
-
config.onDeath?.(config.threadId, `${config.sessionName} (repeated fast exits — check credits/API key)`);
|
|
337
|
-
fastExitCount = 0;
|
|
338
|
-
retryCount = 0;
|
|
339
|
-
timer = setTimeout(() => void checkAndStart(), cooldown);
|
|
340
|
-
return;
|
|
341
|
-
}
|
|
342
|
-
}
|
|
343
|
-
else {
|
|
344
|
-
fastExitCount = 0;
|
|
345
|
-
fastExitEscalation = 0; // reset if the process ran long enough
|
|
346
|
-
}
|
|
347
|
-
lastStartTime = Date.now();
|
|
348
|
-
const ok = await callStartThread(config);
|
|
349
|
-
if (ok) {
|
|
350
|
-
retryCount = 0;
|
|
351
|
-
consecutiveNotRunning = 0;
|
|
352
|
-
scheduleCheck();
|
|
353
|
-
}
|
|
354
|
-
else {
|
|
355
|
-
const delay = Math.min(BASE_BACKOFF_MS * 2 ** retryCount, MAX_BACKOFF_MS);
|
|
356
|
-
keeperLog("INFO", `Scheduling retry in ${delay}ms`);
|
|
357
|
-
timer = setTimeout(() => void checkAndStart(), delay);
|
|
358
|
-
}
|
|
359
|
-
}
|
|
360
|
-
function scheduleCheck() {
|
|
361
|
-
if (stopped)
|
|
362
|
-
return;
|
|
363
|
-
timer = setTimeout(() => void checkAndStart(), HEALTH_CHECK_INTERVAL_MS);
|
|
364
|
-
}
|
|
365
|
-
void checkAndStart();
|
|
366
|
-
return {
|
|
367
|
-
async stop() {
|
|
368
|
-
stopped = true;
|
|
369
|
-
if (timer)
|
|
370
|
-
clearTimeout(timer);
|
|
371
|
-
},
|
|
372
|
-
};
|
|
373
|
-
}
|
|
374
|
-
//# sourceMappingURL=claude-keeper.js.map
|