sensorium-mcp 2.17.26 → 2.17.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/dashboard/routes/threads.d.ts.map +1 -1
- package/dist/dashboard/routes/threads.js +16 -5
- package/dist/dashboard/routes/threads.js.map +1 -1
- package/dist/data/memory/bootstrap.js +2 -2
- package/dist/data/memory/bootstrap.js.map +1 -1
- package/dist/data/memory/consolidation.d.ts.map +1 -1
- package/dist/data/memory/consolidation.js +74 -4
- package/dist/data/memory/consolidation.js.map +1 -1
- package/dist/data/memory/semantic.d.ts +11 -0
- package/dist/data/memory/semantic.d.ts.map +1 -1
- package/dist/data/memory/semantic.js +37 -0
- package/dist/data/memory/semantic.js.map +1 -1
- package/dist/data/memory/thread-registry.d.ts +7 -0
- package/dist/data/memory/thread-registry.d.ts.map +1 -1
- package/dist/data/memory/thread-registry.js +11 -1
- package/dist/data/memory/thread-registry.js.map +1 -1
- package/dist/index.js +17 -5
- package/dist/index.js.map +1 -1
- package/dist/tools/shared-agent-utils.d.ts.map +1 -1
- package/dist/tools/shared-agent-utils.js +3 -4
- package/dist/tools/shared-agent-utils.js.map +1 -1
- package/dist/tools/thread-lifecycle.d.ts.map +1 -1
- package/dist/tools/thread-lifecycle.js +33 -15
- package/dist/tools/thread-lifecycle.js.map +1 -1
- package/package.json +10 -2
- package/scripts/install-supervisor.ps1 +67 -0
- package/scripts/install-supervisor.sh +43 -0
- package/scripts/start-supervisor.ps1 +46 -0
- package/scripts/start-supervisor.sh +20 -0
- package/supervisor/config.go +140 -0
- package/supervisor/go.mod +3 -0
- package/supervisor/health.go +389 -0
- package/supervisor/health_test.go +93 -0
- package/supervisor/keeper.go +303 -0
- package/supervisor/keeper_test.go +27 -0
- package/supervisor/lock.go +56 -0
- package/supervisor/lock_test.go +54 -0
- package/supervisor/log.go +114 -0
- package/supervisor/log_test.go +45 -0
- package/supervisor/main.go +325 -0
- package/supervisor/notify.go +53 -0
- package/supervisor/process.go +222 -0
- package/supervisor/process_test.go +94 -0
- package/supervisor/process_unix.go +14 -0
- package/supervisor/process_windows.go +15 -0
- package/supervisor/updater.go +281 -0
- package/dist/claude-keeper.d.ts +0 -24
- package/dist/claude-keeper.d.ts.map +0 -1
- package/dist/claude-keeper.js +0 -374
- package/dist/claude-keeper.js.map +0 -1
- package/dist/watcher-service.d.ts +0 -2
- package/dist/watcher-service.d.ts.map +0 -1
- package/dist/watcher-service.js +0 -997
- package/dist/watcher-service.js.map +0 -1
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
package main
|
|
2
|
+
|
|
3
|
+
import (
|
|
4
|
+
"context"
|
|
5
|
+
"encoding/json"
|
|
6
|
+
"fmt"
|
|
7
|
+
"math"
|
|
8
|
+
"sync"
|
|
9
|
+
"time"
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
// KeeperConfig describes a thread that must be kept alive.
|
|
13
|
+
type KeeperConfig struct {
|
|
14
|
+
ThreadID int
|
|
15
|
+
SessionName string
|
|
16
|
+
Client string // e.g. "claude-code", "codex"
|
|
17
|
+
WorkingDirectory string
|
|
18
|
+
MaxRetries int
|
|
19
|
+
CooldownMs int
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
// Keeper supervises a single thread, restarting it via the MCP server's
|
|
23
|
+
// start_thread tool when it stops running. One goroutine per keeper.
|
|
24
|
+
type Keeper struct {
|
|
25
|
+
cfg KeeperConfig
|
|
26
|
+
global Config
|
|
27
|
+
mcp *MCPClient
|
|
28
|
+
log *Logger
|
|
29
|
+
onDeath func(threadID int, sessionName string)
|
|
30
|
+
|
|
31
|
+
mu sync.Mutex
|
|
32
|
+
stopped bool
|
|
33
|
+
cancel context.CancelFunc
|
|
34
|
+
done chan struct{}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
func NewKeeper(cfg KeeperConfig, global Config, mcp *MCPClient, log *Logger, onDeath func(int, string)) *Keeper {
|
|
38
|
+
maxRetries := cfg.MaxRetries
|
|
39
|
+
if maxRetries <= 0 {
|
|
40
|
+
maxRetries = global.KeeperMaxRetries
|
|
41
|
+
}
|
|
42
|
+
cfg.MaxRetries = maxRetries
|
|
43
|
+
return &Keeper{
|
|
44
|
+
cfg: cfg,
|
|
45
|
+
global: global,
|
|
46
|
+
mcp: mcp,
|
|
47
|
+
log: log,
|
|
48
|
+
onDeath: onDeath,
|
|
49
|
+
done: make(chan struct{}),
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// Start begins the keeper loop in a separate goroutine.
|
|
54
|
+
func (k *Keeper) Start() {
|
|
55
|
+
ctx, cancel := context.WithCancel(context.Background())
|
|
56
|
+
k.mu.Lock()
|
|
57
|
+
k.cancel = cancel
|
|
58
|
+
k.mu.Unlock()
|
|
59
|
+
|
|
60
|
+
go k.run(ctx)
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Stop signals the keeper to shut down and waits for it to finish.
|
|
64
|
+
func (k *Keeper) Stop() {
|
|
65
|
+
k.mu.Lock()
|
|
66
|
+
k.stopped = true
|
|
67
|
+
if k.cancel != nil {
|
|
68
|
+
k.cancel()
|
|
69
|
+
}
|
|
70
|
+
k.mu.Unlock()
|
|
71
|
+
<-k.done
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
func (k *Keeper) isStopped() bool {
|
|
75
|
+
k.mu.Lock()
|
|
76
|
+
defer k.mu.Unlock()
|
|
77
|
+
return k.stopped
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
func (k *Keeper) run(ctx context.Context) {
|
|
81
|
+
defer close(k.done)
|
|
82
|
+
defer func() {
|
|
83
|
+
if r := recover(); r != nil {
|
|
84
|
+
k.log.Error("Keeper panicked for thread %d: %v", k.cfg.ThreadID, r)
|
|
85
|
+
}
|
|
86
|
+
}()
|
|
87
|
+
defer k.log.Info("Keeper stopped for thread %d", k.cfg.ThreadID)
|
|
88
|
+
|
|
89
|
+
k.log.Info("Keeper started for thread %d ('%s') [client=%s]", k.cfg.ThreadID, k.cfg.SessionName, k.cfg.Client)
|
|
90
|
+
|
|
91
|
+
// Wait for MCP server to be ready
|
|
92
|
+
ready := k.mcp.WaitForReady(ctx, k.global.KeeperReadyPollInterval, k.global.KeeperReadyTimeout)
|
|
93
|
+
if !ready && !k.isStopped() {
|
|
94
|
+
k.log.Warn("MCP server not ready after %v — attempting start_thread anyway", k.global.KeeperReadyTimeout)
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
retryCount := 0
|
|
98
|
+
fastExitCount := 0
|
|
99
|
+
fastExitEscalation := 0
|
|
100
|
+
var lastStartTime time.Time
|
|
101
|
+
activeThreadID := k.cfg.ThreadID // may differ from root after start_thread
|
|
102
|
+
|
|
103
|
+
checkAndStart := func() {
|
|
104
|
+
if k.isStopped() {
|
|
105
|
+
return
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// Check if thread is running — use activeThreadID (worker) not root
|
|
109
|
+
running := k.mcp.IsThreadRunning(ctx, activeThreadID)
|
|
110
|
+
if running {
|
|
111
|
+
// Check if stuck
|
|
112
|
+
if k.mcp.IsThreadStuck(ctx, activeThreadID, k.global.StuckThreshold) {
|
|
113
|
+
k.log.Warn("Thread %d (worker %d) is stuck (no heartbeat for %v) — restarting", k.cfg.ThreadID, activeThreadID, k.global.StuckThreshold)
|
|
114
|
+
// Kill via MCP API, then fall through to restart
|
|
115
|
+
k.killThread(ctx, activeThreadID)
|
|
116
|
+
activeThreadID = k.cfg.ThreadID // reset — will get new worker ID on restart
|
|
117
|
+
} else {
|
|
118
|
+
// Healthy — reset counters
|
|
119
|
+
if retryCount > 0 {
|
|
120
|
+
k.log.Info("Thread %d is healthy again (was at retry %d)", k.cfg.ThreadID, retryCount)
|
|
121
|
+
} else {
|
|
122
|
+
k.log.Debug("Thread %d is healthy", k.cfg.ThreadID)
|
|
123
|
+
}
|
|
124
|
+
retryCount = 0
|
|
125
|
+
return
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Thread is not running (or was stuck and killed)
|
|
130
|
+
if retryCount >= k.cfg.MaxRetries {
|
|
131
|
+
cooldown := k.global.KeeperCooldown
|
|
132
|
+
if k.cfg.CooldownMs > 0 {
|
|
133
|
+
cooldown = time.Duration(k.cfg.CooldownMs) * time.Millisecond
|
|
134
|
+
}
|
|
135
|
+
k.log.Warn("Max retries (%d) exceeded — cooling down for %v", k.cfg.MaxRetries, cooldown)
|
|
136
|
+
if k.onDeath != nil {
|
|
137
|
+
k.onDeath(k.cfg.ThreadID, k.cfg.SessionName)
|
|
138
|
+
}
|
|
139
|
+
k.sleep(ctx, cooldown)
|
|
140
|
+
retryCount = 0
|
|
141
|
+
fastExitCount = 0
|
|
142
|
+
return
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
k.log.Info("Thread %d not running — calling start_thread (attempt %d/%d)", k.cfg.ThreadID, retryCount+1, k.cfg.MaxRetries)
|
|
146
|
+
|
|
147
|
+
// Re-verify root still has keepAlive before restarting
|
|
148
|
+
// (user may have archived the worker, which disables keepAlive on the root)
|
|
149
|
+
if !k.isRootKeepAlive(ctx) {
|
|
150
|
+
k.log.Info("Thread %d root has keepAlive=false — stopping keeper", k.cfg.ThreadID)
|
|
151
|
+
k.mu.Lock()
|
|
152
|
+
k.stopped = true
|
|
153
|
+
k.mu.Unlock()
|
|
154
|
+
return
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
lastStartTime = time.Now()
|
|
158
|
+
ok, workerID := k.callStartThread(ctx)
|
|
159
|
+
|
|
160
|
+
if ok {
|
|
161
|
+
if workerID > 0 {
|
|
162
|
+
activeThreadID = workerID
|
|
163
|
+
k.log.Info("Thread %d start_thread succeeded (worker %d)", k.cfg.ThreadID, workerID)
|
|
164
|
+
} else {
|
|
165
|
+
k.log.Info("Thread %d start_thread succeeded", k.cfg.ThreadID)
|
|
166
|
+
}
|
|
167
|
+
retryCount = 0
|
|
168
|
+
// Check for fast exit on next check
|
|
169
|
+
} else {
|
|
170
|
+
// Check for fast exit pattern
|
|
171
|
+
if !lastStartTime.IsZero() && time.Since(lastStartTime) < k.global.FastExitThreshold {
|
|
172
|
+
fastExitCount++
|
|
173
|
+
if fastExitCount >= k.global.FastExitMaxCount {
|
|
174
|
+
cooldown := time.Duration(float64(k.global.FastExitBaseCooldown) * math.Pow(2, float64(fastExitEscalation)))
|
|
175
|
+
if cooldown > k.global.FastExitMaxCooldown {
|
|
176
|
+
cooldown = k.global.FastExitMaxCooldown
|
|
177
|
+
}
|
|
178
|
+
k.log.Warn("Thread %d: %d consecutive fast exits — backing off %v", k.cfg.ThreadID, fastExitCount, cooldown)
|
|
179
|
+
if k.onDeath != nil {
|
|
180
|
+
k.onDeath(k.cfg.ThreadID, k.cfg.SessionName+" (repeated fast exits — check credits/API key)")
|
|
181
|
+
}
|
|
182
|
+
fastExitEscalation++
|
|
183
|
+
k.sleep(ctx, cooldown)
|
|
184
|
+
fastExitCount = 0
|
|
185
|
+
retryCount = 0
|
|
186
|
+
return
|
|
187
|
+
}
|
|
188
|
+
} else {
|
|
189
|
+
fastExitCount = 0
|
|
190
|
+
fastExitEscalation = 0
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
retryCount++
|
|
194
|
+
delay := k.backoff(retryCount)
|
|
195
|
+
k.log.Info("Backing off %v before next attempt", delay)
|
|
196
|
+
k.sleep(ctx, delay)
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// Initial check
|
|
201
|
+
checkAndStart()
|
|
202
|
+
|
|
203
|
+
// Health check loop
|
|
204
|
+
for {
|
|
205
|
+
if k.isStopped() {
|
|
206
|
+
return
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
select {
|
|
210
|
+
case <-ctx.Done():
|
|
211
|
+
return
|
|
212
|
+
case <-time.After(k.global.KeeperHealthCheckInterval):
|
|
213
|
+
checkAndStart()
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
// callStartThread starts the thread via MCP. Returns (success, workerThreadID).
|
|
219
|
+
// workerThreadID is extracted from the response JSON when available.
|
|
220
|
+
func (k *Keeper) callStartThread(ctx context.Context) (bool, int) {
|
|
221
|
+
sessionID, err := k.mcp.OpenMCPSession(ctx)
|
|
222
|
+
if err != nil {
|
|
223
|
+
k.log.Error("Failed to open MCP session: %v", err)
|
|
224
|
+
return false, 0
|
|
225
|
+
}
|
|
226
|
+
defer k.mcp.CloseMCPSession(ctx, sessionID)
|
|
227
|
+
|
|
228
|
+
text, err := k.mcp.CallStartThread(ctx, sessionID, k.cfg.ThreadID, k.cfg.SessionName, k.cfg.Client, k.cfg.WorkingDirectory)
|
|
229
|
+
if err != nil {
|
|
230
|
+
k.log.Error("start_thread failed: %v", err)
|
|
231
|
+
return false, 0
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
if text != "" {
|
|
235
|
+
k.log.Info("start_thread response: %.200s", text)
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
// Parse the worker threadId from the response JSON
|
|
239
|
+
workerID := parseWorkerThreadID(text)
|
|
240
|
+
return true, workerID
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
func (k *Keeper) killThread(ctx context.Context, threadID int) {
|
|
244
|
+
k.log.Info("Killing stuck thread %d", threadID)
|
|
245
|
+
// Read PID from thread PID file
|
|
246
|
+
pidFile := k.global.Paths.PIDsDir + "/" + fmt.Sprintf("%d.pid", threadID)
|
|
247
|
+
pid, err := ReadPIDFile(pidFile)
|
|
248
|
+
if err != nil {
|
|
249
|
+
k.log.Warn("Cannot read PID for thread %d: %v", k.cfg.ThreadID, err)
|
|
250
|
+
return
|
|
251
|
+
}
|
|
252
|
+
if err := KillProcess(pid, k.log); err != nil {
|
|
253
|
+
k.log.Error("Failed to kill thread %d (PID %d): %v", k.cfg.ThreadID, pid, err)
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
func (k *Keeper) backoff(retry int) time.Duration {
|
|
258
|
+
delay := time.Duration(float64(k.global.KeeperBaseBackoff) * math.Pow(2, float64(retry)))
|
|
259
|
+
if delay > k.global.KeeperMaxBackoff {
|
|
260
|
+
delay = k.global.KeeperMaxBackoff
|
|
261
|
+
}
|
|
262
|
+
return delay
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
func (k *Keeper) sleep(ctx context.Context, d time.Duration) {
|
|
266
|
+
select {
|
|
267
|
+
case <-ctx.Done():
|
|
268
|
+
case <-time.After(d):
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
// isRootKeepAlive checks whether the root thread still has keepAlive=true.
|
|
273
|
+
func (k *Keeper) isRootKeepAlive(ctx context.Context) bool {
|
|
274
|
+
roots, err := k.mcp.GetRootThreads(ctx)
|
|
275
|
+
if err != nil {
|
|
276
|
+
k.log.Debug("isRootKeepAlive(%d): failed to fetch roots: %v — assuming still alive", k.cfg.ThreadID, err)
|
|
277
|
+
return true // fail-open: don't stop keeper if we can't check
|
|
278
|
+
}
|
|
279
|
+
for _, r := range roots {
|
|
280
|
+
tidFloat, _ := r["threadId"].(float64)
|
|
281
|
+
if int(tidFloat) == k.cfg.ThreadID {
|
|
282
|
+
keepAlive, _ := r["keepAlive"].(bool)
|
|
283
|
+
status, _ := r["status"].(string)
|
|
284
|
+
return keepAlive && (status == "" || status == "active")
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
k.log.Debug("isRootKeepAlive(%d): root thread not found in response", k.cfg.ThreadID)
|
|
288
|
+
return false // root thread gone → stop keeper
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
// parseWorkerThreadID extracts the "threadId" field from a start_thread JSON response.
|
|
292
|
+
func parseWorkerThreadID(text string) int {
|
|
293
|
+
if text == "" {
|
|
294
|
+
return 0
|
|
295
|
+
}
|
|
296
|
+
var resp struct {
|
|
297
|
+
ThreadID int `json:"threadId"`
|
|
298
|
+
}
|
|
299
|
+
if json.Unmarshal([]byte(text), &resp) == nil && resp.ThreadID > 0 {
|
|
300
|
+
return resp.ThreadID
|
|
301
|
+
}
|
|
302
|
+
return 0
|
|
303
|
+
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
package main
|
|
2
|
+
|
|
3
|
+
import "testing"
|
|
4
|
+
|
|
5
|
+
func TestParseWorkerThreadID(t *testing.T) {
|
|
6
|
+
tests := []struct {
|
|
7
|
+
name string
|
|
8
|
+
text string
|
|
9
|
+
want int
|
|
10
|
+
}{
|
|
11
|
+
{"normal response", `{"threadId":11226,"status":"restarted","name":"Sensorium 2","pid":87108}`, 11226},
|
|
12
|
+
{"already_running", `{"threadId":11226,"status":"already_running","name":"Sensorium 2","pid":40568}`, 11226},
|
|
13
|
+
{"empty string", "", 0},
|
|
14
|
+
{"no threadId", `{"status":"error"}`, 0},
|
|
15
|
+
{"threadId zero", `{"threadId":0}`, 0},
|
|
16
|
+
{"invalid JSON", `not json`, 0},
|
|
17
|
+
{"negative threadId", `{"threadId":-5}`, 0},
|
|
18
|
+
}
|
|
19
|
+
for _, tt := range tests {
|
|
20
|
+
t.Run(tt.name, func(t *testing.T) {
|
|
21
|
+
got := parseWorkerThreadID(tt.text)
|
|
22
|
+
if got != tt.want {
|
|
23
|
+
t.Errorf("parseWorkerThreadID(%q) = %d, want %d", tt.text, got, tt.want)
|
|
24
|
+
}
|
|
25
|
+
})
|
|
26
|
+
}
|
|
27
|
+
}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
package main
|
|
2
|
+
|
|
3
|
+
import (
|
|
4
|
+
"fmt"
|
|
5
|
+
"os"
|
|
6
|
+
"strconv"
|
|
7
|
+
"strings"
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
// AcquireLock creates a lock file to prevent multiple supervisor instances.
|
|
11
|
+
// Uses O_CREATE|O_EXCL for atomic creation. If a stale lock exists (PID not
|
|
12
|
+
// running), it reclaims the lock.
|
|
13
|
+
func AcquireLock(lockPath string, log *Logger) bool {
|
|
14
|
+
// Try atomic create first
|
|
15
|
+
f, err := os.OpenFile(lockPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0644)
|
|
16
|
+
if err == nil {
|
|
17
|
+
// Lock acquired — write our PID
|
|
18
|
+
fmt.Fprintf(f, "%d", os.Getpid())
|
|
19
|
+
f.Close()
|
|
20
|
+
log.Info("Lock acquired: %s (PID %d)", lockPath, os.Getpid())
|
|
21
|
+
return true
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
// Lock file exists — check if the PID is still alive
|
|
25
|
+
data, err := os.ReadFile(lockPath)
|
|
26
|
+
if err != nil {
|
|
27
|
+
log.Error("Failed to read lockfile %s: %v", lockPath, err)
|
|
28
|
+
return false
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
pidStr := strings.TrimSpace(string(data))
|
|
32
|
+
pid, err := strconv.Atoi(pidStr)
|
|
33
|
+
if err == nil && pid > 0 && IsProcessAlive(pid) {
|
|
34
|
+
log.Error("Another supervisor is running (PID %d). Lockfile: %s", pid, lockPath)
|
|
35
|
+
return false
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Stale lock — reclaim
|
|
39
|
+
log.Warn("Reclaimed stale supervisor lockfile (old PID %s)", pidStr)
|
|
40
|
+
_ = os.Remove(lockPath)
|
|
41
|
+
|
|
42
|
+
// Re-acquire atomically
|
|
43
|
+
f, err = os.OpenFile(lockPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0644)
|
|
44
|
+
if err != nil {
|
|
45
|
+
log.Error("Failed to acquire lockfile after reclaim: %v", err)
|
|
46
|
+
return false
|
|
47
|
+
}
|
|
48
|
+
fmt.Fprintf(f, "%d", os.Getpid())
|
|
49
|
+
f.Close()
|
|
50
|
+
return true
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// ReleaseLock removes the lock file.
|
|
54
|
+
func ReleaseLock(lockPath string) {
|
|
55
|
+
_ = os.Remove(lockPath)
|
|
56
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
package main
|
|
2
|
+
|
|
3
|
+
import (
|
|
4
|
+
"os"
|
|
5
|
+
"path/filepath"
|
|
6
|
+
"testing"
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
func TestAcquireLock_Fresh(t *testing.T) {
|
|
10
|
+
dir := t.TempDir()
|
|
11
|
+
lockPath := filepath.Join(dir, "test.lock")
|
|
12
|
+
log := NewLogger("")
|
|
13
|
+
|
|
14
|
+
if !AcquireLock(lockPath, log) {
|
|
15
|
+
t.Fatal("expected lock acquisition to succeed")
|
|
16
|
+
}
|
|
17
|
+
defer ReleaseLock(lockPath)
|
|
18
|
+
|
|
19
|
+
// Verify lock file was created with our PID
|
|
20
|
+
data, err := os.ReadFile(lockPath)
|
|
21
|
+
if err != nil {
|
|
22
|
+
t.Fatalf("lock file not created: %v", err)
|
|
23
|
+
}
|
|
24
|
+
if len(data) == 0 {
|
|
25
|
+
t.Fatal("lock file is empty")
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
func TestAcquireLock_StalePID(t *testing.T) {
|
|
30
|
+
dir := t.TempDir()
|
|
31
|
+
lockPath := filepath.Join(dir, "test.lock")
|
|
32
|
+
log := NewLogger("")
|
|
33
|
+
|
|
34
|
+
// Write a stale lock with PID 1 (guaranteed to not be a supervisor)
|
|
35
|
+
// Use PID 99999999 which is almost certainly not running
|
|
36
|
+
os.WriteFile(lockPath, []byte("99999999"), 0644)
|
|
37
|
+
|
|
38
|
+
if !AcquireLock(lockPath, log) {
|
|
39
|
+
t.Fatal("expected stale lock to be reclaimed")
|
|
40
|
+
}
|
|
41
|
+
defer ReleaseLock(lockPath)
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
func TestReleaseLock(t *testing.T) {
|
|
45
|
+
dir := t.TempDir()
|
|
46
|
+
lockPath := filepath.Join(dir, "test.lock")
|
|
47
|
+
os.WriteFile(lockPath, []byte("12345"), 0644)
|
|
48
|
+
|
|
49
|
+
ReleaseLock(lockPath)
|
|
50
|
+
|
|
51
|
+
if _, err := os.Stat(lockPath); !os.IsNotExist(err) {
|
|
52
|
+
t.Error("expected lock file to be removed")
|
|
53
|
+
}
|
|
54
|
+
}
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
package main
|
|
2
|
+
|
|
3
|
+
import (
|
|
4
|
+
"fmt"
|
|
5
|
+
"os"
|
|
6
|
+
"sync"
|
|
7
|
+
"time"
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
// Logger writes to both stderr and a rotating log file.
|
|
11
|
+
// Rotates when the file exceeds maxSize bytes.
|
|
12
|
+
type Logger struct {
|
|
13
|
+
mu sync.Mutex
|
|
14
|
+
logPath string
|
|
15
|
+
file *os.File
|
|
16
|
+
debug bool
|
|
17
|
+
size int64
|
|
18
|
+
maxSize int64 // default 5 MB
|
|
19
|
+
maxKeep int // max rotated files to keep
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
func NewLogger(logPath string) *Logger {
|
|
23
|
+
l := &Logger{
|
|
24
|
+
logPath: logPath,
|
|
25
|
+
debug: os.Getenv("SUPERVISOR_DEBUG") == "1" || os.Getenv("SUPERVISOR_DEBUG") == "true",
|
|
26
|
+
maxSize: 5 * 1024 * 1024, // 5 MB
|
|
27
|
+
maxKeep: 3,
|
|
28
|
+
}
|
|
29
|
+
l.openFile()
|
|
30
|
+
return l
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
func (l *Logger) openFile() {
|
|
34
|
+
if l.logPath == "" {
|
|
35
|
+
return
|
|
36
|
+
}
|
|
37
|
+
f, err := os.OpenFile(l.logPath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
|
|
38
|
+
if err != nil {
|
|
39
|
+
fmt.Fprintf(os.Stderr, "[WARN] cannot open log file %s: %v\n", l.logPath, err)
|
|
40
|
+
return
|
|
41
|
+
}
|
|
42
|
+
l.file = f
|
|
43
|
+
// Seed current size for rotation checks
|
|
44
|
+
if info, err := f.Stat(); err == nil {
|
|
45
|
+
l.size = info.Size()
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
func (l *Logger) log(level, format string, args ...any) {
|
|
50
|
+
ts := time.Now().Format("2006-01-02 15:04:05")
|
|
51
|
+
msg := fmt.Sprintf(format, args...)
|
|
52
|
+
line := fmt.Sprintf("[%s] [%s] %s\n", ts, level, msg)
|
|
53
|
+
|
|
54
|
+
l.mu.Lock()
|
|
55
|
+
defer l.mu.Unlock()
|
|
56
|
+
fmt.Fprint(os.Stderr, line)
|
|
57
|
+
if l.file != nil {
|
|
58
|
+
n, err := l.file.WriteString(line)
|
|
59
|
+
if err != nil {
|
|
60
|
+
fmt.Fprintf(os.Stderr, "[ERR] log write failed: %v\n", err)
|
|
61
|
+
}
|
|
62
|
+
l.size += int64(n)
|
|
63
|
+
if l.maxSize > 0 && l.size >= l.maxSize {
|
|
64
|
+
l.rotate()
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
func (l *Logger) Info(format string, args ...any) { l.log("INFO", format, args...) }
|
|
70
|
+
func (l *Logger) Warn(format string, args ...any) { l.log("WARN", format, args...) }
|
|
71
|
+
func (l *Logger) Error(format string, args ...any) { l.log("ERROR", format, args...) }
|
|
72
|
+
func (l *Logger) Debug(format string, args ...any) {
|
|
73
|
+
if l.debug {
|
|
74
|
+
l.log("DEBUG", format, args...)
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// rotate closes the current log file, renames it with a .1 suffix (shifting
|
|
79
|
+
// older rotated files), and opens a fresh log. Called with mu held.
|
|
80
|
+
func (l *Logger) rotate() {
|
|
81
|
+
if l.file != nil {
|
|
82
|
+
l.file.Close()
|
|
83
|
+
l.file = nil
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// Shift existing rotated logs: .3 → delete, .2 → .3, .1 → .2, current → .1
|
|
87
|
+
for i := l.maxKeep; i >= 1; i-- {
|
|
88
|
+
old := fmt.Sprintf("%s.%d", l.logPath, i)
|
|
89
|
+
if i == l.maxKeep {
|
|
90
|
+
os.Remove(old)
|
|
91
|
+
} else {
|
|
92
|
+
os.Rename(old, fmt.Sprintf("%s.%d", l.logPath, i+1))
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
os.Rename(l.logPath, l.logPath+".1")
|
|
96
|
+
|
|
97
|
+
// Open a fresh file
|
|
98
|
+
l.size = 0
|
|
99
|
+
f, err := os.OpenFile(l.logPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
|
|
100
|
+
if err != nil {
|
|
101
|
+
fmt.Fprintf(os.Stderr, "[WARN] log rotate: cannot create fresh log: %v\n", err)
|
|
102
|
+
return
|
|
103
|
+
}
|
|
104
|
+
l.file = f
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
func (l *Logger) Close() {
|
|
108
|
+
l.mu.Lock()
|
|
109
|
+
defer l.mu.Unlock()
|
|
110
|
+
if l.file != nil {
|
|
111
|
+
l.file.Close()
|
|
112
|
+
l.file = nil
|
|
113
|
+
}
|
|
114
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
package main
|
|
2
|
+
|
|
3
|
+
import (
|
|
4
|
+
"os"
|
|
5
|
+
"path/filepath"
|
|
6
|
+
"strings"
|
|
7
|
+
"testing"
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
func TestLogRotation(t *testing.T) {
|
|
11
|
+
dir := t.TempDir()
|
|
12
|
+
logPath := filepath.Join(dir, "test.log")
|
|
13
|
+
|
|
14
|
+
l := &Logger{
|
|
15
|
+
logPath: logPath,
|
|
16
|
+
maxSize: 100, // tiny threshold for test
|
|
17
|
+
maxKeep: 2,
|
|
18
|
+
}
|
|
19
|
+
l.openFile()
|
|
20
|
+
defer l.Close()
|
|
21
|
+
|
|
22
|
+
// Write enough lines to trigger multiple rotations
|
|
23
|
+
for i := 0; i < 50; i++ {
|
|
24
|
+
l.Info("line %d: %s", i, strings.Repeat("x", 20))
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// Current log should exist and be under maxSize
|
|
28
|
+
info, err := os.Stat(logPath)
|
|
29
|
+
if err != nil {
|
|
30
|
+
t.Fatalf("log file missing: %v", err)
|
|
31
|
+
}
|
|
32
|
+
if info.Size() >= 100 {
|
|
33
|
+
t.Errorf("log file should have been rotated, size=%d", info.Size())
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// At least .1 should exist
|
|
37
|
+
if _, err := os.Stat(logPath + ".1"); err != nil {
|
|
38
|
+
t.Error("expected .1 rotated file to exist")
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// .3 should NOT exist (maxKeep=2)
|
|
42
|
+
if _, err := os.Stat(logPath + ".3"); err == nil {
|
|
43
|
+
t.Error("expected .3 file to not exist (maxKeep=2)")
|
|
44
|
+
}
|
|
45
|
+
}
|