opencastle 0.13.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/convoy/engine.d.ts +38 -0
- package/dist/cli/convoy/engine.d.ts.map +1 -0
- package/dist/cli/convoy/engine.js +416 -0
- package/dist/cli/convoy/engine.js.map +1 -0
- package/dist/cli/convoy/engine.test.d.ts +2 -0
- package/dist/cli/convoy/engine.test.d.ts.map +1 -0
- package/dist/cli/convoy/engine.test.js +1140 -0
- package/dist/cli/convoy/engine.test.js.map +1 -0
- package/dist/cli/convoy/health.d.ts +23 -0
- package/dist/cli/convoy/health.d.ts.map +1 -0
- package/dist/cli/convoy/health.js +69 -0
- package/dist/cli/convoy/health.js.map +1 -0
- package/dist/cli/convoy/health.test.d.ts +2 -0
- package/dist/cli/convoy/health.test.d.ts.map +1 -0
- package/dist/cli/convoy/health.test.js +392 -0
- package/dist/cli/convoy/health.test.js.map +1 -0
- package/package.json +1 -1
- package/src/cli/convoy/engine.test.ts +1349 -0
- package/src/cli/convoy/engine.ts +521 -0
- package/src/cli/convoy/health.test.ts +456 -0
- package/src/cli/convoy/health.ts +111 -0
- package/src/dashboard/node_modules/.vite/deps/_metadata.json +6 -6
|
@@ -0,0 +1,456 @@
|
|
|
1
|
+
import { mkdtempSync, rmSync } from 'node:fs'
|
|
2
|
+
import { tmpdir } from 'node:os'
|
|
3
|
+
import { join } from 'node:path'
|
|
4
|
+
import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'
|
|
5
|
+
import { createConvoyStore } from './store.js'
|
|
6
|
+
import type { ConvoyStore } from './store.js'
|
|
7
|
+
import type { ConvoyEventEmitter } from './events.js'
|
|
8
|
+
import { createHealthMonitor } from './health.js'
|
|
9
|
+
import type { HealthMonitorOptions } from './health.js'
|
|
10
|
+
|
|
11
|
+
// ── fixtures ──────────────────────────────────────────────────────────────────
|
|
12
|
+
|
|
13
|
+
const CONVOY_ID = 'convoy-1'
|
|
14
|
+
|
|
15
|
+
type EmittedEvent = {
|
|
16
|
+
type: string
|
|
17
|
+
data?: Record<string, unknown>
|
|
18
|
+
ids?: { convoy_id?: string; task_id?: string; worker_id?: string }
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
let tmpDir: string
|
|
22
|
+
let dbPath: string
|
|
23
|
+
let store: ConvoyStore
|
|
24
|
+
let emittedEvents: EmittedEvent[]
|
|
25
|
+
let mockEvents: ConvoyEventEmitter
|
|
26
|
+
|
|
27
|
+
beforeEach(() => {
|
|
28
|
+
tmpDir = mkdtempSync(join(tmpdir(), 'health-test-'))
|
|
29
|
+
dbPath = join(tmpDir, 'test.db')
|
|
30
|
+
store = createConvoyStore(dbPath)
|
|
31
|
+
emittedEvents = []
|
|
32
|
+
mockEvents = {
|
|
33
|
+
emit(type, data, ids) {
|
|
34
|
+
emittedEvents.push({ type, data, ids })
|
|
35
|
+
},
|
|
36
|
+
}
|
|
37
|
+
store.insertConvoy({
|
|
38
|
+
id: CONVOY_ID,
|
|
39
|
+
name: 'Test Convoy',
|
|
40
|
+
spec_hash: 'abc123',
|
|
41
|
+
status: 'running',
|
|
42
|
+
branch: null,
|
|
43
|
+
created_at: new Date().toISOString(),
|
|
44
|
+
spec_yaml: 'name: test',
|
|
45
|
+
})
|
|
46
|
+
})
|
|
47
|
+
|
|
48
|
+
afterEach(() => {
|
|
49
|
+
store.close()
|
|
50
|
+
rmSync(tmpDir, { recursive: true, force: true })
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
function makeOptions(overrides: Partial<HealthMonitorOptions> = {}): HealthMonitorOptions {
|
|
54
|
+
return { store, events: mockEvents, convoyId: CONVOY_ID, ...overrides }
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
function makeTask(
|
|
58
|
+
overrides: Partial<Parameters<ConvoyStore['insertTask']>[0]> = {},
|
|
59
|
+
): Parameters<ConvoyStore['insertTask']>[0] {
|
|
60
|
+
return {
|
|
61
|
+
id: 'task-1',
|
|
62
|
+
convoy_id: CONVOY_ID,
|
|
63
|
+
phase: 0,
|
|
64
|
+
prompt: 'Do something',
|
|
65
|
+
agent: 'developer',
|
|
66
|
+
model: null,
|
|
67
|
+
timeout_ms: 60_000,
|
|
68
|
+
status: 'running' as const,
|
|
69
|
+
retries: 0,
|
|
70
|
+
max_retries: 1,
|
|
71
|
+
files: null,
|
|
72
|
+
depends_on: null,
|
|
73
|
+
...overrides,
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
function makeWorker(
|
|
78
|
+
overrides: Partial<Parameters<ConvoyStore['insertWorker']>[0]> = {},
|
|
79
|
+
): Parameters<ConvoyStore['insertWorker']>[0] {
|
|
80
|
+
return {
|
|
81
|
+
id: 'worker-1',
|
|
82
|
+
task_id: 'task-1',
|
|
83
|
+
adapter: 'copilot',
|
|
84
|
+
pid: null,
|
|
85
|
+
session_id: null,
|
|
86
|
+
status: 'running' as const,
|
|
87
|
+
worktree: null,
|
|
88
|
+
created_at: new Date().toISOString(),
|
|
89
|
+
...overrides,
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/** Insert a running task linked to a worker. */
|
|
94
|
+
function setupRunning(
|
|
95
|
+
taskOverrides: Partial<Parameters<ConvoyStore['insertTask']>[0]> = {},
|
|
96
|
+
workerOverrides: Partial<Parameters<ConvoyStore['insertWorker']>[0]> = {},
|
|
97
|
+
) {
|
|
98
|
+
store.insertTask(makeTask(taskOverrides))
|
|
99
|
+
store.insertWorker(makeWorker(workerOverrides))
|
|
100
|
+
store.updateTaskStatus('task-1', CONVOY_ID, 'running', { worker_id: 'worker-1' })
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/** ISO timestamp `msAgo` milliseconds in the past. */
|
|
104
|
+
function msBefore(msAgo: number): string {
|
|
105
|
+
return new Date(Date.now() - msAgo).toISOString()
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// ── creation ──────────────────────────────────────────────────────────────────
|
|
109
|
+
|
|
110
|
+
describe('createHealthMonitor', () => {
|
|
111
|
+
it('returns an object with start, stop, and check methods', () => {
|
|
112
|
+
const monitor = createHealthMonitor(makeOptions())
|
|
113
|
+
expect(typeof monitor.start).toBe('function')
|
|
114
|
+
expect(typeof monitor.stop).toBe('function')
|
|
115
|
+
expect(typeof monitor.check).toBe('function')
|
|
116
|
+
})
|
|
117
|
+
|
|
118
|
+
it('accepts custom intervalMs and stuckFactor without error', () => {
|
|
119
|
+
expect(() =>
|
|
120
|
+
createHealthMonitor(makeOptions({ intervalMs: 5_000, stuckFactor: 3 })),
|
|
121
|
+
).not.toThrow()
|
|
122
|
+
})
|
|
123
|
+
|
|
124
|
+
it('accepts an onKill callback', () => {
|
|
125
|
+
expect(() => createHealthMonitor(makeOptions({ onKill: () => {} }))).not.toThrow()
|
|
126
|
+
})
|
|
127
|
+
})
|
|
128
|
+
|
|
129
|
+
// ── start/stop lifecycle ──────────────────────────────────────────────────────
|
|
130
|
+
|
|
131
|
+
describe('start/stop lifecycle', () => {
|
|
132
|
+
it('start() does not throw', () => {
|
|
133
|
+
const monitor = createHealthMonitor(makeOptions())
|
|
134
|
+
expect(() => monitor.start()).not.toThrow()
|
|
135
|
+
monitor.stop()
|
|
136
|
+
})
|
|
137
|
+
|
|
138
|
+
it('stop() after start() does not throw', () => {
|
|
139
|
+
const monitor = createHealthMonitor(makeOptions())
|
|
140
|
+
monitor.start()
|
|
141
|
+
expect(() => monitor.stop()).not.toThrow()
|
|
142
|
+
})
|
|
143
|
+
|
|
144
|
+
it('stop() without start() does not throw (idempotent)', () => {
|
|
145
|
+
const monitor = createHealthMonitor(makeOptions())
|
|
146
|
+
expect(() => monitor.stop()).not.toThrow()
|
|
147
|
+
})
|
|
148
|
+
|
|
149
|
+
it('calling start() twice does not create duplicate intervals', () => {
|
|
150
|
+
vi.useFakeTimers()
|
|
151
|
+
try {
|
|
152
|
+
setupRunning()
|
|
153
|
+
store.updateWorkerStatus('worker-1', 'running', { last_heartbeat: new Date().toISOString() })
|
|
154
|
+
const monitor = createHealthMonitor(makeOptions({ intervalMs: 1_000 }))
|
|
155
|
+
monitor.start()
|
|
156
|
+
monitor.start() // second call is no-op
|
|
157
|
+
vi.advanceTimersByTime(1_000)
|
|
158
|
+
// Only one interval fires; fresh heartbeat → zero events
|
|
159
|
+
expect(emittedEvents).toHaveLength(0)
|
|
160
|
+
monitor.stop()
|
|
161
|
+
} finally {
|
|
162
|
+
vi.useRealTimers()
|
|
163
|
+
}
|
|
164
|
+
})
|
|
165
|
+
|
|
166
|
+
it('check() is invoked on each timer tick after start()', () => {
|
|
167
|
+
vi.useFakeTimers()
|
|
168
|
+
try {
|
|
169
|
+
setupRunning({ timeout_ms: 60_000, max_retries: 5 })
|
|
170
|
+
store.updateWorkerStatus('worker-1', 'running', { last_heartbeat: msBefore(200_000) })
|
|
171
|
+
const monitor = createHealthMonitor(makeOptions({ intervalMs: 1_000, stuckFactor: 2 }))
|
|
172
|
+
monitor.start()
|
|
173
|
+
vi.advanceTimersByTime(1_000)
|
|
174
|
+
expect(store.getTask('task-1', CONVOY_ID)!.status).toBe('pending')
|
|
175
|
+
monitor.stop()
|
|
176
|
+
} finally {
|
|
177
|
+
vi.useRealTimers()
|
|
178
|
+
}
|
|
179
|
+
})
|
|
180
|
+
})
|
|
181
|
+
|
|
182
|
+
// ── check() no-op cases ───────────────────────────────────────────────────────
|
|
183
|
+
|
|
184
|
+
describe('check() no-op cases', () => {
|
|
185
|
+
it('does nothing when no tasks exist for the convoy', () => {
|
|
186
|
+
createHealthMonitor(makeOptions()).check()
|
|
187
|
+
expect(emittedEvents).toHaveLength(0)
|
|
188
|
+
})
|
|
189
|
+
|
|
190
|
+
it('does nothing when all tasks are pending', () => {
|
|
191
|
+
store.insertTask(makeTask({ status: 'pending' }))
|
|
192
|
+
createHealthMonitor(makeOptions()).check()
|
|
193
|
+
expect(emittedEvents).toHaveLength(0)
|
|
194
|
+
})
|
|
195
|
+
|
|
196
|
+
it('does nothing when all tasks are done', () => {
|
|
197
|
+
store.insertTask(makeTask({ status: 'done' }))
|
|
198
|
+
createHealthMonitor(makeOptions()).check()
|
|
199
|
+
expect(emittedEvents).toHaveLength(0)
|
|
200
|
+
})
|
|
201
|
+
|
|
202
|
+
it('does nothing when all tasks are failed', () => {
|
|
203
|
+
store.insertTask(makeTask({ status: 'failed' }))
|
|
204
|
+
createHealthMonitor(makeOptions()).check()
|
|
205
|
+
expect(emittedEvents).toHaveLength(0)
|
|
206
|
+
})
|
|
207
|
+
|
|
208
|
+
it('does nothing for a running task with no worker_id', () => {
|
|
209
|
+
store.insertTask(makeTask()) // worker_id stays NULL (insertTask always sets NULL)
|
|
210
|
+
createHealthMonitor(makeOptions()).check()
|
|
211
|
+
expect(emittedEvents).toHaveLength(0)
|
|
212
|
+
})
|
|
213
|
+
|
|
214
|
+
it('does nothing for a running task when the worker record is not found', () => {
|
|
215
|
+
store.insertTask(makeTask())
|
|
216
|
+
store.updateTaskStatus('task-1', CONVOY_ID, 'running', { worker_id: 'ghost-worker' })
|
|
217
|
+
createHealthMonitor(makeOptions()).check()
|
|
218
|
+
expect(emittedEvents).toHaveLength(0)
|
|
219
|
+
})
|
|
220
|
+
|
|
221
|
+
it('does nothing for a running task with fresh heartbeat and no pid', () => {
|
|
222
|
+
setupRunning({ timeout_ms: 60_000 })
|
|
223
|
+
store.updateWorkerStatus('worker-1', 'running', { last_heartbeat: new Date().toISOString() })
|
|
224
|
+
createHealthMonitor(makeOptions({ stuckFactor: 2 })).check()
|
|
225
|
+
expect(emittedEvents).toHaveLength(0)
|
|
226
|
+
})
|
|
227
|
+
|
|
228
|
+
it('does nothing for a running task with no heartbeat and no pid', () => {
|
|
229
|
+
setupRunning() // no heartbeat set, no pid → neither check fires
|
|
230
|
+
createHealthMonitor(makeOptions()).check()
|
|
231
|
+
expect(emittedEvents).toHaveLength(0)
|
|
232
|
+
})
|
|
233
|
+
})
|
|
234
|
+
|
|
235
|
+
// ── stuck detection ───────────────────────────────────────────────────────────
|
|
236
|
+
|
|
237
|
+
describe('stuck detection', () => {
|
|
238
|
+
function setupStuck(
|
|
239
|
+
taskOverrides: Partial<Parameters<ConvoyStore['insertTask']>[0]> = {},
|
|
240
|
+
) {
|
|
241
|
+
setupRunning({ timeout_ms: 60_000, ...taskOverrides })
|
|
242
|
+
store.updateWorkerStatus('worker-1', 'running', { last_heartbeat: msBefore(200_000) })
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
it('resets task to pending and increments retries when retries < max_retries', () => {
|
|
246
|
+
setupStuck({ retries: 0, max_retries: 1 })
|
|
247
|
+
createHealthMonitor(makeOptions({ stuckFactor: 2 })).check()
|
|
248
|
+
const task = store.getTask('task-1', CONVOY_ID)!
|
|
249
|
+
expect(task.status).toBe('pending')
|
|
250
|
+
expect(task.retries).toBe(1)
|
|
251
|
+
})
|
|
252
|
+
|
|
253
|
+
it('marks worker as killed with a finished_at timestamp', () => {
|
|
254
|
+
setupStuck()
|
|
255
|
+
createHealthMonitor(makeOptions({ stuckFactor: 2 })).check()
|
|
256
|
+
const worker = store.getWorker('worker-1')!
|
|
257
|
+
expect(worker.status).toBe('killed')
|
|
258
|
+
expect(worker.finished_at).not.toBeNull()
|
|
259
|
+
})
|
|
260
|
+
|
|
261
|
+
it('marks task as failed when retries >= max_retries', () => {
|
|
262
|
+
setupStuck({ retries: 1, max_retries: 1 })
|
|
263
|
+
createHealthMonitor(makeOptions({ stuckFactor: 2 })).check()
|
|
264
|
+
expect(store.getTask('task-1', CONVOY_ID)!.status).toBe('failed')
|
|
265
|
+
})
|
|
266
|
+
|
|
267
|
+
it('does NOT trigger when heartbeat is within the stuck threshold', () => {
|
|
268
|
+
setupRunning({ timeout_ms: 60_000 })
|
|
269
|
+
store.updateWorkerStatus('worker-1', 'running', { last_heartbeat: msBefore(1_000) })
|
|
270
|
+
createHealthMonitor(makeOptions({ stuckFactor: 2 })).check()
|
|
271
|
+
expect(emittedEvents).toHaveLength(0)
|
|
272
|
+
expect(store.getTask('task-1', CONVOY_ID)!.status).toBe('running')
|
|
273
|
+
})
|
|
274
|
+
|
|
275
|
+
it('also catches tasks in assigned status', () => {
|
|
276
|
+
store.insertTask(makeTask({ timeout_ms: 60_000 }))
|
|
277
|
+
store.insertWorker(makeWorker())
|
|
278
|
+
store.updateTaskStatus('task-1', CONVOY_ID, 'assigned', { worker_id: 'worker-1' })
|
|
279
|
+
store.updateWorkerStatus('worker-1', 'running', { last_heartbeat: msBefore(200_000) })
|
|
280
|
+
createHealthMonitor(makeOptions({ stuckFactor: 2 })).check()
|
|
281
|
+
const status = store.getTask('task-1', CONVOY_ID)!.status
|
|
282
|
+
expect(['pending', 'failed']).toContain(status)
|
|
283
|
+
})
|
|
284
|
+
|
|
285
|
+
it('skips zombie check when stuck is already detected', () => {
|
|
286
|
+
setupStuck()
|
|
287
|
+
store.updateWorkerStatus('worker-1', 'running', { pid: 12_345 })
|
|
288
|
+
const killSpy = vi.spyOn(process, 'kill')
|
|
289
|
+
try {
|
|
290
|
+
createHealthMonitor(makeOptions({ stuckFactor: 2 })).check()
|
|
291
|
+
expect(killSpy).not.toHaveBeenCalled()
|
|
292
|
+
expect(emittedEvents[0].data?.reason).toBe('stuck')
|
|
293
|
+
} finally {
|
|
294
|
+
killSpy.mockRestore()
|
|
295
|
+
}
|
|
296
|
+
})
|
|
297
|
+
})
|
|
298
|
+
|
|
299
|
+
// ── zombie detection ──────────────────────────────────────────────────────────
|
|
300
|
+
|
|
301
|
+
describe('zombie detection', () => {
|
|
302
|
+
function setupZombie(
|
|
303
|
+
taskOverrides: Partial<Parameters<ConvoyStore['insertTask']>[0]> = {},
|
|
304
|
+
) {
|
|
305
|
+
setupRunning(taskOverrides, { pid: 999_999_999, status: 'running' })
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
it('resets task to pending and increments retries when retries < max_retries', () => {
|
|
309
|
+
setupZombie({ retries: 0, max_retries: 1 })
|
|
310
|
+
const killSpy = vi.spyOn(process, 'kill').mockImplementation((): true => {
|
|
311
|
+
throw Object.assign(new Error('ESRCH'), { code: 'ESRCH' })
|
|
312
|
+
})
|
|
313
|
+
try {
|
|
314
|
+
createHealthMonitor(makeOptions()).check()
|
|
315
|
+
const task = store.getTask('task-1', CONVOY_ID)!
|
|
316
|
+
expect(task.status).toBe('pending')
|
|
317
|
+
expect(task.retries).toBe(1)
|
|
318
|
+
} finally {
|
|
319
|
+
killSpy.mockRestore()
|
|
320
|
+
}
|
|
321
|
+
})
|
|
322
|
+
|
|
323
|
+
it('marks worker as killed with a finished_at timestamp', () => {
|
|
324
|
+
setupZombie()
|
|
325
|
+
const killSpy = vi.spyOn(process, 'kill').mockImplementation((): true => {
|
|
326
|
+
throw new Error('ESRCH')
|
|
327
|
+
})
|
|
328
|
+
try {
|
|
329
|
+
createHealthMonitor(makeOptions()).check()
|
|
330
|
+
const worker = store.getWorker('worker-1')!
|
|
331
|
+
expect(worker.status).toBe('killed')
|
|
332
|
+
expect(worker.finished_at).not.toBeNull()
|
|
333
|
+
} finally {
|
|
334
|
+
killSpy.mockRestore()
|
|
335
|
+
}
|
|
336
|
+
})
|
|
337
|
+
|
|
338
|
+
it('marks task as failed when retries >= max_retries', () => {
|
|
339
|
+
setupZombie({ retries: 1, max_retries: 1 })
|
|
340
|
+
const killSpy = vi.spyOn(process, 'kill').mockImplementation((): true => {
|
|
341
|
+
throw new Error('ESRCH')
|
|
342
|
+
})
|
|
343
|
+
try {
|
|
344
|
+
createHealthMonitor(makeOptions()).check()
|
|
345
|
+
expect(store.getTask('task-1', CONVOY_ID)!.status).toBe('failed')
|
|
346
|
+
} finally {
|
|
347
|
+
killSpy.mockRestore()
|
|
348
|
+
}
|
|
349
|
+
})
|
|
350
|
+
|
|
351
|
+
it('does NOT trigger when process is still alive', () => {
|
|
352
|
+
setupRunning({}, { pid: process.pid, status: 'running' })
|
|
353
|
+
// Real process.kill(process.pid, 0) must succeed — no mock needed
|
|
354
|
+
createHealthMonitor(makeOptions()).check()
|
|
355
|
+
expect(emittedEvents).toHaveLength(0)
|
|
356
|
+
expect(store.getTask('task-1', CONVOY_ID)!.status).toBe('running')
|
|
357
|
+
})
|
|
358
|
+
|
|
359
|
+
it('does NOT trigger when worker status is not running (e.g. spawned)', () => {
|
|
360
|
+
setupRunning({}, { pid: 999_999_999, status: 'spawned' })
|
|
361
|
+
const killSpy = vi.spyOn(process, 'kill').mockImplementation((): true => {
|
|
362
|
+
throw new Error('ESRCH')
|
|
363
|
+
})
|
|
364
|
+
try {
|
|
365
|
+
createHealthMonitor(makeOptions()).check()
|
|
366
|
+
expect(emittedEvents).toHaveLength(0)
|
|
367
|
+
} finally {
|
|
368
|
+
killSpy.mockRestore()
|
|
369
|
+
}
|
|
370
|
+
})
|
|
371
|
+
|
|
372
|
+
it('does NOT call process.kill when pid is null', () => {
|
|
373
|
+
setupRunning({}, { pid: null })
|
|
374
|
+
const killSpy = vi.spyOn(process, 'kill')
|
|
375
|
+
try {
|
|
376
|
+
createHealthMonitor(makeOptions()).check()
|
|
377
|
+
expect(killSpy).not.toHaveBeenCalled()
|
|
378
|
+
} finally {
|
|
379
|
+
killSpy.mockRestore()
|
|
380
|
+
}
|
|
381
|
+
})
|
|
382
|
+
})
|
|
383
|
+
|
|
384
|
+
// ── onKill callback ───────────────────────────────────────────────────────────
|
|
385
|
+
|
|
386
|
+
describe('onKill callback', () => {
|
|
387
|
+
it('is called with workerId and taskId on stuck detection', () => {
|
|
388
|
+
setupRunning({ timeout_ms: 60_000 })
|
|
389
|
+
store.updateWorkerStatus('worker-1', 'running', { last_heartbeat: msBefore(200_000) })
|
|
390
|
+
const onKill = vi.fn()
|
|
391
|
+
createHealthMonitor(makeOptions({ stuckFactor: 2, onKill })).check()
|
|
392
|
+
expect(onKill).toHaveBeenCalledOnce()
|
|
393
|
+
expect(onKill).toHaveBeenCalledWith('worker-1', 'task-1')
|
|
394
|
+
})
|
|
395
|
+
|
|
396
|
+
it('is called with workerId and taskId on zombie detection', () => {
|
|
397
|
+
setupRunning({}, { pid: 999_999_999, status: 'running' })
|
|
398
|
+
const killSpy = vi.spyOn(process, 'kill').mockImplementation((): true => {
|
|
399
|
+
throw new Error('ESRCH')
|
|
400
|
+
})
|
|
401
|
+
try {
|
|
402
|
+
const onKill = vi.fn()
|
|
403
|
+
createHealthMonitor(makeOptions({ onKill })).check()
|
|
404
|
+
expect(onKill).toHaveBeenCalledOnce()
|
|
405
|
+
expect(onKill).toHaveBeenCalledWith('worker-1', 'task-1')
|
|
406
|
+
} finally {
|
|
407
|
+
killSpy.mockRestore()
|
|
408
|
+
}
|
|
409
|
+
})
|
|
410
|
+
|
|
411
|
+
it('does not throw when onKill is not provided', () => {
|
|
412
|
+
setupRunning({ timeout_ms: 60_000 })
|
|
413
|
+
store.updateWorkerStatus('worker-1', 'running', { last_heartbeat: msBefore(200_000) })
|
|
414
|
+
expect(() => createHealthMonitor(makeOptions({ stuckFactor: 2 })).check()).not.toThrow()
|
|
415
|
+
})
|
|
416
|
+
})
|
|
417
|
+
|
|
418
|
+
// ── event emission ────────────────────────────────────────────────────────────
|
|
419
|
+
|
|
420
|
+
describe('event emission', () => {
|
|
421
|
+
it('emits worker_killed with reason stuck on stuck detection', () => {
|
|
422
|
+
setupRunning({ timeout_ms: 60_000 })
|
|
423
|
+
store.updateWorkerStatus('worker-1', 'running', { last_heartbeat: msBefore(200_000) })
|
|
424
|
+
createHealthMonitor(makeOptions({ stuckFactor: 2 })).check()
|
|
425
|
+
expect(emittedEvents).toHaveLength(1)
|
|
426
|
+
expect(emittedEvents[0].type).toBe('worker_killed')
|
|
427
|
+
expect(emittedEvents[0].data?.reason).toBe('stuck')
|
|
428
|
+
expect(emittedEvents[0].data?.worker_id).toBe('worker-1')
|
|
429
|
+
expect(emittedEvents[0].data?.task_id).toBe('task-1')
|
|
430
|
+
})
|
|
431
|
+
|
|
432
|
+
it('emits worker_killed with reason zombie on zombie detection', () => {
|
|
433
|
+
setupRunning({}, { pid: 999_999_999, status: 'running' })
|
|
434
|
+
const killSpy = vi.spyOn(process, 'kill').mockImplementation((): true => {
|
|
435
|
+
throw new Error('ESRCH')
|
|
436
|
+
})
|
|
437
|
+
try {
|
|
438
|
+
createHealthMonitor(makeOptions()).check()
|
|
439
|
+
expect(emittedEvents).toHaveLength(1)
|
|
440
|
+
expect(emittedEvents[0].type).toBe('worker_killed')
|
|
441
|
+
expect(emittedEvents[0].data?.reason).toBe('zombie')
|
|
442
|
+
} finally {
|
|
443
|
+
killSpy.mockRestore()
|
|
444
|
+
}
|
|
445
|
+
})
|
|
446
|
+
|
|
447
|
+
it('includes convoy_id, task_id, worker_id in emitted event ids', () => {
|
|
448
|
+
setupRunning({ timeout_ms: 60_000 })
|
|
449
|
+
store.updateWorkerStatus('worker-1', 'running', { last_heartbeat: msBefore(200_000) })
|
|
450
|
+
createHealthMonitor(makeOptions({ stuckFactor: 2 })).check()
|
|
451
|
+
const { ids } = emittedEvents[0]
|
|
452
|
+
expect(ids?.convoy_id).toBe(CONVOY_ID)
|
|
453
|
+
expect(ids?.task_id).toBe('task-1')
|
|
454
|
+
expect(ids?.worker_id).toBe('worker-1')
|
|
455
|
+
})
|
|
456
|
+
})
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import type { ConvoyStore } from './store.js'
|
|
2
|
+
import type { ConvoyEventEmitter } from './events.js'
|
|
3
|
+
|
|
4
|
+
export interface HealthMonitorOptions {
|
|
5
|
+
store: ConvoyStore
|
|
6
|
+
events: ConvoyEventEmitter
|
|
7
|
+
convoyId: string
|
|
8
|
+
/** Interval between health checks in ms (default: 30000) */
|
|
9
|
+
intervalMs?: number
|
|
10
|
+
/** Factor of task timeout before declaring stuck (default: 2) */
|
|
11
|
+
stuckFactor?: number
|
|
12
|
+
/** Optional kill callback for killing a stuck worker's process */
|
|
13
|
+
onKill?: (workerId: string, taskId: string) => void
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export interface HealthMonitor {
|
|
17
|
+
/** Start periodic health checks. Returns immediately. */
|
|
18
|
+
start(): void
|
|
19
|
+
/** Stop periodic health checks and clean up. */
|
|
20
|
+
stop(): void
|
|
21
|
+
/** Run a single health check cycle (useful for testing). */
|
|
22
|
+
check(): void
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export function createHealthMonitor(options: HealthMonitorOptions): HealthMonitor {
|
|
26
|
+
const {
|
|
27
|
+
store,
|
|
28
|
+
events,
|
|
29
|
+
convoyId,
|
|
30
|
+
intervalMs = 30_000,
|
|
31
|
+
stuckFactor = 2,
|
|
32
|
+
onKill,
|
|
33
|
+
} = options
|
|
34
|
+
|
|
35
|
+
let timer: ReturnType<typeof setInterval> | null = null
|
|
36
|
+
|
|
37
|
+
function check(): void {
|
|
38
|
+
const activeTasks = store
|
|
39
|
+
.getTasksByConvoy(convoyId)
|
|
40
|
+
.filter(t => t.status === 'running' || t.status === 'assigned')
|
|
41
|
+
|
|
42
|
+
for (const task of activeTasks) {
|
|
43
|
+
if (!task.worker_id) continue
|
|
44
|
+
|
|
45
|
+
const worker = store.getWorker(task.worker_id)
|
|
46
|
+
if (!worker) continue
|
|
47
|
+
|
|
48
|
+
let reason: 'stuck' | 'zombie' | null = null
|
|
49
|
+
|
|
50
|
+
if (worker.last_heartbeat !== null) {
|
|
51
|
+
const elapsed = Date.now() - new Date(worker.last_heartbeat).getTime()
|
|
52
|
+
if (elapsed > task.timeout_ms * stuckFactor) {
|
|
53
|
+
reason = 'stuck'
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
if (reason === null && worker.pid !== null) {
|
|
58
|
+
let processGone = false
|
|
59
|
+
try {
|
|
60
|
+
process.kill(worker.pid, 0)
|
|
61
|
+
} catch {
|
|
62
|
+
processGone = true
|
|
63
|
+
}
|
|
64
|
+
if (processGone && worker.status === 'running') {
|
|
65
|
+
reason = 'zombie'
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
if (reason !== null) {
|
|
70
|
+
const workerId = worker.id
|
|
71
|
+
const taskId = task.id
|
|
72
|
+
|
|
73
|
+
onKill?.(workerId, taskId)
|
|
74
|
+
|
|
75
|
+
store.withTransaction(() => {
|
|
76
|
+
store.updateWorkerStatus(workerId, 'killed', {
|
|
77
|
+
finished_at: new Date().toISOString(),
|
|
78
|
+
})
|
|
79
|
+
|
|
80
|
+
if (task.retries < task.max_retries) {
|
|
81
|
+
store.updateTaskStatus(taskId, convoyId, 'pending', {
|
|
82
|
+
retries: task.retries + 1,
|
|
83
|
+
})
|
|
84
|
+
} else {
|
|
85
|
+
store.updateTaskStatus(taskId, convoyId, 'failed')
|
|
86
|
+
}
|
|
87
|
+
})
|
|
88
|
+
|
|
89
|
+
events.emit(
|
|
90
|
+
'worker_killed',
|
|
91
|
+
{ reason, worker_id: workerId, task_id: taskId },
|
|
92
|
+
{ convoy_id: convoyId, task_id: taskId, worker_id: workerId },
|
|
93
|
+
)
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
return {
|
|
99
|
+
start() {
|
|
100
|
+
if (timer !== null) return
|
|
101
|
+
timer = setInterval(check, intervalMs)
|
|
102
|
+
},
|
|
103
|
+
stop() {
|
|
104
|
+
if (timer !== null) {
|
|
105
|
+
clearInterval(timer)
|
|
106
|
+
timer = null
|
|
107
|
+
}
|
|
108
|
+
},
|
|
109
|
+
check,
|
|
110
|
+
}
|
|
111
|
+
}
|
|
@@ -1,25 +1,25 @@
|
|
|
1
1
|
{
|
|
2
|
-
"hash": "
|
|
2
|
+
"hash": "a6abe049",
|
|
3
3
|
"configHash": "30f8ea04",
|
|
4
|
-
"lockfileHash": "
|
|
5
|
-
"browserHash": "
|
|
4
|
+
"lockfileHash": "c72e777b",
|
|
5
|
+
"browserHash": "737e5c4c",
|
|
6
6
|
"optimized": {
|
|
7
7
|
"astro > cssesc": {
|
|
8
8
|
"src": "../../../../../node_modules/cssesc/cssesc.js",
|
|
9
9
|
"file": "astro___cssesc.js",
|
|
10
|
-
"fileHash": "
|
|
10
|
+
"fileHash": "3a9e4516",
|
|
11
11
|
"needsInterop": true
|
|
12
12
|
},
|
|
13
13
|
"astro > aria-query": {
|
|
14
14
|
"src": "../../../../../node_modules/aria-query/lib/index.js",
|
|
15
15
|
"file": "astro___aria-query.js",
|
|
16
|
-
"fileHash": "
|
|
16
|
+
"fileHash": "774ff115",
|
|
17
17
|
"needsInterop": true
|
|
18
18
|
},
|
|
19
19
|
"astro > axobject-query": {
|
|
20
20
|
"src": "../../../../../node_modules/axobject-query/lib/index.js",
|
|
21
21
|
"file": "astro___axobject-query.js",
|
|
22
|
-
"fileHash": "
|
|
22
|
+
"fileHash": "bd1844a6",
|
|
23
23
|
"needsInterop": true
|
|
24
24
|
}
|
|
25
25
|
},
|