opencastle 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,521 @@
1
+ import { execFile as execFileCb } from 'node:child_process'
2
+ import { createHash } from 'node:crypto'
3
+ import { mkdirSync } from 'node:fs'
4
+ import { dirname, join, resolve } from 'node:path'
5
+ import { promisify } from 'node:util'
6
+ import type { Task, TaskSpec, AgentAdapter, ExecuteResult } from '../types.js'
7
+ import { createConvoyStore, type ConvoyStore } from './store.js'
8
+ import { createEventEmitter, type ConvoyEventEmitter } from './events.js'
9
+ import { createWorktreeManager, type WorktreeManager } from './worktree.js'
10
+ import { createMergeQueue, type MergeQueue } from './merge.js'
11
+ import { createHealthMonitor } from './health.js'
12
+ import type { TaskRecord, ConvoyStatus } from './types.js'
13
+ import { buildPhases, formatDuration } from '../run/executor.js'
14
+ import { parseTimeout } from '../run/schema.js'
15
+
16
+ const execFile = promisify(execFileCb)
17
+
18
+ // ── Public interfaces ─────────────────────────────────────────────────────────
19
+
20
+ export interface ConvoyEngineOptions {
21
+ spec: TaskSpec
22
+ specYaml: string
23
+ adapter: AgentAdapter
24
+ basePath?: string
25
+ dbPath?: string
26
+ logsDir?: string
27
+ verbose?: boolean
28
+ _worktreeManager?: WorktreeManager
29
+ _mergeQueue?: MergeQueue
30
+ }
31
+
32
+ export interface ConvoyResult {
33
+ convoyId: string
34
+ status: ConvoyStatus
35
+ summary: { total: number; done: number; failed: number; skipped: number; timedOut: number }
36
+ duration: string
37
+ gateResults?: Array<{ command: string; exitCode: number; passed: boolean }>
38
+ }
39
+
40
+ export interface ConvoyEngine {
41
+ run(): Promise<ConvoyResult>
42
+ resume(convoyId: string): Promise<ConvoyResult>
43
+ }
44
+
45
+ // ── Internal helpers ──────────────────────────────────────────────────────────
46
+
47
+ function msToTimeout(ms: number): string {
48
+ if (ms >= 3_600_000 && ms % 3_600_000 === 0) return `${ms / 3_600_000}h`
49
+ if (ms >= 60_000 && ms % 60_000 === 0) return `${ms / 60_000}m`
50
+ return `${ms / 1_000}s`
51
+ }
52
+
53
+ function taskRecordToTask(record: TaskRecord): Task {
54
+ return {
55
+ id: record.id,
56
+ prompt: record.prompt,
57
+ agent: record.agent,
58
+ timeout: msToTimeout(record.timeout_ms),
59
+ depends_on: record.depends_on ? (JSON.parse(record.depends_on) as string[]) : [],
60
+ files: record.files ? (JSON.parse(record.files) as string[]) : [],
61
+ description: '',
62
+ model: record.model ?? undefined,
63
+ max_retries: record.max_retries,
64
+ }
65
+ }
66
+
67
+ function makeTimeoutPromise(ms: number): { promise: Promise<ExecuteResult>; clear: () => void } {
68
+ let timerId: ReturnType<typeof setTimeout> | undefined
69
+ const promise = new Promise<ExecuteResult>((res) => {
70
+ timerId = setTimeout(
71
+ () => res({ _timedOut: true, success: false, output: 'Task timed out', exitCode: -1 }),
72
+ ms,
73
+ )
74
+ })
75
+ return { promise, clear: () => { if (timerId !== undefined) clearTimeout(timerId) } }
76
+ }
77
+
78
+ // ── Core convoy execution ─────────────────────────────────────────────────────
79
+
80
+ async function runConvoy(
81
+ convoyId: string,
82
+ spec: TaskSpec,
83
+ adapter: AgentAdapter,
84
+ store: ConvoyStore,
85
+ events: ConvoyEventEmitter,
86
+ wtManager: WorktreeManager,
87
+ mergeQueue: MergeQueue,
88
+ basePath: string,
89
+ baseBranch: string,
90
+ verbose: boolean,
91
+ startTime: number,
92
+ ): Promise<ConvoyResult> {
93
+ const activeTaskMap = new Map<string, Task>()
94
+
95
+ const healthMonitor = createHealthMonitor({
96
+ store,
97
+ events,
98
+ convoyId,
99
+ onKill: (workerId, taskId) => {
100
+ const task = activeTaskMap.get(taskId)
101
+ if (task && typeof adapter.kill === 'function') {
102
+ adapter.kill(task)
103
+ }
104
+ activeTaskMap.delete(taskId)
105
+ },
106
+ })
107
+ healthMonitor.start()
108
+
109
+ // ── Task skipping ─────────────────────────────────────────────────────────
110
+
111
+ function skipTask(taskId: string, reason: string, visited: Set<string> = new Set()): void {
112
+ if (visited.has(taskId)) return
113
+ visited.add(taskId)
114
+ const allTasks = store.getTasksByConvoy(convoyId)
115
+ const task = allTasks.find(t => t.id === taskId)
116
+ if (!task || task.status !== 'pending') return
117
+ store.updateTaskStatus(taskId, convoyId, 'skipped', { output: reason })
118
+ if (verbose) process.stdout.write(`\u2298 ${taskId}\n`)
119
+ events.emit('task_skipped', { reason }, { convoy_id: convoyId, task_id: taskId })
120
+ for (const t of allTasks) {
121
+ const deps = t.depends_on ? (JSON.parse(t.depends_on) as string[]) : []
122
+ if (deps.includes(taskId)) {
123
+ skipTask(t.id, `dependency "${taskId}" was skipped/failed`, visited)
124
+ }
125
+ }
126
+ }
127
+
128
+ function cascadeFailure(failedTaskId: string): void {
129
+ if (spec.on_failure === 'stop') {
130
+ const allPending = store.getTasksByConvoy(convoyId).filter(t => t.status === 'pending')
131
+ for (const t of allPending) {
132
+ skipTask(t.id, 'execution halted due to on_failure: stop')
133
+ }
134
+ } else {
135
+ const allTasks = store.getTasksByConvoy(convoyId)
136
+ for (const t of allTasks) {
137
+ const deps = t.depends_on ? (JSON.parse(t.depends_on) as string[]) : []
138
+ if (deps.includes(failedTaskId)) {
139
+ skipTask(t.id, `dependency "${failedTaskId}" failed`)
140
+ }
141
+ }
142
+ }
143
+ }
144
+
145
+ // ── Single-task executor ──────────────────────────────────────────────────
146
+
147
+ async function executeOneTask(taskRecord: TaskRecord): Promise<void> {
148
+ const workerId = `worker-${taskRecord.id}-${Date.now()}`
149
+ const now = () => new Date().toISOString()
150
+
151
+ // Create worktree (skip for copilot adapter)
152
+ let worktreePath: string | null = null
153
+ if (adapter.name !== 'copilot') {
154
+ try {
155
+ worktreePath = await wtManager.create(workerId, baseBranch)
156
+ } catch (err) {
157
+ if (verbose) {
158
+ process.stderr.write(
159
+ `Warning: failed to create worktree for ${taskRecord.id}: ${(err as Error).message}\n`,
160
+ )
161
+ }
162
+ }
163
+ }
164
+
165
+ store.insertWorker({
166
+ id: workerId,
167
+ task_id: taskRecord.id,
168
+ adapter: adapter.name,
169
+ pid: null,
170
+ session_id: null,
171
+ status: 'spawned',
172
+ worktree: worktreePath,
173
+ created_at: now(),
174
+ })
175
+
176
+ // Mark assigned then running
177
+ store.updateTaskStatus(taskRecord.id, convoyId, 'assigned', {
178
+ worker_id: workerId,
179
+ worktree: worktreePath,
180
+ })
181
+ store.updateTaskStatus(taskRecord.id, convoyId, 'running', { started_at: now() })
182
+ store.updateWorkerStatus(workerId, 'running')
183
+
184
+ const task = taskRecordToTask(taskRecord)
185
+ activeTaskMap.set(taskRecord.id, task)
186
+
187
+ if (verbose) process.stdout.write(`\u25b6 ${taskRecord.id}\n`)
188
+ events.emit(
189
+ 'task_started',
190
+ { worker_id: workerId },
191
+ { convoy_id: convoyId, task_id: taskRecord.id, worker_id: workerId },
192
+ )
193
+
194
+ const taskStartTime = Date.now()
195
+ const timeout = makeTimeoutPromise(taskRecord.timeout_ms)
196
+ let result: ExecuteResult
197
+ try {
198
+ result = await Promise.race([
199
+ adapter.execute(task, { verbose, cwd: worktreePath ?? basePath }),
200
+ timeout.promise,
201
+ ])
202
+ timeout.clear()
203
+ } catch (err) {
204
+ timeout.clear()
205
+ result = { success: false, output: (err as Error).message, exitCode: -1 }
206
+ }
207
+
208
+ activeTaskMap.delete(taskRecord.id)
209
+ const finishedAt = now()
210
+ const elapsed = `(${formatDuration(Date.now() - taskStartTime)})`
211
+
212
+ async function removeWorktree(): Promise<void> {
213
+ if (worktreePath) {
214
+ try { await wtManager.remove(worktreePath) } catch { /* ignore cleanup errors */ }
215
+ }
216
+ }
217
+
218
+ // ── Timed out ───────────────────────────────────────────────────────────
219
+ if (result._timedOut) {
220
+ if (typeof adapter.kill === 'function') adapter.kill(task)
221
+ await removeWorktree()
222
+
223
+ const freshRecord = store.getTask(taskRecord.id, convoyId)!
224
+ if (freshRecord.retries < freshRecord.max_retries && spec.on_failure !== 'stop') {
225
+ store.updateTaskStatus(taskRecord.id, convoyId, 'pending', {
226
+ retries: freshRecord.retries + 1,
227
+ worker_id: null,
228
+ worktree: null,
229
+ started_at: null,
230
+ finished_at: null,
231
+ })
232
+ store.updateWorkerStatus(workerId, 'killed', { finished_at: finishedAt })
233
+ if (verbose) {
234
+ process.stdout.write(
235
+ `\u23f1 ${taskRecord.id} retry ${freshRecord.retries + 1}/${freshRecord.max_retries}\n`,
236
+ )
237
+ }
238
+ } else {
239
+ store.withTransaction(() => {
240
+ store.updateTaskStatus(taskRecord.id, convoyId, 'timed-out', {
241
+ finished_at: finishedAt,
242
+ output: result.output,
243
+ })
244
+ store.updateWorkerStatus(workerId, 'failed', { finished_at: finishedAt })
245
+ })
246
+ if (verbose) process.stdout.write(`\u23f1 ${taskRecord.id}\n`)
247
+ events.emit(
248
+ 'task_failed',
249
+ { reason: 'timeout', worker_id: workerId },
250
+ { convoy_id: convoyId, task_id: taskRecord.id, worker_id: workerId },
251
+ )
252
+ cascadeFailure(taskRecord.id)
253
+ }
254
+ return
255
+ }
256
+
257
+ // ── Success ─────────────────────────────────────────────────────────────
258
+ if (result.success) {
259
+ if (worktreePath) {
260
+ try {
261
+ await mergeQueue.merge(worktreePath, `convoy-${workerId}`, baseBranch)
262
+ } catch (err) {
263
+ if (verbose) {
264
+ process.stderr.write(
265
+ `Warning: merge failed for ${taskRecord.id}: ${(err as Error).message}\n`,
266
+ )
267
+ }
268
+ }
269
+ await removeWorktree()
270
+ }
271
+
272
+ store.withTransaction(() => {
273
+ store.updateTaskStatus(taskRecord.id, convoyId, 'done', {
274
+ finished_at: finishedAt,
275
+ output: result.output,
276
+ exit_code: result.exitCode,
277
+ })
278
+ store.updateWorkerStatus(workerId, 'done', { finished_at: finishedAt })
279
+ })
280
+ if (verbose) process.stdout.write(`\u2713 ${taskRecord.id} ${elapsed}\n`)
281
+ events.emit(
282
+ 'task_done',
283
+ { exit_code: result.exitCode, worker_id: workerId },
284
+ { convoy_id: convoyId, task_id: taskRecord.id, worker_id: workerId },
285
+ )
286
+ return
287
+ }
288
+
289
+ // ── Failure ─────────────────────────────────────────────────────────────
290
+ if (typeof adapter.kill === 'function') adapter.kill(task)
291
+ await removeWorktree()
292
+
293
+ const freshRecord = store.getTask(taskRecord.id, convoyId)!
294
+ if (freshRecord.retries < freshRecord.max_retries && spec.on_failure !== 'stop') {
295
+ store.updateTaskStatus(taskRecord.id, convoyId, 'pending', {
296
+ retries: freshRecord.retries + 1,
297
+ worker_id: null,
298
+ worktree: null,
299
+ started_at: null,
300
+ finished_at: null,
301
+ })
302
+ store.updateWorkerStatus(workerId, 'failed', { finished_at: finishedAt })
303
+ if (verbose) {
304
+ process.stdout.write(
305
+ `\u2717 ${taskRecord.id} retry ${freshRecord.retries + 1}/${freshRecord.max_retries}\n`,
306
+ )
307
+ }
308
+ } else {
309
+ store.withTransaction(() => {
310
+ store.updateTaskStatus(taskRecord.id, convoyId, 'failed', {
311
+ finished_at: finishedAt,
312
+ output: result.output,
313
+ exit_code: result.exitCode,
314
+ })
315
+ store.updateWorkerStatus(workerId, 'failed', { finished_at: finishedAt })
316
+ })
317
+ if (verbose) process.stdout.write(`\u2717 ${taskRecord.id}\n`)
318
+ events.emit(
319
+ 'task_failed',
320
+ { reason: 'error', exit_code: result.exitCode, worker_id: workerId },
321
+ { convoy_id: convoyId, task_id: taskRecord.id, worker_id: workerId },
322
+ )
323
+ cascadeFailure(taskRecord.id)
324
+ }
325
+ }
326
+
327
+ // ── Main execution loop ───────────────────────────────────────────────────
328
+
329
+ try {
330
+ let ready = store.getReadyTasks(convoyId)
331
+ const concurrency = spec.concurrency ?? 1
332
+ while (ready.length > 0) {
333
+ for (let i = 0; i < ready.length; i += concurrency) {
334
+ await Promise.all(ready.slice(i, i + concurrency).map(t => executeOneTask(t)))
335
+ }
336
+ ready = store.getReadyTasks(convoyId)
337
+ }
338
+ } finally {
339
+ healthMonitor.stop()
340
+ }
341
+
342
+ // ── Validation gates ──────────────────────────────────────────────────────
343
+
344
+ const gateResults: Array<{ command: string; exitCode: number; passed: boolean }> = []
345
+ if (spec.gates && spec.gates.length > 0) {
346
+ for (const command of spec.gates) {
347
+ try {
348
+ await execFile('sh', ['-c', command], { cwd: basePath })
349
+ gateResults.push({ command, exitCode: 0, passed: true })
350
+ } catch (err) {
351
+ const code =
352
+ typeof (err as { code?: unknown }).code === 'number'
353
+ ? (err as { code: number }).code
354
+ : 1
355
+ gateResults.push({ command, exitCode: code, passed: false })
356
+ }
357
+ }
358
+ }
359
+
360
+ // ── Final status & summary ────────────────────────────────────────────────
361
+
362
+ const allTasksFinal = store.getTasksByConvoy(convoyId)
363
+ const summary = {
364
+ total: allTasksFinal.length,
365
+ done: allTasksFinal.filter(t => t.status === 'done').length,
366
+ failed: allTasksFinal.filter(t => t.status === 'failed').length,
367
+ skipped: allTasksFinal.filter(t => t.status === 'skipped').length,
368
+ timedOut: allTasksFinal.filter(t => t.status === 'timed-out').length,
369
+ }
370
+
371
+ const anyGateFailed = gateResults.some(g => !g.passed)
372
+ const finalStatus: ConvoyStatus = anyGateFailed
373
+ ? 'gate-failed'
374
+ : summary.failed > 0 || summary.timedOut > 0
375
+ ? 'failed'
376
+ : 'done'
377
+
378
+ store.updateConvoyStatus(convoyId, finalStatus, { finished_at: new Date().toISOString() })
379
+
380
+ return {
381
+ convoyId,
382
+ status: finalStatus,
383
+ summary,
384
+ duration: formatDuration(Date.now() - startTime),
385
+ gateResults: spec.gates && spec.gates.length > 0 ? gateResults : undefined,
386
+ }
387
+ }
388
+
389
+ // ── Factory ───────────────────────────────────────────────────────────────────
390
+
391
+ export function createConvoyEngine(options: ConvoyEngineOptions): ConvoyEngine {
392
+ const { spec, specYaml, adapter, verbose = false } = options
393
+ const basePath = resolve(options.basePath ?? process.cwd())
394
+ const dbPath = options.dbPath ?? join(basePath, '.opencastle', 'convoy.db')
395
+
396
+ async function getCurrentBranch(): Promise<string> {
397
+ try {
398
+ const { stdout } = await execFile('git', ['rev-parse', '--abbrev-ref', 'HEAD'], {
399
+ cwd: basePath,
400
+ })
401
+ return stdout.trim()
402
+ } catch {
403
+ return 'main'
404
+ }
405
+ }
406
+
407
+ async function run(): Promise<ConvoyResult> {
408
+ const startTime = Date.now()
409
+ const convoyId = `convoy-${startTime}`
410
+ const specHash = createHash('sha256').update(specYaml).digest('hex')
411
+ const baseBranch = spec.branch ?? (await getCurrentBranch())
412
+
413
+ mkdirSync(dirname(dbPath), { recursive: true })
414
+ const store = createConvoyStore(dbPath)
415
+ const events = createEventEmitter(store, options.logsDir)
416
+ const wtManager = options._worktreeManager ?? createWorktreeManager(basePath)
417
+ const mergeQueue = options._mergeQueue ?? createMergeQueue(basePath)
418
+
419
+ try {
420
+ store.insertConvoy({
421
+ id: convoyId,
422
+ name: spec.name,
423
+ spec_hash: specHash,
424
+ status: 'pending',
425
+ branch: baseBranch,
426
+ created_at: new Date().toISOString(),
427
+ spec_yaml: specYaml,
428
+ })
429
+
430
+ const tasks = spec.tasks ?? []
431
+ const phases = buildPhases(tasks)
432
+ for (let phaseIdx = 0; phaseIdx < phases.length; phaseIdx++) {
433
+ for (const task of phases[phaseIdx]) {
434
+ store.insertTask({
435
+ id: task.id,
436
+ convoy_id: convoyId,
437
+ phase: phaseIdx,
438
+ prompt: task.prompt,
439
+ agent: task.agent,
440
+ model: task.model ?? null,
441
+ timeout_ms: parseTimeout(task.timeout),
442
+ status: 'pending',
443
+ retries: 0,
444
+ max_retries: task.max_retries,
445
+ files: task.files.length > 0 ? JSON.stringify(task.files) : null,
446
+ depends_on: task.depends_on.length > 0 ? JSON.stringify(task.depends_on) : null,
447
+ })
448
+ }
449
+ }
450
+
451
+ store.updateConvoyStatus(convoyId, 'running', { started_at: new Date().toISOString() })
452
+ events.emit('convoy_started', { name: spec.name }, { convoy_id: convoyId })
453
+
454
+ return await runConvoy(
455
+ convoyId, spec, adapter, store, events,
456
+ wtManager, mergeQueue, basePath, baseBranch, verbose, startTime,
457
+ )
458
+ } finally {
459
+ store.close()
460
+ }
461
+ }
462
+
463
+ async function resume(convoyId: string): Promise<ConvoyResult> {
464
+ const startTime = Date.now()
465
+
466
+ mkdirSync(dirname(dbPath), { recursive: true })
467
+ const store = createConvoyStore(dbPath)
468
+ const events = createEventEmitter(store, options.logsDir)
469
+ const wtManager = options._worktreeManager ?? createWorktreeManager(basePath)
470
+ const mergeQueue = options._mergeQueue ?? createMergeQueue(basePath)
471
+
472
+ try {
473
+ const convoy = store.getConvoy(convoyId)
474
+ if (!convoy) {
475
+ throw new Error(`Convoy "${convoyId}" not found in store`)
476
+ }
477
+
478
+ const baseBranch = convoy.branch ?? spec.branch ?? (await getCurrentBranch())
479
+
480
+ // Reset interrupted tasks and mark their workers as killed
481
+ const allTasks = store.getTasksByConvoy(convoyId)
482
+ for (const task of allTasks) {
483
+ if (task.status === 'running' || task.status === 'assigned') {
484
+ if (task.worker_id) {
485
+ try {
486
+ store.updateWorkerStatus(task.worker_id, 'killed', {
487
+ finished_at: new Date().toISOString(),
488
+ })
489
+ } catch {
490
+ // worker record may already be absent
491
+ }
492
+ }
493
+ store.updateTaskStatus(task.id, convoyId, 'pending', {
494
+ worker_id: null,
495
+ worktree: null,
496
+ started_at: null,
497
+ finished_at: null,
498
+ })
499
+ }
500
+ }
501
+
502
+ // Remove all orphaned worktrees from the crashed run
503
+ await wtManager.removeAll()
504
+
505
+ events.emit(
506
+ 'convoy_resumed',
507
+ { original_created_at: convoy.created_at },
508
+ { convoy_id: convoyId },
509
+ )
510
+
511
+ return await runConvoy(
512
+ convoyId, spec, adapter, store, events,
513
+ wtManager, mergeQueue, basePath, baseBranch, verbose, startTime,
514
+ )
515
+ } finally {
516
+ store.close()
517
+ }
518
+ }
519
+
520
+ return { run, resume }
521
+ }