ace-swarm 2.0.2 → 2.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -9
- package/assets/instructions/ACE.instructions.md +2 -2
- package/assets/tasks/README.md +1 -1
- package/assets/tasks/role_tasks.md +2 -2
- package/dist/agent-runtime/role-adapters.js +4 -4
- package/dist/agent-runtime/role-adapters.js.map +1 -1
- package/dist/helpers.d.ts.map +1 -1
- package/dist/helpers.js +2 -5
- package/dist/helpers.js.map +1 -1
- package/dist/job-scheduler.d.ts +31 -40
- package/dist/job-scheduler.d.ts.map +1 -1
- package/dist/job-scheduler.js +266 -320
- package/dist/job-scheduler.js.map +1 -1
- package/dist/resources.js +5 -5
- package/dist/resources.js.map +1 -1
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +12 -0
- package/dist/server.js.map +1 -1
- package/dist/tools-framework.js +2 -2
- package/dist/tools-framework.js.map +1 -1
- package/dist/tools-handoff.d.ts +15 -2
- package/dist/tools-handoff.d.ts.map +1 -1
- package/dist/tools-handoff.js +40 -4
- package/dist/tools-handoff.js.map +1 -1
- package/dist/tools-scheduler.d.ts.map +1 -1
- package/dist/tools-scheduler.js +22 -26
- package/dist/tools-scheduler.js.map +1 -1
- package/dist/tui/index.d.ts +1 -0
- package/dist/tui/index.d.ts.map +1 -1
- package/dist/tui/index.js +18 -3
- package/dist/tui/index.js.map +1 -1
- package/dist/tui/openai-compatible.d.ts +5 -0
- package/dist/tui/openai-compatible.d.ts.map +1 -1
- package/dist/tui/openai-compatible.js +54 -4
- package/dist/tui/openai-compatible.js.map +1 -1
- package/package.json +1 -1
package/dist/job-scheduler.js
CHANGED
|
@@ -38,7 +38,7 @@ async function flushTransitionEvents(buf) {
|
|
|
38
38
|
}
|
|
39
39
|
}
|
|
40
40
|
export const JOB_QUEUE_REL = "agent-state/job-queue.json";
|
|
41
|
-
export const
|
|
41
|
+
export const JOB_LOCK_TABLE_REL = "agent-state/job-locks.json";
|
|
42
42
|
export const SCHEDULER_LEASE_REL = "agent-state/scheduler-lease.json";
|
|
43
43
|
export const SCHEDULER_LOCK_REL = "agent-state/job-scheduler.lock";
|
|
44
44
|
export const PRIORITY_BANDS = ["P0", "P1", "P2", "P3"];
|
|
@@ -47,7 +47,6 @@ export const JOB_STATUS = [
|
|
|
47
47
|
"accepted",
|
|
48
48
|
"blocked",
|
|
49
49
|
"ready",
|
|
50
|
-
"reserved",
|
|
51
50
|
"running",
|
|
52
51
|
"done",
|
|
53
52
|
"failed",
|
|
@@ -55,9 +54,8 @@ export const JOB_STATUS = [
|
|
|
55
54
|
"canceled",
|
|
56
55
|
"unknown_recovery",
|
|
57
56
|
];
|
|
58
|
-
export const
|
|
59
|
-
"
|
|
60
|
-
"running",
|
|
57
|
+
export const JOB_LOCK_STATUS = [
|
|
58
|
+
"active",
|
|
61
59
|
"completed",
|
|
62
60
|
"released",
|
|
63
61
|
"failed",
|
|
@@ -104,14 +102,11 @@ function defaultQueueFile() {
|
|
|
104
102
|
jobs: [],
|
|
105
103
|
};
|
|
106
104
|
}
|
|
107
|
-
function
|
|
105
|
+
function defaultJobLockFile() {
|
|
108
106
|
return {
|
|
109
107
|
version: 1,
|
|
110
108
|
updated_at: nowIso(),
|
|
111
|
-
|
|
112
|
-
horizon_minutes: 30,
|
|
113
|
-
time_slice_seconds: 300,
|
|
114
|
-
reservations: [],
|
|
109
|
+
locks: [],
|
|
115
110
|
};
|
|
116
111
|
}
|
|
117
112
|
function defaultLease(owner = "scheduler") {
|
|
@@ -175,7 +170,6 @@ function parseQueueFile(raw) {
|
|
|
175
170
|
next_attempt_at: typeof job.next_attempt_at === "string" ? job.next_attempt_at : undefined,
|
|
176
171
|
created_at: typeof job.created_at === "string" ? job.created_at : nowIso(),
|
|
177
172
|
accepted_at: typeof job.accepted_at === "string" ? job.accepted_at : undefined,
|
|
178
|
-
reserved_at: typeof job.reserved_at === "string" ? job.reserved_at : undefined,
|
|
179
173
|
started_at: typeof job.started_at === "string" ? job.started_at : undefined,
|
|
180
174
|
completed_at: typeof job.completed_at === "string" ? job.completed_at : undefined,
|
|
181
175
|
evidence_ref: typeof job.evidence_ref === "string" ? job.evidence_ref : undefined,
|
|
@@ -193,57 +187,46 @@ function parseQueueFile(raw) {
|
|
|
193
187
|
return undefined;
|
|
194
188
|
}
|
|
195
189
|
}
|
|
196
|
-
function
|
|
190
|
+
function parseJobLockFile(raw) {
|
|
197
191
|
try {
|
|
198
192
|
const parsed = JSON.parse(raw);
|
|
199
193
|
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed))
|
|
200
194
|
return undefined;
|
|
201
195
|
const candidate = parsed;
|
|
202
|
-
if (candidate.version !== 1 || !Array.isArray(candidate.
|
|
196
|
+
if (candidate.version !== 1 || !Array.isArray(candidate.locks))
|
|
203
197
|
return undefined;
|
|
204
|
-
const
|
|
205
|
-
for (const row of candidate.
|
|
198
|
+
const locks = [];
|
|
199
|
+
for (const row of candidate.locks) {
|
|
206
200
|
if (!row || typeof row !== "object")
|
|
207
201
|
continue;
|
|
208
202
|
const record = row;
|
|
209
|
-
if (typeof record.
|
|
203
|
+
if (typeof record.lock_id !== "string" ||
|
|
210
204
|
typeof record.job_id !== "string" ||
|
|
211
|
-
typeof record.resource !== "string"
|
|
212
|
-
typeof record.time_slice_start !== "string" ||
|
|
213
|
-
typeof record.time_slice_end !== "string") {
|
|
205
|
+
typeof record.resource !== "string") {
|
|
214
206
|
continue;
|
|
215
207
|
}
|
|
216
|
-
const status =
|
|
208
|
+
const status = JOB_LOCK_STATUS.includes(record.status)
|
|
217
209
|
? record.status
|
|
218
|
-
: "
|
|
219
|
-
|
|
220
|
-
|
|
210
|
+
: "active";
|
|
211
|
+
locks.push({
|
|
212
|
+
lock_id: record.lock_id,
|
|
221
213
|
job_id: record.job_id,
|
|
222
214
|
phase: typeof record.phase === "string" ? record.phase : "dispatch",
|
|
223
215
|
resource: record.resource,
|
|
224
|
-
time_slice_start: record.time_slice_start,
|
|
225
|
-
time_slice_end: record.time_slice_end,
|
|
226
216
|
status,
|
|
227
217
|
priority: toPriority(record.priority),
|
|
228
218
|
lease_id: typeof record.lease_id === "string" ? record.lease_id : undefined,
|
|
229
|
-
overrun: typeof record.overrun === "boolean" ? record.overrun : undefined,
|
|
230
219
|
reason_code: typeof record.reason_code === "string" ? record.reason_code : undefined,
|
|
220
|
+
acquired_at: typeof record.acquired_at === "string" ? record.acquired_at : undefined,
|
|
221
|
+
heartbeat_at: typeof record.heartbeat_at === "string" ? record.heartbeat_at : undefined,
|
|
222
|
+
released_at: typeof record.released_at === "string" ? record.released_at : undefined,
|
|
231
223
|
created_at: typeof record.created_at === "string" ? record.created_at : nowIso(),
|
|
232
224
|
});
|
|
233
225
|
}
|
|
234
226
|
return {
|
|
235
227
|
version: 1,
|
|
236
228
|
updated_at: typeof candidate.updated_at === "string" ? candidate.updated_at : nowIso(),
|
|
237
|
-
|
|
238
|
-
? Math.floor(candidate.horizon_steps)
|
|
239
|
-
: 5,
|
|
240
|
-
horizon_minutes: typeof candidate.horizon_minutes === "number" && candidate.horizon_minutes > 0
|
|
241
|
-
? Math.floor(candidate.horizon_minutes)
|
|
242
|
-
: 30,
|
|
243
|
-
time_slice_seconds: typeof candidate.time_slice_seconds === "number" && candidate.time_slice_seconds > 0
|
|
244
|
-
? Math.floor(candidate.time_slice_seconds)
|
|
245
|
-
: 300,
|
|
246
|
-
reservations,
|
|
229
|
+
locks,
|
|
247
230
|
};
|
|
248
231
|
}
|
|
249
232
|
catch {
|
|
@@ -280,8 +263,8 @@ function parseLease(raw) {
|
|
|
280
263
|
export function getJobQueuePath() {
|
|
281
264
|
return wsPath(JOB_QUEUE_REL);
|
|
282
265
|
}
|
|
283
|
-
export function
|
|
284
|
-
return wsPath(
|
|
266
|
+
export function getJobLockTablePath() {
|
|
267
|
+
return wsPath(JOB_LOCK_TABLE_REL);
|
|
285
268
|
}
|
|
286
269
|
export function getSchedulerLeasePath() {
|
|
287
270
|
return wsPath(SCHEDULER_LEASE_REL);
|
|
@@ -292,11 +275,11 @@ export function readJobQueue() {
|
|
|
292
275
|
return defaultQueueFile();
|
|
293
276
|
return parseQueueFile(raw) ?? defaultQueueFile();
|
|
294
277
|
}
|
|
295
|
-
export function
|
|
296
|
-
const raw = safeRead(
|
|
278
|
+
export function readJobLockTable() {
|
|
279
|
+
const raw = safeRead(JOB_LOCK_TABLE_REL);
|
|
297
280
|
if (isReadError(raw))
|
|
298
|
-
return
|
|
299
|
-
return
|
|
281
|
+
return defaultJobLockFile();
|
|
282
|
+
return parseJobLockFile(raw) ?? defaultJobLockFile();
|
|
300
283
|
}
|
|
301
284
|
export function readSchedulerLease() {
|
|
302
285
|
const raw = safeRead(SCHEDULER_LEASE_REL);
|
|
@@ -308,21 +291,13 @@ function writeQueue(queue) {
|
|
|
308
291
|
queue.updated_at = nowIso();
|
|
309
292
|
return safeWrite(JOB_QUEUE_REL, JSON.stringify(queue, null, 2));
|
|
310
293
|
}
|
|
311
|
-
function
|
|
294
|
+
function writeJobLocks(table) {
|
|
312
295
|
table.updated_at = nowIso();
|
|
313
|
-
return safeWrite(
|
|
296
|
+
return safeWrite(JOB_LOCK_TABLE_REL, JSON.stringify(table, null, 2));
|
|
314
297
|
}
|
|
315
298
|
function writeLease(lease) {
|
|
316
299
|
return safeWrite(SCHEDULER_LEASE_REL, JSON.stringify(lease, null, 2));
|
|
317
300
|
}
|
|
318
|
-
function roundUpToSlice(nowMs, sliceMs) {
|
|
319
|
-
if (sliceMs <= 0)
|
|
320
|
-
return nowMs;
|
|
321
|
-
const remainder = nowMs % sliceMs;
|
|
322
|
-
if (remainder === 0)
|
|
323
|
-
return nowMs;
|
|
324
|
-
return nowMs + (sliceMs - remainder);
|
|
325
|
-
}
|
|
326
301
|
function compareJobs(a, b) {
|
|
327
302
|
const priorityCmp = PRIORITY_WEIGHT[a.priority] - PRIORITY_WEIGHT[b.priority];
|
|
328
303
|
if (priorityCmp !== 0)
|
|
@@ -481,59 +456,15 @@ function computeBackoffMs(job) {
|
|
|
481
456
|
const factor = Math.min(32, 2 ** exponent);
|
|
482
457
|
return base * factor;
|
|
483
458
|
}
|
|
484
|
-
function
|
|
485
|
-
|
|
486
|
-
if (!end)
|
|
487
|
-
return false;
|
|
488
|
-
if (record.status === "running")
|
|
489
|
-
return true;
|
|
490
|
-
if (record.status !== "reserved")
|
|
491
|
-
return false;
|
|
492
|
-
return end.getTime() > now.getTime();
|
|
459
|
+
function isJobLockActive(record) {
|
|
460
|
+
return record.status === "active";
|
|
493
461
|
}
|
|
494
|
-
function
|
|
495
|
-
let released = 0;
|
|
496
|
-
const byJob = new Map(queue.jobs.map((job) => [job.job_id, job]));
|
|
497
|
-
for (const reservation of table.reservations) {
|
|
498
|
-
if (reservation.status !== "running")
|
|
499
|
-
continue;
|
|
500
|
-
const end = parseDate(reservation.time_slice_end);
|
|
501
|
-
if (!end || end.getTime() >= now.getTime())
|
|
502
|
-
continue;
|
|
503
|
-
reservation.status = "released";
|
|
504
|
-
reservation.overrun = true;
|
|
505
|
-
reservation.reason_code = "overrun";
|
|
506
|
-
released += 1;
|
|
507
|
-
const job = byJob.get(reservation.job_id);
|
|
508
|
-
if (!job)
|
|
509
|
-
continue;
|
|
510
|
-
if (job.status === "running") {
|
|
511
|
-
job.status = "blocked";
|
|
512
|
-
job.reason_code = "overrun";
|
|
513
|
-
job.error = `Time-slice overrun at ${reservation.time_slice_end}`;
|
|
514
|
-
const nextAttempt = new Date(now.getTime() + computeBackoffMs(job));
|
|
515
|
-
job.next_attempt_at = nowIso(nextAttempt);
|
|
516
|
-
buf.status.push({
|
|
517
|
-
event_type: "SCHEDULER_OVERRUN",
|
|
518
|
-
status: "blocked",
|
|
519
|
-
summary: `Job ${job.job_id} overran slice ending ${reservation.time_slice_end}`,
|
|
520
|
-
payload: {
|
|
521
|
-
job_id: job.job_id,
|
|
522
|
-
reservation_id: reservation.reservation_id,
|
|
523
|
-
slice_end: reservation.time_slice_end,
|
|
524
|
-
next_attempt_at: job.next_attempt_at,
|
|
525
|
-
},
|
|
526
|
-
});
|
|
527
|
-
}
|
|
528
|
-
}
|
|
529
|
-
return released;
|
|
530
|
-
}
|
|
531
|
-
function buildOccupancy(reservations, now) {
|
|
462
|
+
function buildOccupancy(locks) {
|
|
532
463
|
const occupied = new Set();
|
|
533
|
-
for (const
|
|
534
|
-
if (!
|
|
464
|
+
for (const lock of locks) {
|
|
465
|
+
if (!isJobLockActive(lock))
|
|
535
466
|
continue;
|
|
536
|
-
occupied.add(
|
|
467
|
+
occupied.add(lock.resource);
|
|
537
468
|
}
|
|
538
469
|
return occupied;
|
|
539
470
|
}
|
|
@@ -543,14 +474,11 @@ function deriveOwner(input) {
|
|
|
543
474
|
return trimmed;
|
|
544
475
|
return "capability-ops";
|
|
545
476
|
}
|
|
546
|
-
function defaultLeaseTtlSeconds(
|
|
547
|
-
return
|
|
548
|
-
}
|
|
549
|
-
function defaultRunningLivenessSeconds(timeSliceSeconds) {
|
|
550
|
-
return Math.max(300, timeSliceSeconds * 2);
|
|
477
|
+
function defaultLeaseTtlSeconds() {
|
|
478
|
+
return 1800;
|
|
551
479
|
}
|
|
552
|
-
function
|
|
553
|
-
return
|
|
480
|
+
function hasActiveLockForJob(locks, jobId) {
|
|
481
|
+
return locks.some((lock) => lock.job_id === jobId && isJobLockActive(lock));
|
|
554
482
|
}
|
|
555
483
|
function acquireLease(owner, ttlSeconds, now) {
|
|
556
484
|
const existing = readSchedulerLease();
|
|
@@ -593,47 +521,124 @@ function hasActiveLeaseForOwner(lease, owner, now) {
|
|
|
593
521
|
return false;
|
|
594
522
|
return expires.getTime() > now.getTime();
|
|
595
523
|
}
|
|
596
|
-
function
|
|
597
|
-
|
|
598
|
-
|
|
524
|
+
function ensureLeaseForOwner(owner, now, ttlSeconds = defaultLeaseTtlSeconds()) {
|
|
525
|
+
const leaseResult = acquireLease(owner, ttlSeconds, now);
|
|
526
|
+
if (!leaseResult.acquired) {
|
|
527
|
+
return leaseResult;
|
|
528
|
+
}
|
|
529
|
+
const refreshedLease = {
|
|
530
|
+
...leaseResult.lease,
|
|
531
|
+
owner,
|
|
532
|
+
heartbeat_at: nowIso(now),
|
|
533
|
+
expires_at: nowIso(new Date(now.getTime() + ttlSeconds * 1000)),
|
|
534
|
+
};
|
|
535
|
+
writeLease(refreshedLease);
|
|
536
|
+
return {
|
|
537
|
+
...leaseResult,
|
|
538
|
+
lease: refreshedLease,
|
|
539
|
+
};
|
|
540
|
+
}
|
|
541
|
+
function releaseLocksForRecoveredJob(table, jobId, reasonCode, now) {
|
|
542
|
+
for (const lock of table.locks) {
|
|
543
|
+
if (lock.job_id !== jobId)
|
|
599
544
|
continue;
|
|
600
|
-
if (
|
|
545
|
+
if (lock.status !== "active")
|
|
601
546
|
continue;
|
|
602
|
-
|
|
603
|
-
|
|
547
|
+
lock.status = "released";
|
|
548
|
+
lock.reason_code = reasonCode;
|
|
549
|
+
lock.released_at = nowIso(now);
|
|
550
|
+
lock.heartbeat_at = nowIso(now);
|
|
604
551
|
}
|
|
605
552
|
}
|
|
606
|
-
function
|
|
553
|
+
function heartbeatRunningLocks(table, leaseId, now) {
|
|
554
|
+
for (const lock of table.locks) {
|
|
555
|
+
if (lock.status !== "active")
|
|
556
|
+
continue;
|
|
557
|
+
lock.lease_id = leaseId;
|
|
558
|
+
lock.heartbeat_at = nowIso(now);
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
function startJobNow(job, table, lease, now, buf, sourceTool) {
|
|
562
|
+
const acquiredAt = nowIso(now);
|
|
563
|
+
const resources = normalizeResources(job.resource_requirements);
|
|
564
|
+
for (const lock of table.locks) {
|
|
565
|
+
if (lock.job_id !== job.job_id)
|
|
566
|
+
continue;
|
|
567
|
+
if (lock.status === "active") {
|
|
568
|
+
lock.heartbeat_at = acquiredAt;
|
|
569
|
+
lock.lease_id = lease.lease_id;
|
|
570
|
+
}
|
|
571
|
+
}
|
|
572
|
+
const existingResources = new Set(table.locks
|
|
573
|
+
.filter((row) => row.job_id === job.job_id && row.status === "active")
|
|
574
|
+
.map((row) => row.resource));
|
|
575
|
+
for (const resource of resources) {
|
|
576
|
+
if (existingResources.has(resource))
|
|
577
|
+
continue;
|
|
578
|
+
table.locks.push({
|
|
579
|
+
lock_id: `LOCK-${randomUUID().slice(0, 12)}`,
|
|
580
|
+
job_id: job.job_id,
|
|
581
|
+
phase: "dispatch",
|
|
582
|
+
resource,
|
|
583
|
+
status: "active",
|
|
584
|
+
priority: job.priority,
|
|
585
|
+
lease_id: lease.lease_id,
|
|
586
|
+
acquired_at: acquiredAt,
|
|
587
|
+
heartbeat_at: acquiredAt,
|
|
588
|
+
created_at: acquiredAt,
|
|
589
|
+
});
|
|
590
|
+
}
|
|
591
|
+
job.status = "running";
|
|
592
|
+
job.started_at = acquiredAt;
|
|
593
|
+
job.error = undefined;
|
|
594
|
+
job.reason_code = undefined;
|
|
595
|
+
buf.status.push({
|
|
596
|
+
event_type: "SCHEDULER_JOB_STARTED",
|
|
597
|
+
status: "in_progress",
|
|
598
|
+
summary: `Started job ${job.job_id}`,
|
|
599
|
+
payload: {
|
|
600
|
+
job_id: job.job_id,
|
|
601
|
+
acquired_at: acquiredAt,
|
|
602
|
+
resources,
|
|
603
|
+
source_tool: sourceTool,
|
|
604
|
+
},
|
|
605
|
+
});
|
|
606
|
+
buf.ledger.push({
|
|
607
|
+
tool: sourceTool,
|
|
608
|
+
category: "major_update",
|
|
609
|
+
message: `Started scheduler job ${job.job_id}`,
|
|
610
|
+
artifacts: ["agent-state/job-queue.json", "agent-state/job-locks.json"],
|
|
611
|
+
metadata: {
|
|
612
|
+
job_id: job.job_id,
|
|
613
|
+
acquired_at: acquiredAt,
|
|
614
|
+
resources,
|
|
615
|
+
},
|
|
616
|
+
});
|
|
617
|
+
return { acquired_at: acquiredAt, resources };
|
|
618
|
+
}
|
|
619
|
+
function reconcileRecoveryState(queue, table, now, recoveryMode, buf) {
|
|
607
620
|
let recovered = 0;
|
|
608
|
-
const livenessCutoffMs = now.getTime() - defaultRunningLivenessSeconds(timeSliceSeconds) * 1000;
|
|
609
621
|
for (const job of queue.jobs) {
|
|
610
622
|
if (job.status === "unknown_recovery") {
|
|
611
|
-
|
|
623
|
+
releaseLocksForRecoveredJob(table, job.job_id, "recovery_hold", now);
|
|
612
624
|
continue;
|
|
613
625
|
}
|
|
614
626
|
if (job.status !== "running")
|
|
615
627
|
continue;
|
|
616
|
-
const
|
|
617
|
-
const
|
|
618
|
-
|
|
619
|
-
runningReservations.every((reservation) => {
|
|
620
|
-
const end = parseDate(reservation.time_slice_end);
|
|
621
|
-
return !end || end.getTime() < now.getTime();
|
|
622
|
-
});
|
|
623
|
-
const started = parseDate(job.started_at);
|
|
624
|
-
const livenessExceeded = started ? started.getTime() <= livenessCutoffMs : false;
|
|
625
|
-
if (!missingReservationState && !(recoveryMode && (allRunningSlicesExpired || livenessExceeded))) {
|
|
628
|
+
const runningLocks = table.locks.filter((lock) => lock.job_id === job.job_id && lock.status === "active");
|
|
629
|
+
const missingLockState = runningLocks.length === 0;
|
|
630
|
+
if (!missingLockState && !recoveryMode) {
|
|
626
631
|
continue;
|
|
627
632
|
}
|
|
628
633
|
const previousStatus = job.status;
|
|
629
|
-
const reasonCode =
|
|
634
|
+
const reasonCode = missingLockState ? "recovery_missing_lock" : "lease_recovered";
|
|
630
635
|
job.status = "unknown_recovery";
|
|
631
636
|
job.reason_code = reasonCode;
|
|
632
|
-
job.error =
|
|
633
|
-
? "Running job lost
|
|
634
|
-
: `Running job
|
|
637
|
+
job.error = missingLockState
|
|
638
|
+
? "Running job lost its resource lock state and requires manual recovery."
|
|
639
|
+
: `Running job lease was recovered by another owner at ${nowIso(now)}`;
|
|
635
640
|
job.next_attempt_at = undefined;
|
|
636
|
-
|
|
641
|
+
releaseLocksForRecoveredJob(table, job.job_id, "crash_recovery", now);
|
|
637
642
|
recovered += 1;
|
|
638
643
|
buf.status.push({
|
|
639
644
|
event_type: "SCHEDULER_RECOVERY",
|
|
@@ -786,7 +791,6 @@ export async function acknowledgeJob(input) {
|
|
|
786
791
|
job.next_attempt_at = undefined;
|
|
787
792
|
if (wasUnknownRecovery) {
|
|
788
793
|
job.started_at = undefined;
|
|
789
|
-
job.reserved_at = undefined;
|
|
790
794
|
}
|
|
791
795
|
}
|
|
792
796
|
}
|
|
@@ -826,24 +830,15 @@ export async function acknowledgeJob(input) {
|
|
|
826
830
|
await flushTransitionEvents(buf);
|
|
827
831
|
return result;
|
|
828
832
|
}
|
|
829
|
-
export async function
|
|
833
|
+
export async function dispatchJobs(input = {}) {
|
|
830
834
|
const buf = newEventBuffer();
|
|
831
835
|
const result = await withFileLock(SCHEDULER_LOCK_REL, () => {
|
|
832
836
|
const owner = deriveOwner(input.owner);
|
|
833
837
|
const now = parseDate(input.now_iso) ?? new Date();
|
|
834
|
-
const table =
|
|
835
|
-
const horizonSteps = typeof input.horizon_steps === "number" && input.horizon_steps > 0
|
|
836
|
-
? Math.floor(input.horizon_steps)
|
|
837
|
-
: table.horizon_steps;
|
|
838
|
-
const horizonMinutes = typeof input.horizon_minutes === "number" && input.horizon_minutes > 0
|
|
839
|
-
? Math.floor(input.horizon_minutes)
|
|
840
|
-
: table.horizon_minutes;
|
|
841
|
-
const timeSliceSeconds = typeof input.time_slice_seconds === "number" && input.time_slice_seconds > 0
|
|
842
|
-
? Math.floor(input.time_slice_seconds)
|
|
843
|
-
: table.time_slice_seconds;
|
|
838
|
+
const table = readJobLockTable();
|
|
844
839
|
const leaseTtl = typeof input.lease_ttl_seconds === "number" && input.lease_ttl_seconds > 0
|
|
845
840
|
? Math.floor(input.lease_ttl_seconds)
|
|
846
|
-
: defaultLeaseTtlSeconds(
|
|
841
|
+
: defaultLeaseTtlSeconds();
|
|
847
842
|
const leaseResult = acquireLease(owner, leaseTtl, now);
|
|
848
843
|
if (!leaseResult.acquired) {
|
|
849
844
|
const queue = readJobQueue();
|
|
@@ -853,18 +848,16 @@ export async function runSchedulerTick(input = {}) {
|
|
|
853
848
|
lease_acquired: false,
|
|
854
849
|
lease: leaseResult.lease,
|
|
855
850
|
queue_path: getJobQueuePath(),
|
|
856
|
-
|
|
851
|
+
lock_path: getJobLockTablePath(),
|
|
857
852
|
queue,
|
|
858
|
-
|
|
853
|
+
locks: table,
|
|
859
854
|
summary: {
|
|
860
855
|
total_jobs: queue.jobs.length,
|
|
861
856
|
blocked_jobs: queue.jobs.filter((job) => job.status === "blocked").length,
|
|
862
857
|
ready_jobs: queue.jobs.filter((job) => job.status === "ready").length,
|
|
863
|
-
reserved_jobs: queue.jobs.filter((job) => job.status === "reserved").length,
|
|
864
858
|
running_jobs: queue.jobs.filter((job) => job.status === "running").length,
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
released_overruns: 0,
|
|
859
|
+
started_jobs: 0,
|
|
860
|
+
recovered_jobs: 0,
|
|
868
861
|
},
|
|
869
862
|
};
|
|
870
863
|
}
|
|
@@ -874,13 +867,8 @@ export async function runSchedulerTick(input = {}) {
|
|
|
874
867
|
handoffState: readHandoffRegistry(),
|
|
875
868
|
statusEvents: readStatusEvents(500),
|
|
876
869
|
};
|
|
877
|
-
table.
|
|
878
|
-
table.
|
|
879
|
-
table.time_slice_seconds = timeSliceSeconds;
|
|
880
|
-
const recoveredJobs = reconcileRecoveryState(queue, table, now, timeSliceSeconds, leaseResult.recovery_mode, buf);
|
|
881
|
-
const releasedOverruns = leaseResult.recovery_mode
|
|
882
|
-
? 0
|
|
883
|
-
: markRunningOverruns(queue, table, now, buf);
|
|
870
|
+
const recoveredJobs = reconcileRecoveryState(queue, table, now, leaseResult.recovery_mode, buf);
|
|
871
|
+
heartbeatRunningLocks(table, leaseResult.lease.lease_id, now);
|
|
884
872
|
for (const job of queue.jobs) {
|
|
885
873
|
if (TERMINAL_STATUSES.has(job.status))
|
|
886
874
|
continue;
|
|
@@ -896,75 +884,28 @@ export async function runSchedulerTick(input = {}) {
|
|
|
896
884
|
job.reason_code = blocks[0]?.reason_code;
|
|
897
885
|
job.error = blocks[0]?.detail;
|
|
898
886
|
}
|
|
899
|
-
else
|
|
887
|
+
else {
|
|
900
888
|
job.status = "ready";
|
|
901
889
|
job.reason_code = undefined;
|
|
902
890
|
job.error = undefined;
|
|
903
891
|
}
|
|
904
892
|
}
|
|
905
|
-
|
|
906
|
-
for (const reservation of table.reservations) {
|
|
907
|
-
if (reservation.status !== "reserved")
|
|
908
|
-
continue;
|
|
909
|
-
const start = parseDate(reservation.time_slice_start);
|
|
910
|
-
if (!start || start.getTime() <= now.getTime())
|
|
911
|
-
continue;
|
|
912
|
-
reservation.status = "released";
|
|
913
|
-
reservation.reason_code = "replanned";
|
|
914
|
-
replannedReservations += 1;
|
|
915
|
-
}
|
|
916
|
-
for (const job of queue.jobs) {
|
|
917
|
-
if (job.status !== "reserved")
|
|
918
|
-
continue;
|
|
919
|
-
if (!hasActiveReservationForJob(table.reservations, job.job_id, now)) {
|
|
920
|
-
job.status = "ready";
|
|
921
|
-
}
|
|
922
|
-
}
|
|
923
|
-
const occupancy = buildOccupancy(table.reservations, now);
|
|
893
|
+
const occupancy = buildOccupancy(table.locks);
|
|
924
894
|
const readyJobs = queue.jobs
|
|
925
895
|
.filter((job) => job.status === "ready")
|
|
926
896
|
.sort(compareJobs);
|
|
927
|
-
|
|
928
|
-
const sliceMs = timeSliceSeconds * 1000;
|
|
929
|
-
const startMs = roundUpToSlice(now.getTime(), sliceMs);
|
|
930
|
-
const horizonEndMs = Math.min(startMs + horizonMinutes * 60_000, startMs + Math.max(1, horizonSteps) * sliceMs);
|
|
897
|
+
let startedJobs = 0;
|
|
931
898
|
for (const job of readyJobs) {
|
|
932
|
-
if (plannedJobs.size >= horizonSteps)
|
|
933
|
-
break;
|
|
934
899
|
const resources = normalizeResources(job.resource_requirements);
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
const startIso = new Date(cursor).toISOString();
|
|
938
|
-
const free = resources.every((resource) => !occupancy.has(`${resource}::${startIso}`));
|
|
939
|
-
if (free) {
|
|
940
|
-
selectedMs = cursor;
|
|
941
|
-
break;
|
|
942
|
-
}
|
|
943
|
-
}
|
|
944
|
-
if (selectedMs === undefined) {
|
|
945
|
-
job.status = "ready";
|
|
900
|
+
const free = resources.every((resource) => !occupancy.has(resource));
|
|
901
|
+
if (!free) {
|
|
946
902
|
continue;
|
|
947
903
|
}
|
|
948
|
-
const startIso = new Date(selectedMs).toISOString();
|
|
949
|
-
const endIso = new Date(selectedMs + sliceMs).toISOString();
|
|
950
904
|
for (const resource of resources) {
|
|
951
|
-
|
|
952
|
-
reservation_id: `RSV-${randomUUID().slice(0, 12)}`,
|
|
953
|
-
job_id: job.job_id,
|
|
954
|
-
phase: "dispatch",
|
|
955
|
-
resource,
|
|
956
|
-
time_slice_start: startIso,
|
|
957
|
-
time_slice_end: endIso,
|
|
958
|
-
status: "reserved",
|
|
959
|
-
priority: job.priority,
|
|
960
|
-
lease_id: leaseResult.lease.lease_id,
|
|
961
|
-
created_at: nowIso(now),
|
|
962
|
-
});
|
|
963
|
-
occupancy.add(`${resource}::${startIso}`);
|
|
905
|
+
occupancy.add(resource);
|
|
964
906
|
}
|
|
965
|
-
job.
|
|
966
|
-
|
|
967
|
-
plannedJobs.add(job.job_id);
|
|
907
|
+
startJobNow(job, table, leaseResult.lease, now, buf, "dispatch_jobs");
|
|
908
|
+
startedJobs += 1;
|
|
968
909
|
}
|
|
969
910
|
const refreshedLease = {
|
|
970
911
|
...leaseResult.lease,
|
|
@@ -974,21 +915,19 @@ export async function runSchedulerTick(input = {}) {
|
|
|
974
915
|
};
|
|
975
916
|
writeLease(refreshedLease);
|
|
976
917
|
const finalQueuePath = writeQueue(queue);
|
|
977
|
-
const
|
|
918
|
+
const finalLockPath = writeJobLocks(table);
|
|
978
919
|
const summary = {
|
|
979
920
|
total_jobs: queue.jobs.length,
|
|
980
921
|
blocked_jobs: queue.jobs.filter((job) => job.status === "blocked").length,
|
|
981
922
|
ready_jobs: queue.jobs.filter((job) => job.status === "ready").length,
|
|
982
|
-
reserved_jobs: queue.jobs.filter((job) => job.status === "reserved").length,
|
|
983
923
|
running_jobs: queue.jobs.filter((job) => job.status === "running").length,
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
released_overruns: releasedOverruns + recoveredJobs,
|
|
924
|
+
started_jobs: startedJobs,
|
|
925
|
+
recovered_jobs: recoveredJobs,
|
|
987
926
|
};
|
|
988
927
|
buf.status.push({
|
|
989
928
|
event_type: "SCHEDULER_TICK",
|
|
990
929
|
status: "pass",
|
|
991
|
-
summary: `
|
|
930
|
+
summary: `Dispatch started=${summary.started_jobs} ready=${summary.ready_jobs} running=${summary.running_jobs} recovered=${recoveredJobs}`,
|
|
992
931
|
payload: {
|
|
993
932
|
owner,
|
|
994
933
|
lease_acquired: true,
|
|
@@ -998,14 +937,10 @@ export async function runSchedulerTick(input = {}) {
|
|
|
998
937
|
},
|
|
999
938
|
});
|
|
1000
939
|
buf.ledger.push({
|
|
1001
|
-
tool: "
|
|
1002
|
-
category: summary.
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
? "major_update"
|
|
1006
|
-
: "info",
|
|
1007
|
-
message: `Scheduler tick complete (planned=${summary.planned_jobs}, reserved=${summary.reserved_jobs})`,
|
|
1008
|
-
artifacts: ["agent-state/job-queue.json", "agent-state/job-reservations.json"],
|
|
940
|
+
tool: "dispatch_jobs",
|
|
941
|
+
category: summary.started_jobs > 0 || summary.recovered_jobs > 0 ? "major_update" : "info",
|
|
942
|
+
message: `Scheduler dispatch complete (started=${summary.started_jobs}, running=${summary.running_jobs})`,
|
|
943
|
+
artifacts: ["agent-state/job-queue.json", "agent-state/job-locks.json"],
|
|
1009
944
|
metadata: {
|
|
1010
945
|
owner,
|
|
1011
946
|
lease_id: refreshedLease.lease_id,
|
|
@@ -1020,117 +955,129 @@ export async function runSchedulerTick(input = {}) {
|
|
|
1020
955
|
lease_acquired: true,
|
|
1021
956
|
lease: refreshedLease,
|
|
1022
957
|
queue_path: finalQueuePath,
|
|
1023
|
-
|
|
958
|
+
lock_path: finalLockPath,
|
|
1024
959
|
queue,
|
|
1025
|
-
|
|
960
|
+
locks: table,
|
|
1026
961
|
summary,
|
|
1027
962
|
};
|
|
1028
963
|
});
|
|
1029
964
|
await flushTransitionEvents(buf);
|
|
1030
965
|
return result;
|
|
1031
966
|
}
|
|
1032
|
-
export async function
|
|
967
|
+
export async function dispatchJobNow(input) {
|
|
1033
968
|
const buf = newEventBuffer();
|
|
1034
969
|
const result = await withFileLock(SCHEDULER_LOCK_REL, () => {
|
|
1035
970
|
const owner = deriveOwner(input.owner);
|
|
1036
971
|
const now = parseDate(input.now_iso) ?? new Date();
|
|
1037
|
-
const
|
|
1038
|
-
if (!
|
|
972
|
+
const leaseResult = ensureLeaseForOwner(owner, now);
|
|
973
|
+
if (!leaseResult.acquired) {
|
|
1039
974
|
return {
|
|
1040
975
|
ok: false,
|
|
1041
976
|
queue_path: getJobQueuePath(),
|
|
1042
|
-
|
|
977
|
+
lock_path: getJobLockTablePath(),
|
|
1043
978
|
queue: readJobQueue(),
|
|
1044
|
-
|
|
979
|
+
locks: readJobLockTable(),
|
|
1045
980
|
error: `Owner ${owner} does not hold an active scheduler lease.`,
|
|
1046
981
|
};
|
|
1047
982
|
}
|
|
1048
983
|
const queue = readJobQueue();
|
|
1049
|
-
const table =
|
|
984
|
+
const table = readJobLockTable();
|
|
1050
985
|
const job = queue.jobs.find((row) => row.job_id === input.job_id);
|
|
1051
986
|
if (!job) {
|
|
1052
987
|
return {
|
|
1053
988
|
ok: false,
|
|
1054
989
|
queue_path: getJobQueuePath(),
|
|
1055
|
-
|
|
990
|
+
lock_path: getJobLockTablePath(),
|
|
1056
991
|
queue,
|
|
1057
|
-
|
|
992
|
+
locks: table,
|
|
1058
993
|
error: `Unknown job_id: ${input.job_id}`,
|
|
1059
994
|
};
|
|
1060
995
|
}
|
|
1061
|
-
|
|
1062
|
-
.filter((row) => row.job_id === input.job_id && row.status === "reserved")
|
|
1063
|
-
.sort((a, b) => Date.parse(a.time_slice_start) - Date.parse(b.time_slice_start))[0];
|
|
1064
|
-
if (!candidate) {
|
|
996
|
+
if (job.status === "unknown_recovery") {
|
|
1065
997
|
return {
|
|
1066
998
|
ok: false,
|
|
1067
999
|
queue_path: getJobQueuePath(),
|
|
1068
|
-
|
|
1000
|
+
lock_path: getJobLockTablePath(),
|
|
1069
1001
|
queue,
|
|
1070
|
-
|
|
1071
|
-
error: `
|
|
1002
|
+
locks: table,
|
|
1003
|
+
error: `Job ${input.job_id} is parked in unknown_recovery; resume it before dispatch.`,
|
|
1072
1004
|
};
|
|
1073
1005
|
}
|
|
1074
|
-
|
|
1075
|
-
const end = parseDate(candidate.time_slice_end);
|
|
1076
|
-
if (!start || !end || now.getTime() < start.getTime() || now.getTime() > end.getTime()) {
|
|
1006
|
+
if (TERMINAL_STATUSES.has(job.status)) {
|
|
1077
1007
|
return {
|
|
1078
1008
|
ok: false,
|
|
1079
1009
|
queue_path: getJobQueuePath(),
|
|
1080
|
-
|
|
1010
|
+
lock_path: getJobLockTablePath(),
|
|
1081
1011
|
queue,
|
|
1082
|
-
|
|
1083
|
-
error: `
|
|
1012
|
+
locks: table,
|
|
1013
|
+
error: `Job ${input.job_id} is ${job.status} and cannot be started.`,
|
|
1084
1014
|
};
|
|
1085
1015
|
}
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
row.status === "
|
|
1089
|
-
row
|
|
1090
|
-
|
|
1091
|
-
|
|
1016
|
+
if (job.status === "running") {
|
|
1017
|
+
const runningResources = table.locks
|
|
1018
|
+
.filter((row) => row.job_id === input.job_id && row.status === "active")
|
|
1019
|
+
.map((row) => row.resource);
|
|
1020
|
+
const acquiredAt = table.locks.find((row) => row.job_id === input.job_id && row.status === "active")
|
|
1021
|
+
?.acquired_at ??
|
|
1022
|
+
job.started_at ??
|
|
1023
|
+
nowIso(now);
|
|
1024
|
+
heartbeatRunningLocks(table, leaseResult.lease.lease_id, now);
|
|
1025
|
+
const queuePath = writeQueue(queue);
|
|
1026
|
+
const lockPath = writeJobLocks(table);
|
|
1027
|
+
return {
|
|
1028
|
+
ok: true,
|
|
1029
|
+
queue_path: queuePath,
|
|
1030
|
+
lock_path: lockPath,
|
|
1031
|
+
queue,
|
|
1032
|
+
locks: table,
|
|
1033
|
+
started_lock: {
|
|
1034
|
+
acquired_at: acquiredAt,
|
|
1035
|
+
resources: runningResources,
|
|
1036
|
+
},
|
|
1037
|
+
};
|
|
1038
|
+
}
|
|
1039
|
+
const dependencyContext = {
|
|
1040
|
+
todoState: readTodoState(),
|
|
1041
|
+
handoffState: readHandoffRegistry(),
|
|
1042
|
+
statusEvents: readStatusEvents(500),
|
|
1043
|
+
};
|
|
1044
|
+
const blocks = evaluateDependencyBlocks(job, now, dependencyContext);
|
|
1045
|
+
if (blocks.length > 0) {
|
|
1046
|
+
return {
|
|
1047
|
+
ok: false,
|
|
1048
|
+
queue_path: getJobQueuePath(),
|
|
1049
|
+
lock_path: getJobLockTablePath(),
|
|
1050
|
+
queue,
|
|
1051
|
+
locks: table,
|
|
1052
|
+
error: blocks[0]?.detail ?? `Job ${input.job_id} is blocked.`,
|
|
1053
|
+
};
|
|
1054
|
+
}
|
|
1055
|
+
const occupancy = buildOccupancy(table.locks);
|
|
1056
|
+
const resources = normalizeResources(job.resource_requirements);
|
|
1057
|
+
const busy = resources.find((resource) => table.locks.some((row) => row.job_id !== input.job_id && row.resource === resource && row.status === "active"));
|
|
1058
|
+
if (busy) {
|
|
1059
|
+
return {
|
|
1060
|
+
ok: false,
|
|
1061
|
+
queue_path: getJobQueuePath(),
|
|
1062
|
+
lock_path: getJobLockTablePath(),
|
|
1063
|
+
queue,
|
|
1064
|
+
locks: table,
|
|
1065
|
+
error: `Resource ${busy} is already held by another running job.`,
|
|
1066
|
+
};
|
|
1067
|
+
}
|
|
1068
|
+
for (const resource of resources) {
|
|
1069
|
+
occupancy.add(resource);
|
|
1092
1070
|
}
|
|
1093
|
-
job.
|
|
1094
|
-
job.started_at = nowIso(now);
|
|
1095
|
-
job.error = undefined;
|
|
1096
|
-
job.reason_code = undefined;
|
|
1071
|
+
const startedLock = startJobNow(job, table, leaseResult.lease, now, buf, "dispatch_job_now");
|
|
1097
1072
|
const queuePath = writeQueue(queue);
|
|
1098
|
-
const
|
|
1099
|
-
buf.status.push({
|
|
1100
|
-
event_type: "SCHEDULER_JOB_STARTED",
|
|
1101
|
-
status: "in_progress",
|
|
1102
|
-
summary: `Started reserved job ${input.job_id}`,
|
|
1103
|
-
payload: {
|
|
1104
|
-
job_id: input.job_id,
|
|
1105
|
-
started_slice: {
|
|
1106
|
-
start: candidate.time_slice_start,
|
|
1107
|
-
end: candidate.time_slice_end,
|
|
1108
|
-
},
|
|
1109
|
-
},
|
|
1110
|
-
});
|
|
1111
|
-
buf.ledger.push({
|
|
1112
|
-
tool: "start_reserved_job",
|
|
1113
|
-
category: "major_update",
|
|
1114
|
-
message: `Started reserved job ${input.job_id}`,
|
|
1115
|
-
artifacts: ["agent-state/job-queue.json", "agent-state/job-reservations.json"],
|
|
1116
|
-
metadata: {
|
|
1117
|
-
job_id: input.job_id,
|
|
1118
|
-
started_slice: {
|
|
1119
|
-
start: candidate.time_slice_start,
|
|
1120
|
-
end: candidate.time_slice_end,
|
|
1121
|
-
},
|
|
1122
|
-
},
|
|
1123
|
-
});
|
|
1073
|
+
const lockPath = writeJobLocks(table);
|
|
1124
1074
|
return {
|
|
1125
1075
|
ok: true,
|
|
1126
1076
|
queue_path: queuePath,
|
|
1127
|
-
|
|
1077
|
+
lock_path: lockPath,
|
|
1128
1078
|
queue,
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
start: candidate.time_slice_start,
|
|
1132
|
-
end: candidate.time_slice_end,
|
|
1133
|
-
},
|
|
1079
|
+
locks: table,
|
|
1080
|
+
started_lock: startedLock,
|
|
1134
1081
|
};
|
|
1135
1082
|
});
|
|
1136
1083
|
await flushTransitionEvents(buf);
|
|
@@ -1141,27 +1088,27 @@ export async function completeJob(input) {
|
|
|
1141
1088
|
const result = await withFileLock(SCHEDULER_LOCK_REL, () => {
|
|
1142
1089
|
const owner = deriveOwner(input.owner);
|
|
1143
1090
|
const now = parseDate(input.now_iso) ?? new Date();
|
|
1144
|
-
const
|
|
1145
|
-
if (!
|
|
1091
|
+
const leaseResult = ensureLeaseForOwner(owner, now);
|
|
1092
|
+
if (!leaseResult.acquired) {
|
|
1146
1093
|
return {
|
|
1147
1094
|
ok: false,
|
|
1148
1095
|
queue_path: getJobQueuePath(),
|
|
1149
|
-
|
|
1096
|
+
lock_path: getJobLockTablePath(),
|
|
1150
1097
|
queue: readJobQueue(),
|
|
1151
|
-
|
|
1098
|
+
locks: readJobLockTable(),
|
|
1152
1099
|
error: `Owner ${owner} does not hold an active scheduler lease.`,
|
|
1153
1100
|
};
|
|
1154
1101
|
}
|
|
1155
1102
|
const queue = readJobQueue();
|
|
1156
|
-
const table =
|
|
1103
|
+
const table = readJobLockTable();
|
|
1157
1104
|
const job = queue.jobs.find((row) => row.job_id === input.job_id);
|
|
1158
1105
|
if (!job) {
|
|
1159
1106
|
return {
|
|
1160
1107
|
ok: false,
|
|
1161
1108
|
queue_path: getJobQueuePath(),
|
|
1162
|
-
|
|
1109
|
+
lock_path: getJobLockTablePath(),
|
|
1163
1110
|
queue,
|
|
1164
|
-
|
|
1111
|
+
locks: table,
|
|
1165
1112
|
error: `Unknown job_id: ${input.job_id}`,
|
|
1166
1113
|
};
|
|
1167
1114
|
}
|
|
@@ -1169,34 +1116,32 @@ export async function completeJob(input) {
|
|
|
1169
1116
|
return {
|
|
1170
1117
|
ok: false,
|
|
1171
1118
|
queue_path: getJobQueuePath(),
|
|
1172
|
-
|
|
1119
|
+
lock_path: getJobLockTablePath(),
|
|
1173
1120
|
queue,
|
|
1174
|
-
|
|
1121
|
+
locks: table,
|
|
1175
1122
|
error: `Job ${input.job_id} is ${job.status}; only running jobs can be completed.`,
|
|
1176
1123
|
};
|
|
1177
1124
|
}
|
|
1178
|
-
const
|
|
1179
|
-
if (
|
|
1125
|
+
const runningLocks = table.locks.filter((lock) => lock.job_id === input.job_id && lock.status === "active");
|
|
1126
|
+
if (runningLocks.length === 0) {
|
|
1180
1127
|
return {
|
|
1181
1128
|
ok: false,
|
|
1182
1129
|
queue_path: getJobQueuePath(),
|
|
1183
|
-
|
|
1130
|
+
lock_path: getJobLockTablePath(),
|
|
1184
1131
|
queue,
|
|
1185
|
-
|
|
1186
|
-
error: `Job ${input.job_id} has no
|
|
1132
|
+
locks: table,
|
|
1133
|
+
error: `Job ${input.job_id} has no active resource locks.`,
|
|
1187
1134
|
};
|
|
1188
1135
|
}
|
|
1189
1136
|
const success = input.success !== false;
|
|
1190
|
-
for (const
|
|
1191
|
-
if (
|
|
1137
|
+
for (const lock of table.locks) {
|
|
1138
|
+
if (lock.job_id !== input.job_id)
|
|
1192
1139
|
continue;
|
|
1193
|
-
if (
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
reservation.status = "released";
|
|
1199
|
-
reservation.reason_code = success ? "job_completed" : "retry_scheduled";
|
|
1140
|
+
if (lock.status === "active") {
|
|
1141
|
+
lock.status = success ? "completed" : "failed";
|
|
1142
|
+
lock.reason_code = success ? undefined : "execution_failed";
|
|
1143
|
+
lock.released_at = nowIso(now);
|
|
1144
|
+
lock.heartbeat_at = nowIso(now);
|
|
1200
1145
|
}
|
|
1201
1146
|
}
|
|
1202
1147
|
if (success) {
|
|
@@ -1222,10 +1167,11 @@ export async function completeJob(input) {
|
|
|
1222
1167
|
job.status = "blocked";
|
|
1223
1168
|
job.reason_code = "retry_scheduled";
|
|
1224
1169
|
job.next_attempt_at = nowIso(retryAt);
|
|
1170
|
+
job.started_at = undefined;
|
|
1225
1171
|
}
|
|
1226
1172
|
}
|
|
1227
1173
|
const queuePath = writeQueue(queue);
|
|
1228
|
-
const
|
|
1174
|
+
const lockPath = writeJobLocks(table);
|
|
1229
1175
|
const finalStatus = job.status;
|
|
1230
1176
|
const isSuccess = finalStatus === "done";
|
|
1231
1177
|
buf.status.push({
|
|
@@ -1248,7 +1194,7 @@ export async function completeJob(input) {
|
|
|
1248
1194
|
message: isSuccess
|
|
1249
1195
|
? `Completed scheduler job ${input.job_id}`
|
|
1250
1196
|
: `Scheduler job ${input.job_id} ended with ${finalStatus}`,
|
|
1251
|
-
artifacts: ["agent-state/job-queue.json", "agent-state/job-
|
|
1197
|
+
artifacts: ["agent-state/job-queue.json", "agent-state/job-locks.json"],
|
|
1252
1198
|
metadata: {
|
|
1253
1199
|
job_id: input.job_id,
|
|
1254
1200
|
final_status: finalStatus,
|
|
@@ -1258,9 +1204,9 @@ export async function completeJob(input) {
|
|
|
1258
1204
|
return {
|
|
1259
1205
|
ok: true,
|
|
1260
1206
|
queue_path: queuePath,
|
|
1261
|
-
|
|
1207
|
+
lock_path: lockPath,
|
|
1262
1208
|
queue,
|
|
1263
|
-
|
|
1209
|
+
locks: table,
|
|
1264
1210
|
status: job.status,
|
|
1265
1211
|
retry_scheduled_for: job.next_attempt_at,
|
|
1266
1212
|
};
|