ace-swarm 2.0.3 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -38,7 +38,7 @@ async function flushTransitionEvents(buf) {
38
38
  }
39
39
  }
40
40
  export const JOB_QUEUE_REL = "agent-state/job-queue.json";
41
- export const RESERVATION_TABLE_REL = "agent-state/job-reservations.json";
41
+ export const JOB_LOCK_TABLE_REL = "agent-state/job-locks.json";
42
42
  export const SCHEDULER_LEASE_REL = "agent-state/scheduler-lease.json";
43
43
  export const SCHEDULER_LOCK_REL = "agent-state/job-scheduler.lock";
44
44
  export const PRIORITY_BANDS = ["P0", "P1", "P2", "P3"];
@@ -47,7 +47,6 @@ export const JOB_STATUS = [
47
47
  "accepted",
48
48
  "blocked",
49
49
  "ready",
50
- "reserved",
51
50
  "running",
52
51
  "done",
53
52
  "failed",
@@ -55,9 +54,8 @@ export const JOB_STATUS = [
55
54
  "canceled",
56
55
  "unknown_recovery",
57
56
  ];
58
- export const RESERVATION_STATUS = [
59
- "reserved",
60
- "running",
57
+ export const JOB_LOCK_STATUS = [
58
+ "active",
61
59
  "completed",
62
60
  "released",
63
61
  "failed",
@@ -104,14 +102,11 @@ function defaultQueueFile() {
104
102
  jobs: [],
105
103
  };
106
104
  }
107
- function defaultReservationFile() {
105
+ function defaultJobLockFile() {
108
106
  return {
109
107
  version: 1,
110
108
  updated_at: nowIso(),
111
- horizon_steps: 5,
112
- horizon_minutes: 30,
113
- time_slice_seconds: 300,
114
- reservations: [],
109
+ locks: [],
115
110
  };
116
111
  }
117
112
  function defaultLease(owner = "scheduler") {
@@ -175,7 +170,6 @@ function parseQueueFile(raw) {
175
170
  next_attempt_at: typeof job.next_attempt_at === "string" ? job.next_attempt_at : undefined,
176
171
  created_at: typeof job.created_at === "string" ? job.created_at : nowIso(),
177
172
  accepted_at: typeof job.accepted_at === "string" ? job.accepted_at : undefined,
178
- reserved_at: typeof job.reserved_at === "string" ? job.reserved_at : undefined,
179
173
  started_at: typeof job.started_at === "string" ? job.started_at : undefined,
180
174
  completed_at: typeof job.completed_at === "string" ? job.completed_at : undefined,
181
175
  evidence_ref: typeof job.evidence_ref === "string" ? job.evidence_ref : undefined,
@@ -193,57 +187,46 @@ function parseQueueFile(raw) {
193
187
  return undefined;
194
188
  }
195
189
  }
196
- function parseReservationFile(raw) {
190
+ function parseJobLockFile(raw) {
197
191
  try {
198
192
  const parsed = JSON.parse(raw);
199
193
  if (!parsed || typeof parsed !== "object" || Array.isArray(parsed))
200
194
  return undefined;
201
195
  const candidate = parsed;
202
- if (candidate.version !== 1 || !Array.isArray(candidate.reservations))
196
+ if (candidate.version !== 1 || !Array.isArray(candidate.locks))
203
197
  return undefined;
204
- const reservations = [];
205
- for (const row of candidate.reservations) {
198
+ const locks = [];
199
+ for (const row of candidate.locks) {
206
200
  if (!row || typeof row !== "object")
207
201
  continue;
208
202
  const record = row;
209
- if (typeof record.reservation_id !== "string" ||
203
+ if (typeof record.lock_id !== "string" ||
210
204
  typeof record.job_id !== "string" ||
211
- typeof record.resource !== "string" ||
212
- typeof record.time_slice_start !== "string" ||
213
- typeof record.time_slice_end !== "string") {
205
+ typeof record.resource !== "string") {
214
206
  continue;
215
207
  }
216
- const status = RESERVATION_STATUS.includes(record.status)
208
+ const status = JOB_LOCK_STATUS.includes(record.status)
217
209
  ? record.status
218
- : "reserved";
219
- reservations.push({
220
- reservation_id: record.reservation_id,
210
+ : "active";
211
+ locks.push({
212
+ lock_id: record.lock_id,
221
213
  job_id: record.job_id,
222
214
  phase: typeof record.phase === "string" ? record.phase : "dispatch",
223
215
  resource: record.resource,
224
- time_slice_start: record.time_slice_start,
225
- time_slice_end: record.time_slice_end,
226
216
  status,
227
217
  priority: toPriority(record.priority),
228
218
  lease_id: typeof record.lease_id === "string" ? record.lease_id : undefined,
229
- overrun: typeof record.overrun === "boolean" ? record.overrun : undefined,
230
219
  reason_code: typeof record.reason_code === "string" ? record.reason_code : undefined,
220
+ acquired_at: typeof record.acquired_at === "string" ? record.acquired_at : undefined,
221
+ heartbeat_at: typeof record.heartbeat_at === "string" ? record.heartbeat_at : undefined,
222
+ released_at: typeof record.released_at === "string" ? record.released_at : undefined,
231
223
  created_at: typeof record.created_at === "string" ? record.created_at : nowIso(),
232
224
  });
233
225
  }
234
226
  return {
235
227
  version: 1,
236
228
  updated_at: typeof candidate.updated_at === "string" ? candidate.updated_at : nowIso(),
237
- horizon_steps: typeof candidate.horizon_steps === "number" && candidate.horizon_steps > 0
238
- ? Math.floor(candidate.horizon_steps)
239
- : 5,
240
- horizon_minutes: typeof candidate.horizon_minutes === "number" && candidate.horizon_minutes > 0
241
- ? Math.floor(candidate.horizon_minutes)
242
- : 30,
243
- time_slice_seconds: typeof candidate.time_slice_seconds === "number" && candidate.time_slice_seconds > 0
244
- ? Math.floor(candidate.time_slice_seconds)
245
- : 300,
246
- reservations,
229
+ locks,
247
230
  };
248
231
  }
249
232
  catch {
@@ -280,8 +263,8 @@ function parseLease(raw) {
280
263
  export function getJobQueuePath() {
281
264
  return wsPath(JOB_QUEUE_REL);
282
265
  }
283
- export function getReservationTablePath() {
284
- return wsPath(RESERVATION_TABLE_REL);
266
+ export function getJobLockTablePath() {
267
+ return wsPath(JOB_LOCK_TABLE_REL);
285
268
  }
286
269
  export function getSchedulerLeasePath() {
287
270
  return wsPath(SCHEDULER_LEASE_REL);
@@ -292,11 +275,11 @@ export function readJobQueue() {
292
275
  return defaultQueueFile();
293
276
  return parseQueueFile(raw) ?? defaultQueueFile();
294
277
  }
295
- export function readReservationTable() {
296
- const raw = safeRead(RESERVATION_TABLE_REL);
278
+ export function readJobLockTable() {
279
+ const raw = safeRead(JOB_LOCK_TABLE_REL);
297
280
  if (isReadError(raw))
298
- return defaultReservationFile();
299
- return parseReservationFile(raw) ?? defaultReservationFile();
281
+ return defaultJobLockFile();
282
+ return parseJobLockFile(raw) ?? defaultJobLockFile();
300
283
  }
301
284
  export function readSchedulerLease() {
302
285
  const raw = safeRead(SCHEDULER_LEASE_REL);
@@ -308,21 +291,13 @@ function writeQueue(queue) {
308
291
  queue.updated_at = nowIso();
309
292
  return safeWrite(JOB_QUEUE_REL, JSON.stringify(queue, null, 2));
310
293
  }
311
- function writeReservations(table) {
294
+ function writeJobLocks(table) {
312
295
  table.updated_at = nowIso();
313
- return safeWrite(RESERVATION_TABLE_REL, JSON.stringify(table, null, 2));
296
+ return safeWrite(JOB_LOCK_TABLE_REL, JSON.stringify(table, null, 2));
314
297
  }
315
298
  function writeLease(lease) {
316
299
  return safeWrite(SCHEDULER_LEASE_REL, JSON.stringify(lease, null, 2));
317
300
  }
318
- function roundUpToSlice(nowMs, sliceMs) {
319
- if (sliceMs <= 0)
320
- return nowMs;
321
- const remainder = nowMs % sliceMs;
322
- if (remainder === 0)
323
- return nowMs;
324
- return nowMs + (sliceMs - remainder);
325
- }
326
301
  function compareJobs(a, b) {
327
302
  const priorityCmp = PRIORITY_WEIGHT[a.priority] - PRIORITY_WEIGHT[b.priority];
328
303
  if (priorityCmp !== 0)
@@ -481,59 +456,15 @@ function computeBackoffMs(job) {
481
456
  const factor = Math.min(32, 2 ** exponent);
482
457
  return base * factor;
483
458
  }
484
- function isReservationActive(record, now) {
485
- const end = parseDate(record.time_slice_end);
486
- if (!end)
487
- return false;
488
- if (record.status === "running")
489
- return true;
490
- if (record.status !== "reserved")
491
- return false;
492
- return end.getTime() > now.getTime();
459
+ function isJobLockActive(record) {
460
+ return record.status === "active";
493
461
  }
494
- function markRunningOverruns(queue, table, now, buf) {
495
- let released = 0;
496
- const byJob = new Map(queue.jobs.map((job) => [job.job_id, job]));
497
- for (const reservation of table.reservations) {
498
- if (reservation.status !== "running")
499
- continue;
500
- const end = parseDate(reservation.time_slice_end);
501
- if (!end || end.getTime() >= now.getTime())
502
- continue;
503
- reservation.status = "released";
504
- reservation.overrun = true;
505
- reservation.reason_code = "overrun";
506
- released += 1;
507
- const job = byJob.get(reservation.job_id);
508
- if (!job)
509
- continue;
510
- if (job.status === "running") {
511
- job.status = "blocked";
512
- job.reason_code = "overrun";
513
- job.error = `Time-slice overrun at ${reservation.time_slice_end}`;
514
- const nextAttempt = new Date(now.getTime() + computeBackoffMs(job));
515
- job.next_attempt_at = nowIso(nextAttempt);
516
- buf.status.push({
517
- event_type: "SCHEDULER_OVERRUN",
518
- status: "blocked",
519
- summary: `Job ${job.job_id} overran slice ending ${reservation.time_slice_end}`,
520
- payload: {
521
- job_id: job.job_id,
522
- reservation_id: reservation.reservation_id,
523
- slice_end: reservation.time_slice_end,
524
- next_attempt_at: job.next_attempt_at,
525
- },
526
- });
527
- }
528
- }
529
- return released;
530
- }
531
- function buildOccupancy(reservations, now) {
462
+ function buildOccupancy(locks) {
532
463
  const occupied = new Set();
533
- for (const reservation of reservations) {
534
- if (!isReservationActive(reservation, now))
464
+ for (const lock of locks) {
465
+ if (!isJobLockActive(lock))
535
466
  continue;
536
- occupied.add(`${reservation.resource}::${reservation.time_slice_start}`);
467
+ occupied.add(lock.resource);
537
468
  }
538
469
  return occupied;
539
470
  }
@@ -543,14 +474,11 @@ function deriveOwner(input) {
543
474
  return trimmed;
544
475
  return "capability-ops";
545
476
  }
546
- function defaultLeaseTtlSeconds(horizonSteps, timeSliceSeconds) {
547
- return Math.min(3600, Math.max(60, Math.max(2, horizonSteps) * timeSliceSeconds));
548
- }
549
- function defaultRunningLivenessSeconds(timeSliceSeconds) {
550
- return Math.max(300, timeSliceSeconds * 2);
477
+ function defaultLeaseTtlSeconds() {
478
+ return 1800;
551
479
  }
552
- function hasActiveReservationForJob(reservations, jobId, now) {
553
- return reservations.some((reservation) => reservation.job_id === jobId && isReservationActive(reservation, now));
480
+ function hasActiveLockForJob(locks, jobId) {
481
+ return locks.some((lock) => lock.job_id === jobId && isJobLockActive(lock));
554
482
  }
555
483
  function acquireLease(owner, ttlSeconds, now) {
556
484
  const existing = readSchedulerLease();
@@ -593,47 +521,124 @@ function hasActiveLeaseForOwner(lease, owner, now) {
593
521
  return false;
594
522
  return expires.getTime() > now.getTime();
595
523
  }
596
- function releaseReservationsForRecoveredJob(table, jobId, reasonCode) {
597
- for (const reservation of table.reservations) {
598
- if (reservation.job_id !== jobId)
524
+ function ensureLeaseForOwner(owner, now, ttlSeconds = defaultLeaseTtlSeconds()) {
525
+ const leaseResult = acquireLease(owner, ttlSeconds, now);
526
+ if (!leaseResult.acquired) {
527
+ return leaseResult;
528
+ }
529
+ const refreshedLease = {
530
+ ...leaseResult.lease,
531
+ owner,
532
+ heartbeat_at: nowIso(now),
533
+ expires_at: nowIso(new Date(now.getTime() + ttlSeconds * 1000)),
534
+ };
535
+ writeLease(refreshedLease);
536
+ return {
537
+ ...leaseResult,
538
+ lease: refreshedLease,
539
+ };
540
+ }
541
+ function releaseLocksForRecoveredJob(table, jobId, reasonCode, now) {
542
+ for (const lock of table.locks) {
543
+ if (lock.job_id !== jobId)
599
544
  continue;
600
- if (reservation.status !== "running" && reservation.status !== "reserved")
545
+ if (lock.status !== "active")
601
546
  continue;
602
- reservation.status = "released";
603
- reservation.reason_code = reasonCode;
547
+ lock.status = "released";
548
+ lock.reason_code = reasonCode;
549
+ lock.released_at = nowIso(now);
550
+ lock.heartbeat_at = nowIso(now);
604
551
  }
605
552
  }
606
- function reconcileRecoveryState(queue, table, now, timeSliceSeconds, recoveryMode, buf) {
553
+ function heartbeatRunningLocks(table, leaseId, now) {
554
+ for (const lock of table.locks) {
555
+ if (lock.status !== "active")
556
+ continue;
557
+ lock.lease_id = leaseId;
558
+ lock.heartbeat_at = nowIso(now);
559
+ }
560
+ }
561
+ function startJobNow(job, table, lease, now, buf, sourceTool) {
562
+ const acquiredAt = nowIso(now);
563
+ const resources = normalizeResources(job.resource_requirements);
564
+ for (const lock of table.locks) {
565
+ if (lock.job_id !== job.job_id)
566
+ continue;
567
+ if (lock.status === "active") {
568
+ lock.heartbeat_at = acquiredAt;
569
+ lock.lease_id = lease.lease_id;
570
+ }
571
+ }
572
+ const existingResources = new Set(table.locks
573
+ .filter((row) => row.job_id === job.job_id && row.status === "active")
574
+ .map((row) => row.resource));
575
+ for (const resource of resources) {
576
+ if (existingResources.has(resource))
577
+ continue;
578
+ table.locks.push({
579
+ lock_id: `LOCK-${randomUUID().slice(0, 12)}`,
580
+ job_id: job.job_id,
581
+ phase: "dispatch",
582
+ resource,
583
+ status: "active",
584
+ priority: job.priority,
585
+ lease_id: lease.lease_id,
586
+ acquired_at: acquiredAt,
587
+ heartbeat_at: acquiredAt,
588
+ created_at: acquiredAt,
589
+ });
590
+ }
591
+ job.status = "running";
592
+ job.started_at = acquiredAt;
593
+ job.error = undefined;
594
+ job.reason_code = undefined;
595
+ buf.status.push({
596
+ event_type: "SCHEDULER_JOB_STARTED",
597
+ status: "in_progress",
598
+ summary: `Started job ${job.job_id}`,
599
+ payload: {
600
+ job_id: job.job_id,
601
+ acquired_at: acquiredAt,
602
+ resources,
603
+ source_tool: sourceTool,
604
+ },
605
+ });
606
+ buf.ledger.push({
607
+ tool: sourceTool,
608
+ category: "major_update",
609
+ message: `Started scheduler job ${job.job_id}`,
610
+ artifacts: ["agent-state/job-queue.json", "agent-state/job-locks.json"],
611
+ metadata: {
612
+ job_id: job.job_id,
613
+ acquired_at: acquiredAt,
614
+ resources,
615
+ },
616
+ });
617
+ return { acquired_at: acquiredAt, resources };
618
+ }
619
+ function reconcileRecoveryState(queue, table, now, recoveryMode, buf) {
607
620
  let recovered = 0;
608
- const livenessCutoffMs = now.getTime() - defaultRunningLivenessSeconds(timeSliceSeconds) * 1000;
609
621
  for (const job of queue.jobs) {
610
622
  if (job.status === "unknown_recovery") {
611
- releaseReservationsForRecoveredJob(table, job.job_id, "recovery_hold");
623
+ releaseLocksForRecoveredJob(table, job.job_id, "recovery_hold", now);
612
624
  continue;
613
625
  }
614
626
  if (job.status !== "running")
615
627
  continue;
616
- const runningReservations = table.reservations.filter((reservation) => reservation.job_id === job.job_id && reservation.status === "running");
617
- const missingReservationState = runningReservations.length === 0;
618
- const allRunningSlicesExpired = runningReservations.length > 0 &&
619
- runningReservations.every((reservation) => {
620
- const end = parseDate(reservation.time_slice_end);
621
- return !end || end.getTime() < now.getTime();
622
- });
623
- const started = parseDate(job.started_at);
624
- const livenessExceeded = started ? started.getTime() <= livenessCutoffMs : false;
625
- if (!missingReservationState && !(recoveryMode && (allRunningSlicesExpired || livenessExceeded))) {
628
+ const runningLocks = table.locks.filter((lock) => lock.job_id === job.job_id && lock.status === "active");
629
+ const missingLockState = runningLocks.length === 0;
630
+ if (!missingLockState && !recoveryMode) {
626
631
  continue;
627
632
  }
628
633
  const previousStatus = job.status;
629
- const reasonCode = missingReservationState ? "recovery_missing_reservation" : "crash_recovery";
634
+ const reasonCode = missingLockState ? "recovery_missing_lock" : "lease_recovered";
630
635
  job.status = "unknown_recovery";
631
636
  job.reason_code = reasonCode;
632
- job.error = missingReservationState
633
- ? "Running job lost reservation state and requires manual recovery."
634
- : `Running job exceeded recovery threshold at ${nowIso(now)}`;
637
+ job.error = missingLockState
638
+ ? "Running job lost its resource lock state and requires manual recovery."
639
+ : `Running job lease was recovered by another owner at ${nowIso(now)}`;
635
640
  job.next_attempt_at = undefined;
636
- releaseReservationsForRecoveredJob(table, job.job_id, "crash_recovery");
641
+ releaseLocksForRecoveredJob(table, job.job_id, "crash_recovery", now);
637
642
  recovered += 1;
638
643
  buf.status.push({
639
644
  event_type: "SCHEDULER_RECOVERY",
@@ -786,7 +791,6 @@ export async function acknowledgeJob(input) {
786
791
  job.next_attempt_at = undefined;
787
792
  if (wasUnknownRecovery) {
788
793
  job.started_at = undefined;
789
- job.reserved_at = undefined;
790
794
  }
791
795
  }
792
796
  }
@@ -826,24 +830,15 @@ export async function acknowledgeJob(input) {
826
830
  await flushTransitionEvents(buf);
827
831
  return result;
828
832
  }
829
- export async function runSchedulerTick(input = {}) {
833
+ export async function dispatchJobs(input = {}) {
830
834
  const buf = newEventBuffer();
831
835
  const result = await withFileLock(SCHEDULER_LOCK_REL, () => {
832
836
  const owner = deriveOwner(input.owner);
833
837
  const now = parseDate(input.now_iso) ?? new Date();
834
- const table = readReservationTable();
835
- const horizonSteps = typeof input.horizon_steps === "number" && input.horizon_steps > 0
836
- ? Math.floor(input.horizon_steps)
837
- : table.horizon_steps;
838
- const horizonMinutes = typeof input.horizon_minutes === "number" && input.horizon_minutes > 0
839
- ? Math.floor(input.horizon_minutes)
840
- : table.horizon_minutes;
841
- const timeSliceSeconds = typeof input.time_slice_seconds === "number" && input.time_slice_seconds > 0
842
- ? Math.floor(input.time_slice_seconds)
843
- : table.time_slice_seconds;
838
+ const table = readJobLockTable();
844
839
  const leaseTtl = typeof input.lease_ttl_seconds === "number" && input.lease_ttl_seconds > 0
845
840
  ? Math.floor(input.lease_ttl_seconds)
846
- : defaultLeaseTtlSeconds(horizonSteps, timeSliceSeconds);
841
+ : defaultLeaseTtlSeconds();
847
842
  const leaseResult = acquireLease(owner, leaseTtl, now);
848
843
  if (!leaseResult.acquired) {
849
844
  const queue = readJobQueue();
@@ -853,18 +848,16 @@ export async function runSchedulerTick(input = {}) {
853
848
  lease_acquired: false,
854
849
  lease: leaseResult.lease,
855
850
  queue_path: getJobQueuePath(),
856
- reservation_path: getReservationTablePath(),
851
+ lock_path: getJobLockTablePath(),
857
852
  queue,
858
- reservations: table,
853
+ locks: table,
859
854
  summary: {
860
855
  total_jobs: queue.jobs.length,
861
856
  blocked_jobs: queue.jobs.filter((job) => job.status === "blocked").length,
862
857
  ready_jobs: queue.jobs.filter((job) => job.status === "ready").length,
863
- reserved_jobs: queue.jobs.filter((job) => job.status === "reserved").length,
864
858
  running_jobs: queue.jobs.filter((job) => job.status === "running").length,
865
- planned_jobs: 0,
866
- replanned_reservations: 0,
867
- released_overruns: 0,
859
+ started_jobs: 0,
860
+ recovered_jobs: 0,
868
861
  },
869
862
  };
870
863
  }
@@ -874,13 +867,8 @@ export async function runSchedulerTick(input = {}) {
874
867
  handoffState: readHandoffRegistry(),
875
868
  statusEvents: readStatusEvents(500),
876
869
  };
877
- table.horizon_steps = horizonSteps;
878
- table.horizon_minutes = horizonMinutes;
879
- table.time_slice_seconds = timeSliceSeconds;
880
- const recoveredJobs = reconcileRecoveryState(queue, table, now, timeSliceSeconds, leaseResult.recovery_mode, buf);
881
- const releasedOverruns = leaseResult.recovery_mode
882
- ? 0
883
- : markRunningOverruns(queue, table, now, buf);
870
+ const recoveredJobs = reconcileRecoveryState(queue, table, now, leaseResult.recovery_mode, buf);
871
+ heartbeatRunningLocks(table, leaseResult.lease.lease_id, now);
884
872
  for (const job of queue.jobs) {
885
873
  if (TERMINAL_STATUSES.has(job.status))
886
874
  continue;
@@ -896,75 +884,28 @@ export async function runSchedulerTick(input = {}) {
896
884
  job.reason_code = blocks[0]?.reason_code;
897
885
  job.error = blocks[0]?.detail;
898
886
  }
899
- else if (job.status !== "reserved") {
887
+ else {
900
888
  job.status = "ready";
901
889
  job.reason_code = undefined;
902
890
  job.error = undefined;
903
891
  }
904
892
  }
905
- let replannedReservations = 0;
906
- for (const reservation of table.reservations) {
907
- if (reservation.status !== "reserved")
908
- continue;
909
- const start = parseDate(reservation.time_slice_start);
910
- if (!start || start.getTime() <= now.getTime())
911
- continue;
912
- reservation.status = "released";
913
- reservation.reason_code = "replanned";
914
- replannedReservations += 1;
915
- }
916
- for (const job of queue.jobs) {
917
- if (job.status !== "reserved")
918
- continue;
919
- if (!hasActiveReservationForJob(table.reservations, job.job_id, now)) {
920
- job.status = "ready";
921
- }
922
- }
923
- const occupancy = buildOccupancy(table.reservations, now);
893
+ const occupancy = buildOccupancy(table.locks);
924
894
  const readyJobs = queue.jobs
925
895
  .filter((job) => job.status === "ready")
926
896
  .sort(compareJobs);
927
- const plannedJobs = new Set();
928
- const sliceMs = timeSliceSeconds * 1000;
929
- const startMs = roundUpToSlice(now.getTime(), sliceMs);
930
- const horizonEndMs = Math.min(startMs + horizonMinutes * 60_000, startMs + Math.max(1, horizonSteps) * sliceMs);
897
+ let startedJobs = 0;
931
898
  for (const job of readyJobs) {
932
- if (plannedJobs.size >= horizonSteps)
933
- break;
934
899
  const resources = normalizeResources(job.resource_requirements);
935
- let selectedMs;
936
- for (let cursor = startMs; cursor < horizonEndMs; cursor += sliceMs) {
937
- const startIso = new Date(cursor).toISOString();
938
- const free = resources.every((resource) => !occupancy.has(`${resource}::${startIso}`));
939
- if (free) {
940
- selectedMs = cursor;
941
- break;
942
- }
943
- }
944
- if (selectedMs === undefined) {
945
- job.status = "ready";
900
+ const free = resources.every((resource) => !occupancy.has(resource));
901
+ if (!free) {
946
902
  continue;
947
903
  }
948
- const startIso = new Date(selectedMs).toISOString();
949
- const endIso = new Date(selectedMs + sliceMs).toISOString();
950
904
  for (const resource of resources) {
951
- table.reservations.push({
952
- reservation_id: `RSV-${randomUUID().slice(0, 12)}`,
953
- job_id: job.job_id,
954
- phase: "dispatch",
955
- resource,
956
- time_slice_start: startIso,
957
- time_slice_end: endIso,
958
- status: "reserved",
959
- priority: job.priority,
960
- lease_id: leaseResult.lease.lease_id,
961
- created_at: nowIso(now),
962
- });
963
- occupancy.add(`${resource}::${startIso}`);
905
+ occupancy.add(resource);
964
906
  }
965
- job.status = "reserved";
966
- job.reserved_at = nowIso(now);
967
- plannedJobs.add(job.job_id);
907
+ startJobNow(job, table, leaseResult.lease, now, buf, "dispatch_jobs");
908
+ startedJobs += 1;
968
909
  }
969
910
  const refreshedLease = {
970
911
  ...leaseResult.lease,
@@ -974,21 +915,19 @@ export async function runSchedulerTick(input = {}) {
974
915
  };
975
916
  writeLease(refreshedLease);
976
917
  const finalQueuePath = writeQueue(queue);
977
- const finalReservationPath = writeReservations(table);
918
+ const finalLockPath = writeJobLocks(table);
978
919
  const summary = {
979
920
  total_jobs: queue.jobs.length,
980
921
  blocked_jobs: queue.jobs.filter((job) => job.status === "blocked").length,
981
922
  ready_jobs: queue.jobs.filter((job) => job.status === "ready").length,
982
- reserved_jobs: queue.jobs.filter((job) => job.status === "reserved").length,
983
923
  running_jobs: queue.jobs.filter((job) => job.status === "running").length,
984
- planned_jobs: plannedJobs.size,
985
- replanned_reservations: replannedReservations,
986
- released_overruns: releasedOverruns + recoveredJobs,
924
+ started_jobs: startedJobs,
925
+ recovered_jobs: recoveredJobs,
987
926
  };
988
927
  buf.status.push({
989
928
  event_type: "SCHEDULER_TICK",
990
929
  status: "pass",
991
- summary: `Tick planned=${summary.planned_jobs} ready=${summary.ready_jobs} reserved=${summary.reserved_jobs} recovered=${recoveredJobs} overruns=${releasedOverruns}`,
930
+ summary: `Dispatch started=${summary.started_jobs} ready=${summary.ready_jobs} running=${summary.running_jobs} recovered=${recoveredJobs}`,
992
931
  payload: {
993
932
  owner,
994
933
  lease_acquired: true,
@@ -998,14 +937,10 @@ export async function runSchedulerTick(input = {}) {
998
937
  },
999
938
  });
1000
939
  buf.ledger.push({
1001
- tool: "run_scheduler_tick",
1002
- category: summary.planned_jobs > 0 ||
1003
- summary.replanned_reservations > 0 ||
1004
- summary.released_overruns > 0
1005
- ? "major_update"
1006
- : "info",
1007
- message: `Scheduler tick complete (planned=${summary.planned_jobs}, reserved=${summary.reserved_jobs})`,
1008
- artifacts: ["agent-state/job-queue.json", "agent-state/job-reservations.json"],
940
+ tool: "dispatch_jobs",
941
+ category: summary.started_jobs > 0 || summary.recovered_jobs > 0 ? "major_update" : "info",
942
+ message: `Scheduler dispatch complete (started=${summary.started_jobs}, running=${summary.running_jobs})`,
943
+ artifacts: ["agent-state/job-queue.json", "agent-state/job-locks.json"],
1009
944
  metadata: {
1010
945
  owner,
1011
946
  lease_id: refreshedLease.lease_id,
@@ -1020,117 +955,129 @@ export async function runSchedulerTick(input = {}) {
1020
955
  lease_acquired: true,
1021
956
  lease: refreshedLease,
1022
957
  queue_path: finalQueuePath,
1023
- reservation_path: finalReservationPath,
958
+ lock_path: finalLockPath,
1024
959
  queue,
1025
- reservations: table,
960
+ locks: table,
1026
961
  summary,
1027
962
  };
1028
963
  });
1029
964
  await flushTransitionEvents(buf);
1030
965
  return result;
1031
966
  }
1032
- export async function startReservedJob(input) {
967
+ export async function dispatchJobNow(input) {
1033
968
  const buf = newEventBuffer();
1034
969
  const result = await withFileLock(SCHEDULER_LOCK_REL, () => {
1035
970
  const owner = deriveOwner(input.owner);
1036
971
  const now = parseDate(input.now_iso) ?? new Date();
1037
- const lease = readSchedulerLease();
1038
- if (!hasActiveLeaseForOwner(lease, owner, now)) {
972
+ const leaseResult = ensureLeaseForOwner(owner, now);
973
+ if (!leaseResult.acquired) {
1039
974
  return {
1040
975
  ok: false,
1041
976
  queue_path: getJobQueuePath(),
1042
- reservation_path: getReservationTablePath(),
977
+ lock_path: getJobLockTablePath(),
1043
978
  queue: readJobQueue(),
1044
- reservations: readReservationTable(),
979
+ locks: readJobLockTable(),
1045
980
  error: `Owner ${owner} does not hold an active scheduler lease.`,
1046
981
  };
1047
982
  }
1048
983
  const queue = readJobQueue();
1049
- const table = readReservationTable();
984
+ const table = readJobLockTable();
1050
985
  const job = queue.jobs.find((row) => row.job_id === input.job_id);
1051
986
  if (!job) {
1052
987
  return {
1053
988
  ok: false,
1054
989
  queue_path: getJobQueuePath(),
1055
- reservation_path: getReservationTablePath(),
990
+ lock_path: getJobLockTablePath(),
1056
991
  queue,
1057
- reservations: table,
992
+ locks: table,
1058
993
  error: `Unknown job_id: ${input.job_id}`,
1059
994
  };
1060
995
  }
1061
- const candidate = table.reservations
1062
- .filter((row) => row.job_id === input.job_id && row.status === "reserved")
1063
- .sort((a, b) => Date.parse(a.time_slice_start) - Date.parse(b.time_slice_start))[0];
1064
- if (!candidate) {
996
+ if (job.status === "unknown_recovery") {
1065
997
  return {
1066
998
  ok: false,
1067
999
  queue_path: getJobQueuePath(),
1068
- reservation_path: getReservationTablePath(),
1000
+ lock_path: getJobLockTablePath(),
1069
1001
  queue,
1070
- reservations: table,
1071
- error: `No reserved slice found for ${input.job_id}`,
1002
+ locks: table,
1003
+ error: `Job ${input.job_id} is parked in unknown_recovery; resume it before dispatch.`,
1072
1004
  };
1073
1005
  }
1074
- const start = parseDate(candidate.time_slice_start);
1075
- const end = parseDate(candidate.time_slice_end);
1076
- if (!start || !end || now.getTime() < start.getTime() || now.getTime() > end.getTime()) {
1006
+ if (TERMINAL_STATUSES.has(job.status)) {
1077
1007
  return {
1078
1008
  ok: false,
1079
1009
  queue_path: getJobQueuePath(),
1080
- reservation_path: getReservationTablePath(),
1010
+ lock_path: getJobLockTablePath(),
1081
1011
  queue,
1082
- reservations: table,
1083
- error: `Current time is outside reserved slice ${candidate.time_slice_start} - ${candidate.time_slice_end}`,
1012
+ locks: table,
1013
+ error: `Job ${input.job_id} is ${job.status} and cannot be started.`,
1084
1014
  };
1085
1015
  }
1086
- for (const row of table.reservations) {
1087
- if (row.job_id === input.job_id &&
1088
- row.status === "reserved" &&
1089
- row.time_slice_start === candidate.time_slice_start) {
1090
- row.status = "running";
1091
- }
1016
+ if (job.status === "running") {
1017
+ const runningResources = table.locks
1018
+ .filter((row) => row.job_id === input.job_id && row.status === "active")
1019
+ .map((row) => row.resource);
1020
+ const acquiredAt = table.locks.find((row) => row.job_id === input.job_id && row.status === "active")
1021
+ ?.acquired_at ??
1022
+ job.started_at ??
1023
+ nowIso(now);
1024
+ heartbeatRunningLocks(table, leaseResult.lease.lease_id, now);
1025
+ const queuePath = writeQueue(queue);
1026
+ const lockPath = writeJobLocks(table);
1027
+ return {
1028
+ ok: true,
1029
+ queue_path: queuePath,
1030
+ lock_path: lockPath,
1031
+ queue,
1032
+ locks: table,
1033
+ started_lock: {
1034
+ acquired_at: acquiredAt,
1035
+ resources: runningResources,
1036
+ },
1037
+ };
1038
+ }
1039
+ const dependencyContext = {
1040
+ todoState: readTodoState(),
1041
+ handoffState: readHandoffRegistry(),
1042
+ statusEvents: readStatusEvents(500),
1043
+ };
1044
+ const blocks = evaluateDependencyBlocks(job, now, dependencyContext);
1045
+ if (blocks.length > 0) {
1046
+ return {
1047
+ ok: false,
1048
+ queue_path: getJobQueuePath(),
1049
+ lock_path: getJobLockTablePath(),
1050
+ queue,
1051
+ locks: table,
1052
+ error: blocks[0]?.detail ?? `Job ${input.job_id} is blocked.`,
1053
+ };
1054
+ }
1055
+ const occupancy = buildOccupancy(table.locks);
1056
+ const resources = normalizeResources(job.resource_requirements);
1057
+ const busy = resources.find((resource) => table.locks.some((row) => row.job_id !== input.job_id && row.resource === resource && row.status === "active"));
1058
+ if (busy) {
1059
+ return {
1060
+ ok: false,
1061
+ queue_path: getJobQueuePath(),
1062
+ lock_path: getJobLockTablePath(),
1063
+ queue,
1064
+ locks: table,
1065
+ error: `Resource ${busy} is already held by another running job.`,
1066
+ };
1067
+ }
1068
+ for (const resource of resources) {
1069
+ occupancy.add(resource);
1092
1070
  }
1093
- job.status = "running";
1094
- job.started_at = nowIso(now);
1095
- job.error = undefined;
1096
- job.reason_code = undefined;
1071
+ const startedLock = startJobNow(job, table, leaseResult.lease, now, buf, "dispatch_job_now");
1097
1072
  const queuePath = writeQueue(queue);
1098
- const reservationPath = writeReservations(table);
1099
- buf.status.push({
1100
- event_type: "SCHEDULER_JOB_STARTED",
1101
- status: "in_progress",
1102
- summary: `Started reserved job ${input.job_id}`,
1103
- payload: {
1104
- job_id: input.job_id,
1105
- started_slice: {
1106
- start: candidate.time_slice_start,
1107
- end: candidate.time_slice_end,
1108
- },
1109
- },
1110
- });
1111
- buf.ledger.push({
1112
- tool: "start_reserved_job",
1113
- category: "major_update",
1114
- message: `Started reserved job ${input.job_id}`,
1115
- artifacts: ["agent-state/job-queue.json", "agent-state/job-reservations.json"],
1116
- metadata: {
1117
- job_id: input.job_id,
1118
- started_slice: {
1119
- start: candidate.time_slice_start,
1120
- end: candidate.time_slice_end,
1121
- },
1122
- },
1123
- });
1073
+ const lockPath = writeJobLocks(table);
1124
1074
  return {
1125
1075
  ok: true,
1126
1076
  queue_path: queuePath,
1127
- reservation_path: reservationPath,
1077
+ lock_path: lockPath,
1128
1078
  queue,
1129
- reservations: table,
1130
- started_slice: {
1131
- start: candidate.time_slice_start,
1132
- end: candidate.time_slice_end,
1133
- },
1079
+ locks: table,
1080
+ started_lock: startedLock,
1134
1081
  };
1135
1082
  });
1136
1083
  await flushTransitionEvents(buf);
@@ -1141,27 +1088,27 @@ export async function completeJob(input) {
1141
1088
  const result = await withFileLock(SCHEDULER_LOCK_REL, () => {
1142
1089
  const owner = deriveOwner(input.owner);
1143
1090
  const now = parseDate(input.now_iso) ?? new Date();
1144
- const lease = readSchedulerLease();
1145
- if (!hasActiveLeaseForOwner(lease, owner, now)) {
1091
+ const leaseResult = ensureLeaseForOwner(owner, now);
1092
+ if (!leaseResult.acquired) {
1146
1093
  return {
1147
1094
  ok: false,
1148
1095
  queue_path: getJobQueuePath(),
1149
- reservation_path: getReservationTablePath(),
1096
+ lock_path: getJobLockTablePath(),
1150
1097
  queue: readJobQueue(),
1151
- reservations: readReservationTable(),
1098
+ locks: readJobLockTable(),
1152
1099
  error: `Owner ${owner} does not hold an active scheduler lease.`,
1153
1100
  };
1154
1101
  }
1155
1102
  const queue = readJobQueue();
1156
- const table = readReservationTable();
1103
+ const table = readJobLockTable();
1157
1104
  const job = queue.jobs.find((row) => row.job_id === input.job_id);
1158
1105
  if (!job) {
1159
1106
  return {
1160
1107
  ok: false,
1161
1108
  queue_path: getJobQueuePath(),
1162
- reservation_path: getReservationTablePath(),
1109
+ lock_path: getJobLockTablePath(),
1163
1110
  queue,
1164
- reservations: table,
1111
+ locks: table,
1165
1112
  error: `Unknown job_id: ${input.job_id}`,
1166
1113
  };
1167
1114
  }
@@ -1169,34 +1116,32 @@ export async function completeJob(input) {
1169
1116
  return {
1170
1117
  ok: false,
1171
1118
  queue_path: getJobQueuePath(),
1172
- reservation_path: getReservationTablePath(),
1119
+ lock_path: getJobLockTablePath(),
1173
1120
  queue,
1174
- reservations: table,
1121
+ locks: table,
1175
1122
  error: `Job ${input.job_id} is ${job.status}; only running jobs can be completed.`,
1176
1123
  };
1177
1124
  }
1178
- const runningReservations = table.reservations.filter((reservation) => reservation.job_id === input.job_id && reservation.status === "running");
1179
- if (runningReservations.length === 0) {
1125
+ const runningLocks = table.locks.filter((lock) => lock.job_id === input.job_id && lock.status === "active");
1126
+ if (runningLocks.length === 0) {
1180
1127
  return {
1181
1128
  ok: false,
1182
1129
  queue_path: getJobQueuePath(),
1183
- reservation_path: getReservationTablePath(),
1130
+ lock_path: getJobLockTablePath(),
1184
1131
  queue,
1185
- reservations: table,
1186
- error: `Job ${input.job_id} has no running reservation slices.`,
1132
+ locks: table,
1133
+ error: `Job ${input.job_id} has no active resource locks.`,
1187
1134
  };
1188
1135
  }
1189
1136
  const success = input.success !== false;
1190
- for (const reservation of table.reservations) {
1191
- if (reservation.job_id !== input.job_id)
1137
+ for (const lock of table.locks) {
1138
+ if (lock.job_id !== input.job_id)
1192
1139
  continue;
1193
- if (reservation.status === "running") {
1194
- reservation.status = success ? "completed" : "failed";
1195
- reservation.reason_code = success ? undefined : "execution_failed";
1196
- }
1197
- else if (reservation.status === "reserved") {
1198
- reservation.status = "released";
1199
- reservation.reason_code = success ? "job_completed" : "retry_scheduled";
1140
+ if (lock.status === "active") {
1141
+ lock.status = success ? "completed" : "failed";
1142
+ lock.reason_code = success ? undefined : "execution_failed";
1143
+ lock.released_at = nowIso(now);
1144
+ lock.heartbeat_at = nowIso(now);
1200
1145
  }
1201
1146
  }
1202
1147
  if (success) {
@@ -1222,10 +1167,11 @@ export async function completeJob(input) {
1222
1167
  job.status = "blocked";
1223
1168
  job.reason_code = "retry_scheduled";
1224
1169
  job.next_attempt_at = nowIso(retryAt);
1170
+ job.started_at = undefined;
1225
1171
  }
1226
1172
  }
1227
1173
  const queuePath = writeQueue(queue);
1228
- const reservationPath = writeReservations(table);
1174
+ const lockPath = writeJobLocks(table);
1229
1175
  const finalStatus = job.status;
1230
1176
  const isSuccess = finalStatus === "done";
1231
1177
  buf.status.push({
@@ -1248,7 +1194,7 @@ export async function completeJob(input) {
1248
1194
  message: isSuccess
1249
1195
  ? `Completed scheduler job ${input.job_id}`
1250
1196
  : `Scheduler job ${input.job_id} ended with ${finalStatus}`,
1251
- artifacts: ["agent-state/job-queue.json", "agent-state/job-reservations.json"],
1197
+ artifacts: ["agent-state/job-queue.json", "agent-state/job-locks.json"],
1252
1198
  metadata: {
1253
1199
  job_id: input.job_id,
1254
1200
  final_status: finalStatus,
@@ -1258,9 +1204,9 @@ export async function completeJob(input) {
1258
1204
  return {
1259
1205
  ok: true,
1260
1206
  queue_path: queuePath,
1261
- reservation_path: reservationPath,
1207
+ lock_path: lockPath,
1262
1208
  queue,
1263
- reservations: table,
1209
+ locks: table,
1264
1210
  status: job.status,
1265
1211
  retry_scheduled_for: job.next_attempt_at,
1266
1212
  };