@voyant-travel/workflows-orchestrator 0.107.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/LICENSE +201 -0
  2. package/NOTICE +52 -0
  3. package/README.md +76 -0
  4. package/dist/abort-registry.d.ts +6 -0
  5. package/dist/abort-registry.d.ts.map +1 -0
  6. package/dist/abort-registry.js +37 -0
  7. package/dist/concurrency.d.ts +31 -0
  8. package/dist/concurrency.d.ts.map +1 -0
  9. package/dist/concurrency.js +145 -0
  10. package/dist/drive.d.ts +67 -0
  11. package/dist/drive.d.ts.map +1 -0
  12. package/dist/drive.js +373 -0
  13. package/dist/driver-inmemory.d.ts +30 -0
  14. package/dist/driver-inmemory.d.ts.map +1 -0
  15. package/dist/driver-inmemory.js +394 -0
  16. package/dist/event-router.d.ts +51 -0
  17. package/dist/event-router.d.ts.map +1 -0
  18. package/dist/event-router.js +68 -0
  19. package/dist/http-step-handler.d.ts +25 -0
  20. package/dist/http-step-handler.d.ts.map +1 -0
  21. package/dist/http-step-handler.js +78 -0
  22. package/dist/in-memory-store.d.ts +5 -0
  23. package/dist/in-memory-store.d.ts.map +1 -0
  24. package/dist/in-memory-store.js +41 -0
  25. package/dist/index.d.ts +13 -0
  26. package/dist/index.d.ts.map +1 -0
  27. package/dist/index.js +22 -0
  28. package/dist/journal-helpers.d.ts +3 -0
  29. package/dist/journal-helpers.d.ts.map +1 -0
  30. package/dist/journal-helpers.js +9 -0
  31. package/dist/orchestrator.d.ts +116 -0
  32. package/dist/orchestrator.d.ts.map +1 -0
  33. package/dist/orchestrator.js +411 -0
  34. package/dist/resume-run.d.ts +40 -0
  35. package/dist/resume-run.d.ts.map +1 -0
  36. package/dist/resume-run.js +119 -0
  37. package/dist/schedule.d.ts +51 -0
  38. package/dist/schedule.d.ts.map +1 -0
  39. package/dist/schedule.js +243 -0
  40. package/dist/testing/driver-compliance.d.ts +58 -0
  41. package/dist/testing/driver-compliance.d.ts.map +1 -0
  42. package/dist/testing/driver-compliance.js +667 -0
  43. package/dist/types.d.ts +182 -0
  44. package/dist/types.d.ts.map +1 -0
  45. package/dist/types.js +4 -0
  46. package/package.json +51 -0
  47. package/src/__tests__/orchestrator-test-support.ts +18 -0
  48. package/src/abort-registry.ts +41 -0
  49. package/src/concurrency.ts +217 -0
  50. package/src/drive.ts +477 -0
  51. package/src/driver-inmemory.ts +511 -0
  52. package/src/event-router.ts +120 -0
  53. package/src/http-step-handler.ts +112 -0
  54. package/src/in-memory-store.ts +44 -0
  55. package/src/index.ts +73 -0
  56. package/src/journal-helpers.ts +11 -0
  57. package/src/orchestrator.ts +527 -0
  58. package/src/resume-run.ts +162 -0
  59. package/src/schedule.ts +310 -0
  60. package/src/testing/driver-compliance.ts +800 -0
  61. package/src/types.ts +201 -0
@@ -0,0 +1,527 @@
1
+ // agent-quality: file-size exception -- owner: workflows-orchestrator; existing module stays co-located until a dedicated split preserves behavior and tests.
2
+ // Public entry points for the reference orchestrator.
3
+ //
4
+ // `trigger()` creates a RunRecord, drives it forward through the
5
+ // tenant handler, and persists the resulting record.
6
+ // `resume()` accepts a waitpoint injection for a parked run, applies
7
+ // it, drives forward, and persists.
8
+ // `cancel()` closes out a run without running compensations (they
9
+ // must come from the tenant handler, not the orchestrator).
10
+
11
+ import type { Duration } from "@voyant-travel/workflows"
12
+
13
+ import { registerRunAbort, signalRunAbort, unregisterRunAbort } from "./abort-registry.js"
14
+ import { applyWaitpointInjection, type DriveOptions, driveUntilPaused } from "./drive.js"
15
+ import { emptyJournal } from "./journal-helpers.js"
16
+ import type {
17
+ JournalSlice,
18
+ RunRecord,
19
+ RunRecordStore,
20
+ RunTrigger,
21
+ StepHandler,
22
+ WaitpointInjection,
23
+ } from "./types.js"
24
+
25
+ export interface TriggerArgs {
26
+ workflowId: string
27
+ workflowVersion: string
28
+ input: unknown
29
+ tenantMeta: RunRecord["tenantMeta"]
30
+ environment?: RunRecord["environment"]
31
+ triggeredBy?: RunTrigger
32
+ tags?: string[]
33
+ runNumber?: number
34
+ /** Optional id to use; defaults to `run_` + crypto random. */
35
+ runId?: string
36
+ /**
37
+ * Caller-supplied idempotency token. When set and the caller did not
38
+ * explicitly pass `runId`, the orchestrator derives a deterministic
39
+ * runId from `(workflowId, idempotencyKey)` so retries of the same
40
+ * trigger return the same run record without re-driving. The key is
41
+ * also persisted onto `RunRecord.idempotencyKey` so persistent stores
42
+ * can populate dedup columns / unique indexes natively.
43
+ *
44
+ * See architecture doc §15.2 for the full ingest-side derivation
45
+ * (eventId → idempotencyKey via `${filterId}:${eventId}`).
46
+ */
47
+ idempotencyKey?: string
48
+ /**
49
+ * Optional trigger-time delay. When set to a future instant, the
50
+ * orchestrator persists the run in `waiting` on a synthetic DATETIME
51
+ * waitpoint and leaves execution to the normal wakeup/time-wheel path.
52
+ */
53
+ delay?: Duration | Date
54
+ /** Higher values are claimed first by scheduler/time-wheel stores. */
55
+ priority?: number
56
+ /**
57
+ * Optional journal seed. Used by external replay/resume callers
58
+ * that need a new run to skip steps already completed by a parent
59
+ * run.
60
+ */
61
+ initialJournal?: JournalSlice
62
+ /**
63
+ * Metadata cursor paired with `initialJournal`. Resume callers that
64
+ * seed a journal with an existing `metadataState` must also seed the
65
+ * positional cursor so replayed metadata mutations are not applied
66
+ * twice.
67
+ */
68
+ initialMetadataAppliedCount?: number
69
+ /**
70
+ * Compute-time budget in ms, typically from `WorkflowConfig.timeout`.
71
+ * Parked time on waitpoints does not count against this. When the
72
+ * cumulative invocation time exceeds it, the run ends `failed`
73
+ * with `code: "WORKFLOW_TIMEOUT"`. Undefined / 0 = no limit.
74
+ */
75
+ timeoutMs?: number
76
+ /**
77
+ * For child runs spawned by `ctx.invoke` on a parent that may park.
78
+ * When set, the orchestrator records it on the child's record so
79
+ * the child's terminal status cascade-resumes the parent.
80
+ */
81
+ parent?: { runId: string; waitpointId: string }
82
+ /**
83
+ * Internal lifecycle hook used by driver-level coordinators that need
84
+ * the persisted run id before the first invocation starts.
85
+ */
86
+ onRunRecordCreated?: (record: RunRecord) => void
87
+ }
88
+
89
+ export interface OrchestratorDeps extends DriveOptions {
90
+ store: RunRecordStore
91
+ handler: StepHandler
92
+ /** id generator; defaults to `run_<random>`. */
93
+ idGenerator?: () => string
94
+ }
95
+
96
+ export async function trigger(args: TriggerArgs, deps: OrchestratorDeps): Promise<RunRecord> {
97
+ const now = deps.now ?? (() => Date.now())
98
+ const createdAt = now()
99
+ // Idempotency: caller-supplied `runId` wins (explicit). Otherwise, when
100
+ // `idempotencyKey` is supplied, derive a deterministic runId from
101
+ // `(workflowId, idempotencyKey)`. We then route through `store.tryInsert`
102
+ // — atomic check-or-insert — so concurrent triggers with the same
103
+ // derived runId never both proceed to drive (architecture doc §15.2,
104
+ // closes the race the get-then-save pattern leaves open).
105
+ const explicitRunId = args.runId
106
+ const idempotencyKey = args.idempotencyKey
107
+ const derivedRunId =
108
+ idempotencyKey !== undefined ? `idem-${args.workflowId}-${idempotencyKey}` : undefined
109
+ const id = explicitRunId ?? derivedRunId ?? deps.idGenerator?.() ?? defaultRunId(now)
110
+ const isIdempotent = explicitRunId !== undefined || derivedRunId !== undefined
111
+ const delayWakeAt = resolveDelayWakeAt(args.delay, createdAt)
112
+ const record: RunRecord = {
113
+ id,
114
+ workflowId: args.workflowId,
115
+ workflowVersion: args.workflowVersion,
116
+ status: delayWakeAt !== undefined ? "waiting" : "running",
117
+ input: args.input,
118
+ journal: args.initialJournal ? cloneJournal(args.initialJournal) : emptyJournal(),
119
+ invocationCount: 0,
120
+ metadataAppliedCount: args.initialMetadataAppliedCount ?? 0,
121
+ computeTimeMs: 0,
122
+ timeoutMs: args.timeoutMs,
123
+ priority: args.priority,
124
+ parent: args.parent,
125
+ pendingWaitpoints:
126
+ delayWakeAt !== undefined
127
+ ? [
128
+ {
129
+ clientWaitpointId: triggerDelayWaitpointId(id),
130
+ kind: "DATETIME",
131
+ meta: {
132
+ wakeAt: delayWakeAt,
133
+ durationMs: Math.max(0, delayWakeAt - createdAt),
134
+ source: "trigger.delay",
135
+ },
136
+ },
137
+ ]
138
+ : [],
139
+ streams: {},
140
+ startedAt: delayWakeAt ?? createdAt,
141
+ triggeredBy: args.triggeredBy ?? { kind: "api" },
142
+ tags: args.tags ?? [],
143
+ environment: args.environment ?? "development",
144
+ tenantMeta: args.tenantMeta,
145
+ runMeta: { number: args.runNumber ?? 1, attempt: 1 },
146
+ idempotencyKey,
147
+ }
148
+ // Idempotent path uses tryInsert; non-idempotent (auto-generated) ids
149
+ // can't collide so the existing save() is sufficient.
150
+ if (isIdempotent) {
151
+ const result = await deps.store.tryInsert(record)
152
+ if (!result.created) {
153
+ // Another caller raced in first. Return their record without
154
+ // re-driving — drive() is what actually fires side effects.
155
+ return result.record
156
+ }
157
+ args.onRunRecordCreated?.(record)
158
+ } else {
159
+ // Persist up-front so concurrent `cancel(runId)` calls can find the
160
+ // run before any invocation has completed.
161
+ await deps.store.save(record)
162
+ args.onRunRecordCreated?.(record)
163
+ }
164
+ if (delayWakeAt !== undefined) {
165
+ return record
166
+ }
167
+ const abortCtrl = registerRunAbort(id)
168
+ try {
169
+ await driveUntilPaused(record, {
170
+ ...driveOptionsFor(deps),
171
+ signal: abortCtrl.signal,
172
+ })
173
+ } finally {
174
+ unregisterRunAbort(id)
175
+ }
176
+ // If an external cancel fired during drive, the aborted step may
177
+ // have surfaced as `failed` (the step threw on abort). The user's
178
+ // intent was cancel, so adopt the store's `cancelled` status.
179
+ if (abortCtrl.signal.aborted) {
180
+ const latest = await deps.store.get(id)
181
+ if (latest?.status === "cancelled") {
182
+ record.status = "cancelled"
183
+ record.completedAt = latest.completedAt ?? now()
184
+ record.pendingWaitpoints = []
185
+ record.error = latest.error
186
+ }
187
+ }
188
+ return deps.store.save(record)
189
+ }
190
+
191
+ function resolveDelayWakeAt(delay: Duration | Date | undefined, now: number): number | undefined {
192
+ if (delay === undefined) return undefined
193
+ const wakeAt = delay instanceof Date ? delay.getTime() : now + durationToMs(delay)
194
+ return wakeAt > now ? wakeAt : undefined
195
+ }
196
+
197
+ function durationToMs(duration: Duration): number {
198
+ if (typeof duration === "number") return duration
199
+ const m = /^(\d+)(ms|s|m|h|d|w)$/.exec(duration)
200
+ if (!m) throw new Error(`invalid duration: ${String(duration)}`)
201
+ const n = Number(m[1])
202
+ switch (m[2]) {
203
+ case "ms":
204
+ return n
205
+ case "s":
206
+ return n * 1_000
207
+ case "m":
208
+ return n * 60_000
209
+ case "h":
210
+ return n * 3_600_000
211
+ case "d":
212
+ return n * 86_400_000
213
+ case "w":
214
+ return n * 604_800_000
215
+ default:
216
+ throw new Error(`invalid duration unit: ${m[2]}`)
217
+ }
218
+ }
219
+
220
+ function triggerDelayWaitpointId(runId: string): string {
221
+ return `trigger-delay:${runId}`
222
+ }
223
+
224
+ function cloneJournal(journal: JournalSlice): JournalSlice {
225
+ return structuredClone(journal) as JournalSlice
226
+ }
227
+
228
+ /**
229
+ * Build DriveOptions that carry a `triggerChild` hook bound to the
230
+ * same deps — so `ctx.invoke(child, ...)` recursively runs the child
231
+ * through the same orchestrator, store, and handler — plus a
232
+ * `beforeInvocation` hook that persists mid-flight progress and
233
+ * honors concurrent cancellations.
234
+ */
235
+ function driveOptionsFor(deps: OrchestratorDeps): DriveOptions {
236
+ const now = deps.now ?? (() => Date.now())
237
+ return {
238
+ ...deps,
239
+ triggerChild: async ({ parent, waitpoint }) => {
240
+ const childWorkflowId = String(waitpoint.meta.childWorkflowId)
241
+ const childInput = waitpoint.meta.childInput
242
+ const detach = waitpoint.meta.detach === true
243
+ return trigger(
244
+ {
245
+ workflowId: childWorkflowId,
246
+ // Children inherit the parent's workflow version slot unless
247
+ // lockToVersion is set; for v1 we always inherit.
248
+ workflowVersion: parent.workflowVersion,
249
+ input: childInput,
250
+ tenantMeta: parent.tenantMeta,
251
+ environment: parent.environment,
252
+ // Inherit the parent's trigger kind for run-tree observability.
253
+ triggeredBy: parent.triggeredBy,
254
+ tags: Array.isArray(waitpoint.meta.tags) ? (waitpoint.meta.tags as string[]) : [],
255
+ // Lineage pointer: if this child parks, its terminal status
256
+ // cascade-resumes the parent at this specific waitpoint.
257
+ parent: detach
258
+ ? undefined
259
+ : { runId: parent.id, waitpointId: waitpoint.clientWaitpointId },
260
+ },
261
+ deps,
262
+ )
263
+ },
264
+ beforeInvocation: async (rec) => {
265
+ // Read-first: a concurrent `cancel()` may have flipped the
266
+ // stored status. If we saved first, we'd overwrite it.
267
+ const latest = await deps.store.get(rec.id)
268
+ if (latest && latest.status === "cancelled") {
269
+ rec.status = "cancelled"
270
+ rec.completedAt = latest.completedAt ?? now()
271
+ rec.pendingWaitpoints = []
272
+ if (latest.error) rec.error = latest.error
273
+ return false
274
+ }
275
+ // Persist mid-flight progress so the dashboard sees updates and
276
+ // the next concurrent cancel() has an up-to-date target.
277
+ await deps.store.save(rec)
278
+ return true
279
+ },
280
+ }
281
+ }
282
+
283
+ export interface ResumeArgs {
284
+ runId: string
285
+ injection: WaitpointInjection
286
+ }
287
+
288
+ export async function resume(
289
+ args: ResumeArgs,
290
+ deps: OrchestratorDeps,
291
+ ): Promise<
292
+ | { ok: true; record: RunRecord }
293
+ | { ok: false; status: "not_found" | "not_parked" | "no_match"; message: string }
294
+ > {
295
+ const existing = await deps.store.get(args.runId)
296
+ if (!existing) {
297
+ return { ok: false, status: "not_found", message: `run ${args.runId} not found` }
298
+ }
299
+ if (existing.status !== "waiting") {
300
+ return {
301
+ ok: false,
302
+ status: "not_parked",
303
+ message: `run ${args.runId} is not parked (status: ${existing.status})`,
304
+ }
305
+ }
306
+ const ok = applyWaitpointInjection(existing, args.injection, deps.now)
307
+ if (!ok.ok) {
308
+ return { ok: false, status: "no_match", message: ok.message }
309
+ }
310
+ const abortCtrl = registerRunAbort(existing.id)
311
+ try {
312
+ await driveUntilPaused(existing, {
313
+ ...driveOptionsFor(deps),
314
+ signal: abortCtrl.signal,
315
+ })
316
+ } finally {
317
+ unregisterRunAbort(existing.id)
318
+ }
319
+ if (abortCtrl.signal.aborted) {
320
+ const latest = await deps.store.get(existing.id)
321
+ if (latest?.status === "cancelled") {
322
+ const now = deps.now ?? (() => Date.now())
323
+ existing.status = "cancelled"
324
+ existing.completedAt = latest.completedAt ?? now()
325
+ existing.pendingWaitpoints = []
326
+ existing.error = latest.error
327
+ }
328
+ }
329
+ const saved = await deps.store.save(existing)
330
+ // If this resume drove the run into a terminal state and it's a
331
+ // child of some parent, cascade the resolution.
332
+ if (isTerminalStatus(saved.status)) {
333
+ await cascadeResumeParent(saved, deps)
334
+ }
335
+ return { ok: true, record: saved }
336
+ }
337
+
338
+ export interface ResumeDueAlarmsArgs {
339
+ runId: string
340
+ }
341
+
342
+ /**
343
+ * Resolve every DATETIME waitpoint whose `wakeAt` has passed, drive
344
+ * the run forward, and persist. Returns the saved record, or null
345
+ * when the run isn't in `waiting` state (already terminal / running
346
+ * elsewhere), or when no DATETIME waitpoints are actually due yet —
347
+ * both are no-ops that the caller can treat as "nothing to do."
348
+ *
349
+ * Callers (local serve loop, CF DO alarm handler) are responsible for
350
+ * scheduling the actual wake-up timer. This function is transport-
351
+ * agnostic: given `now()`, it does the resolve + drive + save.
352
+ */
353
+ export async function resumeDueAlarms(
354
+ args: ResumeDueAlarmsArgs,
355
+ deps: OrchestratorDeps,
356
+ ): Promise<RunRecord | null> {
357
+ const record = await deps.store.get(args.runId)
358
+ if (!record) return null
359
+ if (record.status !== "waiting") return null
360
+ const now = deps.now ?? (() => Date.now())
361
+ const at = now()
362
+ const stillPending: typeof record.pendingWaitpoints = []
363
+ let resolvedAny = false
364
+ for (const wp of record.pendingWaitpoints) {
365
+ const wakeAt = typeof wp.meta.wakeAt === "number" ? wp.meta.wakeAt : undefined
366
+ if (wp.kind === "DATETIME" && wakeAt !== undefined && wakeAt <= at) {
367
+ record.journal.waitpointsResolved[wp.clientWaitpointId] = {
368
+ kind: "DATETIME",
369
+ resolvedAt: at,
370
+ source: "replay",
371
+ }
372
+ resolvedAny = true
373
+ } else {
374
+ stillPending.push(wp)
375
+ }
376
+ }
377
+ if (!resolvedAny) return null
378
+ record.pendingWaitpoints = stillPending
379
+ if (record.pendingWaitpoints.length === 0) record.status = "running"
380
+ const abortCtrl = registerRunAbort(record.id)
381
+ try {
382
+ await driveUntilPaused(record, {
383
+ ...driveOptionsFor(deps),
384
+ signal: abortCtrl.signal,
385
+ })
386
+ } finally {
387
+ unregisterRunAbort(record.id)
388
+ }
389
+ const saved = await deps.store.save(record)
390
+ if (isTerminalStatus(saved.status)) {
391
+ await cascadeResumeParent(saved, deps)
392
+ }
393
+ return saved
394
+ }
395
+
396
+ export interface CancelArgs {
397
+ runId: string
398
+ reason?: string
399
+ }
400
+
401
+ export async function cancel(
402
+ args: CancelArgs,
403
+ deps: OrchestratorDeps,
404
+ ): Promise<
405
+ | { ok: true; record: RunRecord }
406
+ | { ok: false; status: "not_found" | "already_terminal"; message: string }
407
+ > {
408
+ const existing = await deps.store.get(args.runId)
409
+ if (!existing) {
410
+ return { ok: false, status: "not_found", message: `run ${args.runId} not found` }
411
+ }
412
+ if (existing.status !== "waiting" && existing.status !== "running") {
413
+ return {
414
+ ok: false,
415
+ status: "already_terminal",
416
+ message: `run ${args.runId} is already terminal (status: ${existing.status})`,
417
+ }
418
+ }
419
+ const now = deps.now ?? (() => Date.now())
420
+ existing.status = "cancelled"
421
+ existing.completedAt = now()
422
+ existing.pendingWaitpoints = []
423
+ if (args.reason) {
424
+ existing.error = {
425
+ category: "USER_ERROR",
426
+ code: "CANCELLED",
427
+ message: args.reason,
428
+ }
429
+ }
430
+ const saved = await deps.store.save(existing)
431
+ // Best-effort mid-step abort: if the run is in-flight in this
432
+ // process, fire its AbortSignal so step bodies that observe
433
+ // `ctx.signal` (fetches, sleeps, etc.) stop immediately. Returns
434
+ // `false` when no controller is registered (run is in another
435
+ // process, or drive has already exited) — that's fine; the
436
+ // status flip + between-invocation recheck cover that path.
437
+ signalRunAbort(existing.id, args.reason)
438
+ // If this cancel was on a child with a parked parent, surface the
439
+ // cancellation to the parent as a RUN-waitpoint error.
440
+ if (isTerminalStatus(saved.status)) {
441
+ await cascadeResumeParent(saved, deps)
442
+ }
443
+ return { ok: true, record: saved }
444
+ }
445
+
446
+ /**
447
+ * When a child run reaches a terminal state, look up its `parent`
448
+ * pointer and resume the parent's matching RUN waitpoint with the
449
+ * child's output / error. Best-effort: if the parent can't be found
450
+ * or is no longer parked, silently drop (the parent's own drive will
451
+ * observe the child's state on replay via a subsequent trigger).
452
+ */
453
+ async function cascadeResumeParent(child: RunRecord, deps: OrchestratorDeps): Promise<void> {
454
+ if (!child.parent) return
455
+ const parent = await deps.store.get(child.parent.runId)
456
+ if (!parent) return
457
+ if (parent.status !== "waiting") return
458
+ const wpIdx = parent.pendingWaitpoints.findIndex(
459
+ (w) => w.clientWaitpointId === child.parent!.waitpointId,
460
+ )
461
+ if (wpIdx < 0) return
462
+
463
+ const now = deps.now ?? (() => Date.now())
464
+ const at = now()
465
+ if (child.status === "completed") {
466
+ parent.journal.waitpointsResolved[child.parent.waitpointId] = {
467
+ kind: "RUN",
468
+ resolvedAt: at,
469
+ payload: child.output,
470
+ source: "replay",
471
+ }
472
+ } else {
473
+ parent.journal.waitpointsResolved[child.parent.waitpointId] = {
474
+ kind: "RUN",
475
+ resolvedAt: at,
476
+ source: "replay",
477
+ error: {
478
+ category:
479
+ (child.error?.category as "USER_ERROR" | "RUNTIME_ERROR" | undefined) ?? "USER_ERROR",
480
+ code: child.error?.code ?? "CHILD_RUN_ENDED",
481
+ message: child.error?.message ?? `child run ended with status ${child.status}`,
482
+ },
483
+ }
484
+ }
485
+ parent.pendingWaitpoints.splice(wpIdx, 1)
486
+ if (parent.pendingWaitpoints.length === 0) {
487
+ parent.status = "running"
488
+ }
489
+
490
+ // Re-drive the parent. This nested drive goes through the same
491
+ // handler / store / hooks, so the parent's own parent (if any)
492
+ // will also cascade-resume when appropriate.
493
+ const abortCtrl = registerRunAbort(parent.id)
494
+ try {
495
+ await driveUntilPaused(parent, {
496
+ ...driveOptionsFor(deps),
497
+ signal: abortCtrl.signal,
498
+ })
499
+ } finally {
500
+ unregisterRunAbort(parent.id)
501
+ }
502
+ await deps.store.save(parent)
503
+ // The parent might itself have a parent — recurse.
504
+ if (isTerminalStatus(parent.status)) {
505
+ await cascadeResumeParent(parent, deps)
506
+ }
507
+ }
508
+
509
+ function isTerminalStatus(s: string): boolean {
510
+ return (
511
+ s === "completed" ||
512
+ s === "failed" ||
513
+ s === "cancelled" ||
514
+ s === "compensated" ||
515
+ s === "compensation_failed"
516
+ )
517
+ }
518
+
519
+ function defaultRunId(now: () => number): string {
520
+ const ts = now().toString(36)
521
+ // Non-cryptographic; orchestrator core exposes `idGenerator` for
522
+ // callers that want stronger guarantees.
523
+ const rand = Math.floor(Math.random() * 1_000_000)
524
+ .toString(36)
525
+ .padStart(4, "0")
526
+ return `run_${ts}_${rand}`
527
+ }
@@ -0,0 +1,162 @@
1
+ import { emptyJournal } from "./journal-helpers.js"
2
+ import type { JournalSlice, RunRecord, StepJournalEntry } from "./types.js"
3
+
4
+ export interface BuildResumeJournalInput {
5
+ parent: RunRecord
6
+ resumeFromStep?: string
7
+ seedResults?: Record<string, unknown>
8
+ now?: () => number
9
+ }
10
+
11
+ export interface BuildResumeJournalResult {
12
+ resumeFromStep: string
13
+ journal: JournalSlice
14
+ metadataAppliedCount: number
15
+ }
16
+
17
+ export interface BuildSeededResumeJournalInput {
18
+ parentRunId: string
19
+ resumeFromStep: string
20
+ seedResults: Record<string, unknown>
21
+ metadataState?: Record<string, unknown>
22
+ metadataAppliedCount?: number
23
+ now?: () => number
24
+ }
25
+
26
+ export function buildResumeJournal(input: BuildResumeJournalInput): BuildResumeJournalResult {
27
+ const resumeFromStep = input.resumeFromStep ?? findFirstFailedStep(input.parent)
28
+ if (!resumeFromStep) {
29
+ throw new Error(
30
+ `run "${input.parent.id}" has no failed step; pass resumeFromStep explicitly to resume it`,
31
+ )
32
+ }
33
+
34
+ const journal = emptyJournal()
35
+ journal.metadataState = structuredClone(input.parent.journal.metadataState) as Record<
36
+ string,
37
+ unknown
38
+ >
39
+
40
+ if (input.seedResults) {
41
+ return buildSeededResumeJournal({
42
+ parentRunId: input.parent.id,
43
+ resumeFromStep,
44
+ seedResults: input.seedResults,
45
+ metadataState: journal.metadataState,
46
+ metadataAppliedCount: input.parent.metadataAppliedCount,
47
+ now: input.now,
48
+ })
49
+ }
50
+
51
+ for (const [stepId, entry] of Object.entries(input.parent.journal.stepResults)) {
52
+ if (stepId === resumeFromStep) break
53
+ if (entry.status !== "ok") {
54
+ throw new Error(
55
+ `step "${stepId}" completed before "${resumeFromStep}" but is not successful; cannot seed resume journal`,
56
+ )
57
+ }
58
+ journal.stepResults[stepId] = structuredClone(entry) as StepJournalEntry
59
+ }
60
+
61
+ return {
62
+ resumeFromStep,
63
+ journal,
64
+ metadataAppliedCount: input.parent.metadataAppliedCount,
65
+ }
66
+ }
67
+
68
+ export function buildSeededResumeJournal(
69
+ input: BuildSeededResumeJournalInput,
70
+ ): BuildResumeJournalResult {
71
+ const journal = emptyJournal()
72
+ journal.metadataState = input.metadataState
73
+ ? (structuredClone(input.metadataState) as Record<string, unknown>)
74
+ : {}
75
+ const now = input.now ?? (() => Date.now())
76
+ let at = now()
77
+ for (const [stepId, output] of Object.entries(input.seedResults)) {
78
+ journal.stepResults[stepId] = seededStepEntry(output, at)
79
+ at += 1
80
+ }
81
+ return {
82
+ resumeFromStep: input.resumeFromStep,
83
+ journal,
84
+ metadataAppliedCount: input.metadataAppliedCount ?? 0,
85
+ }
86
+ }
87
+
88
+ export type SeedResultsValidation =
89
+ | { ok: true; seedResults: Record<string, unknown> }
90
+ | { ok: false; message: string }
91
+
92
+ const SEED_RESULTS_MAX_ENTRIES = 256
93
+ const SEED_RESULTS_MAX_STEP_ID_LENGTH = 200
94
+ const SEED_RESULTS_MAX_SERIALIZED_CHARS = 1_000_000
95
+ // biome-ignore lint/suspicious/noControlCharactersInRegex: rejecting control chars is the point -- owner: workflows-orchestrator; existing suppression is intentional pending typed cleanup.
96
+ const CONTROL_CHARS = /[\x00-\x1f\x7f]/
97
+
98
+ /**
99
+ * Strict structural validation for caller-supplied `seedResults`
100
+ * (`POST /api/runs/:id/resume`). Seeded entries are written verbatim
101
+ * into the new run's journal as already-completed steps, so they let
102
+ * the caller assert "this step ran and produced this output" — they
103
+ * must be gated behind an operator credential AND shape-checked:
104
+ * a record of bounded, control-character-free step ids to
105
+ * JSON-serializable values, bounded in count and total size.
106
+ */
107
+ export function validateSeedResults(value: unknown): SeedResultsValidation {
108
+ if (typeof value !== "object" || value === null || Array.isArray(value)) {
109
+ return { ok: false, message: "seedResults must be an object of stepId → output" }
110
+ }
111
+ const entries = Object.entries(value as Record<string, unknown>)
112
+ if (entries.length > SEED_RESULTS_MAX_ENTRIES) {
113
+ return {
114
+ ok: false,
115
+ message: `seedResults may contain at most ${SEED_RESULTS_MAX_ENTRIES} entries`,
116
+ }
117
+ }
118
+ for (const [stepId, output] of entries) {
119
+ if (stepId.length === 0 || stepId.length > SEED_RESULTS_MAX_STEP_ID_LENGTH) {
120
+ return {
121
+ ok: false,
122
+ message: `seedResults step ids must be 1-${SEED_RESULTS_MAX_STEP_ID_LENGTH} characters`,
123
+ }
124
+ }
125
+ if (CONTROL_CHARS.test(stepId)) {
126
+ return { ok: false, message: "seedResults step ids must not contain control characters" }
127
+ }
128
+ let serialized: string | undefined
129
+ try {
130
+ serialized = JSON.stringify(output)
131
+ } catch {
132
+ return { ok: false, message: `seedResults["${stepId}"] is not JSON-serializable` }
133
+ }
134
+ if (serialized === undefined) {
135
+ return { ok: false, message: `seedResults["${stepId}"] is not JSON-serializable` }
136
+ }
137
+ if (serialized.length > SEED_RESULTS_MAX_SERIALIZED_CHARS) {
138
+ return {
139
+ ok: false,
140
+ message: `seedResults["${stepId}"] exceeds the ${SEED_RESULTS_MAX_SERIALIZED_CHARS}-character serialized limit`,
141
+ }
142
+ }
143
+ }
144
+ return { ok: true, seedResults: value as Record<string, unknown> }
145
+ }
146
+
147
+ function findFirstFailedStep(parent: RunRecord): string | undefined {
148
+ for (const [stepId, entry] of Object.entries(parent.journal.stepResults)) {
149
+ if (entry.status === "err") return stepId
150
+ }
151
+ return undefined
152
+ }
153
+
154
+ function seededStepEntry(output: unknown, at: number): StepJournalEntry {
155
+ return {
156
+ attempt: 1,
157
+ status: "ok",
158
+ output,
159
+ startedAt: at,
160
+ finishedAt: at,
161
+ }
162
+ }