@voyant-travel/workflows-orchestrator 0.107.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/NOTICE +52 -0
- package/README.md +76 -0
- package/dist/abort-registry.d.ts +6 -0
- package/dist/abort-registry.d.ts.map +1 -0
- package/dist/abort-registry.js +37 -0
- package/dist/concurrency.d.ts +31 -0
- package/dist/concurrency.d.ts.map +1 -0
- package/dist/concurrency.js +145 -0
- package/dist/drive.d.ts +67 -0
- package/dist/drive.d.ts.map +1 -0
- package/dist/drive.js +373 -0
- package/dist/driver-inmemory.d.ts +30 -0
- package/dist/driver-inmemory.d.ts.map +1 -0
- package/dist/driver-inmemory.js +394 -0
- package/dist/event-router.d.ts +51 -0
- package/dist/event-router.d.ts.map +1 -0
- package/dist/event-router.js +68 -0
- package/dist/http-step-handler.d.ts +25 -0
- package/dist/http-step-handler.d.ts.map +1 -0
- package/dist/http-step-handler.js +78 -0
- package/dist/in-memory-store.d.ts +5 -0
- package/dist/in-memory-store.d.ts.map +1 -0
- package/dist/in-memory-store.js +41 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +22 -0
- package/dist/journal-helpers.d.ts +3 -0
- package/dist/journal-helpers.d.ts.map +1 -0
- package/dist/journal-helpers.js +9 -0
- package/dist/orchestrator.d.ts +116 -0
- package/dist/orchestrator.d.ts.map +1 -0
- package/dist/orchestrator.js +411 -0
- package/dist/resume-run.d.ts +40 -0
- package/dist/resume-run.d.ts.map +1 -0
- package/dist/resume-run.js +119 -0
- package/dist/schedule.d.ts +51 -0
- package/dist/schedule.d.ts.map +1 -0
- package/dist/schedule.js +243 -0
- package/dist/testing/driver-compliance.d.ts +58 -0
- package/dist/testing/driver-compliance.d.ts.map +1 -0
- package/dist/testing/driver-compliance.js +667 -0
- package/dist/types.d.ts +182 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +4 -0
- package/package.json +51 -0
- package/src/__tests__/orchestrator-test-support.ts +18 -0
- package/src/abort-registry.ts +41 -0
- package/src/concurrency.ts +217 -0
- package/src/drive.ts +477 -0
- package/src/driver-inmemory.ts +511 -0
- package/src/event-router.ts +120 -0
- package/src/http-step-handler.ts +112 -0
- package/src/in-memory-store.ts +44 -0
- package/src/index.ts +73 -0
- package/src/journal-helpers.ts +11 -0
- package/src/orchestrator.ts +527 -0
- package/src/resume-run.ts +162 -0
- package/src/schedule.ts +310 -0
- package/src/testing/driver-compliance.ts +800 -0
- package/src/types.ts +201 -0
|
@@ -0,0 +1,527 @@
|
|
|
1
|
+
// agent-quality: file-size exception -- owner: workflows-orchestrator; existing module stays co-located until a dedicated split preserves behavior and tests.
|
|
2
|
+
// Public entry points for the reference orchestrator.
|
|
3
|
+
//
|
|
4
|
+
// `trigger()` creates a RunRecord, drives it forward through the
|
|
5
|
+
// tenant handler, and persists the resulting record.
|
|
6
|
+
// `resume()` accepts a waitpoint injection for a parked run, applies
|
|
7
|
+
// it, drives forward, and persists.
|
|
8
|
+
// `cancel()` closes out a run without running compensations (they
|
|
9
|
+
// must come from the tenant handler, not the orchestrator).
|
|
10
|
+
|
|
11
|
+
import type { Duration } from "@voyant-travel/workflows"
|
|
12
|
+
|
|
13
|
+
import { registerRunAbort, signalRunAbort, unregisterRunAbort } from "./abort-registry.js"
|
|
14
|
+
import { applyWaitpointInjection, type DriveOptions, driveUntilPaused } from "./drive.js"
|
|
15
|
+
import { emptyJournal } from "./journal-helpers.js"
|
|
16
|
+
import type {
|
|
17
|
+
JournalSlice,
|
|
18
|
+
RunRecord,
|
|
19
|
+
RunRecordStore,
|
|
20
|
+
RunTrigger,
|
|
21
|
+
StepHandler,
|
|
22
|
+
WaitpointInjection,
|
|
23
|
+
} from "./types.js"
|
|
24
|
+
|
|
25
|
+
export interface TriggerArgs {
|
|
26
|
+
workflowId: string
|
|
27
|
+
workflowVersion: string
|
|
28
|
+
input: unknown
|
|
29
|
+
tenantMeta: RunRecord["tenantMeta"]
|
|
30
|
+
environment?: RunRecord["environment"]
|
|
31
|
+
triggeredBy?: RunTrigger
|
|
32
|
+
tags?: string[]
|
|
33
|
+
runNumber?: number
|
|
34
|
+
/** Optional id to use; defaults to `run_` + crypto random. */
|
|
35
|
+
runId?: string
|
|
36
|
+
/**
|
|
37
|
+
* Caller-supplied idempotency token. When set and the caller did not
|
|
38
|
+
* explicitly pass `runId`, the orchestrator derives a deterministic
|
|
39
|
+
* runId from `(workflowId, idempotencyKey)` so retries of the same
|
|
40
|
+
* trigger return the same run record without re-driving. The key is
|
|
41
|
+
* also persisted onto `RunRecord.idempotencyKey` so persistent stores
|
|
42
|
+
* can populate dedup columns / unique indexes natively.
|
|
43
|
+
*
|
|
44
|
+
* See architecture doc §15.2 for the full ingest-side derivation
|
|
45
|
+
* (eventId → idempotencyKey via `${filterId}:${eventId}`).
|
|
46
|
+
*/
|
|
47
|
+
idempotencyKey?: string
|
|
48
|
+
/**
|
|
49
|
+
* Optional trigger-time delay. When set to a future instant, the
|
|
50
|
+
* orchestrator persists the run in `waiting` on a synthetic DATETIME
|
|
51
|
+
* waitpoint and leaves execution to the normal wakeup/time-wheel path.
|
|
52
|
+
*/
|
|
53
|
+
delay?: Duration | Date
|
|
54
|
+
/** Higher values are claimed first by scheduler/time-wheel stores. */
|
|
55
|
+
priority?: number
|
|
56
|
+
/**
|
|
57
|
+
* Optional journal seed. Used by external replay/resume callers
|
|
58
|
+
* that need a new run to skip steps already completed by a parent
|
|
59
|
+
* run.
|
|
60
|
+
*/
|
|
61
|
+
initialJournal?: JournalSlice
|
|
62
|
+
/**
|
|
63
|
+
* Metadata cursor paired with `initialJournal`. Resume callers that
|
|
64
|
+
* seed a journal with an existing `metadataState` must also seed the
|
|
65
|
+
* positional cursor so replayed metadata mutations are not applied
|
|
66
|
+
* twice.
|
|
67
|
+
*/
|
|
68
|
+
initialMetadataAppliedCount?: number
|
|
69
|
+
/**
|
|
70
|
+
* Compute-time budget in ms, typically from `WorkflowConfig.timeout`.
|
|
71
|
+
* Parked time on waitpoints does not count against this. When the
|
|
72
|
+
* cumulative invocation time exceeds it, the run ends `failed`
|
|
73
|
+
* with `code: "WORKFLOW_TIMEOUT"`. Undefined / 0 = no limit.
|
|
74
|
+
*/
|
|
75
|
+
timeoutMs?: number
|
|
76
|
+
/**
|
|
77
|
+
* For child runs spawned by `ctx.invoke` on a parent that may park.
|
|
78
|
+
* When set, the orchestrator records it on the child's record so
|
|
79
|
+
* the child's terminal status cascade-resumes the parent.
|
|
80
|
+
*/
|
|
81
|
+
parent?: { runId: string; waitpointId: string }
|
|
82
|
+
/**
|
|
83
|
+
* Internal lifecycle hook used by driver-level coordinators that need
|
|
84
|
+
* the persisted run id before the first invocation starts.
|
|
85
|
+
*/
|
|
86
|
+
onRunRecordCreated?: (record: RunRecord) => void
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
export interface OrchestratorDeps extends DriveOptions {
|
|
90
|
+
store: RunRecordStore
|
|
91
|
+
handler: StepHandler
|
|
92
|
+
/** id generator; defaults to `run_<random>`. */
|
|
93
|
+
idGenerator?: () => string
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
export async function trigger(args: TriggerArgs, deps: OrchestratorDeps): Promise<RunRecord> {
|
|
97
|
+
const now = deps.now ?? (() => Date.now())
|
|
98
|
+
const createdAt = now()
|
|
99
|
+
// Idempotency: caller-supplied `runId` wins (explicit). Otherwise, when
|
|
100
|
+
// `idempotencyKey` is supplied, derive a deterministic runId from
|
|
101
|
+
// `(workflowId, idempotencyKey)`. We then route through `store.tryInsert`
|
|
102
|
+
// — atomic check-or-insert — so concurrent triggers with the same
|
|
103
|
+
// derived runId never both proceed to drive (architecture doc §15.2,
|
|
104
|
+
// closes the race the get-then-save pattern leaves open).
|
|
105
|
+
const explicitRunId = args.runId
|
|
106
|
+
const idempotencyKey = args.idempotencyKey
|
|
107
|
+
const derivedRunId =
|
|
108
|
+
idempotencyKey !== undefined ? `idem-${args.workflowId}-${idempotencyKey}` : undefined
|
|
109
|
+
const id = explicitRunId ?? derivedRunId ?? deps.idGenerator?.() ?? defaultRunId(now)
|
|
110
|
+
const isIdempotent = explicitRunId !== undefined || derivedRunId !== undefined
|
|
111
|
+
const delayWakeAt = resolveDelayWakeAt(args.delay, createdAt)
|
|
112
|
+
const record: RunRecord = {
|
|
113
|
+
id,
|
|
114
|
+
workflowId: args.workflowId,
|
|
115
|
+
workflowVersion: args.workflowVersion,
|
|
116
|
+
status: delayWakeAt !== undefined ? "waiting" : "running",
|
|
117
|
+
input: args.input,
|
|
118
|
+
journal: args.initialJournal ? cloneJournal(args.initialJournal) : emptyJournal(),
|
|
119
|
+
invocationCount: 0,
|
|
120
|
+
metadataAppliedCount: args.initialMetadataAppliedCount ?? 0,
|
|
121
|
+
computeTimeMs: 0,
|
|
122
|
+
timeoutMs: args.timeoutMs,
|
|
123
|
+
priority: args.priority,
|
|
124
|
+
parent: args.parent,
|
|
125
|
+
pendingWaitpoints:
|
|
126
|
+
delayWakeAt !== undefined
|
|
127
|
+
? [
|
|
128
|
+
{
|
|
129
|
+
clientWaitpointId: triggerDelayWaitpointId(id),
|
|
130
|
+
kind: "DATETIME",
|
|
131
|
+
meta: {
|
|
132
|
+
wakeAt: delayWakeAt,
|
|
133
|
+
durationMs: Math.max(0, delayWakeAt - createdAt),
|
|
134
|
+
source: "trigger.delay",
|
|
135
|
+
},
|
|
136
|
+
},
|
|
137
|
+
]
|
|
138
|
+
: [],
|
|
139
|
+
streams: {},
|
|
140
|
+
startedAt: delayWakeAt ?? createdAt,
|
|
141
|
+
triggeredBy: args.triggeredBy ?? { kind: "api" },
|
|
142
|
+
tags: args.tags ?? [],
|
|
143
|
+
environment: args.environment ?? "development",
|
|
144
|
+
tenantMeta: args.tenantMeta,
|
|
145
|
+
runMeta: { number: args.runNumber ?? 1, attempt: 1 },
|
|
146
|
+
idempotencyKey,
|
|
147
|
+
}
|
|
148
|
+
// Idempotent path uses tryInsert; non-idempotent (auto-generated) ids
|
|
149
|
+
// can't collide so the existing save() is sufficient.
|
|
150
|
+
if (isIdempotent) {
|
|
151
|
+
const result = await deps.store.tryInsert(record)
|
|
152
|
+
if (!result.created) {
|
|
153
|
+
// Another caller raced in first. Return their record without
|
|
154
|
+
// re-driving — drive() is what actually fires side effects.
|
|
155
|
+
return result.record
|
|
156
|
+
}
|
|
157
|
+
args.onRunRecordCreated?.(record)
|
|
158
|
+
} else {
|
|
159
|
+
// Persist up-front so concurrent `cancel(runId)` calls can find the
|
|
160
|
+
// run before any invocation has completed.
|
|
161
|
+
await deps.store.save(record)
|
|
162
|
+
args.onRunRecordCreated?.(record)
|
|
163
|
+
}
|
|
164
|
+
if (delayWakeAt !== undefined) {
|
|
165
|
+
return record
|
|
166
|
+
}
|
|
167
|
+
const abortCtrl = registerRunAbort(id)
|
|
168
|
+
try {
|
|
169
|
+
await driveUntilPaused(record, {
|
|
170
|
+
...driveOptionsFor(deps),
|
|
171
|
+
signal: abortCtrl.signal,
|
|
172
|
+
})
|
|
173
|
+
} finally {
|
|
174
|
+
unregisterRunAbort(id)
|
|
175
|
+
}
|
|
176
|
+
// If an external cancel fired during drive, the aborted step may
|
|
177
|
+
// have surfaced as `failed` (the step threw on abort). The user's
|
|
178
|
+
// intent was cancel, so adopt the store's `cancelled` status.
|
|
179
|
+
if (abortCtrl.signal.aborted) {
|
|
180
|
+
const latest = await deps.store.get(id)
|
|
181
|
+
if (latest?.status === "cancelled") {
|
|
182
|
+
record.status = "cancelled"
|
|
183
|
+
record.completedAt = latest.completedAt ?? now()
|
|
184
|
+
record.pendingWaitpoints = []
|
|
185
|
+
record.error = latest.error
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
return deps.store.save(record)
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
function resolveDelayWakeAt(delay: Duration | Date | undefined, now: number): number | undefined {
|
|
192
|
+
if (delay === undefined) return undefined
|
|
193
|
+
const wakeAt = delay instanceof Date ? delay.getTime() : now + durationToMs(delay)
|
|
194
|
+
return wakeAt > now ? wakeAt : undefined
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
function durationToMs(duration: Duration): number {
|
|
198
|
+
if (typeof duration === "number") return duration
|
|
199
|
+
const m = /^(\d+)(ms|s|m|h|d|w)$/.exec(duration)
|
|
200
|
+
if (!m) throw new Error(`invalid duration: ${String(duration)}`)
|
|
201
|
+
const n = Number(m[1])
|
|
202
|
+
switch (m[2]) {
|
|
203
|
+
case "ms":
|
|
204
|
+
return n
|
|
205
|
+
case "s":
|
|
206
|
+
return n * 1_000
|
|
207
|
+
case "m":
|
|
208
|
+
return n * 60_000
|
|
209
|
+
case "h":
|
|
210
|
+
return n * 3_600_000
|
|
211
|
+
case "d":
|
|
212
|
+
return n * 86_400_000
|
|
213
|
+
case "w":
|
|
214
|
+
return n * 604_800_000
|
|
215
|
+
default:
|
|
216
|
+
throw new Error(`invalid duration unit: ${m[2]}`)
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
function triggerDelayWaitpointId(runId: string): string {
|
|
221
|
+
return `trigger-delay:${runId}`
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
function cloneJournal(journal: JournalSlice): JournalSlice {
|
|
225
|
+
return structuredClone(journal) as JournalSlice
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/**
|
|
229
|
+
* Build DriveOptions that carry a `triggerChild` hook bound to the
|
|
230
|
+
* same deps — so `ctx.invoke(child, ...)` recursively runs the child
|
|
231
|
+
* through the same orchestrator, store, and handler — plus a
|
|
232
|
+
* `beforeInvocation` hook that persists mid-flight progress and
|
|
233
|
+
* honors concurrent cancellations.
|
|
234
|
+
*/
|
|
235
|
+
function driveOptionsFor(deps: OrchestratorDeps): DriveOptions {
|
|
236
|
+
const now = deps.now ?? (() => Date.now())
|
|
237
|
+
return {
|
|
238
|
+
...deps,
|
|
239
|
+
triggerChild: async ({ parent, waitpoint }) => {
|
|
240
|
+
const childWorkflowId = String(waitpoint.meta.childWorkflowId)
|
|
241
|
+
const childInput = waitpoint.meta.childInput
|
|
242
|
+
const detach = waitpoint.meta.detach === true
|
|
243
|
+
return trigger(
|
|
244
|
+
{
|
|
245
|
+
workflowId: childWorkflowId,
|
|
246
|
+
// Children inherit the parent's workflow version slot unless
|
|
247
|
+
// lockToVersion is set; for v1 we always inherit.
|
|
248
|
+
workflowVersion: parent.workflowVersion,
|
|
249
|
+
input: childInput,
|
|
250
|
+
tenantMeta: parent.tenantMeta,
|
|
251
|
+
environment: parent.environment,
|
|
252
|
+
// Inherit the parent's trigger kind for run-tree observability.
|
|
253
|
+
triggeredBy: parent.triggeredBy,
|
|
254
|
+
tags: Array.isArray(waitpoint.meta.tags) ? (waitpoint.meta.tags as string[]) : [],
|
|
255
|
+
// Lineage pointer: if this child parks, its terminal status
|
|
256
|
+
// cascade-resumes the parent at this specific waitpoint.
|
|
257
|
+
parent: detach
|
|
258
|
+
? undefined
|
|
259
|
+
: { runId: parent.id, waitpointId: waitpoint.clientWaitpointId },
|
|
260
|
+
},
|
|
261
|
+
deps,
|
|
262
|
+
)
|
|
263
|
+
},
|
|
264
|
+
beforeInvocation: async (rec) => {
|
|
265
|
+
// Read-first: a concurrent `cancel()` may have flipped the
|
|
266
|
+
// stored status. If we saved first, we'd overwrite it.
|
|
267
|
+
const latest = await deps.store.get(rec.id)
|
|
268
|
+
if (latest && latest.status === "cancelled") {
|
|
269
|
+
rec.status = "cancelled"
|
|
270
|
+
rec.completedAt = latest.completedAt ?? now()
|
|
271
|
+
rec.pendingWaitpoints = []
|
|
272
|
+
if (latest.error) rec.error = latest.error
|
|
273
|
+
return false
|
|
274
|
+
}
|
|
275
|
+
// Persist mid-flight progress so the dashboard sees updates and
|
|
276
|
+
// the next concurrent cancel() has an up-to-date target.
|
|
277
|
+
await deps.store.save(rec)
|
|
278
|
+
return true
|
|
279
|
+
},
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
export interface ResumeArgs {
|
|
284
|
+
runId: string
|
|
285
|
+
injection: WaitpointInjection
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
export async function resume(
|
|
289
|
+
args: ResumeArgs,
|
|
290
|
+
deps: OrchestratorDeps,
|
|
291
|
+
): Promise<
|
|
292
|
+
| { ok: true; record: RunRecord }
|
|
293
|
+
| { ok: false; status: "not_found" | "not_parked" | "no_match"; message: string }
|
|
294
|
+
> {
|
|
295
|
+
const existing = await deps.store.get(args.runId)
|
|
296
|
+
if (!existing) {
|
|
297
|
+
return { ok: false, status: "not_found", message: `run ${args.runId} not found` }
|
|
298
|
+
}
|
|
299
|
+
if (existing.status !== "waiting") {
|
|
300
|
+
return {
|
|
301
|
+
ok: false,
|
|
302
|
+
status: "not_parked",
|
|
303
|
+
message: `run ${args.runId} is not parked (status: ${existing.status})`,
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
const ok = applyWaitpointInjection(existing, args.injection, deps.now)
|
|
307
|
+
if (!ok.ok) {
|
|
308
|
+
return { ok: false, status: "no_match", message: ok.message }
|
|
309
|
+
}
|
|
310
|
+
const abortCtrl = registerRunAbort(existing.id)
|
|
311
|
+
try {
|
|
312
|
+
await driveUntilPaused(existing, {
|
|
313
|
+
...driveOptionsFor(deps),
|
|
314
|
+
signal: abortCtrl.signal,
|
|
315
|
+
})
|
|
316
|
+
} finally {
|
|
317
|
+
unregisterRunAbort(existing.id)
|
|
318
|
+
}
|
|
319
|
+
if (abortCtrl.signal.aborted) {
|
|
320
|
+
const latest = await deps.store.get(existing.id)
|
|
321
|
+
if (latest?.status === "cancelled") {
|
|
322
|
+
const now = deps.now ?? (() => Date.now())
|
|
323
|
+
existing.status = "cancelled"
|
|
324
|
+
existing.completedAt = latest.completedAt ?? now()
|
|
325
|
+
existing.pendingWaitpoints = []
|
|
326
|
+
existing.error = latest.error
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
const saved = await deps.store.save(existing)
|
|
330
|
+
// If this resume drove the run into a terminal state and it's a
|
|
331
|
+
// child of some parent, cascade the resolution.
|
|
332
|
+
if (isTerminalStatus(saved.status)) {
|
|
333
|
+
await cascadeResumeParent(saved, deps)
|
|
334
|
+
}
|
|
335
|
+
return { ok: true, record: saved }
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
export interface ResumeDueAlarmsArgs {
|
|
339
|
+
runId: string
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
/**
|
|
343
|
+
* Resolve every DATETIME waitpoint whose `wakeAt` has passed, drive
|
|
344
|
+
* the run forward, and persist. Returns the saved record, or null
|
|
345
|
+
* when the run isn't in `waiting` state (already terminal / running
|
|
346
|
+
* elsewhere), or when no DATETIME waitpoints are actually due yet —
|
|
347
|
+
* both are no-ops that the caller can treat as "nothing to do."
|
|
348
|
+
*
|
|
349
|
+
* Callers (local serve loop, CF DO alarm handler) are responsible for
|
|
350
|
+
* scheduling the actual wake-up timer. This function is transport-
|
|
351
|
+
* agnostic: given `now()`, it does the resolve + drive + save.
|
|
352
|
+
*/
|
|
353
|
+
export async function resumeDueAlarms(
|
|
354
|
+
args: ResumeDueAlarmsArgs,
|
|
355
|
+
deps: OrchestratorDeps,
|
|
356
|
+
): Promise<RunRecord | null> {
|
|
357
|
+
const record = await deps.store.get(args.runId)
|
|
358
|
+
if (!record) return null
|
|
359
|
+
if (record.status !== "waiting") return null
|
|
360
|
+
const now = deps.now ?? (() => Date.now())
|
|
361
|
+
const at = now()
|
|
362
|
+
const stillPending: typeof record.pendingWaitpoints = []
|
|
363
|
+
let resolvedAny = false
|
|
364
|
+
for (const wp of record.pendingWaitpoints) {
|
|
365
|
+
const wakeAt = typeof wp.meta.wakeAt === "number" ? wp.meta.wakeAt : undefined
|
|
366
|
+
if (wp.kind === "DATETIME" && wakeAt !== undefined && wakeAt <= at) {
|
|
367
|
+
record.journal.waitpointsResolved[wp.clientWaitpointId] = {
|
|
368
|
+
kind: "DATETIME",
|
|
369
|
+
resolvedAt: at,
|
|
370
|
+
source: "replay",
|
|
371
|
+
}
|
|
372
|
+
resolvedAny = true
|
|
373
|
+
} else {
|
|
374
|
+
stillPending.push(wp)
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
if (!resolvedAny) return null
|
|
378
|
+
record.pendingWaitpoints = stillPending
|
|
379
|
+
if (record.pendingWaitpoints.length === 0) record.status = "running"
|
|
380
|
+
const abortCtrl = registerRunAbort(record.id)
|
|
381
|
+
try {
|
|
382
|
+
await driveUntilPaused(record, {
|
|
383
|
+
...driveOptionsFor(deps),
|
|
384
|
+
signal: abortCtrl.signal,
|
|
385
|
+
})
|
|
386
|
+
} finally {
|
|
387
|
+
unregisterRunAbort(record.id)
|
|
388
|
+
}
|
|
389
|
+
const saved = await deps.store.save(record)
|
|
390
|
+
if (isTerminalStatus(saved.status)) {
|
|
391
|
+
await cascadeResumeParent(saved, deps)
|
|
392
|
+
}
|
|
393
|
+
return saved
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
export interface CancelArgs {
|
|
397
|
+
runId: string
|
|
398
|
+
reason?: string
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
export async function cancel(
|
|
402
|
+
args: CancelArgs,
|
|
403
|
+
deps: OrchestratorDeps,
|
|
404
|
+
): Promise<
|
|
405
|
+
| { ok: true; record: RunRecord }
|
|
406
|
+
| { ok: false; status: "not_found" | "already_terminal"; message: string }
|
|
407
|
+
> {
|
|
408
|
+
const existing = await deps.store.get(args.runId)
|
|
409
|
+
if (!existing) {
|
|
410
|
+
return { ok: false, status: "not_found", message: `run ${args.runId} not found` }
|
|
411
|
+
}
|
|
412
|
+
if (existing.status !== "waiting" && existing.status !== "running") {
|
|
413
|
+
return {
|
|
414
|
+
ok: false,
|
|
415
|
+
status: "already_terminal",
|
|
416
|
+
message: `run ${args.runId} is already terminal (status: ${existing.status})`,
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
const now = deps.now ?? (() => Date.now())
|
|
420
|
+
existing.status = "cancelled"
|
|
421
|
+
existing.completedAt = now()
|
|
422
|
+
existing.pendingWaitpoints = []
|
|
423
|
+
if (args.reason) {
|
|
424
|
+
existing.error = {
|
|
425
|
+
category: "USER_ERROR",
|
|
426
|
+
code: "CANCELLED",
|
|
427
|
+
message: args.reason,
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
const saved = await deps.store.save(existing)
|
|
431
|
+
// Best-effort mid-step abort: if the run is in-flight in this
|
|
432
|
+
// process, fire its AbortSignal so step bodies that observe
|
|
433
|
+
// `ctx.signal` (fetches, sleeps, etc.) stop immediately. Returns
|
|
434
|
+
// `false` when no controller is registered (run is in another
|
|
435
|
+
// process, or drive has already exited) — that's fine; the
|
|
436
|
+
// status flip + between-invocation recheck cover that path.
|
|
437
|
+
signalRunAbort(existing.id, args.reason)
|
|
438
|
+
// If this cancel was on a child with a parked parent, surface the
|
|
439
|
+
// cancellation to the parent as a RUN-waitpoint error.
|
|
440
|
+
if (isTerminalStatus(saved.status)) {
|
|
441
|
+
await cascadeResumeParent(saved, deps)
|
|
442
|
+
}
|
|
443
|
+
return { ok: true, record: saved }
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
/**
|
|
447
|
+
* When a child run reaches a terminal state, look up its `parent`
|
|
448
|
+
* pointer and resume the parent's matching RUN waitpoint with the
|
|
449
|
+
* child's output / error. Best-effort: if the parent can't be found
|
|
450
|
+
* or is no longer parked, silently drop (the parent's own drive will
|
|
451
|
+
* observe the child's state on replay via a subsequent trigger).
|
|
452
|
+
*/
|
|
453
|
+
async function cascadeResumeParent(child: RunRecord, deps: OrchestratorDeps): Promise<void> {
|
|
454
|
+
if (!child.parent) return
|
|
455
|
+
const parent = await deps.store.get(child.parent.runId)
|
|
456
|
+
if (!parent) return
|
|
457
|
+
if (parent.status !== "waiting") return
|
|
458
|
+
const wpIdx = parent.pendingWaitpoints.findIndex(
|
|
459
|
+
(w) => w.clientWaitpointId === child.parent!.waitpointId,
|
|
460
|
+
)
|
|
461
|
+
if (wpIdx < 0) return
|
|
462
|
+
|
|
463
|
+
const now = deps.now ?? (() => Date.now())
|
|
464
|
+
const at = now()
|
|
465
|
+
if (child.status === "completed") {
|
|
466
|
+
parent.journal.waitpointsResolved[child.parent.waitpointId] = {
|
|
467
|
+
kind: "RUN",
|
|
468
|
+
resolvedAt: at,
|
|
469
|
+
payload: child.output,
|
|
470
|
+
source: "replay",
|
|
471
|
+
}
|
|
472
|
+
} else {
|
|
473
|
+
parent.journal.waitpointsResolved[child.parent.waitpointId] = {
|
|
474
|
+
kind: "RUN",
|
|
475
|
+
resolvedAt: at,
|
|
476
|
+
source: "replay",
|
|
477
|
+
error: {
|
|
478
|
+
category:
|
|
479
|
+
(child.error?.category as "USER_ERROR" | "RUNTIME_ERROR" | undefined) ?? "USER_ERROR",
|
|
480
|
+
code: child.error?.code ?? "CHILD_RUN_ENDED",
|
|
481
|
+
message: child.error?.message ?? `child run ended with status ${child.status}`,
|
|
482
|
+
},
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
parent.pendingWaitpoints.splice(wpIdx, 1)
|
|
486
|
+
if (parent.pendingWaitpoints.length === 0) {
|
|
487
|
+
parent.status = "running"
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
// Re-drive the parent. This nested drive goes through the same
|
|
491
|
+
// handler / store / hooks, so the parent's own parent (if any)
|
|
492
|
+
// will also cascade-resume when appropriate.
|
|
493
|
+
const abortCtrl = registerRunAbort(parent.id)
|
|
494
|
+
try {
|
|
495
|
+
await driveUntilPaused(parent, {
|
|
496
|
+
...driveOptionsFor(deps),
|
|
497
|
+
signal: abortCtrl.signal,
|
|
498
|
+
})
|
|
499
|
+
} finally {
|
|
500
|
+
unregisterRunAbort(parent.id)
|
|
501
|
+
}
|
|
502
|
+
await deps.store.save(parent)
|
|
503
|
+
// The parent might itself have a parent — recurse.
|
|
504
|
+
if (isTerminalStatus(parent.status)) {
|
|
505
|
+
await cascadeResumeParent(parent, deps)
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
function isTerminalStatus(s: string): boolean {
|
|
510
|
+
return (
|
|
511
|
+
s === "completed" ||
|
|
512
|
+
s === "failed" ||
|
|
513
|
+
s === "cancelled" ||
|
|
514
|
+
s === "compensated" ||
|
|
515
|
+
s === "compensation_failed"
|
|
516
|
+
)
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
function defaultRunId(now: () => number): string {
|
|
520
|
+
const ts = now().toString(36)
|
|
521
|
+
// Non-cryptographic; orchestrator core exposes `idGenerator` for
|
|
522
|
+
// callers that want stronger guarantees.
|
|
523
|
+
const rand = Math.floor(Math.random() * 1_000_000)
|
|
524
|
+
.toString(36)
|
|
525
|
+
.padStart(4, "0")
|
|
526
|
+
return `run_${ts}_${rand}`
|
|
527
|
+
}
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
import { emptyJournal } from "./journal-helpers.js"
|
|
2
|
+
import type { JournalSlice, RunRecord, StepJournalEntry } from "./types.js"
|
|
3
|
+
|
|
4
|
+
export interface BuildResumeJournalInput {
|
|
5
|
+
parent: RunRecord
|
|
6
|
+
resumeFromStep?: string
|
|
7
|
+
seedResults?: Record<string, unknown>
|
|
8
|
+
now?: () => number
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export interface BuildResumeJournalResult {
|
|
12
|
+
resumeFromStep: string
|
|
13
|
+
journal: JournalSlice
|
|
14
|
+
metadataAppliedCount: number
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export interface BuildSeededResumeJournalInput {
|
|
18
|
+
parentRunId: string
|
|
19
|
+
resumeFromStep: string
|
|
20
|
+
seedResults: Record<string, unknown>
|
|
21
|
+
metadataState?: Record<string, unknown>
|
|
22
|
+
metadataAppliedCount?: number
|
|
23
|
+
now?: () => number
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export function buildResumeJournal(input: BuildResumeJournalInput): BuildResumeJournalResult {
|
|
27
|
+
const resumeFromStep = input.resumeFromStep ?? findFirstFailedStep(input.parent)
|
|
28
|
+
if (!resumeFromStep) {
|
|
29
|
+
throw new Error(
|
|
30
|
+
`run "${input.parent.id}" has no failed step; pass resumeFromStep explicitly to resume it`,
|
|
31
|
+
)
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
const journal = emptyJournal()
|
|
35
|
+
journal.metadataState = structuredClone(input.parent.journal.metadataState) as Record<
|
|
36
|
+
string,
|
|
37
|
+
unknown
|
|
38
|
+
>
|
|
39
|
+
|
|
40
|
+
if (input.seedResults) {
|
|
41
|
+
return buildSeededResumeJournal({
|
|
42
|
+
parentRunId: input.parent.id,
|
|
43
|
+
resumeFromStep,
|
|
44
|
+
seedResults: input.seedResults,
|
|
45
|
+
metadataState: journal.metadataState,
|
|
46
|
+
metadataAppliedCount: input.parent.metadataAppliedCount,
|
|
47
|
+
now: input.now,
|
|
48
|
+
})
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
for (const [stepId, entry] of Object.entries(input.parent.journal.stepResults)) {
|
|
52
|
+
if (stepId === resumeFromStep) break
|
|
53
|
+
if (entry.status !== "ok") {
|
|
54
|
+
throw new Error(
|
|
55
|
+
`step "${stepId}" completed before "${resumeFromStep}" but is not successful; cannot seed resume journal`,
|
|
56
|
+
)
|
|
57
|
+
}
|
|
58
|
+
journal.stepResults[stepId] = structuredClone(entry) as StepJournalEntry
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
return {
|
|
62
|
+
resumeFromStep,
|
|
63
|
+
journal,
|
|
64
|
+
metadataAppliedCount: input.parent.metadataAppliedCount,
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
export function buildSeededResumeJournal(
|
|
69
|
+
input: BuildSeededResumeJournalInput,
|
|
70
|
+
): BuildResumeJournalResult {
|
|
71
|
+
const journal = emptyJournal()
|
|
72
|
+
journal.metadataState = input.metadataState
|
|
73
|
+
? (structuredClone(input.metadataState) as Record<string, unknown>)
|
|
74
|
+
: {}
|
|
75
|
+
const now = input.now ?? (() => Date.now())
|
|
76
|
+
let at = now()
|
|
77
|
+
for (const [stepId, output] of Object.entries(input.seedResults)) {
|
|
78
|
+
journal.stepResults[stepId] = seededStepEntry(output, at)
|
|
79
|
+
at += 1
|
|
80
|
+
}
|
|
81
|
+
return {
|
|
82
|
+
resumeFromStep: input.resumeFromStep,
|
|
83
|
+
journal,
|
|
84
|
+
metadataAppliedCount: input.metadataAppliedCount ?? 0,
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
export type SeedResultsValidation =
|
|
89
|
+
| { ok: true; seedResults: Record<string, unknown> }
|
|
90
|
+
| { ok: false; message: string }
|
|
91
|
+
|
|
92
|
+
const SEED_RESULTS_MAX_ENTRIES = 256
|
|
93
|
+
const SEED_RESULTS_MAX_STEP_ID_LENGTH = 200
|
|
94
|
+
const SEED_RESULTS_MAX_SERIALIZED_CHARS = 1_000_000
|
|
95
|
+
// biome-ignore lint/suspicious/noControlCharactersInRegex: rejecting control chars is the point -- owner: workflows-orchestrator; existing suppression is intentional pending typed cleanup.
|
|
96
|
+
const CONTROL_CHARS = /[\x00-\x1f\x7f]/
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Strict structural validation for caller-supplied `seedResults`
|
|
100
|
+
* (`POST /api/runs/:id/resume`). Seeded entries are written verbatim
|
|
101
|
+
* into the new run's journal as already-completed steps, so they let
|
|
102
|
+
* the caller assert "this step ran and produced this output" — they
|
|
103
|
+
* must be gated behind an operator credential AND shape-checked:
|
|
104
|
+
* a record of bounded, control-character-free step ids to
|
|
105
|
+
* JSON-serializable values, bounded in count and total size.
|
|
106
|
+
*/
|
|
107
|
+
export function validateSeedResults(value: unknown): SeedResultsValidation {
|
|
108
|
+
if (typeof value !== "object" || value === null || Array.isArray(value)) {
|
|
109
|
+
return { ok: false, message: "seedResults must be an object of stepId → output" }
|
|
110
|
+
}
|
|
111
|
+
const entries = Object.entries(value as Record<string, unknown>)
|
|
112
|
+
if (entries.length > SEED_RESULTS_MAX_ENTRIES) {
|
|
113
|
+
return {
|
|
114
|
+
ok: false,
|
|
115
|
+
message: `seedResults may contain at most ${SEED_RESULTS_MAX_ENTRIES} entries`,
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
for (const [stepId, output] of entries) {
|
|
119
|
+
if (stepId.length === 0 || stepId.length > SEED_RESULTS_MAX_STEP_ID_LENGTH) {
|
|
120
|
+
return {
|
|
121
|
+
ok: false,
|
|
122
|
+
message: `seedResults step ids must be 1-${SEED_RESULTS_MAX_STEP_ID_LENGTH} characters`,
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
if (CONTROL_CHARS.test(stepId)) {
|
|
126
|
+
return { ok: false, message: "seedResults step ids must not contain control characters" }
|
|
127
|
+
}
|
|
128
|
+
let serialized: string | undefined
|
|
129
|
+
try {
|
|
130
|
+
serialized = JSON.stringify(output)
|
|
131
|
+
} catch {
|
|
132
|
+
return { ok: false, message: `seedResults["${stepId}"] is not JSON-serializable` }
|
|
133
|
+
}
|
|
134
|
+
if (serialized === undefined) {
|
|
135
|
+
return { ok: false, message: `seedResults["${stepId}"] is not JSON-serializable` }
|
|
136
|
+
}
|
|
137
|
+
if (serialized.length > SEED_RESULTS_MAX_SERIALIZED_CHARS) {
|
|
138
|
+
return {
|
|
139
|
+
ok: false,
|
|
140
|
+
message: `seedResults["${stepId}"] exceeds the ${SEED_RESULTS_MAX_SERIALIZED_CHARS}-character serialized limit`,
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
return { ok: true, seedResults: value as Record<string, unknown> }
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
function findFirstFailedStep(parent: RunRecord): string | undefined {
|
|
148
|
+
for (const [stepId, entry] of Object.entries(parent.journal.stepResults)) {
|
|
149
|
+
if (entry.status === "err") return stepId
|
|
150
|
+
}
|
|
151
|
+
return undefined
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
function seededStepEntry(output: unknown, at: number): StepJournalEntry {
|
|
155
|
+
return {
|
|
156
|
+
attempt: 1,
|
|
157
|
+
status: "ok",
|
|
158
|
+
output,
|
|
159
|
+
startedAt: at,
|
|
160
|
+
finishedAt: at,
|
|
161
|
+
}
|
|
162
|
+
}
|