@strav/durable 1.0.0-alpha.9 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -4
- package/src/durable_runner.ts +446 -122
- package/src/durable_workflow.ts +190 -30
- package/src/index.ts +19 -4
- package/src/types.ts +146 -7
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@strav/durable",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.1",
|
|
4
4
|
"description": "Strav durable execution — crash-resumable sequential workflows on top of @strav/queue + Postgres. V1: sequential .step() with retries + saga compensation. V2 adds parallel/route/loop/sleep/waitForSignal.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./src/index.ts",
|
|
@@ -19,9 +19,9 @@
|
|
|
19
19
|
"access": "public"
|
|
20
20
|
},
|
|
21
21
|
"dependencies": {
|
|
22
|
-
"@strav/kernel": "1.0.
|
|
23
|
-
"@strav/database": "1.0.
|
|
24
|
-
"@strav/queue": "1.0.
|
|
22
|
+
"@strav/kernel": "1.0.1",
|
|
23
|
+
"@strav/database": "1.0.1",
|
|
24
|
+
"@strav/queue": "1.0.1"
|
|
25
25
|
},
|
|
26
26
|
"peerDependencies": {
|
|
27
27
|
"@types/bun": ">=1.3.14"
|
package/src/durable_runner.ts
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* `DurableRunner` — the engine that owns the durable execution state
|
|
3
3
|
* machine.
|
|
4
4
|
*
|
|
5
|
-
*
|
|
5
|
+
* Four load-bearing methods:
|
|
6
6
|
*
|
|
7
7
|
* 1. `start(name, input)` — INSERTs a new run row, dispatches the
|
|
8
8
|
* first `DurableAdvanceJob` for it inside the same transaction
|
|
@@ -11,12 +11,11 @@
|
|
|
11
11
|
* queue.
|
|
12
12
|
*
|
|
13
13
|
* 2. `advance(runId)` — the job handler. Acquires a row lock,
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
17
|
-
*
|
|
18
|
-
* kicks off compensation.
|
|
19
|
-
* DB transaction so partial writes can't escape.
|
|
14
|
+
* resolves the node at `current_step`, dispatches by node-type
|
|
15
|
+
* (`step` / `sleep` / `waitForSignal` / `parallel` / `route` /
|
|
16
|
+
* `loop` / `childWorkflow`), journals the result, and either
|
|
17
|
+
* re-enqueues itself, parks the run as `waiting`, or — on
|
|
18
|
+
* terminal failure — kicks off compensation.
|
|
20
19
|
*
|
|
21
20
|
* 3. `compensate(runId)` — walks the journal in reverse order
|
|
22
21
|
* running each step's `compensate` callback. On clean
|
|
@@ -24,27 +23,44 @@
|
|
|
24
23
|
* compensation are logged but don't block the rest of the
|
|
25
24
|
* rollback (compensators must be idempotent).
|
|
26
25
|
*
|
|
26
|
+
* 4. `signal(runId, signalName, payload?)` — wakes a run parked on
|
|
27
|
+
* a `waitForSignal` node. Writes the journal entry, clears the
|
|
28
|
+
* awaiting marker, dispatches an advance.
|
|
29
|
+
*
|
|
27
30
|
* Apps don't usually call `advance` / `compensate` directly — the
|
|
28
31
|
* `DurableAdvanceJob` and `DurableCompensateJob` classes wrap them.
|
|
29
32
|
*/
|
|
30
33
|
|
|
31
|
-
import {
|
|
32
|
-
type Database,
|
|
33
|
-
PostgresDatabase,
|
|
34
|
-
type SchemaRegistry,
|
|
35
|
-
} from '@strav/database'
|
|
34
|
+
import { type Database, PostgresDatabase, type SchemaRegistry } from '@strav/database'
|
|
36
35
|
import { type Logger, ulid } from '@strav/kernel'
|
|
37
36
|
import type { JobClass, Queue } from '@strav/queue'
|
|
38
|
-
import { RunNotFoundError } from './durable_error.ts'
|
|
39
|
-
import type {
|
|
37
|
+
import { DurableError, RunNotFoundError } from './durable_error.ts'
|
|
38
|
+
import type {
|
|
39
|
+
DurableContext,
|
|
40
|
+
DurableNode,
|
|
41
|
+
DurableStep,
|
|
42
|
+
RunSnapshot,
|
|
43
|
+
RunStatus,
|
|
44
|
+
} from './types.ts'
|
|
40
45
|
import type { WorkflowRegistry } from './workflow_registry.ts'
|
|
41
46
|
|
|
47
|
+
interface RunState {
|
|
48
|
+
results: Record<string, unknown>
|
|
49
|
+
stepAttempts: Record<string, number>
|
|
50
|
+
/** `waitForSignal` markers — `{ [nodeName]: signalName }`. */
|
|
51
|
+
awaitingSignals?: Record<string, string>
|
|
52
|
+
/** Per-loop iteration state — `{ [nodeName]: { iteration, results[] } }`. */
|
|
53
|
+
loopState?: Record<string, { iteration: number; results: unknown[] }>
|
|
54
|
+
/** Per-child-workflow link — `{ [nodeName]: { childRunId } }`. */
|
|
55
|
+
childRunIds?: Record<string, string>
|
|
56
|
+
}
|
|
57
|
+
|
|
42
58
|
interface RunRow {
|
|
43
59
|
id: string
|
|
44
60
|
workflow_name: string
|
|
45
61
|
input: Record<string, unknown> | string
|
|
46
62
|
status: RunStatus
|
|
47
|
-
state:
|
|
63
|
+
state: RunState | string
|
|
48
64
|
current_step: number
|
|
49
65
|
result: Record<string, unknown> | string | null
|
|
50
66
|
error: string | null
|
|
@@ -63,6 +79,22 @@ interface JournalRow {
|
|
|
63
79
|
completed_at: Date
|
|
64
80
|
}
|
|
65
81
|
|
|
82
|
+
type Tx = { query: Database['query']; queryOne: Database['queryOne']; execute: Database['execute'] }
|
|
83
|
+
|
|
84
|
+
type Outcome =
|
|
85
|
+
/** Node completed; advance cursor + re-dispatch. */
|
|
86
|
+
| { kind: 'completed'; value: unknown; attempt: number }
|
|
87
|
+
/** Node has retries left; re-dispatch with delay. */
|
|
88
|
+
| { kind: 'retry'; attempt: number; delaySec: number }
|
|
89
|
+
/** Node exhausted retries; journal + compensate. */
|
|
90
|
+
| { kind: 'failed'; attempt: number; error: string }
|
|
91
|
+
/**
|
|
92
|
+
* Node parked itself. `delaySec`, when set, schedules a wake-up
|
|
93
|
+
* advance — for sleep and child-workflow polling. Undefined for
|
|
94
|
+
* waitForSignal (an external `signal()` call resumes it).
|
|
95
|
+
*/
|
|
96
|
+
| { kind: 'waiting'; delaySec?: number }
|
|
97
|
+
|
|
66
98
|
export interface DurableRunnerOptions {
|
|
67
99
|
db: PostgresDatabase
|
|
68
100
|
queue: Queue
|
|
@@ -120,8 +152,6 @@ export class DurableRunner {
|
|
|
120
152
|
* orphan either.
|
|
121
153
|
*/
|
|
122
154
|
async start(workflowName: string, input: Record<string, unknown> = {}): Promise<string> {
|
|
123
|
-
// Validate workflow registration up-front so the caller sees a
|
|
124
|
-
// synchronous error rather than a never-advancing run row.
|
|
125
155
|
this.registry.get(workflowName)
|
|
126
156
|
const runId = ulid()
|
|
127
157
|
await this.db.transaction(async (tx) => {
|
|
@@ -129,7 +159,7 @@ export class DurableRunner {
|
|
|
129
159
|
`INSERT INTO "strav_workflow_runs"
|
|
130
160
|
(id, workflow_name, input, status, state, current_step, result, error, created_at, updated_at)
|
|
131
161
|
VALUES ($1, $2, $3::jsonb, 'pending', $4::jsonb, 0, NULL, NULL, now(), now())`,
|
|
132
|
-
[runId, workflowName, JSON.stringify(input), JSON.stringify(
|
|
162
|
+
[runId, workflowName, JSON.stringify(input), JSON.stringify(emptyState())],
|
|
133
163
|
)
|
|
134
164
|
await this.queue.dispatch(this.advanceJob, { runId })
|
|
135
165
|
})
|
|
@@ -148,133 +178,108 @@ export class DurableRunner {
|
|
|
148
178
|
}
|
|
149
179
|
|
|
150
180
|
/**
|
|
151
|
-
* Advance handler.
|
|
152
|
-
*
|
|
153
|
-
*
|
|
154
|
-
* 2. Resolve the workflow + the step at `current_step`.
|
|
155
|
-
* 3. If a completed journal row already exists for this step,
|
|
156
|
-
* treat the run as if the step just succeeded — bump
|
|
157
|
-
* `current_step` and either enqueue the next or mark
|
|
158
|
-
* `completed`.
|
|
159
|
-
* 4. Otherwise call the handler. On success: journal +
|
|
160
|
-
* bump cursor + enqueue next (or mark `completed`). On
|
|
161
|
-
* throw: track the attempt; if there are retries left,
|
|
162
|
-
* enqueue a delayed advance; otherwise journal the failure
|
|
163
|
-
* and kick off compensation.
|
|
181
|
+
* Advance handler. Loads the run, dispatches the current node by
|
|
182
|
+
* type, and either re-enqueues (`continue`), parks (`waiting`),
|
|
183
|
+
* retries with backoff, or kicks off compensation.
|
|
164
184
|
*/
|
|
165
185
|
async advance(runId: string): Promise<void> {
|
|
166
|
-
const
|
|
186
|
+
const shouldContinue = await this.db.transaction(async (tx) => {
|
|
167
187
|
const row = await tx.queryOne<RunRow>(
|
|
168
188
|
`SELECT id, workflow_name, input, status, state, current_step, result, error, created_at, updated_at
|
|
169
189
|
FROM "strav_workflow_runs" WHERE id = $1 FOR UPDATE`,
|
|
170
190
|
[runId],
|
|
171
191
|
)
|
|
172
192
|
if (!row) throw new RunNotFoundError(runId)
|
|
173
|
-
if (row.status === 'completed' || row.status === 'failed') return
|
|
193
|
+
if (row.status === 'completed' || row.status === 'failed') return false
|
|
174
194
|
|
|
175
195
|
const wf = this.registry.get(row.workflow_name)
|
|
176
|
-
const state = parseJson(row.state) as
|
|
177
|
-
|
|
178
|
-
stepAttempts: Record<string, number>
|
|
179
|
-
}
|
|
196
|
+
const state = parseJson(row.state) as RunState
|
|
197
|
+
ensureStateShape(state)
|
|
180
198
|
const input = parseJson(row.input) as Record<string, unknown>
|
|
181
199
|
|
|
182
200
|
if (row.current_step >= wf.steps.length) {
|
|
183
201
|
await this.markCompleted(tx, runId, state.results)
|
|
184
|
-
return
|
|
202
|
+
return false
|
|
185
203
|
}
|
|
186
204
|
|
|
187
|
-
const
|
|
205
|
+
const node = wf.steps[row.current_step] as DurableNode
|
|
188
206
|
|
|
189
|
-
// Idempotent replay — if
|
|
190
|
-
//
|
|
207
|
+
// Idempotent replay — if the node was already journaled
|
|
208
|
+
// completed, skip the handler.
|
|
191
209
|
const journaled = await tx.queryOne<JournalRow>(
|
|
192
210
|
`SELECT id, run_id, step_name, status, result, error, attempts, completed_at
|
|
193
211
|
FROM "strav_workflow_journal" WHERE run_id = $1 AND step_name = $2`,
|
|
194
|
-
[runId,
|
|
212
|
+
[runId, node.name],
|
|
195
213
|
)
|
|
196
214
|
if (journaled?.status === 'completed') {
|
|
197
|
-
state.results[
|
|
215
|
+
state.results[node.name] = parseJson(journaled.result)
|
|
198
216
|
await this.advanceCursor(tx, runId, row.current_step + 1, state)
|
|
199
|
-
|
|
200
|
-
// lock across the next handler invocation.
|
|
201
|
-
return { wf, runId, status: 'continue' as const }
|
|
217
|
+
return true
|
|
202
218
|
}
|
|
203
219
|
|
|
204
|
-
const attempt = (state.stepAttempts[
|
|
220
|
+
const attempt = (state.stepAttempts[node.name] ?? 0) + 1
|
|
205
221
|
const ctx: DurableContext = {
|
|
206
222
|
input,
|
|
207
223
|
results: state.results,
|
|
208
224
|
runId,
|
|
209
225
|
attempt,
|
|
210
226
|
}
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
await tx.execute(
|
|
214
|
-
`INSERT INTO "strav_workflow_journal"
|
|
215
|
-
(id, run_id, step_name, status, result, error, attempts, completed_at, created_at, updated_at)
|
|
216
|
-
VALUES ($1, $2, $3, 'completed', $4::jsonb, NULL, $5, now(), now(), now())`,
|
|
217
|
-
[ulid(), runId, step.name, JSON.stringify(result ?? null), attempt],
|
|
218
|
-
)
|
|
219
|
-
state.results[step.name] = result
|
|
220
|
-
delete state.stepAttempts[step.name]
|
|
221
|
-
await this.advanceCursor(tx, runId, row.current_step + 1, state)
|
|
222
|
-
return { wf, runId, status: 'continue' as const }
|
|
223
|
-
} catch (err) {
|
|
224
|
-
const message = err instanceof Error ? err.message : String(err)
|
|
225
|
-
this.logger?.warn('Durable step failed', {
|
|
226
|
-
runId,
|
|
227
|
-
step: step.name,
|
|
228
|
-
attempt,
|
|
229
|
-
error: message,
|
|
230
|
-
})
|
|
231
|
-
if (attempt < step.maxAttempts) {
|
|
232
|
-
state.stepAttempts[step.name] = attempt
|
|
233
|
-
await tx.execute(
|
|
234
|
-
`UPDATE "strav_workflow_runs" SET state = $1::jsonb, updated_at = now() WHERE id = $2`,
|
|
235
|
-
[JSON.stringify(state), runId],
|
|
236
|
-
)
|
|
237
|
-
const delaySec = Math.max(0, step.backoff(attempt))
|
|
238
|
-
await this.queue.dispatchLater(delaySec, this.advanceJob, { runId })
|
|
239
|
-
return null
|
|
240
|
-
}
|
|
241
|
-
// Terminal — journal the failure, mark compensating, kick off
|
|
242
|
-
// compensation. The compensate handler walks back from the
|
|
243
|
-
// step BEFORE this one (no compensator for the step that
|
|
244
|
-
// just failed; there's nothing to roll back since the work
|
|
245
|
-
// didn't commit).
|
|
246
|
-
await tx.execute(
|
|
247
|
-
`INSERT INTO "strav_workflow_journal"
|
|
248
|
-
(id, run_id, step_name, status, result, error, attempts, completed_at, created_at, updated_at)
|
|
249
|
-
VALUES ($1, $2, $3, 'failed', NULL, $4, $5, now(), now(), now())`,
|
|
250
|
-
[ulid(), runId, step.name, message, attempt],
|
|
251
|
-
)
|
|
252
|
-
await tx.execute(
|
|
253
|
-
`UPDATE "strav_workflow_runs"
|
|
254
|
-
SET status = 'compensating', state = $1::jsonb, error = $2, updated_at = now()
|
|
255
|
-
WHERE id = $3`,
|
|
256
|
-
[JSON.stringify(state), message, runId],
|
|
257
|
-
)
|
|
258
|
-
await this.queue.dispatch(this.compensateJob, { runId })
|
|
259
|
-
return null
|
|
260
|
-
}
|
|
227
|
+
const outcome = await this.runNode(tx, node, ctx, state, runId, attempt)
|
|
228
|
+
return this.applyOutcome(tx, runId, row.current_step, node, state, outcome)
|
|
261
229
|
})
|
|
262
230
|
|
|
263
|
-
|
|
264
|
-
// advance the next one. We do this OUTSIDE the original
|
|
265
|
-
// transaction so each step holds the row lock for the minimum
|
|
266
|
-
// necessary window — important when steps make external API
|
|
267
|
-
// calls that can be slow.
|
|
268
|
-
if (workflow?.status === 'continue') {
|
|
231
|
+
if (shouldContinue) {
|
|
269
232
|
await this.queue.dispatch(this.advanceJob, { runId })
|
|
270
233
|
}
|
|
271
234
|
}
|
|
272
235
|
|
|
236
|
+
/**
|
|
237
|
+
* Wake a run parked on a `waitForSignal` node. Writes the journal
|
|
238
|
+
* entry with `payload` as the node's result, clears the awaiting
|
|
239
|
+
* marker, and dispatches a fresh advance job to resume the next
|
|
240
|
+
* node. No-op when no run is awaiting `signalName`.
|
|
241
|
+
*/
|
|
242
|
+
async signal(runId: string, signalName: string, payload?: unknown): Promise<boolean> {
|
|
243
|
+
return this.db.transaction(async (tx) => {
|
|
244
|
+
const row = await tx.queryOne<RunRow>(
|
|
245
|
+
`SELECT id, workflow_name, input, status, state, current_step, result, error, created_at, updated_at
|
|
246
|
+
FROM "strav_workflow_runs" WHERE id = $1 FOR UPDATE`,
|
|
247
|
+
[runId],
|
|
248
|
+
)
|
|
249
|
+
if (!row) throw new RunNotFoundError(runId)
|
|
250
|
+
if (row.status !== 'waiting') return false
|
|
251
|
+
const state = parseJson(row.state) as RunState
|
|
252
|
+
ensureStateShape(state)
|
|
253
|
+
const awaiting = state.awaitingSignals ?? {}
|
|
254
|
+
const matchEntry = Object.entries(awaiting).find(([, name]) => name === signalName)
|
|
255
|
+
if (matchEntry === undefined) return false
|
|
256
|
+
const [nodeName] = matchEntry
|
|
257
|
+
// Journal the wake-up so replay sees the signal as already received.
|
|
258
|
+
await tx.execute(
|
|
259
|
+
`INSERT INTO "strav_workflow_journal"
|
|
260
|
+
(id, run_id, step_name, status, result, error, attempts, completed_at, created_at, updated_at)
|
|
261
|
+
VALUES ($1, $2, $3, 'completed', $4::jsonb, NULL, 1, now(), now(), now())`,
|
|
262
|
+
[ulid(), runId, nodeName, JSON.stringify(payload ?? null)],
|
|
263
|
+
)
|
|
264
|
+
delete awaiting[nodeName]
|
|
265
|
+
state.awaitingSignals = awaiting
|
|
266
|
+
state.results[nodeName] = payload ?? null
|
|
267
|
+
await tx.execute(
|
|
268
|
+
`UPDATE "strav_workflow_runs"
|
|
269
|
+
SET status = 'running', state = $1::jsonb, current_step = current_step + 1, updated_at = now()
|
|
270
|
+
WHERE id = $2`,
|
|
271
|
+
[JSON.stringify(state), runId],
|
|
272
|
+
)
|
|
273
|
+
await this.queue.dispatch(this.advanceJob, { runId })
|
|
274
|
+
return true
|
|
275
|
+
})
|
|
276
|
+
}
|
|
277
|
+
|
|
273
278
|
/**
|
|
274
279
|
* Compensate handler. Walks the journal in reverse, calling each
|
|
275
280
|
* registered compensator. Compensators that throw are logged but
|
|
276
|
-
* don't halt the rollback
|
|
277
|
-
*
|
|
281
|
+
* don't halt the rollback. Only `step` nodes carry compensators in
|
|
282
|
+
* V2 — other node types are skipped.
|
|
278
283
|
*/
|
|
279
284
|
async compensate(runId: string): Promise<void> {
|
|
280
285
|
await this.db.transaction(async (tx) => {
|
|
@@ -287,7 +292,8 @@ export class DurableRunner {
|
|
|
287
292
|
if (row.status !== 'compensating') return
|
|
288
293
|
|
|
289
294
|
const wf = this.registry.get(row.workflow_name)
|
|
290
|
-
const state = parseJson(row.state) as
|
|
295
|
+
const state = parseJson(row.state) as RunState
|
|
296
|
+
ensureStateShape(state)
|
|
291
297
|
const input = parseJson(row.input) as Record<string, unknown>
|
|
292
298
|
|
|
293
299
|
const journal = await tx.query<JournalRow>(
|
|
@@ -295,20 +301,16 @@ export class DurableRunner {
|
|
|
295
301
|
FROM "strav_workflow_journal" WHERE run_id = $1 ORDER BY completed_at ASC`,
|
|
296
302
|
[runId],
|
|
297
303
|
)
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
const completedNames = new Set(
|
|
303
|
-
journal.filter((j) => j.status === 'completed').map((j) => j.step_name),
|
|
304
|
-
)
|
|
305
|
-
const stepsByName = new Map<string, DurableStep>(wf.steps.map((s) => [s.name, s]))
|
|
304
|
+
const completedNames = journal
|
|
305
|
+
.filter((j) => j.status === 'completed')
|
|
306
|
+
.map((j) => j.step_name)
|
|
307
|
+
const stepsByName = new Map<string, DurableNode>(wf.steps.map((s) => [s.name, s]))
|
|
306
308
|
|
|
307
309
|
for (const name of [...completedNames].reverse()) {
|
|
308
|
-
const
|
|
309
|
-
if (!
|
|
310
|
+
const node = stepsByName.get(name)
|
|
311
|
+
if (node?.type !== 'step' || !node.compensate) continue
|
|
310
312
|
try {
|
|
311
|
-
await
|
|
313
|
+
await (node as DurableStep).compensate?.({
|
|
312
314
|
input,
|
|
313
315
|
results: state.results,
|
|
314
316
|
runId,
|
|
@@ -332,10 +334,316 @@ export class DurableRunner {
|
|
|
332
334
|
})
|
|
333
335
|
}
|
|
334
336
|
|
|
337
|
+
// ─── Node-type dispatch ──────────────────────────────────────────────────
|
|
338
|
+
|
|
339
|
+
private async runNode(
|
|
340
|
+
tx: Tx,
|
|
341
|
+
node: DurableNode,
|
|
342
|
+
ctx: DurableContext,
|
|
343
|
+
state: RunState,
|
|
344
|
+
runId: string,
|
|
345
|
+
attempt: number,
|
|
346
|
+
): Promise<Outcome> {
|
|
347
|
+
switch (node.type) {
|
|
348
|
+
case 'step':
|
|
349
|
+
return this.runStepLike(node, ctx, attempt, () => node.handler(ctx))
|
|
350
|
+
case 'sleep':
|
|
351
|
+
return this.runSleep(node, ctx, state, attempt)
|
|
352
|
+
case 'waitForSignal':
|
|
353
|
+
return this.runWaitForSignal(node, ctx, state)
|
|
354
|
+
case 'parallel':
|
|
355
|
+
return this.runStepLike(node, ctx, attempt, async () => {
|
|
356
|
+
const entries = Object.entries(node.branches)
|
|
357
|
+
const results = await Promise.all(
|
|
358
|
+
entries.map(async ([key, handler]) => [key, await handler(ctx)] as const),
|
|
359
|
+
)
|
|
360
|
+
return Object.fromEntries(results)
|
|
361
|
+
})
|
|
362
|
+
case 'route':
|
|
363
|
+
return this.runStepLike(node, ctx, attempt, async () => {
|
|
364
|
+
const key = await node.select(ctx)
|
|
365
|
+
const handler = node.branches[key]
|
|
366
|
+
if (handler === undefined) {
|
|
367
|
+
throw new DurableError(
|
|
368
|
+
`DurableRunner: route "${node.name}" returned unknown branch "${key}". Branches: ${Object.keys(node.branches).join(', ')}`,
|
|
369
|
+
)
|
|
370
|
+
}
|
|
371
|
+
const result = await handler(ctx)
|
|
372
|
+
return { branch: key, result }
|
|
373
|
+
})
|
|
374
|
+
case 'loop':
|
|
375
|
+
return this.runLoop(tx, node, ctx, state, runId, attempt)
|
|
376
|
+
case 'childWorkflow':
|
|
377
|
+
return this.runChildWorkflow(tx, node, ctx, state, runId, attempt)
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
/** Common retry/failure envelope for nodes that look like one handler. */
|
|
382
|
+
private async runStepLike(
|
|
383
|
+
node: { name: string; maxAttempts: number; backoff: (n: number) => number },
|
|
384
|
+
ctx: DurableContext,
|
|
385
|
+
attempt: number,
|
|
386
|
+
fn: () => Promise<unknown>,
|
|
387
|
+
): Promise<Outcome> {
|
|
388
|
+
try {
|
|
389
|
+
const value = await fn()
|
|
390
|
+
return { kind: 'completed', value, attempt }
|
|
391
|
+
} catch (err) {
|
|
392
|
+
const error = err instanceof Error ? err.message : String(err)
|
|
393
|
+
this.logger?.warn('Durable node failed', {
|
|
394
|
+
runId: ctx.runId,
|
|
395
|
+
node: node.name,
|
|
396
|
+
attempt,
|
|
397
|
+
error,
|
|
398
|
+
})
|
|
399
|
+
if (attempt < node.maxAttempts) {
|
|
400
|
+
return { kind: 'retry', attempt, delaySec: Math.max(0, node.backoff(attempt)) }
|
|
401
|
+
}
|
|
402
|
+
return { kind: 'failed', attempt, error }
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
private async runSleep(
|
|
407
|
+
node: import('./types.ts').DurableSleep,
|
|
408
|
+
ctx: DurableContext,
|
|
409
|
+
state: RunState,
|
|
410
|
+
attempt: number,
|
|
411
|
+
): Promise<Outcome> {
|
|
412
|
+
const requested =
|
|
413
|
+
typeof node.delay === 'number' ? node.delay : await node.delay(ctx)
|
|
414
|
+
const delaySec = Math.max(0, Math.floor(requested))
|
|
415
|
+
const sleepKey = `__sleep__${node.name}`
|
|
416
|
+
const previouslyDispatched = (state as unknown as Record<string, unknown>)[sleepKey] as
|
|
417
|
+
| { dispatchedAt: number }
|
|
418
|
+
| undefined
|
|
419
|
+
if (previouslyDispatched !== undefined) {
|
|
420
|
+
const elapsedSec = (Date.now() - previouslyDispatched.dispatchedAt) / 1000
|
|
421
|
+
if (elapsedSec >= delaySec) {
|
|
422
|
+
return { kind: 'completed', value: { sleptSec: delaySec }, attempt }
|
|
423
|
+
}
|
|
424
|
+
// Spurious early wake-up — re-park.
|
|
425
|
+
return { kind: 'waiting', delaySec: Math.max(1, delaySec - elapsedSec) }
|
|
426
|
+
}
|
|
427
|
+
;(state as unknown as Record<string, unknown>)[sleepKey] = { dispatchedAt: Date.now() }
|
|
428
|
+
return { kind: 'waiting', delaySec }
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
private async runWaitForSignal(
|
|
432
|
+
node: import('./types.ts').DurableWaitForSignal,
|
|
433
|
+
ctx: DurableContext,
|
|
434
|
+
state: RunState,
|
|
435
|
+
): Promise<Outcome> {
|
|
436
|
+
const name = typeof node.signalName === 'string' ? node.signalName : node.signalName(ctx)
|
|
437
|
+
const awaiting = state.awaitingSignals ?? {}
|
|
438
|
+
awaiting[node.name] = name
|
|
439
|
+
state.awaitingSignals = awaiting
|
|
440
|
+
return { kind: 'waiting' }
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
private async runLoop(
|
|
444
|
+
tx: Tx,
|
|
445
|
+
node: import('./types.ts').DurableLoop,
|
|
446
|
+
ctx: DurableContext,
|
|
447
|
+
state: RunState,
|
|
448
|
+
runId: string,
|
|
449
|
+
attempt: number,
|
|
450
|
+
): Promise<Outcome> {
|
|
451
|
+
const loops = state.loopState ?? {}
|
|
452
|
+
const slot = loops[node.name] ?? { iteration: 0, results: [] }
|
|
453
|
+
loops[node.name] = slot
|
|
454
|
+
state.loopState = loops
|
|
455
|
+
|
|
456
|
+
// Idempotent replay for this iteration — if the per-iteration
|
|
457
|
+
// journal row already exists, treat the iteration as done.
|
|
458
|
+
const iterName = `${node.name}#${slot.iteration}`
|
|
459
|
+
const iterJournal = await tx.queryOne<JournalRow>(
|
|
460
|
+
`SELECT id, run_id, step_name, status, result, error, attempts, completed_at
|
|
461
|
+
FROM "strav_workflow_journal" WHERE run_id = $1 AND step_name = $2`,
|
|
462
|
+
[runId, iterName],
|
|
463
|
+
)
|
|
464
|
+
if (iterJournal?.status === 'completed') {
|
|
465
|
+
slot.results.push(parseJson(iterJournal.result))
|
|
466
|
+
slot.iteration += 1
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
if (slot.iteration >= node.maxIterations) {
|
|
470
|
+
return { kind: 'failed', attempt, error: `loop exceeded maxIterations (${node.maxIterations})` }
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
let keepGoing: boolean
|
|
474
|
+
try {
|
|
475
|
+
keepGoing = await node.condition(ctx, slot.iteration)
|
|
476
|
+
} catch (err) {
|
|
477
|
+
return {
|
|
478
|
+
kind: 'failed',
|
|
479
|
+
attempt,
|
|
480
|
+
error: err instanceof Error ? err.message : String(err),
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
if (!keepGoing) {
|
|
484
|
+
return { kind: 'completed', value: [...slot.results], attempt }
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
try {
|
|
488
|
+
const value = await node.body({ ...ctx, iteration: slot.iteration })
|
|
489
|
+
// Journal this iteration before bumping; failure mid-write
|
|
490
|
+
// will replay this same iteration on resume (journal lookup
|
|
491
|
+
// above short-circuits).
|
|
492
|
+
await tx.execute(
|
|
493
|
+
`INSERT INTO "strav_workflow_journal"
|
|
494
|
+
(id, run_id, step_name, status, result, error, attempts, completed_at, created_at, updated_at)
|
|
495
|
+
VALUES ($1, $2, $3, 'completed', $4::jsonb, NULL, $5, now(), now(), now())`,
|
|
496
|
+
[ulid(), runId, iterName, JSON.stringify(value ?? null), attempt],
|
|
497
|
+
)
|
|
498
|
+
slot.results.push(value)
|
|
499
|
+
slot.iteration += 1
|
|
500
|
+
// Keep current_step pinned; re-dispatch advance to evaluate
|
|
501
|
+
// the next iteration in its own transaction.
|
|
502
|
+
await tx.execute(
|
|
503
|
+
`UPDATE "strav_workflow_runs" SET state = $1::jsonb, updated_at = now() WHERE id = $2`,
|
|
504
|
+
[JSON.stringify(state), runId],
|
|
505
|
+
)
|
|
506
|
+
// 'continue' via a sentinel — applyOutcome's `completed` path
|
|
507
|
+
// is reserved for cursor-advancing nodes; here we want to
|
|
508
|
+
// re-enter advance without moving the cursor.
|
|
509
|
+
return { kind: 'waiting', delaySec: 0 }
|
|
510
|
+
} catch (err) {
|
|
511
|
+
const error = err instanceof Error ? err.message : String(err)
|
|
512
|
+
if (attempt < node.maxAttempts) {
|
|
513
|
+
return { kind: 'retry', attempt, delaySec: Math.max(0, node.backoff(attempt)) }
|
|
514
|
+
}
|
|
515
|
+
return { kind: 'failed', attempt, error }
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
private async runChildWorkflow(
|
|
520
|
+
tx: Tx,
|
|
521
|
+
node: import('./types.ts').DurableChildWorkflow,
|
|
522
|
+
ctx: DurableContext,
|
|
523
|
+
state: RunState,
|
|
524
|
+
runId: string,
|
|
525
|
+
attempt: number,
|
|
526
|
+
): Promise<Outcome> {
|
|
527
|
+
const children = state.childRunIds ?? {}
|
|
528
|
+
state.childRunIds = children
|
|
529
|
+
let childId = children[node.name]
|
|
530
|
+
|
|
531
|
+
if (childId === undefined) {
|
|
532
|
+
let spec: { name: string; input?: Record<string, unknown> }
|
|
533
|
+
try {
|
|
534
|
+
spec = await node.start(ctx)
|
|
535
|
+
} catch (err) {
|
|
536
|
+
const error = err instanceof Error ? err.message : String(err)
|
|
537
|
+
if (attempt < 1) {
|
|
538
|
+
return { kind: 'retry', attempt, delaySec: 0 }
|
|
539
|
+
}
|
|
540
|
+
return { kind: 'failed', attempt, error }
|
|
541
|
+
}
|
|
542
|
+
childId = await this.start(spec.name, spec.input ?? {})
|
|
543
|
+
children[node.name] = childId
|
|
544
|
+
await tx.execute(
|
|
545
|
+
`UPDATE "strav_workflow_runs" SET state = $1::jsonb, updated_at = now() WHERE id = $2`,
|
|
546
|
+
[JSON.stringify(state), runId],
|
|
547
|
+
)
|
|
548
|
+
return { kind: 'waiting', delaySec: node.pollIntervalSec }
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
const child = await tx.queryOne<RunRow>(
|
|
552
|
+
`SELECT id, workflow_name, input, status, state, current_step, result, error, created_at, updated_at
|
|
553
|
+
FROM "strav_workflow_runs" WHERE id = $1`,
|
|
554
|
+
[childId],
|
|
555
|
+
)
|
|
556
|
+
if (!child) {
|
|
557
|
+
return {
|
|
558
|
+
kind: 'failed',
|
|
559
|
+
attempt,
|
|
560
|
+
error: `child workflow run "${childId}" disappeared`,
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
if (child.status === 'completed') {
|
|
564
|
+
return { kind: 'completed', value: parseJson(child.result), attempt }
|
|
565
|
+
}
|
|
566
|
+
if (child.status === 'failed') {
|
|
567
|
+
return {
|
|
568
|
+
kind: 'failed',
|
|
569
|
+
attempt,
|
|
570
|
+
error: child.error ?? 'child workflow failed without error message',
|
|
571
|
+
}
|
|
572
|
+
}
|
|
573
|
+
// pending / running / waiting / compensating → keep polling.
|
|
574
|
+
return { kind: 'waiting', delaySec: node.pollIntervalSec }
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
// ─── Outcome → state mutation ───────────────────────────────────────────
|
|
578
|
+
|
|
579
|
+
private async applyOutcome(
|
|
580
|
+
tx: Tx,
|
|
581
|
+
runId: string,
|
|
582
|
+
currentStep: number,
|
|
583
|
+
node: DurableNode,
|
|
584
|
+
state: RunState,
|
|
585
|
+
outcome: Outcome,
|
|
586
|
+
): Promise<boolean> {
|
|
587
|
+
switch (outcome.kind) {
|
|
588
|
+
case 'completed':
|
|
589
|
+
await tx.execute(
|
|
590
|
+
`INSERT INTO "strav_workflow_journal"
|
|
591
|
+
(id, run_id, step_name, status, result, error, attempts, completed_at, created_at, updated_at)
|
|
592
|
+
VALUES ($1, $2, $3, 'completed', $4::jsonb, NULL, $5, now(), now(), now())`,
|
|
593
|
+
[ulid(), runId, node.name, JSON.stringify(outcome.value ?? null), outcome.attempt],
|
|
594
|
+
)
|
|
595
|
+
state.results[node.name] = outcome.value
|
|
596
|
+
delete state.stepAttempts[node.name]
|
|
597
|
+
if (node.type === 'loop' && state.loopState !== undefined) {
|
|
598
|
+
delete state.loopState[node.name]
|
|
599
|
+
}
|
|
600
|
+
if (node.type === 'childWorkflow' && state.childRunIds !== undefined) {
|
|
601
|
+
delete state.childRunIds[node.name]
|
|
602
|
+
}
|
|
603
|
+
clearSleepKey(state, node)
|
|
604
|
+
await this.advanceCursor(tx, runId, currentStep + 1, state)
|
|
605
|
+
return true
|
|
606
|
+
case 'retry':
|
|
607
|
+
state.stepAttempts[node.name] = outcome.attempt
|
|
608
|
+
await tx.execute(
|
|
609
|
+
`UPDATE "strav_workflow_runs" SET state = $1::jsonb, updated_at = now() WHERE id = $2`,
|
|
610
|
+
[JSON.stringify(state), runId],
|
|
611
|
+
)
|
|
612
|
+
await this.queue.dispatchLater(outcome.delaySec, this.advanceJob, { runId })
|
|
613
|
+
return false
|
|
614
|
+
case 'failed':
|
|
615
|
+
await tx.execute(
|
|
616
|
+
`INSERT INTO "strav_workflow_journal"
|
|
617
|
+
(id, run_id, step_name, status, result, error, attempts, completed_at, created_at, updated_at)
|
|
618
|
+
VALUES ($1, $2, $3, 'failed', NULL, $4, $5, now(), now(), now())`,
|
|
619
|
+
[ulid(), runId, node.name, outcome.error, outcome.attempt],
|
|
620
|
+
)
|
|
621
|
+
await tx.execute(
|
|
622
|
+
`UPDATE "strav_workflow_runs"
|
|
623
|
+
SET status = 'compensating', state = $1::jsonb, error = $2, updated_at = now()
|
|
624
|
+
WHERE id = $3`,
|
|
625
|
+
[JSON.stringify(state), outcome.error, runId],
|
|
626
|
+
)
|
|
627
|
+
await this.queue.dispatch(this.compensateJob, { runId })
|
|
628
|
+
return false
|
|
629
|
+
case 'waiting':
|
|
630
|
+
await tx.execute(
|
|
631
|
+
`UPDATE "strav_workflow_runs"
|
|
632
|
+
SET status = 'waiting', state = $1::jsonb, updated_at = now()
|
|
633
|
+
WHERE id = $2`,
|
|
634
|
+
[JSON.stringify(state), runId],
|
|
635
|
+
)
|
|
636
|
+
if (outcome.delaySec !== undefined) {
|
|
637
|
+
await this.queue.dispatchLater(outcome.delaySec, this.advanceJob, { runId })
|
|
638
|
+
}
|
|
639
|
+
return false
|
|
640
|
+
}
|
|
641
|
+
}
|
|
642
|
+
|
|
335
643
|
// ─── Internal helpers ────────────────────────────────────────────────────
|
|
336
644
|
|
|
337
645
|
private async markCompleted(
|
|
338
|
-
tx:
|
|
646
|
+
tx: Tx,
|
|
339
647
|
runId: string,
|
|
340
648
|
results: Record<string, unknown>,
|
|
341
649
|
): Promise<void> {
|
|
@@ -344,7 +652,7 @@ export class DurableRunner {
|
|
|
344
652
|
SET status = 'completed', state = $1::jsonb, result = $2::jsonb, updated_at = now()
|
|
345
653
|
WHERE id = $3`,
|
|
346
654
|
[
|
|
347
|
-
JSON.stringify({
|
|
655
|
+
JSON.stringify({ ...emptyState(), results }),
|
|
348
656
|
JSON.stringify(results),
|
|
349
657
|
runId,
|
|
350
658
|
],
|
|
@@ -352,10 +660,10 @@ export class DurableRunner {
|
|
|
352
660
|
}
|
|
353
661
|
|
|
354
662
|
private async advanceCursor(
|
|
355
|
-
tx:
|
|
663
|
+
tx: Tx,
|
|
356
664
|
runId: string,
|
|
357
665
|
nextStep: number,
|
|
358
|
-
state:
|
|
666
|
+
state: RunState,
|
|
359
667
|
): Promise<void> {
|
|
360
668
|
await tx.execute(
|
|
361
669
|
`UPDATE "strav_workflow_runs"
|
|
@@ -368,6 +676,22 @@ export class DurableRunner {
|
|
|
368
676
|
|
|
369
677
|
// ─── Pure helpers ────────────────────────────────────────────────────────
|
|
370
678
|
|
|
679
|
+
function emptyState(): RunState {
|
|
680
|
+
return { results: {}, stepAttempts: {} }
|
|
681
|
+
}
|
|
682
|
+
|
|
683
|
+
function ensureStateShape(state: RunState): void {
|
|
684
|
+
if (state.results === undefined) state.results = {}
|
|
685
|
+
if (state.stepAttempts === undefined) state.stepAttempts = {}
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
function clearSleepKey(state: RunState, node: DurableNode): void {
|
|
689
|
+
const key = `__sleep__${node.name}`
|
|
690
|
+
if (key in state) {
|
|
691
|
+
delete (state as unknown as Record<string, unknown>)[key]
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
|
|
371
695
|
function parseJson(value: unknown): unknown {
|
|
372
696
|
if (value === null || value === undefined) return value
|
|
373
697
|
if (typeof value === 'string') return JSON.parse(value)
|
|
@@ -375,7 +699,7 @@ function parseJson(value: unknown): unknown {
|
|
|
375
699
|
}
|
|
376
700
|
|
|
377
701
|
function toSnapshot(row: RunRow): RunSnapshot {
|
|
378
|
-
const state = parseJson(row.state) as
|
|
702
|
+
const state = parseJson(row.state) as RunState | null
|
|
379
703
|
return {
|
|
380
704
|
id: row.id,
|
|
381
705
|
workflowName: row.workflow_name,
|
package/src/durable_workflow.ts
CHANGED
|
@@ -2,46 +2,58 @@
|
|
|
2
2
|
* `DurableWorkflow` — the builder apps use to declare a named,
|
|
3
3
|
* registered, crash-resumable workflow.
|
|
4
4
|
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
* copy-paste, but the semantics differ in three important ways:
|
|
5
|
+
* V1 surface: `.step(name, handler, options?)` — sequential, named,
|
|
6
|
+
* journaled, retried, optionally saga-compensated.
|
|
8
7
|
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
* apps don't pass closures to `runner.start()`.
|
|
8
|
+
* V2 surface adds five composite primitives that still occupy one
|
|
9
|
+
* cursor slot each:
|
|
12
10
|
*
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
17
|
-
*
|
|
11
|
+
* - `.sleep(name, delay)` — park for N seconds or a context-aware
|
|
12
|
+
* deadline.
|
|
13
|
+
* - `.waitForSignal(name, signalName)` — pause until
|
|
14
|
+
* `runner.signal(runId, signalName, payload?)` fires.
|
|
15
|
+
* - `.parallel(name, { branchA: fn, branchB: fn, ... })` — run
|
|
16
|
+
* every branch in `Promise.all`; whole-or-nothing failure.
|
|
17
|
+
* - `.route(name, select, branches)` — pick one branch by
|
|
18
|
+
* predicate.
|
|
19
|
+
* - `.loop(name, condition, body)` — iterate while `condition()`
|
|
20
|
+
* holds; each iteration is its own journal row.
|
|
21
|
+
* - `.childWorkflow(name, start)` — spawn another registered
|
|
22
|
+
* workflow and wait on it.
|
|
18
23
|
*
|
|
19
|
-
*
|
|
20
|
-
*
|
|
21
|
-
*
|
|
22
|
-
*
|
|
23
|
-
* `advance` job may run in a worker that never saw the request
|
|
24
|
-
* that started the workflow.
|
|
25
|
-
*
|
|
26
|
-
* V1 ships sequential `.step()` only. V2 adds `.parallel` / `.route`
|
|
27
|
-
* / `.loop` / `.sleep` / `.waitForSignal` / `.childWorkflow`.
|
|
24
|
+
* Cursor model stays a flat integer (`current_step`) — every node,
|
|
25
|
+
* primitive or composite, occupies one slot. Internal sub-state
|
|
26
|
+
* (loop iteration counters, awaiting-signal names, child run ids)
|
|
27
|
+
* lives in the run row's `state` JSONB.
|
|
28
28
|
*/
|
|
29
29
|
|
|
30
30
|
import { DurableError } from './durable_error.ts'
|
|
31
31
|
import type {
|
|
32
|
+
DurableChildWorkflow,
|
|
33
|
+
DurableCompensator,
|
|
34
|
+
DurableContext,
|
|
35
|
+
DurableLoop,
|
|
36
|
+
DurableLoopContext,
|
|
37
|
+
DurableNode,
|
|
38
|
+
DurableParallel,
|
|
39
|
+
DurableRoute,
|
|
40
|
+
DurableSleep,
|
|
32
41
|
DurableStep,
|
|
33
42
|
DurableStepHandler,
|
|
34
43
|
DurableStepOptions,
|
|
44
|
+
DurableWaitForSignal,
|
|
35
45
|
} from './types.ts'
|
|
36
46
|
|
|
37
47
|
const DEFAULT_MAX_ATTEMPTS = 3
|
|
48
|
+
const DEFAULT_MAX_ITERATIONS = 1000
|
|
49
|
+
const DEFAULT_CHILD_POLL_SEC = 2
|
|
38
50
|
const MAX_BACKOFF_SECONDS = 60
|
|
39
51
|
const defaultBackoff = (failedAttempt: number): number =>
|
|
40
52
|
Math.min(2 ** failedAttempt, MAX_BACKOFF_SECONDS)
|
|
41
53
|
|
|
42
54
|
export class DurableWorkflow {
|
|
43
55
|
readonly name: string
|
|
44
|
-
private readonly
|
|
56
|
+
private readonly _nodes: DurableNode[] = []
|
|
45
57
|
private readonly _names = new Set<string>()
|
|
46
58
|
|
|
47
59
|
constructor(name: string) {
|
|
@@ -51,9 +63,15 @@ export class DurableWorkflow {
|
|
|
51
63
|
this.name = name
|
|
52
64
|
}
|
|
53
65
|
|
|
54
|
-
/**
|
|
55
|
-
|
|
56
|
-
|
|
66
|
+
/**
|
|
67
|
+
* Read-only snapshot of the declared nodes.
|
|
68
|
+
*
|
|
69
|
+
* Field is named `steps` for back-compat with V1 — every node
|
|
70
|
+
* (`step`, `sleep`, `parallel`, …) carries a `type` discriminator
|
|
71
|
+
* that callers branch on.
|
|
72
|
+
*/
|
|
73
|
+
get steps(): readonly DurableNode[] {
|
|
74
|
+
return this._nodes
|
|
57
75
|
}
|
|
58
76
|
|
|
59
77
|
/**
|
|
@@ -70,28 +88,170 @@ export class DurableWorkflow {
|
|
|
70
88
|
*/
|
|
71
89
|
step(name: string, handler: DurableStepHandler, options?: DurableStepOptions): this {
|
|
72
90
|
this.claim(name)
|
|
73
|
-
const
|
|
91
|
+
const node: DurableStep = {
|
|
74
92
|
type: 'step',
|
|
75
93
|
name,
|
|
76
94
|
handler,
|
|
77
95
|
maxAttempts: options?.maxAttempts ?? DEFAULT_MAX_ATTEMPTS,
|
|
78
96
|
backoff: options?.backoff ?? defaultBackoff,
|
|
79
97
|
}
|
|
80
|
-
if (options?.compensate)
|
|
81
|
-
this.
|
|
98
|
+
if (options?.compensate) node.compensate = options.compensate
|
|
99
|
+
this._nodes.push(node)
|
|
100
|
+
return this
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Park the run for `delay` seconds (or a context-aware function
|
|
105
|
+
* returning seconds). Marks the run `waiting`; the cursor advances
|
|
106
|
+
* once the delayed advance fires.
|
|
107
|
+
*/
|
|
108
|
+
sleep(
|
|
109
|
+
name: string,
|
|
110
|
+
delay: number | ((ctx: DurableContext) => number | Promise<number>),
|
|
111
|
+
): this {
|
|
112
|
+
this.claim(name)
|
|
113
|
+
const node: DurableSleep = { type: 'sleep', name, delay }
|
|
114
|
+
this._nodes.push(node)
|
|
115
|
+
return this
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Pause until `runner.signal(runId, signalName, payload?)` fires.
|
|
120
|
+
* The payload becomes this node's result.
|
|
121
|
+
*/
|
|
122
|
+
waitForSignal(
|
|
123
|
+
name: string,
|
|
124
|
+
signalName: string | ((ctx: DurableContext) => string),
|
|
125
|
+
): this {
|
|
126
|
+
this.claim(name)
|
|
127
|
+
const node: DurableWaitForSignal = { type: 'waitForSignal', name, signalName }
|
|
128
|
+
this._nodes.push(node)
|
|
129
|
+
return this
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Run every branch concurrently within a single advance. Returns
|
|
134
|
+
* a `{ [branchName]: result }` object. Any branch throw fails the
|
|
135
|
+
* whole node (retried + compensated together).
|
|
136
|
+
*/
|
|
137
|
+
parallel(
|
|
138
|
+
name: string,
|
|
139
|
+
branches: Record<string, DurableStepHandler>,
|
|
140
|
+
options?: { maxAttempts?: number; backoff?: (failedAttempt: number) => number },
|
|
141
|
+
): this {
|
|
142
|
+
this.claim(name)
|
|
143
|
+
if (Object.keys(branches).length === 0) {
|
|
144
|
+
throw new DurableError(
|
|
145
|
+
`DurableWorkflow("${this.name}").parallel("${name}"): at least one branch required.`,
|
|
146
|
+
)
|
|
147
|
+
}
|
|
148
|
+
const node: DurableParallel = {
|
|
149
|
+
type: 'parallel',
|
|
150
|
+
name,
|
|
151
|
+
branches,
|
|
152
|
+
maxAttempts: options?.maxAttempts ?? DEFAULT_MAX_ATTEMPTS,
|
|
153
|
+
backoff: options?.backoff ?? defaultBackoff,
|
|
154
|
+
}
|
|
155
|
+
this._nodes.push(node)
|
|
156
|
+
return this
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
/**
|
|
160
|
+
* Pick one branch by `select(ctx)` predicate. The chosen handler's
|
|
161
|
+
* return is the node's result; the chosen branch key is recorded
|
|
162
|
+
* in `results[name].branch`.
|
|
163
|
+
*/
|
|
164
|
+
route(
|
|
165
|
+
name: string,
|
|
166
|
+
select: (ctx: DurableContext) => string | Promise<string>,
|
|
167
|
+
branches: Record<string, DurableStepHandler>,
|
|
168
|
+
options?: { maxAttempts?: number; backoff?: (failedAttempt: number) => number },
|
|
169
|
+
): this {
|
|
170
|
+
this.claim(name)
|
|
171
|
+
if (Object.keys(branches).length === 0) {
|
|
172
|
+
throw new DurableError(
|
|
173
|
+
`DurableWorkflow("${this.name}").route("${name}"): at least one branch required.`,
|
|
174
|
+
)
|
|
175
|
+
}
|
|
176
|
+
const node: DurableRoute = {
|
|
177
|
+
type: 'route',
|
|
178
|
+
name,
|
|
179
|
+
select,
|
|
180
|
+
branches,
|
|
181
|
+
maxAttempts: options?.maxAttempts ?? DEFAULT_MAX_ATTEMPTS,
|
|
182
|
+
backoff: options?.backoff ?? defaultBackoff,
|
|
183
|
+
}
|
|
184
|
+
this._nodes.push(node)
|
|
185
|
+
return this
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
/**
|
|
189
|
+
* Repeat `body(ctx)` while `condition(ctx, iter)` returns true,
|
|
190
|
+
* up to `maxIterations` (default 1000). Each iteration is its own
|
|
191
|
+
* journal row keyed `<name>#<iter>` so a crash mid-loop resumes
|
|
192
|
+
* from the next un-journaled iteration. The node's result is the
|
|
193
|
+
* array of per-iteration returns.
|
|
194
|
+
*/
|
|
195
|
+
loop(
|
|
196
|
+
name: string,
|
|
197
|
+
condition: (ctx: DurableContext, iter: number) => boolean | Promise<boolean>,
|
|
198
|
+
body: (ctx: DurableLoopContext) => Promise<unknown>,
|
|
199
|
+
options?: {
|
|
200
|
+
maxIterations?: number
|
|
201
|
+
maxAttempts?: number
|
|
202
|
+
backoff?: (failedAttempt: number) => number
|
|
203
|
+
},
|
|
204
|
+
): this {
|
|
205
|
+
this.claim(name)
|
|
206
|
+
const node: DurableLoop = {
|
|
207
|
+
type: 'loop',
|
|
208
|
+
name,
|
|
209
|
+
condition,
|
|
210
|
+
body,
|
|
211
|
+
maxIterations: options?.maxIterations ?? DEFAULT_MAX_ITERATIONS,
|
|
212
|
+
maxAttempts: options?.maxAttempts ?? DEFAULT_MAX_ATTEMPTS,
|
|
213
|
+
backoff: options?.backoff ?? defaultBackoff,
|
|
214
|
+
}
|
|
215
|
+
this._nodes.push(node)
|
|
216
|
+
return this
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
/**
|
|
220
|
+
* Spawn a registered child workflow and wait for it to complete.
|
|
221
|
+
* The parent re-polls the child's status via a delayed advance —
|
|
222
|
+
* no parent_run_id column needed. Child `failed` propagates as a
|
|
223
|
+
* failure on this node.
|
|
224
|
+
*/
|
|
225
|
+
childWorkflow(
|
|
226
|
+
name: string,
|
|
227
|
+
start: DurableChildWorkflow['start'],
|
|
228
|
+
options?: { pollIntervalSec?: number },
|
|
229
|
+
): this {
|
|
230
|
+
this.claim(name)
|
|
231
|
+
const node: DurableChildWorkflow = {
|
|
232
|
+
type: 'childWorkflow',
|
|
233
|
+
name,
|
|
234
|
+
start,
|
|
235
|
+
pollIntervalSec: options?.pollIntervalSec ?? DEFAULT_CHILD_POLL_SEC,
|
|
236
|
+
}
|
|
237
|
+
this._nodes.push(node)
|
|
82
238
|
return this
|
|
83
239
|
}
|
|
84
240
|
|
|
85
|
-
/** Throw if the step name has already been used in this workflow. */
|
|
86
241
|
private claim(name: string): void {
|
|
87
242
|
if (!name) {
|
|
88
|
-
throw new DurableError(`DurableWorkflow("${this.name}"):
|
|
243
|
+
throw new DurableError(`DurableWorkflow("${this.name}"): node name must be non-empty.`)
|
|
89
244
|
}
|
|
90
245
|
if (this._names.has(name)) {
|
|
91
246
|
throw new DurableError(
|
|
92
|
-
`DurableWorkflow("${this.name}"): duplicate
|
|
247
|
+
`DurableWorkflow("${this.name}"): duplicate node name "${name}". Nodes are journaled by name; collisions would break replay.`,
|
|
93
248
|
)
|
|
94
249
|
}
|
|
95
250
|
this._names.add(name)
|
|
96
251
|
}
|
|
97
252
|
}
|
|
253
|
+
|
|
254
|
+
// Re-export `DurableCompensator` so the index barrel doesn't need to
|
|
255
|
+
// list it twice — it's part of `DurableStepOptions` and the
|
|
256
|
+
// step-builder signature.
|
|
257
|
+
export type { DurableCompensator }
|
package/src/index.ts
CHANGED
|
@@ -1,9 +1,16 @@
|
|
|
1
1
|
// Public API of @strav/durable.
|
|
2
2
|
//
|
|
3
|
-
// Crash-resumable workflows on top of @strav/queue + Postgres.
|
|
4
|
-
//
|
|
5
|
-
//
|
|
6
|
-
//
|
|
3
|
+
// Crash-resumable workflows on top of @strav/queue + Postgres.
|
|
4
|
+
// Builder surface on `DurableWorkflow`:
|
|
5
|
+
// - `.step(name, handler, opts?)` — sequential, retried, saga-compensated.
|
|
6
|
+
// - `.sleep(name, delay)` — park for a duration.
|
|
7
|
+
// - `.waitForSignal(name, signalName)` — pause until
|
|
8
|
+
// `runner.signal(runId, name, payload?)`.
|
|
9
|
+
// - `.parallel(name, branches)` — Promise.all-style fan-out.
|
|
10
|
+
// - `.route(name, select, branches)` — single-branch routing.
|
|
11
|
+
// - `.loop(name, condition, body)` — per-iteration journaled loop.
|
|
12
|
+
// - `.childWorkflow(name, start)` — spawn a registered workflow
|
|
13
|
+
// and wait for completion.
|
|
7
14
|
|
|
8
15
|
export { defineDurable } from './define_durable.ts'
|
|
9
16
|
export {
|
|
@@ -25,11 +32,19 @@ export { DurableWorkflow } from './durable_workflow.ts'
|
|
|
25
32
|
export { JOURNAL_UNIQUE_INDEX, workflowJournalSchema } from './journal_schema.ts'
|
|
26
33
|
export { workflowRunsSchema } from './runs_schema.ts'
|
|
27
34
|
export type {
|
|
35
|
+
DurableChildWorkflow,
|
|
28
36
|
DurableCompensator,
|
|
29
37
|
DurableContext,
|
|
38
|
+
DurableLoop,
|
|
39
|
+
DurableLoopContext,
|
|
40
|
+
DurableNode,
|
|
41
|
+
DurableParallel,
|
|
42
|
+
DurableRoute,
|
|
43
|
+
DurableSleep,
|
|
30
44
|
DurableStep,
|
|
31
45
|
DurableStepHandler,
|
|
32
46
|
DurableStepOptions,
|
|
47
|
+
DurableWaitForSignal,
|
|
33
48
|
RunSnapshot,
|
|
34
49
|
RunStatus,
|
|
35
50
|
} from './types.ts'
|
package/src/types.ts
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* Public types for durable execution.
|
|
3
3
|
*
|
|
4
4
|
* A durable workflow is a *named*, registered definition: handlers are
|
|
5
|
-
* keyed by
|
|
5
|
+
* keyed by node name so the runner can re-enter them across processes
|
|
6
6
|
* after a crash. Apps don't pass closures into `start()` — they pass
|
|
7
7
|
* a workflow name + input.
|
|
8
8
|
*
|
|
@@ -16,12 +16,31 @@
|
|
|
16
16
|
* `durable:status` CLI command in a later slice).
|
|
17
17
|
*/
|
|
18
18
|
|
|
19
|
-
|
|
19
|
+
/**
|
|
20
|
+
* Run lifecycle states:
|
|
21
|
+
*
|
|
22
|
+
* - `pending` — row INSERTed; no advance has run yet.
|
|
23
|
+
* - `running` — a worker is mid-step or in-flight.
|
|
24
|
+
* - `waiting` — node parked itself (sleep, waitForSignal,
|
|
25
|
+
* childWorkflow). The cursor doesn't move until
|
|
26
|
+
* the wakeup condition fires.
|
|
27
|
+
* - `compensating`— terminal failure; the saga is rolling back.
|
|
28
|
+
* - `completed` — every node finished; `result` populated.
|
|
29
|
+
* - `failed` — compensation done (or no compensation needed);
|
|
30
|
+
* `error` populated.
|
|
31
|
+
*/
|
|
32
|
+
export type RunStatus =
|
|
33
|
+
| 'pending'
|
|
34
|
+
| 'running'
|
|
35
|
+
| 'waiting'
|
|
36
|
+
| 'compensating'
|
|
37
|
+
| 'completed'
|
|
38
|
+
| 'failed'
|
|
20
39
|
|
|
21
40
|
export interface DurableContext {
|
|
22
41
|
/** Workflow input — the object passed to `DurableRunner.start(name, input)`. */
|
|
23
42
|
readonly input: Record<string, unknown>
|
|
24
|
-
/** Results from every prior
|
|
43
|
+
/** Results from every prior node, keyed by node name. */
|
|
25
44
|
readonly results: Record<string, unknown>
|
|
26
45
|
/** Durable run id (the row PK). Useful for logging / correlation. */
|
|
27
46
|
readonly runId: string
|
|
@@ -29,6 +48,12 @@ export interface DurableContext {
|
|
|
29
48
|
readonly attempt: number
|
|
30
49
|
}
|
|
31
50
|
|
|
51
|
+
/** Context handed to a `.loop(...)` body — same as `DurableContext` plus the iteration counter. */
|
|
52
|
+
export interface DurableLoopContext extends DurableContext {
|
|
53
|
+
/** 0-based iteration number. */
|
|
54
|
+
readonly iteration: number
|
|
55
|
+
}
|
|
56
|
+
|
|
32
57
|
export interface RunSnapshot {
|
|
33
58
|
id: string
|
|
34
59
|
workflowName: string
|
|
@@ -60,11 +85,12 @@ export interface DurableStepOptions {
|
|
|
60
85
|
backoff?: (failedAttempt: number) => number
|
|
61
86
|
}
|
|
62
87
|
|
|
88
|
+
// ─── Node variants (V2) ──────────────────────────────────────────────────
|
|
89
|
+
|
|
63
90
|
/**
|
|
64
|
-
*
|
|
65
|
-
*
|
|
66
|
-
*
|
|
67
|
-
* `workflow.steps`.
|
|
91
|
+
* One sequential step. The cursor advances by 1 once the handler
|
|
92
|
+
* succeeds. Failures retry up to `maxAttempts`; exhaustion triggers
|
|
93
|
+
* reverse-order saga compensation.
|
|
68
94
|
*/
|
|
69
95
|
export interface DurableStep {
|
|
70
96
|
type: 'step'
|
|
@@ -74,3 +100,116 @@ export interface DurableStep {
|
|
|
74
100
|
maxAttempts: number
|
|
75
101
|
backoff: (failedAttempt: number) => number
|
|
76
102
|
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Park the run for a fixed duration. The runner schedules a delayed
|
|
106
|
+
* advance via `queue.dispatchLater(delaySec)` and marks the run as
|
|
107
|
+
* `waiting`. On wake-up the node is journaled and the cursor moves
|
|
108
|
+
* on.
|
|
109
|
+
*
|
|
110
|
+
* `delay` is either a number of seconds or a context-aware function
|
|
111
|
+
* that returns one (so apps can sleep until a wall-clock target
|
|
112
|
+
* encoded in `ctx.input` / `ctx.results`).
|
|
113
|
+
*/
|
|
114
|
+
export interface DurableSleep {
|
|
115
|
+
type: 'sleep'
|
|
116
|
+
name: string
|
|
117
|
+
delay: number | ((ctx: DurableContext) => number | Promise<number>)
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Pause the run until an external `runner.signal(runId, signalName,
|
|
122
|
+
* payload?)` call fires. The signal's `payload` lands as the node's
|
|
123
|
+
* result. Useful for human-in-the-loop approvals, third-party
|
|
124
|
+
* webhooks, async-out / async-in handshakes.
|
|
125
|
+
*
|
|
126
|
+
* `signalName` is either a literal or a context-aware function (so
|
|
127
|
+
* the listener name can depend on `ctx.input`).
|
|
128
|
+
*/
|
|
129
|
+
export interface DurableWaitForSignal {
|
|
130
|
+
type: 'waitForSignal'
|
|
131
|
+
name: string
|
|
132
|
+
signalName: string | ((ctx: DurableContext) => string)
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Run a set of named branches concurrently. Each branch is a single
|
|
137
|
+
* handler; the parallel node completes when every branch has — its
|
|
138
|
+
* result is `{ [branch]: result }`. If any branch throws, the WHOLE
|
|
139
|
+
* node fails and the failure path follows the same retry +
|
|
140
|
+
* compensation rules as `step`.
|
|
141
|
+
*
|
|
142
|
+
* V2 scope — no per-branch retries, no per-branch journaling. The
|
|
143
|
+
* whole `Promise.all(...)` runs inside one advance.
|
|
144
|
+
*/
|
|
145
|
+
export interface DurableParallel {
|
|
146
|
+
type: 'parallel'
|
|
147
|
+
name: string
|
|
148
|
+
branches: Record<string, DurableStepHandler>
|
|
149
|
+
maxAttempts: number
|
|
150
|
+
backoff: (failedAttempt: number) => number
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
/**
|
|
154
|
+
* Pick one of N named branches based on a `select(ctx)` predicate.
|
|
155
|
+
* The chosen branch's handler runs; its return lands as the node's
|
|
156
|
+
* result alongside the chosen key. Unknown selection keys throw.
|
|
157
|
+
*/
|
|
158
|
+
export interface DurableRoute {
|
|
159
|
+
type: 'route'
|
|
160
|
+
name: string
|
|
161
|
+
select: (ctx: DurableContext) => string | Promise<string>
|
|
162
|
+
branches: Record<string, DurableStepHandler>
|
|
163
|
+
maxAttempts: number
|
|
164
|
+
backoff: (failedAttempt: number) => number
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Repeat `body(ctx, i)` while `condition(ctx, iter)` returns true,
|
|
169
|
+
* up to `maxIterations`. Each iteration is journaled separately
|
|
170
|
+
* (`<name>#<iter>`) so a crash mid-loop resumes from the next
|
|
171
|
+
* un-journaled iteration. The node's final result is the array of
|
|
172
|
+
* per-iteration returns.
|
|
173
|
+
*/
|
|
174
|
+
export interface DurableLoop {
|
|
175
|
+
type: 'loop'
|
|
176
|
+
name: string
|
|
177
|
+
condition: (ctx: DurableContext, iter: number) => boolean | Promise<boolean>
|
|
178
|
+
body: (ctx: DurableLoopContext) => Promise<unknown>
|
|
179
|
+
/** Safety ceiling on iterations. Default `1000`. */
|
|
180
|
+
maxIterations: number
|
|
181
|
+
maxAttempts: number
|
|
182
|
+
backoff: (failedAttempt: number) => number
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
/**
|
|
186
|
+
* Spawn a child workflow (by registered name) and wait for it to
|
|
187
|
+
* complete. The parent re-polls the child's status via a delayed
|
|
188
|
+
* advance — no parent_run_id column needed, no cross-row push. Child
|
|
189
|
+
* `failed` propagates as a failure on this node (which retries +
|
|
190
|
+
* compensates like any other).
|
|
191
|
+
*
|
|
192
|
+
* `start(ctx)` returns `{ name, input }` — the child workflow name
|
|
193
|
+
* (must be registered) and its input object.
|
|
194
|
+
*/
|
|
195
|
+
export interface DurableChildWorkflow {
|
|
196
|
+
type: 'childWorkflow'
|
|
197
|
+
name: string
|
|
198
|
+
start: (
|
|
199
|
+
ctx: DurableContext,
|
|
200
|
+
) => Promise<{ name: string; input?: Record<string, unknown> }> | {
|
|
201
|
+
name: string
|
|
202
|
+
input?: Record<string, unknown>
|
|
203
|
+
}
|
|
204
|
+
/** How often the runner re-polls the child's status (seconds). Default `2`. */
|
|
205
|
+
pollIntervalSec: number
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
export type DurableNode =
|
|
209
|
+
| DurableStep
|
|
210
|
+
| DurableSleep
|
|
211
|
+
| DurableWaitForSignal
|
|
212
|
+
| DurableParallel
|
|
213
|
+
| DurableRoute
|
|
214
|
+
| DurableLoop
|
|
215
|
+
| DurableChildWorkflow
|