@strav/durable 0.4.31 → 1.0.0-alpha.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +18 -37
- package/src/define_durable.ts +29 -0
- package/src/durable_advance_job.ts +38 -0
- package/src/durable_compensate_job.ts +33 -0
- package/src/durable_error.ts +38 -0
- package/src/durable_provider.ts +91 -0
- package/src/durable_runner.ts +395 -0
- package/src/durable_workflow.ts +97 -0
- package/src/index.ts +25 -26
- package/src/journal_schema.ts +53 -0
- package/src/runs_schema.ts +38 -0
- package/src/types.ts +58 -198
- package/src/workflow_registry.ts +49 -0
- package/CHANGELOG.md +0 -26
- package/src/builder.ts +0 -158
- package/src/config.ts +0 -36
- package/src/durable.ts +0 -268
- package/src/engine/advance_handler.ts +0 -154
- package/src/engine/compensate_handler.ts +0 -70
- package/src/engine/compensation_driver.ts +0 -61
- package/src/engine/context.ts +0 -36
- package/src/engine/enqueue.ts +0 -62
- package/src/engine/finalize.ts +0 -111
- package/src/engine/index.ts +0 -20
- package/src/engine/run_store.ts +0 -42
- package/src/engine/step_driver.ts +0 -291
- package/src/engine/suspended_run.ts +0 -24
- package/src/errors.ts +0 -21
- package/src/helpers.ts +0 -16
- package/src/models/index.ts +0 -3
- package/src/models/journal.ts +0 -54
- package/src/models/run_machine.ts +0 -39
- package/src/models/workflow_run.ts +0 -36
- package/src/providers/durable_provider.ts +0 -31
- package/src/providers/index.ts +0 -2
- package/src/registry.ts +0 -35
- package/src/schema.ts +0 -70
- package/src/util.ts +0 -25
- package/tsconfig.json +0 -5
|
@@ -0,0 +1,395 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `DurableRunner` — the engine that owns the durable execution state
|
|
3
|
+
* machine.
|
|
4
|
+
*
|
|
5
|
+
* Three load-bearing methods:
|
|
6
|
+
*
|
|
7
|
+
* 1. `start(name, input)` — INSERTs a new run row, dispatches the
|
|
8
|
+
* first `DurableAdvanceJob` for it inside the same transaction
|
|
9
|
+
* (queue-until-commit via `@strav/queue`'s `DatabaseQueue`).
|
|
10
|
+
* Returns the run id; the workflow runs asynchronously on the
|
|
11
|
+
* queue.
|
|
12
|
+
*
|
|
13
|
+
* 2. `advance(runId)` — the job handler. Acquires a row lock,
|
|
14
|
+
* decides what step is next, looks for a completed journal
|
|
15
|
+
* entry to short-circuit (idempotent replay), runs the
|
|
16
|
+
* handler, journals the result, and either re-enqueues itself
|
|
17
|
+
* for the next step or — on failure — schedules a retry or
|
|
18
|
+
* kicks off compensation. The whole step body runs inside a
|
|
19
|
+
* DB transaction so partial writes can't escape.
|
|
20
|
+
*
|
|
21
|
+
* 3. `compensate(runId)` — walks the journal in reverse order
|
|
22
|
+
* running each step's `compensate` callback. On clean
|
|
23
|
+
* completion the run lands in `failed`. Failures during
|
|
24
|
+
* compensation are logged but don't block the rest of the
|
|
25
|
+
* rollback (compensators must be idempotent).
|
|
26
|
+
*
|
|
27
|
+
* Apps don't usually call `advance` / `compensate` directly — the
|
|
28
|
+
* `DurableAdvanceJob` and `DurableCompensateJob` classes wrap them.
|
|
29
|
+
*/
|
|
30
|
+
|
|
31
|
+
import {
|
|
32
|
+
type Database,
|
|
33
|
+
PostgresDatabase,
|
|
34
|
+
type SchemaRegistry,
|
|
35
|
+
} from '@strav/database'
|
|
36
|
+
import { type Logger, ulid } from '@strav/kernel'
|
|
37
|
+
import type { JobClass, Queue } from '@strav/queue'
|
|
38
|
+
import { RunNotFoundError } from './durable_error.ts'
|
|
39
|
+
import type { DurableStep, DurableContext, RunSnapshot, RunStatus } from './types.ts'
|
|
40
|
+
import type { WorkflowRegistry } from './workflow_registry.ts'
|
|
41
|
+
|
|
42
|
+
interface RunRow {
|
|
43
|
+
id: string
|
|
44
|
+
workflow_name: string
|
|
45
|
+
input: Record<string, unknown> | string
|
|
46
|
+
status: RunStatus
|
|
47
|
+
state: { results?: Record<string, unknown>; stepAttempts?: Record<string, number> } | string
|
|
48
|
+
current_step: number
|
|
49
|
+
result: Record<string, unknown> | string | null
|
|
50
|
+
error: string | null
|
|
51
|
+
created_at: Date
|
|
52
|
+
updated_at: Date
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
interface JournalRow {
|
|
56
|
+
id: string
|
|
57
|
+
run_id: string
|
|
58
|
+
step_name: string
|
|
59
|
+
status: 'completed' | 'failed'
|
|
60
|
+
result: Record<string, unknown> | string | null
|
|
61
|
+
error: string | null
|
|
62
|
+
attempts: number
|
|
63
|
+
completed_at: Date
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
export interface DurableRunnerOptions {
|
|
67
|
+
db: PostgresDatabase
|
|
68
|
+
queue: Queue
|
|
69
|
+
registry: WorkflowRegistry
|
|
70
|
+
/**
|
|
71
|
+
* Job classes the runner dispatches for advance / compensate. Passed
|
|
72
|
+
* in as options so the runner stays decoupled from the Job module
|
|
73
|
+
* (the Jobs themselves import the runner for DI — taking them as
|
|
74
|
+
* options breaks the resulting cycle without forcing a third
|
|
75
|
+
* intermediate module).
|
|
76
|
+
*
|
|
77
|
+
* `DurableProvider` wires the defaults
|
|
78
|
+
* (`DurableAdvanceJob` / `DurableCompensateJob`); apps that subclass
|
|
79
|
+
* the Jobs (custom logging, custom dead-letter routing) pass their
|
|
80
|
+
* subclasses here.
|
|
81
|
+
*/
|
|
82
|
+
advanceJob: JobClass
|
|
83
|
+
compensateJob: JobClass
|
|
84
|
+
/** Optional logger — picked up via `LogManager.channel('durable')` when wired by `DurableProvider`. */
|
|
85
|
+
logger?: Logger
|
|
86
|
+
/**
|
|
87
|
+
* Optional SchemaRegistry — when supplied, callers can read it to
|
|
88
|
+
* find the runs / journal schemas during boot DDL emission.
|
|
89
|
+
*/
|
|
90
|
+
schemas?: SchemaRegistry
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
export class DurableRunner {
|
|
94
|
+
private readonly db: PostgresDatabase
|
|
95
|
+
private readonly queue: Queue
|
|
96
|
+
private readonly registry: WorkflowRegistry
|
|
97
|
+
private readonly advanceJob: JobClass
|
|
98
|
+
private readonly compensateJob: JobClass
|
|
99
|
+
private readonly logger: Logger | undefined
|
|
100
|
+
|
|
101
|
+
constructor(options: DurableRunnerOptions) {
|
|
102
|
+
this.db = options.db
|
|
103
|
+
this.queue = options.queue
|
|
104
|
+
this.registry = options.registry
|
|
105
|
+
this.advanceJob = options.advanceJob
|
|
106
|
+
this.compensateJob = options.compensateJob
|
|
107
|
+
this.logger = options.logger
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/** Register a workflow on the embedded registry. Sugar for `runner.registry.register(...)`. */
|
|
111
|
+
register(workflow: Parameters<WorkflowRegistry['register']>[0]): this {
|
|
112
|
+
this.registry.register(workflow)
|
|
113
|
+
return this
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Start a new durable run. INSERTs the run row + dispatches the
|
|
118
|
+
* first `advance` job in one transaction; the queue row commits
|
|
119
|
+
* with the run row so a crash between INSERT and dispatch can't
|
|
120
|
+
* orphan either.
|
|
121
|
+
*/
|
|
122
|
+
async start(workflowName: string, input: Record<string, unknown> = {}): Promise<string> {
|
|
123
|
+
// Validate workflow registration up-front so the caller sees a
|
|
124
|
+
// synchronous error rather than a never-advancing run row.
|
|
125
|
+
this.registry.get(workflowName)
|
|
126
|
+
const runId = ulid()
|
|
127
|
+
await this.db.transaction(async (tx) => {
|
|
128
|
+
await tx.execute(
|
|
129
|
+
`INSERT INTO "strav_workflow_runs"
|
|
130
|
+
(id, workflow_name, input, status, state, current_step, result, error, created_at, updated_at)
|
|
131
|
+
VALUES ($1, $2, $3::jsonb, 'pending', $4::jsonb, 0, NULL, NULL, now(), now())`,
|
|
132
|
+
[runId, workflowName, JSON.stringify(input), JSON.stringify({ results: {}, stepAttempts: {} })],
|
|
133
|
+
)
|
|
134
|
+
await this.queue.dispatch(this.advanceJob, { runId })
|
|
135
|
+
})
|
|
136
|
+
return runId
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/** Read a run by id. Throws `RunNotFoundError` when missing. */
|
|
140
|
+
async find(runId: string): Promise<RunSnapshot> {
|
|
141
|
+
const row = await this.db.queryOne<RunRow>(
|
|
142
|
+
`SELECT id, workflow_name, input, status, state, current_step, result, error, created_at, updated_at
|
|
143
|
+
FROM "strav_workflow_runs" WHERE id = $1`,
|
|
144
|
+
[runId],
|
|
145
|
+
)
|
|
146
|
+
if (!row) throw new RunNotFoundError(runId)
|
|
147
|
+
return toSnapshot(row)
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Advance handler. Runs inside one transaction:
|
|
152
|
+
*
|
|
153
|
+
* 1. SELECT FOR UPDATE the run row (serializes concurrent advances).
|
|
154
|
+
* 2. Resolve the workflow + the step at `current_step`.
|
|
155
|
+
* 3. If a completed journal row already exists for this step,
|
|
156
|
+
* treat the run as if the step just succeeded — bump
|
|
157
|
+
* `current_step` and either enqueue the next or mark
|
|
158
|
+
* `completed`.
|
|
159
|
+
* 4. Otherwise call the handler. On success: journal +
|
|
160
|
+
* bump cursor + enqueue next (or mark `completed`). On
|
|
161
|
+
* throw: track the attempt; if there are retries left,
|
|
162
|
+
* enqueue a delayed advance; otherwise journal the failure
|
|
163
|
+
* and kick off compensation.
|
|
164
|
+
*/
|
|
165
|
+
async advance(runId: string): Promise<void> {
|
|
166
|
+
const workflow = await this.db.transaction(async (tx) => {
|
|
167
|
+
const row = await tx.queryOne<RunRow>(
|
|
168
|
+
`SELECT id, workflow_name, input, status, state, current_step, result, error, created_at, updated_at
|
|
169
|
+
FROM "strav_workflow_runs" WHERE id = $1 FOR UPDATE`,
|
|
170
|
+
[runId],
|
|
171
|
+
)
|
|
172
|
+
if (!row) throw new RunNotFoundError(runId)
|
|
173
|
+
if (row.status === 'completed' || row.status === 'failed') return null
|
|
174
|
+
|
|
175
|
+
const wf = this.registry.get(row.workflow_name)
|
|
176
|
+
const state = parseJson(row.state) as {
|
|
177
|
+
results: Record<string, unknown>
|
|
178
|
+
stepAttempts: Record<string, number>
|
|
179
|
+
}
|
|
180
|
+
const input = parseJson(row.input) as Record<string, unknown>
|
|
181
|
+
|
|
182
|
+
if (row.current_step >= wf.steps.length) {
|
|
183
|
+
await this.markCompleted(tx, runId, state.results)
|
|
184
|
+
return null
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
const step = wf.steps[row.current_step]!
|
|
188
|
+
|
|
189
|
+
// Idempotent replay — if we already journaled this step, skip
|
|
190
|
+
// the handler and just advance the cursor.
|
|
191
|
+
const journaled = await tx.queryOne<JournalRow>(
|
|
192
|
+
`SELECT id, run_id, step_name, status, result, error, attempts, completed_at
|
|
193
|
+
FROM "strav_workflow_journal" WHERE run_id = $1 AND step_name = $2`,
|
|
194
|
+
[runId, step.name],
|
|
195
|
+
)
|
|
196
|
+
if (journaled?.status === 'completed') {
|
|
197
|
+
state.results[step.name] = parseJson(journaled.result)
|
|
198
|
+
await this.advanceCursor(tx, runId, row.current_step + 1, state)
|
|
199
|
+
// Continue outside the transaction so we don't hold the row
|
|
200
|
+
// lock across the next handler invocation.
|
|
201
|
+
return { wf, runId, status: 'continue' as const }
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
const attempt = (state.stepAttempts[step.name] ?? 0) + 1
|
|
205
|
+
const ctx: DurableContext = {
|
|
206
|
+
input,
|
|
207
|
+
results: state.results,
|
|
208
|
+
runId,
|
|
209
|
+
attempt,
|
|
210
|
+
}
|
|
211
|
+
try {
|
|
212
|
+
const result = await step.handler(ctx)
|
|
213
|
+
await tx.execute(
|
|
214
|
+
`INSERT INTO "strav_workflow_journal"
|
|
215
|
+
(id, run_id, step_name, status, result, error, attempts, completed_at, created_at, updated_at)
|
|
216
|
+
VALUES ($1, $2, $3, 'completed', $4::jsonb, NULL, $5, now(), now(), now())`,
|
|
217
|
+
[ulid(), runId, step.name, JSON.stringify(result ?? null), attempt],
|
|
218
|
+
)
|
|
219
|
+
state.results[step.name] = result
|
|
220
|
+
delete state.stepAttempts[step.name]
|
|
221
|
+
await this.advanceCursor(tx, runId, row.current_step + 1, state)
|
|
222
|
+
return { wf, runId, status: 'continue' as const }
|
|
223
|
+
} catch (err) {
|
|
224
|
+
const message = err instanceof Error ? err.message : String(err)
|
|
225
|
+
this.logger?.warn('Durable step failed', {
|
|
226
|
+
runId,
|
|
227
|
+
step: step.name,
|
|
228
|
+
attempt,
|
|
229
|
+
error: message,
|
|
230
|
+
})
|
|
231
|
+
if (attempt < step.maxAttempts) {
|
|
232
|
+
state.stepAttempts[step.name] = attempt
|
|
233
|
+
await tx.execute(
|
|
234
|
+
`UPDATE "strav_workflow_runs" SET state = $1::jsonb, updated_at = now() WHERE id = $2`,
|
|
235
|
+
[JSON.stringify(state), runId],
|
|
236
|
+
)
|
|
237
|
+
const delaySec = Math.max(0, step.backoff(attempt))
|
|
238
|
+
await this.queue.dispatchLater(delaySec, this.advanceJob, { runId })
|
|
239
|
+
return null
|
|
240
|
+
}
|
|
241
|
+
// Terminal — journal the failure, mark compensating, kick off
|
|
242
|
+
// compensation. The compensate handler walks back from the
|
|
243
|
+
// step BEFORE this one (no compensator for the step that
|
|
244
|
+
// just failed; there's nothing to roll back since the work
|
|
245
|
+
// didn't commit).
|
|
246
|
+
await tx.execute(
|
|
247
|
+
`INSERT INTO "strav_workflow_journal"
|
|
248
|
+
(id, run_id, step_name, status, result, error, attempts, completed_at, created_at, updated_at)
|
|
249
|
+
VALUES ($1, $2, $3, 'failed', NULL, $4, $5, now(), now(), now())`,
|
|
250
|
+
[ulid(), runId, step.name, message, attempt],
|
|
251
|
+
)
|
|
252
|
+
await tx.execute(
|
|
253
|
+
`UPDATE "strav_workflow_runs"
|
|
254
|
+
SET status = 'compensating', state = $1::jsonb, error = $2, updated_at = now()
|
|
255
|
+
WHERE id = $3`,
|
|
256
|
+
[JSON.stringify(state), message, runId],
|
|
257
|
+
)
|
|
258
|
+
await this.queue.dispatch(this.compensateJob, { runId })
|
|
259
|
+
return null
|
|
260
|
+
}
|
|
261
|
+
})
|
|
262
|
+
|
|
263
|
+
// If the step succeeded (or was already journaled), re-enter to
|
|
264
|
+
// advance the next one. We do this OUTSIDE the original
|
|
265
|
+
// transaction so each step holds the row lock for the minimum
|
|
266
|
+
// necessary window — important when steps make external API
|
|
267
|
+
// calls that can be slow.
|
|
268
|
+
if (workflow?.status === 'continue') {
|
|
269
|
+
await this.queue.dispatch(this.advanceJob, { runId })
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
/**
|
|
274
|
+
* Compensate handler. Walks the journal in reverse, calling each
|
|
275
|
+
* registered compensator. Compensators that throw are logged but
|
|
276
|
+
* don't halt the rollback — the rest still run. When the walk
|
|
277
|
+
* finishes the run lands in `failed`.
|
|
278
|
+
*/
|
|
279
|
+
async compensate(runId: string): Promise<void> {
|
|
280
|
+
await this.db.transaction(async (tx) => {
|
|
281
|
+
const row = await tx.queryOne<RunRow>(
|
|
282
|
+
`SELECT id, workflow_name, input, status, state, current_step, result, error, created_at, updated_at
|
|
283
|
+
FROM "strav_workflow_runs" WHERE id = $1 FOR UPDATE`,
|
|
284
|
+
[runId],
|
|
285
|
+
)
|
|
286
|
+
if (!row) throw new RunNotFoundError(runId)
|
|
287
|
+
if (row.status !== 'compensating') return
|
|
288
|
+
|
|
289
|
+
const wf = this.registry.get(row.workflow_name)
|
|
290
|
+
const state = parseJson(row.state) as { results: Record<string, unknown> }
|
|
291
|
+
const input = parseJson(row.input) as Record<string, unknown>
|
|
292
|
+
|
|
293
|
+
const journal = await tx.query<JournalRow>(
|
|
294
|
+
`SELECT id, run_id, step_name, status, result, error, attempts, completed_at
|
|
295
|
+
FROM "strav_workflow_journal" WHERE run_id = $1 ORDER BY completed_at ASC`,
|
|
296
|
+
[runId],
|
|
297
|
+
)
|
|
298
|
+
// Build an ordered list of successfully-completed step names so we
|
|
299
|
+
// can walk back through `wf.steps` in declaration order and find
|
|
300
|
+
// each compensator. Failed-step rows are skipped — they hold no
|
|
301
|
+
// committed work to roll back.
|
|
302
|
+
const completedNames = new Set(
|
|
303
|
+
journal.filter((j) => j.status === 'completed').map((j) => j.step_name),
|
|
304
|
+
)
|
|
305
|
+
const stepsByName = new Map<string, DurableStep>(wf.steps.map((s) => [s.name, s]))
|
|
306
|
+
|
|
307
|
+
for (const name of [...completedNames].reverse()) {
|
|
308
|
+
const step = stepsByName.get(name)
|
|
309
|
+
if (!step?.compensate) continue
|
|
310
|
+
try {
|
|
311
|
+
await step.compensate({
|
|
312
|
+
input,
|
|
313
|
+
results: state.results,
|
|
314
|
+
runId,
|
|
315
|
+
attempt: 1,
|
|
316
|
+
})
|
|
317
|
+
} catch (err) {
|
|
318
|
+
this.logger?.error('Durable compensator threw', {
|
|
319
|
+
runId,
|
|
320
|
+
step: name,
|
|
321
|
+
error: err instanceof Error ? err.message : String(err),
|
|
322
|
+
})
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
await tx.execute(
|
|
327
|
+
`UPDATE "strav_workflow_runs"
|
|
328
|
+
SET status = 'failed', updated_at = now()
|
|
329
|
+
WHERE id = $1`,
|
|
330
|
+
[runId],
|
|
331
|
+
)
|
|
332
|
+
})
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
// ─── Internal helpers ────────────────────────────────────────────────────
|
|
336
|
+
|
|
337
|
+
private async markCompleted(
|
|
338
|
+
tx: Database | { execute: (s: string, p: unknown[]) => Promise<number> },
|
|
339
|
+
runId: string,
|
|
340
|
+
results: Record<string, unknown>,
|
|
341
|
+
): Promise<void> {
|
|
342
|
+
await tx.execute(
|
|
343
|
+
`UPDATE "strav_workflow_runs"
|
|
344
|
+
SET status = 'completed', state = $1::jsonb, result = $2::jsonb, updated_at = now()
|
|
345
|
+
WHERE id = $3`,
|
|
346
|
+
[
|
|
347
|
+
JSON.stringify({ results, stepAttempts: {} }),
|
|
348
|
+
JSON.stringify(results),
|
|
349
|
+
runId,
|
|
350
|
+
],
|
|
351
|
+
)
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
private async advanceCursor(
|
|
355
|
+
tx: { execute: (s: string, p: unknown[]) => Promise<number> },
|
|
356
|
+
runId: string,
|
|
357
|
+
nextStep: number,
|
|
358
|
+
state: { results: Record<string, unknown>; stepAttempts: Record<string, number> },
|
|
359
|
+
): Promise<void> {
|
|
360
|
+
await tx.execute(
|
|
361
|
+
`UPDATE "strav_workflow_runs"
|
|
362
|
+
SET current_step = $1, state = $2::jsonb, status = 'running', updated_at = now()
|
|
363
|
+
WHERE id = $3`,
|
|
364
|
+
[nextStep, JSON.stringify(state), runId],
|
|
365
|
+
)
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
// ─── Pure helpers ────────────────────────────────────────────────────────
|
|
370
|
+
|
|
371
|
+
function parseJson(value: unknown): unknown {
|
|
372
|
+
if (value === null || value === undefined) return value
|
|
373
|
+
if (typeof value === 'string') return JSON.parse(value)
|
|
374
|
+
return value
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
function toSnapshot(row: RunRow): RunSnapshot {
|
|
378
|
+
const state = parseJson(row.state) as { results?: Record<string, unknown> } | null
|
|
379
|
+
return {
|
|
380
|
+
id: row.id,
|
|
381
|
+
workflowName: row.workflow_name,
|
|
382
|
+
status: row.status,
|
|
383
|
+
input: parseJson(row.input) as Record<string, unknown>,
|
|
384
|
+
results: state?.results ?? {},
|
|
385
|
+
currentStep: row.current_step,
|
|
386
|
+
result: parseJson(row.result) as Record<string, unknown> | null,
|
|
387
|
+
error: row.error,
|
|
388
|
+
createdAt: row.created_at,
|
|
389
|
+
updatedAt: row.updated_at,
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
// Re-export for tests that want to construct a Database stub without
|
|
394
|
+
// depending on the parent module's import order.
|
|
395
|
+
export type { DurableError } from './durable_error.ts'
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `DurableWorkflow` — the builder apps use to declare a named,
|
|
3
|
+
* registered, crash-resumable workflow.
|
|
4
|
+
*
|
|
5
|
+
* Mirrors the `.step(name, handler, { compensate?, maxAttempts? })`
|
|
6
|
+
* surface from `@strav/workflow` so simple migrations are mostly
|
|
7
|
+
* copy-paste, but the semantics differ in three important ways:
|
|
8
|
+
*
|
|
9
|
+
* 1. Workflows are *named* and live in a registry. Steps are looked
|
|
10
|
+
* up by name when an `advance` job picks them off the queue —
|
|
11
|
+
* apps don't pass closures to `runner.start()`.
|
|
12
|
+
*
|
|
13
|
+
* 2. Each step is its own crash boundary. A step that's already
|
|
14
|
+
* journaled completed is skipped on replay; a step that throws
|
|
15
|
+
* is retried up to `maxAttempts` with `backoff` (default
|
|
16
|
+
* exponential, capped at 60s); a step that exhausts its
|
|
17
|
+
* attempts triggers reverse-order saga compensation.
|
|
18
|
+
*
|
|
19
|
+
* 3. Step handlers must be *resolvable across processes*. The
|
|
20
|
+
* registry holds the handler function; the queue payload carries
|
|
21
|
+
* only the run id + step name. Handlers can close over module-
|
|
22
|
+
* level state but NOT request-scoped variables — the
|
|
23
|
+
* `advance` job may run in a worker that never saw the request
|
|
24
|
+
* that started the workflow.
|
|
25
|
+
*
|
|
26
|
+
* V1 ships sequential `.step()` only. V2 adds `.parallel` / `.route`
|
|
27
|
+
* / `.loop` / `.sleep` / `.waitForSignal` / `.childWorkflow`.
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
import { DurableError } from './durable_error.ts'
|
|
31
|
+
import type {
|
|
32
|
+
DurableStep,
|
|
33
|
+
DurableStepHandler,
|
|
34
|
+
DurableStepOptions,
|
|
35
|
+
} from './types.ts'
|
|
36
|
+
|
|
37
|
+
const DEFAULT_MAX_ATTEMPTS = 3
|
|
38
|
+
const MAX_BACKOFF_SECONDS = 60
|
|
39
|
+
const defaultBackoff = (failedAttempt: number): number =>
|
|
40
|
+
Math.min(2 ** failedAttempt, MAX_BACKOFF_SECONDS)
|
|
41
|
+
|
|
42
|
+
export class DurableWorkflow {
|
|
43
|
+
readonly name: string
|
|
44
|
+
private readonly _steps: DurableStep[] = []
|
|
45
|
+
private readonly _names = new Set<string>()
|
|
46
|
+
|
|
47
|
+
constructor(name: string) {
|
|
48
|
+
if (!name) {
|
|
49
|
+
throw new DurableError('DurableWorkflow: name must be a non-empty string.')
|
|
50
|
+
}
|
|
51
|
+
this.name = name
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/** Read-only snapshot of the queued steps. */
|
|
55
|
+
get steps(): readonly DurableStep[] {
|
|
56
|
+
return this._steps
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Append a sequential step. The handler's return is journaled and
|
|
61
|
+
* stored under `results[name]`; the next step's handler sees it
|
|
62
|
+
* through `ctx.results[name]`.
|
|
63
|
+
*
|
|
64
|
+
* `compensate` registers a saga rollback that runs in reverse
|
|
65
|
+
* declaration order when a *later* step exhausts its retries.
|
|
66
|
+
*
|
|
67
|
+
* `maxAttempts` includes the first try. Default is 3 (= initial +
|
|
68
|
+
* 2 retries). `backoff(failedAttempt)` returns seconds until the
|
|
69
|
+
* next attempt; default is exponential capped at 60s.
|
|
70
|
+
*/
|
|
71
|
+
step(name: string, handler: DurableStepHandler, options?: DurableStepOptions): this {
|
|
72
|
+
this.claim(name)
|
|
73
|
+
const step: DurableStep = {
|
|
74
|
+
type: 'step',
|
|
75
|
+
name,
|
|
76
|
+
handler,
|
|
77
|
+
maxAttempts: options?.maxAttempts ?? DEFAULT_MAX_ATTEMPTS,
|
|
78
|
+
backoff: options?.backoff ?? defaultBackoff,
|
|
79
|
+
}
|
|
80
|
+
if (options?.compensate) step.compensate = options.compensate
|
|
81
|
+
this._steps.push(step)
|
|
82
|
+
return this
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/** Throw if the step name has already been used in this workflow. */
|
|
86
|
+
private claim(name: string): void {
|
|
87
|
+
if (!name) {
|
|
88
|
+
throw new DurableError(`DurableWorkflow("${this.name}"): step name must be non-empty.`)
|
|
89
|
+
}
|
|
90
|
+
if (this._names.has(name)) {
|
|
91
|
+
throw new DurableError(
|
|
92
|
+
`DurableWorkflow("${this.name}"): duplicate step name "${name}". Steps are journaled by name; collisions would break replay.`,
|
|
93
|
+
)
|
|
94
|
+
}
|
|
95
|
+
this._names.add(name)
|
|
96
|
+
}
|
|
97
|
+
}
|
package/src/index.ts
CHANGED
|
@@ -1,37 +1,36 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
export {
|
|
1
|
+
// Public API of @strav/durable.
|
|
2
|
+
//
|
|
3
|
+
// Crash-resumable workflows on top of @strav/queue + Postgres. V1
|
|
4
|
+
// ships sequential `.step()` with per-step retries and saga
|
|
5
|
+
// compensation. V2 layers in parallel / route / loop / sleep /
|
|
6
|
+
// waitForSignal / childWorkflow.
|
|
7
|
+
|
|
8
|
+
export { defineDurable } from './define_durable.ts'
|
|
9
|
+
export {
|
|
10
|
+
DurableAdvanceJob,
|
|
11
|
+
type DurableAdvancePayload,
|
|
12
|
+
} from './durable_advance_job.ts'
|
|
13
|
+
export {
|
|
14
|
+
DurableCompensateJob,
|
|
15
|
+
type DurableCompensatePayload,
|
|
16
|
+
} from './durable_compensate_job.ts'
|
|
9
17
|
export {
|
|
10
18
|
DurableError,
|
|
11
19
|
RunNotFoundError,
|
|
12
20
|
WorkflowNotRegisteredError,
|
|
13
|
-
} from './
|
|
14
|
-
export {
|
|
21
|
+
} from './durable_error.ts'
|
|
22
|
+
export { DurableProvider } from './durable_provider.ts'
|
|
23
|
+
export { DurableRunner, type DurableRunnerOptions } from './durable_runner.ts'
|
|
24
|
+
export { DurableWorkflow } from './durable_workflow.ts'
|
|
25
|
+
export { JOURNAL_UNIQUE_INDEX, workflowJournalSchema } from './journal_schema.ts'
|
|
26
|
+
export { workflowRunsSchema } from './runs_schema.ts'
|
|
15
27
|
export type {
|
|
28
|
+
DurableCompensator,
|
|
16
29
|
DurableContext,
|
|
17
30
|
DurableStep,
|
|
18
31
|
DurableStepHandler,
|
|
19
|
-
DurableLoopHandler,
|
|
20
|
-
DurableRouteResolver,
|
|
21
|
-
DurableParallelEntry,
|
|
22
|
-
DurableCompensator,
|
|
23
32
|
DurableStepOptions,
|
|
24
|
-
|
|
25
|
-
SequentialStep,
|
|
26
|
-
ParallelStep,
|
|
27
|
-
RouteStep,
|
|
28
|
-
LoopStep,
|
|
29
|
-
SleepStep,
|
|
30
|
-
SignalStep,
|
|
31
|
-
ChildStep,
|
|
33
|
+
RunSnapshot,
|
|
32
34
|
RunStatus,
|
|
33
|
-
JournalStatus,
|
|
34
|
-
StartResult,
|
|
35
|
-
ResumeResult,
|
|
36
|
-
RunStatusSnapshot,
|
|
37
35
|
} from './types.ts'
|
|
36
|
+
export { WorkflowRegistry } from './workflow_registry.ts'
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `strav_workflow_journal` — per-step checkpoint log.
|
|
3
|
+
*
|
|
4
|
+
* One row per step *completion* (success or terminal failure). The
|
|
5
|
+
* `UNIQUE (run_id, step_name)` constraint is the load-bearing
|
|
6
|
+
* idempotency mechanism: when the queue redelivers an `advance` job
|
|
7
|
+
* after a worker crash mid-step, the handler can re-INSERT under a
|
|
8
|
+
* conflict and detect "we already completed this step" without
|
|
9
|
+
* re-running the handler.
|
|
10
|
+
*
|
|
11
|
+
* Columns:
|
|
12
|
+
* - `id` — ULID PK
|
|
13
|
+
* - `run_id` — FK to `strav_workflow_runs`
|
|
14
|
+
* - `step_name` — step identifier (workflow.steps[i].name)
|
|
15
|
+
* - `status` — `completed` (handler returned) | `failed` (terminal)
|
|
16
|
+
* - `result` — jsonb of the handler's return; null on failure
|
|
17
|
+
* - `error` — terminal failure message; null on success
|
|
18
|
+
* - `attempts` — total attempts the step took (1 = succeeded on first try)
|
|
19
|
+
* - `completed_at` — wall-clock timestamp the row was inserted
|
|
20
|
+
*
|
|
21
|
+
* In-flight retries are tracked on the *run* row (a `state.attempts`
|
|
22
|
+
* counter per step name), not here — the journal is append-only and
|
|
23
|
+
* carries only terminal step outcomes.
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
import { Archetype, defineSchema } from '@strav/database'
|
|
27
|
+
|
|
28
|
+
export const workflowJournalSchema = defineSchema(
|
|
29
|
+
'strav_workflow_journal',
|
|
30
|
+
Archetype.Event,
|
|
31
|
+
(t) => {
|
|
32
|
+
t.id()
|
|
33
|
+
t.string('run_id').max(26)
|
|
34
|
+
t.string('step_name').max(255)
|
|
35
|
+
t.string('status').max(32)
|
|
36
|
+
t.json('result').nullable()
|
|
37
|
+
t.text('error').nullable()
|
|
38
|
+
t.integer('attempts').default(1)
|
|
39
|
+
t.timestamp('completed_at')
|
|
40
|
+
t.timestamps()
|
|
41
|
+
},
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Index name for the `(run_id, step_name)` UNIQUE that DurableProvider
|
|
46
|
+
* provisions at boot. The composite unique can't live in the schema
|
|
47
|
+
* builder (no table-level unique in V1 — see `t.unique`); the
|
|
48
|
+
* provider emits it via `CREATE UNIQUE INDEX IF NOT EXISTS` after
|
|
49
|
+
* the journal table is created. Belt-and-suspenders against
|
|
50
|
+
* accidental dup writes — the advance handler's row-lock on the run
|
|
51
|
+
* already serializes journal INSERTs for a given (run_id, step_name).
|
|
52
|
+
*/
|
|
53
|
+
export const JOURNAL_UNIQUE_INDEX = 'strav_workflow_journal_run_step_unique_idx'
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `strav_workflow_runs` — the durable record of a single workflow
|
|
3
|
+
* execution. One row per `runner.start()` call; mutated as the
|
|
4
|
+
* `advance` handler walks the step list.
|
|
5
|
+
*
|
|
6
|
+
* Columns:
|
|
7
|
+
* - `id` — ULID PK
|
|
8
|
+
* - `workflow_name` — registry key the run was started against
|
|
9
|
+
* - `input` — the original input object (jsonb, never mutated post-start)
|
|
10
|
+
* - `status` — `pending` / `running` / `compensating` / `completed` / `failed`
|
|
11
|
+
* - `state` — jsonb bag carrying `results` (the per-step return values)
|
|
12
|
+
* - `current_step` — 0-based cursor pointing at the next step to advance
|
|
13
|
+
* - `result` — set to `state.results` on completion; null otherwise
|
|
14
|
+
* - `error` — terminal failure message; null on success
|
|
15
|
+
* - timestamps
|
|
16
|
+
*
|
|
17
|
+
* The hot path (advance / compensate) writes via raw SQL for atomicity;
|
|
18
|
+
* application code that polls a run reads via the standard Repository
|
|
19
|
+
* surface or via `DurableRunner.find`.
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
import { Archetype, defineSchema } from '@strav/database'
|
|
23
|
+
|
|
24
|
+
export const workflowRunsSchema = defineSchema(
|
|
25
|
+
'strav_workflow_runs',
|
|
26
|
+
Archetype.Event,
|
|
27
|
+
(t) => {
|
|
28
|
+
t.id()
|
|
29
|
+
t.string('workflow_name').max(255)
|
|
30
|
+
t.json('input')
|
|
31
|
+
t.string('status').max(32).default('pending')
|
|
32
|
+
t.json('state')
|
|
33
|
+
t.integer('current_step').default(0)
|
|
34
|
+
t.json('result').nullable()
|
|
35
|
+
t.text('error').nullable()
|
|
36
|
+
t.timestamps()
|
|
37
|
+
},
|
|
38
|
+
)
|