@strav/durable 0.4.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +26 -0
- package/package.json +49 -0
- package/src/builder.ts +158 -0
- package/src/config.ts +36 -0
- package/src/durable.ts +268 -0
- package/src/engine/advance_handler.ts +154 -0
- package/src/engine/compensate_handler.ts +70 -0
- package/src/engine/compensation_driver.ts +61 -0
- package/src/engine/context.ts +36 -0
- package/src/engine/enqueue.ts +62 -0
- package/src/engine/finalize.ts +111 -0
- package/src/engine/index.ts +20 -0
- package/src/engine/run_store.ts +42 -0
- package/src/engine/step_driver.ts +291 -0
- package/src/engine/suspended_run.ts +24 -0
- package/src/errors.ts +21 -0
- package/src/helpers.ts +16 -0
- package/src/index.ts +37 -0
- package/src/models/index.ts +3 -0
- package/src/models/journal.ts +54 -0
- package/src/models/run_machine.ts +39 -0
- package/src/models/workflow_run.ts +36 -0
- package/src/providers/durable_provider.ts +31 -0
- package/src/providers/index.ts +2 -0
- package/src/registry.ts +35 -0
- package/src/schema.ts +70 -0
- package/src/types.ts +216 -0
- package/src/util.ts +25 -0
- package/tsconfig.json +5 -0
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# @strav/durable
|
|
2
|
+
|
|
3
|
+
## 0.4.26
|
|
4
|
+
|
|
5
|
+
Initial release — durable, crash-resumable workflow execution.
|
|
6
|
+
|
|
7
|
+
- `durable(name)` builder mirroring the `@strav/workflow` authoring API
|
|
8
|
+
(`.step` / `.parallel` / `.route` / `.loop` + `compensate`), plus durable-only
|
|
9
|
+
step types `.sleep`, `.waitForSignal`, and `.childWorkflow`.
|
|
10
|
+
- Explicit-journal execution model (DBOS/Inngest style, no determinism sandbox):
|
|
11
|
+
every step is checkpointed to `_strav_workflow_journal`; `UNIQUE (run_id, step_id)`
|
|
12
|
+
makes redelivery idempotent.
|
|
13
|
+
- Queue-driven progression — one top-level step = one `durable:advance`
|
|
14
|
+
`@strav/queue` job; the journal write + run-row update + next-job enqueue
|
|
15
|
+
commit in a single transaction, so the engine inherits crash-safety from the
|
|
16
|
+
queue. A workflow killed mid-execution resumes from the first incomplete step.
|
|
17
|
+
- Suspend/resume: `waitForSignal` parks a run for an unbounded time holding no
|
|
18
|
+
process; `Durable.resume()` delivers the signal.
|
|
19
|
+
- Durable timers via `.sleep` (survives process restarts).
|
|
20
|
+
- Independently durable, parent-linked child workflows.
|
|
21
|
+
- Journaled saga compensation — rollback is itself crash-safe.
|
|
22
|
+
- Composes with `@strav/brain` — a step returning a `SuspendedRun` suspends the
|
|
23
|
+
run; resume re-enters the step (duck-typed, no `@strav/brain` dependency).
|
|
24
|
+
- `WorkflowRun` — a `stateful()` ORM model over the run record, with a
|
|
25
|
+
`@strav/machine` run-status lifecycle.
|
|
26
|
+
- `Durable.start` / `resume` / `status` / `list` / `cancel` / `recover`.
|
package/package.json
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@strav/durable",
|
|
3
|
+
"version": "0.4.27",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"description": "Durable, crash-resumable workflow execution for the Strav framework",
|
|
6
|
+
"license": "MIT",
|
|
7
|
+
"keywords": [
|
|
8
|
+
"bun",
|
|
9
|
+
"framework",
|
|
10
|
+
"typescript",
|
|
11
|
+
"strav",
|
|
12
|
+
"durable",
|
|
13
|
+
"workflow",
|
|
14
|
+
"orchestration"
|
|
15
|
+
],
|
|
16
|
+
"exports": {
|
|
17
|
+
".": "./src/index.ts",
|
|
18
|
+
"./engine": "./src/engine/index.ts",
|
|
19
|
+
"./engine/*": "./src/engine/*.ts",
|
|
20
|
+
"./models": "./src/models/index.ts",
|
|
21
|
+
"./models/*": "./src/models/*.ts",
|
|
22
|
+
"./providers": "./src/providers/index.ts",
|
|
23
|
+
"./providers/*": "./src/providers/*.ts",
|
|
24
|
+
"./*": "./src/*.ts"
|
|
25
|
+
},
|
|
26
|
+
"files": [
|
|
27
|
+
"src/",
|
|
28
|
+
"package.json",
|
|
29
|
+
"tsconfig.json",
|
|
30
|
+
"CHANGELOG.md"
|
|
31
|
+
],
|
|
32
|
+
"peerDependencies": {
|
|
33
|
+
"@strav/kernel": "0.4.27",
|
|
34
|
+
"@strav/database": "0.4.27"
|
|
35
|
+
},
|
|
36
|
+
"dependencies": {
|
|
37
|
+
"@strav/queue": "0.4.27",
|
|
38
|
+
"@strav/machine": "0.4.27",
|
|
39
|
+
"@strav/workflow": "0.4.27",
|
|
40
|
+
"luxon": "^3.7.2"
|
|
41
|
+
},
|
|
42
|
+
"devDependencies": {
|
|
43
|
+
"@types/luxon": "^3.7.1"
|
|
44
|
+
},
|
|
45
|
+
"scripts": {
|
|
46
|
+
"test": "bun test tests/",
|
|
47
|
+
"typecheck": "tsc --noEmit"
|
|
48
|
+
}
|
|
49
|
+
}
|
package/src/builder.ts
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
import { DurableError } from './errors.ts'
|
|
2
|
+
import type {
|
|
3
|
+
DurableStep,
|
|
4
|
+
DurableStepHandler,
|
|
5
|
+
DurableLoopHandler,
|
|
6
|
+
DurableRouteResolver,
|
|
7
|
+
DurableParallelEntry,
|
|
8
|
+
DurableStepOptions,
|
|
9
|
+
DurableLoopOptions,
|
|
10
|
+
} from './types.ts'
|
|
11
|
+
|
|
12
|
+
const DEFAULT_MAX_RETRIES = 3
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Durable workflow builder.
|
|
16
|
+
*
|
|
17
|
+
* Mirrors the `@strav/workflow` authoring API (`.step` / `.parallel` /
|
|
18
|
+
* `.route` / `.loop` + `compensate`) so plain workflow code is copy-paste
|
|
19
|
+
* portable, and adds durable-only step types: `.sleep`, `.waitForSignal`,
|
|
20
|
+
* and `.childWorkflow`.
|
|
21
|
+
*
|
|
22
|
+
* Built workflows are flat ordered step lists — there is no re-runnable
|
|
23
|
+
* function body, so durability needs no determinism sandbox.
|
|
24
|
+
*
|
|
25
|
+
* @example
|
|
26
|
+
* durable('milestone')
|
|
27
|
+
* .step('discover', async (ctx) => discover(ctx.input))
|
|
28
|
+
* .parallel('build', [
|
|
29
|
+
* { name: 'api', handler: async (ctx) => buildApi(ctx) },
|
|
30
|
+
* { name: 'ui', handler: async (ctx) => buildUi(ctx) },
|
|
31
|
+
* ])
|
|
32
|
+
* .waitForSignal('signoff', 'founder-signoff')
|
|
33
|
+
* .step('ship', async (ctx) => ship(ctx))
|
|
34
|
+
*/
|
|
35
|
+
export class DurableWorkflow {
|
|
36
|
+
readonly name: string
|
|
37
|
+
private readonly _steps: DurableStep[] = []
|
|
38
|
+
private readonly _names = new Set<string>()
|
|
39
|
+
|
|
40
|
+
constructor(name: string) {
|
|
41
|
+
this.name = name
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/** The ordered, immutable step list. */
|
|
45
|
+
get steps(): readonly DurableStep[] {
|
|
46
|
+
return this._steps
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
private claim(name: string): void {
|
|
50
|
+
if (this._names.has(name)) {
|
|
51
|
+
throw new DurableError(
|
|
52
|
+
`Duplicate step name "${name}" in durable workflow "${this.name}".`
|
|
53
|
+
)
|
|
54
|
+
}
|
|
55
|
+
this._names.add(name)
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/** Add a sequential step. Its result is stored in `ctx.results[name]`. */
|
|
59
|
+
step(name: string, handler: DurableStepHandler, options?: DurableStepOptions): this {
|
|
60
|
+
this.claim(name)
|
|
61
|
+
this._steps.push({
|
|
62
|
+
type: 'step',
|
|
63
|
+
name,
|
|
64
|
+
handler,
|
|
65
|
+
compensate: options?.compensate,
|
|
66
|
+
maxRetries: options?.maxRetries ?? DEFAULT_MAX_RETRIES,
|
|
67
|
+
retryBackoff: options?.retryBackoff ?? 'exponential',
|
|
68
|
+
})
|
|
69
|
+
return this
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/** Run multiple handlers in parallel; each result is stored under its entry name. */
|
|
73
|
+
parallel(
|
|
74
|
+
name: string,
|
|
75
|
+
entries: DurableParallelEntry[],
|
|
76
|
+
options?: Pick<DurableStepOptions, 'maxRetries' | 'retryBackoff'>
|
|
77
|
+
): this {
|
|
78
|
+
this.claim(name)
|
|
79
|
+
if (entries.length === 0) {
|
|
80
|
+
throw new DurableError(`Parallel step "${name}" must have at least one entry.`)
|
|
81
|
+
}
|
|
82
|
+
this._steps.push({
|
|
83
|
+
type: 'parallel',
|
|
84
|
+
name,
|
|
85
|
+
entries,
|
|
86
|
+
maxRetries: options?.maxRetries ?? DEFAULT_MAX_RETRIES,
|
|
87
|
+
retryBackoff: options?.retryBackoff ?? 'exponential',
|
|
88
|
+
})
|
|
89
|
+
return this
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/** Route to a branch based on a resolver's return value. */
|
|
93
|
+
route(
|
|
94
|
+
name: string,
|
|
95
|
+
resolver: DurableRouteResolver,
|
|
96
|
+
branches: Record<string, DurableStepHandler>,
|
|
97
|
+
options?: Pick<DurableStepOptions, 'maxRetries' | 'retryBackoff'>
|
|
98
|
+
): this {
|
|
99
|
+
this.claim(name)
|
|
100
|
+
this._steps.push({
|
|
101
|
+
type: 'route',
|
|
102
|
+
name,
|
|
103
|
+
resolver,
|
|
104
|
+
branches,
|
|
105
|
+
maxRetries: options?.maxRetries ?? DEFAULT_MAX_RETRIES,
|
|
106
|
+
retryBackoff: options?.retryBackoff ?? 'exponential',
|
|
107
|
+
})
|
|
108
|
+
return this
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/** Run a handler in a loop until a condition is met or max iterations reached. */
|
|
112
|
+
loop(name: string, handler: DurableLoopHandler, options: DurableLoopOptions): this {
|
|
113
|
+
this.claim(name)
|
|
114
|
+
this._steps.push({
|
|
115
|
+
type: 'loop',
|
|
116
|
+
name,
|
|
117
|
+
handler,
|
|
118
|
+
maxIterations: options.maxIterations,
|
|
119
|
+
until: options.until,
|
|
120
|
+
feedback: options.feedback,
|
|
121
|
+
mapInput: options.mapInput,
|
|
122
|
+
maxRetries: options.maxRetries ?? DEFAULT_MAX_RETRIES,
|
|
123
|
+
retryBackoff: options.retryBackoff ?? 'exponential',
|
|
124
|
+
})
|
|
125
|
+
return this
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/** Durable timer — suspend the run and resume after `duration` (ms or a Date). */
|
|
129
|
+
sleep(name: string, duration: number | Date): this {
|
|
130
|
+
this.claim(name)
|
|
131
|
+
this._steps.push({ type: 'sleep', name, duration })
|
|
132
|
+
return this
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Suspend the run until `Durable.resume(runId, signal, data)` is called.
|
|
137
|
+
* Holds no process while suspended — resumes exactly, even days later.
|
|
138
|
+
*/
|
|
139
|
+
waitForSignal(name: string, signal: string, options?: { timeout?: number }): this {
|
|
140
|
+
this.claim(name)
|
|
141
|
+
this._steps.push({ type: 'signal', name, signal, timeout: options?.timeout })
|
|
142
|
+
return this
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
/**
|
|
146
|
+
* Spawn an independently durable child workflow and wait for it to finish.
|
|
147
|
+
* The child result is stored in `ctx.results[name]`.
|
|
148
|
+
*/
|
|
149
|
+
childWorkflow(
|
|
150
|
+
name: string,
|
|
151
|
+
childName: string,
|
|
152
|
+
mapInput?: (ctx: import('./types.ts').DurableContext) => Record<string, unknown>
|
|
153
|
+
): this {
|
|
154
|
+
this.claim(name)
|
|
155
|
+
this._steps.push({ type: 'child', name, childName, mapInput })
|
|
156
|
+
return this
|
|
157
|
+
}
|
|
158
|
+
}
|
package/src/config.ts
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
/** Engine-wide configuration for `@strav/durable`. */
|
|
2
|
+
export interface DurableConfig {
|
|
3
|
+
/**
|
|
4
|
+
* The `@strav/queue` queue name durable jobs are dispatched on. Run a
|
|
5
|
+
* `Worker({ queue })` on this name to process durable workflows. Keeping
|
|
6
|
+
* it separate from app jobs lets durable work be scaled independently.
|
|
7
|
+
*/
|
|
8
|
+
queue: string
|
|
9
|
+
/**
|
|
10
|
+
* Per-job timeout (ms) for `durable:advance` / `durable:compensate` jobs.
|
|
11
|
+
* Must comfortably exceed the slowest single step (e.g. an LLM call).
|
|
12
|
+
*/
|
|
13
|
+
jobTimeout: number
|
|
14
|
+
/**
|
|
15
|
+
* Queue-level max attempts for durable jobs. This insures against *process
|
|
16
|
+
* crashes* only — application-level step retries are handled by the engine
|
|
17
|
+
* via the journal `attempt` count, independently of this.
|
|
18
|
+
*/
|
|
19
|
+
maxAttempts: number
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
const config: DurableConfig = {
|
|
23
|
+
queue: 'durable',
|
|
24
|
+
jobTimeout: 600_000,
|
|
25
|
+
maxAttempts: 5,
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/** Read the current durable engine configuration. */
|
|
29
|
+
export function getConfig(): DurableConfig {
|
|
30
|
+
return config
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/** Override durable engine configuration. Call before booting workers. */
|
|
34
|
+
export function configureDurable(patch: Partial<DurableConfig>): void {
|
|
35
|
+
Object.assign(config, patch)
|
|
36
|
+
}
|
package/src/durable.ts
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
import { sql, transaction } from '@strav/database'
|
|
2
|
+
import { Queue } from '@strav/queue'
|
|
3
|
+
import { registry } from './registry.ts'
|
|
4
|
+
import { ensureTables } from './schema.ts'
|
|
5
|
+
import { configureDurable, type DurableConfig } from './config.ts'
|
|
6
|
+
import { RunNotFoundError } from './errors.ts'
|
|
7
|
+
import type {
|
|
8
|
+
ResumeResult,
|
|
9
|
+
RunStatus,
|
|
10
|
+
RunStatusSnapshot,
|
|
11
|
+
StartResult,
|
|
12
|
+
} from './types.ts'
|
|
13
|
+
import { writeJournal } from './models/journal.ts'
|
|
14
|
+
import {
|
|
15
|
+
advanceHandler,
|
|
16
|
+
applyPatch,
|
|
17
|
+
compensateHandler,
|
|
18
|
+
enqueueAdvance,
|
|
19
|
+
enqueueCompensate,
|
|
20
|
+
loadRun,
|
|
21
|
+
lockRun,
|
|
22
|
+
type AdvancePayload,
|
|
23
|
+
type CompensatePayload,
|
|
24
|
+
type Tx,
|
|
25
|
+
} from './engine/index.ts'
|
|
26
|
+
|
|
27
|
+
/** Drop engine-internal keys so `results` reads like a `@strav/workflow` context. */
|
|
28
|
+
function publicResults(state: Record<string, unknown>): Record<string, unknown> {
|
|
29
|
+
const out: Record<string, unknown> = {}
|
|
30
|
+
for (const [key, value] of Object.entries(state)) {
|
|
31
|
+
if (!key.startsWith('__strav_')) out[key] = value
|
|
32
|
+
}
|
|
33
|
+
return out
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
async function cancelRecursive(trx: Tx, runId: number): Promise<void> {
|
|
37
|
+
await trx`
|
|
38
|
+
UPDATE "_strav_workflow_runs"
|
|
39
|
+
SET "status" = 'canceled', "updated_at" = NOW()
|
|
40
|
+
WHERE "id" = ${runId}
|
|
41
|
+
AND "status" IN ('pending', 'running', 'suspended', 'compensating')
|
|
42
|
+
`
|
|
43
|
+
const children = (await trx`
|
|
44
|
+
SELECT "id" FROM "_strav_workflow_runs" WHERE "parent_run_id" = ${runId}
|
|
45
|
+
`) as Record<string, unknown>[]
|
|
46
|
+
for (const child of children) {
|
|
47
|
+
await cancelRecursive(trx, Number(child.id))
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Static facade for the durable execution engine.
|
|
53
|
+
*
|
|
54
|
+
* @example
|
|
55
|
+
* await Durable.start('milestone', { projectId: 42 })
|
|
56
|
+
* const snapshot = await Durable.status(runId)
|
|
57
|
+
* await Durable.resume(runId, 'founder-signoff', { approved: true })
|
|
58
|
+
*/
|
|
59
|
+
export class Durable {
|
|
60
|
+
/** Create the engine's tables (`_strav_workflow_runs`, `_strav_workflow_journal`). */
|
|
61
|
+
static async ensureTables(): Promise<void> {
|
|
62
|
+
await ensureTables()
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/** Override engine configuration (queue name, job timeout, max attempts). */
|
|
66
|
+
static configure(patch: Partial<DurableConfig>): void {
|
|
67
|
+
configureDurable(patch)
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Register the `durable:advance` / `durable:compensate` queue handlers.
|
|
72
|
+
* Called by `DurableProvider.boot`; idempotent.
|
|
73
|
+
*/
|
|
74
|
+
static registerHandlers(): void {
|
|
75
|
+
Queue.handle('durable:advance', async (payload: unknown) => {
|
|
76
|
+
await advanceHandler(payload as AdvancePayload)
|
|
77
|
+
})
|
|
78
|
+
Queue.handle('durable:compensate', async (payload: unknown) => {
|
|
79
|
+
await compensateHandler(payload as CompensatePayload)
|
|
80
|
+
})
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Start a new durable run. Inserts the run row and enqueues the first
|
|
85
|
+
* step's `durable:advance` job in a single transaction. Returns immediately
|
|
86
|
+
* — the workflow runs on the queue.
|
|
87
|
+
*/
|
|
88
|
+
static async start(
|
|
89
|
+
workflowName: string,
|
|
90
|
+
input: Record<string, unknown> = {},
|
|
91
|
+
opts?: { parentRunId?: number; parentStepId?: string }
|
|
92
|
+
): Promise<StartResult> {
|
|
93
|
+
registry.get(workflowName) // throws if not registered
|
|
94
|
+
|
|
95
|
+
return await transaction(async (trx: Tx) => {
|
|
96
|
+
const rows = (await trx`
|
|
97
|
+
INSERT INTO "_strav_workflow_runs"
|
|
98
|
+
("workflow_name", "input", "status", "state", "current_step",
|
|
99
|
+
"parent_run_id", "parent_step_id")
|
|
100
|
+
VALUES (
|
|
101
|
+
${workflowName}, ${JSON.stringify(input)}, 'running', '{}', 0,
|
|
102
|
+
${opts?.parentRunId ?? null}, ${opts?.parentStepId ?? null}
|
|
103
|
+
)
|
|
104
|
+
RETURNING "id"
|
|
105
|
+
`) as Record<string, unknown>[]
|
|
106
|
+
const runId = Number(rows[0]!.id)
|
|
107
|
+
await enqueueAdvance(trx, runId, 0)
|
|
108
|
+
return { runId, status: 'running' as RunStatus }
|
|
109
|
+
})
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Deliver a signal to a suspended run.
|
|
114
|
+
*
|
|
115
|
+
* - `waitForSignal` step → journals the payload, advances past the step.
|
|
116
|
+
* - A suspended brain-agent `.step` → stores the payload and re-enters the
|
|
117
|
+
* step so the handler can call `runner.resume(...)`.
|
|
118
|
+
*
|
|
119
|
+
* Returns `{ accepted: false }` if the run is not suspended on a matching
|
|
120
|
+
* signal (idempotent — a duplicate or mismatched signal is a no-op).
|
|
121
|
+
*/
|
|
122
|
+
static async resume(
|
|
123
|
+
runId: number,
|
|
124
|
+
signal: string,
|
|
125
|
+
data?: unknown
|
|
126
|
+
): Promise<ResumeResult> {
|
|
127
|
+
return await transaction(async (trx: Tx) => {
|
|
128
|
+
const run = await lockRun(trx, runId)
|
|
129
|
+
if (!run) throw new RunNotFoundError(runId)
|
|
130
|
+
if (run.status !== 'suspended' || run.awaitingSignal !== signal) {
|
|
131
|
+
return { accepted: false, status: run.status }
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
const step = registry.get(run.workflowName).steps[run.currentStep]
|
|
135
|
+
if (!step) return { accepted: false, status: run.status }
|
|
136
|
+
|
|
137
|
+
if (step.type === 'signal') {
|
|
138
|
+
await writeJournal(trx, runId, [
|
|
139
|
+
{ stepId: step.name, status: 'completed', result: data ?? null, attempt: 1 },
|
|
140
|
+
])
|
|
141
|
+
const newState = applyPatch(run.state, { [step.name]: data ?? null })
|
|
142
|
+
const next = run.currentStep + 1
|
|
143
|
+
await trx`
|
|
144
|
+
UPDATE "_strav_workflow_runs"
|
|
145
|
+
SET "status" = 'running', "awaiting_signal" = NULL,
|
|
146
|
+
"state" = ${JSON.stringify(newState)}, "current_step" = ${next},
|
|
147
|
+
"updated_at" = NOW()
|
|
148
|
+
WHERE "id" = ${runId}
|
|
149
|
+
`
|
|
150
|
+
await enqueueAdvance(trx, runId, next)
|
|
151
|
+
return { accepted: true, status: 'running' as RunStatus }
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
if (step.type === 'step') {
|
|
155
|
+
// A suspended brain agent — re-enter the same step with the resume data.
|
|
156
|
+
const newState = applyPatch(run.state, { __strav_resume__: data ?? null })
|
|
157
|
+
await trx`
|
|
158
|
+
UPDATE "_strav_workflow_runs"
|
|
159
|
+
SET "status" = 'running', "awaiting_signal" = NULL,
|
|
160
|
+
"state" = ${JSON.stringify(newState)}, "updated_at" = NOW()
|
|
161
|
+
WHERE "id" = ${runId}
|
|
162
|
+
`
|
|
163
|
+
await enqueueAdvance(trx, runId, run.currentStep)
|
|
164
|
+
return { accepted: true, status: 'running' as RunStatus }
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
return { accepted: false, status: run.status }
|
|
168
|
+
})
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/** Snapshot a run's live state (and one level of child runs). */
|
|
172
|
+
static async status(runId: number): Promise<RunStatusSnapshot> {
|
|
173
|
+
const run = await loadRun(runId)
|
|
174
|
+
if (!run) throw new RunNotFoundError(runId)
|
|
175
|
+
|
|
176
|
+
const totalSteps = registry.has(run.workflowName)
|
|
177
|
+
? registry.get(run.workflowName).steps.length
|
|
178
|
+
: 0
|
|
179
|
+
|
|
180
|
+
const childRows = (await sql`
|
|
181
|
+
SELECT "id" FROM "_strav_workflow_runs"
|
|
182
|
+
WHERE "parent_run_id" = ${runId} ORDER BY "id"
|
|
183
|
+
`) as Record<string, unknown>[]
|
|
184
|
+
const children: RunStatusSnapshot[] = []
|
|
185
|
+
for (const child of childRows) {
|
|
186
|
+
children.push(await Durable.status(Number(child.id)))
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
return {
|
|
190
|
+
runId: run.id,
|
|
191
|
+
workflowName: run.workflowName,
|
|
192
|
+
status: run.status,
|
|
193
|
+
currentStep: run.currentStep,
|
|
194
|
+
totalSteps,
|
|
195
|
+
awaitingSignal: run.awaitingSignal,
|
|
196
|
+
wakeAt: run.wakeAt ? run.wakeAt.toISOString() : null,
|
|
197
|
+
results: publicResults(run.state),
|
|
198
|
+
error: run.error,
|
|
199
|
+
parentRunId: run.parentRunId,
|
|
200
|
+
children,
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
/** List runs, most recent first, optionally filtered by status / parent. */
|
|
205
|
+
static async list(filter?: {
|
|
206
|
+
status?: RunStatus
|
|
207
|
+
parentRunId?: number
|
|
208
|
+
}): Promise<RunStatusSnapshot[]> {
|
|
209
|
+
const status = filter?.status ?? null
|
|
210
|
+
const parentRunId = filter?.parentRunId ?? null
|
|
211
|
+
const rows = (await sql`
|
|
212
|
+
SELECT "id" FROM "_strav_workflow_runs"
|
|
213
|
+
WHERE (${status}::text IS NULL OR "status" = ${status})
|
|
214
|
+
AND (${parentRunId}::bigint IS NULL OR "parent_run_id" = ${parentRunId})
|
|
215
|
+
ORDER BY "id" DESC
|
|
216
|
+
`) as Record<string, unknown>[]
|
|
217
|
+
|
|
218
|
+
const snapshots: RunStatusSnapshot[] = []
|
|
219
|
+
for (const row of rows) {
|
|
220
|
+
snapshots.push(await Durable.status(Number(row.id)))
|
|
221
|
+
}
|
|
222
|
+
return snapshots
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/** Cancel a run and all of its descendant child runs. */
|
|
226
|
+
static async cancel(runId: number): Promise<void> {
|
|
227
|
+
await transaction(async (trx: Tx) => {
|
|
228
|
+
await cancelRecursive(trx, runId)
|
|
229
|
+
})
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
/**
|
|
233
|
+
* Re-enqueue runs that are `running` / `compensating` but have no live
|
|
234
|
+
* `_strav_jobs` row — e.g. a job that dead-lettered. Returns the count
|
|
235
|
+
* recovered. Safe to run periodically (e.g. via the `@strav/queue` Scheduler).
|
|
236
|
+
*/
|
|
237
|
+
static async recover(): Promise<number> {
|
|
238
|
+
const rows = (await sql`
|
|
239
|
+
SELECT r."id", r."status", r."current_step", r."compensation_cursor"
|
|
240
|
+
FROM "_strav_workflow_runs" r
|
|
241
|
+
WHERE r."status" IN ('running', 'compensating')
|
|
242
|
+
AND NOT EXISTS (
|
|
243
|
+
SELECT 1 FROM "_strav_jobs" j
|
|
244
|
+
WHERE j."job" IN ('durable:advance', 'durable:compensate')
|
|
245
|
+
AND (j."payload"->>'runId')::bigint = r."id"
|
|
246
|
+
)
|
|
247
|
+
`) as Record<string, unknown>[]
|
|
248
|
+
|
|
249
|
+
let recovered = 0
|
|
250
|
+
for (const row of rows) {
|
|
251
|
+
const runId = Number(row.id)
|
|
252
|
+
await transaction(async (trx: Tx) => {
|
|
253
|
+
if (row.status === 'running') {
|
|
254
|
+
await enqueueAdvance(trx, runId, Number(row.current_step))
|
|
255
|
+
} else if (row.compensation_cursor != null) {
|
|
256
|
+
await enqueueCompensate(trx, runId, Number(row.compensation_cursor))
|
|
257
|
+
}
|
|
258
|
+
})
|
|
259
|
+
recovered++
|
|
260
|
+
}
|
|
261
|
+
return recovered
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
/** Clear the workflow registry. For testing only. */
|
|
265
|
+
static reset(): void {
|
|
266
|
+
registry.reset()
|
|
267
|
+
}
|
|
268
|
+
}
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
import { transaction } from '@strav/database'
|
|
2
|
+
import { registry } from '../registry.ts'
|
|
3
|
+
import { loadJournal, writeJournal } from '../models/journal.ts'
|
|
4
|
+
import { buildContext } from './context.ts'
|
|
5
|
+
import { enqueueAdvance } from './enqueue.ts'
|
|
6
|
+
import { applyPatch, beginCompensation, completeRun } from './finalize.ts'
|
|
7
|
+
import { loadRun, lockRun, type Tx } from './run_store.ts'
|
|
8
|
+
import { runDurableStep, type StepOutcome } from './step_driver.ts'
|
|
9
|
+
|
|
10
|
+
/** Payload of a `durable:advance` job. */
|
|
11
|
+
export interface AdvancePayload {
|
|
12
|
+
runId: number
|
|
13
|
+
stepIndex: number
|
|
14
|
+
attempt?: number
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* The `durable:advance` queue handler — runs one top-level step of a run.
|
|
19
|
+
*
|
|
20
|
+
* Phase A (no lock): load the run, guard against stale redelivery, execute the
|
|
21
|
+
* step. The step handler may take minutes — no row lock is held during it.
|
|
22
|
+
* Phase B (`applyOutcome`, row-locked transaction): apply the outcome —
|
|
23
|
+
* journal, advance/suspend/retry/compensate, enqueue the continuation — atomically.
|
|
24
|
+
*/
|
|
25
|
+
export async function advanceHandler(payload: AdvancePayload): Promise<void> {
|
|
26
|
+
const { runId, stepIndex } = payload
|
|
27
|
+
const attempt = payload.attempt ?? 1
|
|
28
|
+
|
|
29
|
+
const run = await loadRun(runId)
|
|
30
|
+
if (!run) return
|
|
31
|
+
// Only a `running` run advances; `suspended`/terminal/`compensating` runs
|
|
32
|
+
// are handled by resume / the compensation chain / not at all.
|
|
33
|
+
if (run.status !== 'running') return
|
|
34
|
+
// Stale redelivery — the run already moved past this step.
|
|
35
|
+
if (run.currentStep !== stepIndex) return
|
|
36
|
+
|
|
37
|
+
const workflow = registry.get(run.workflowName)
|
|
38
|
+
const steps = workflow.steps
|
|
39
|
+
|
|
40
|
+
// Past the last step — the run is done.
|
|
41
|
+
if (stepIndex >= steps.length) {
|
|
42
|
+
await completeRun(runId)
|
|
43
|
+
return
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
const step = steps[stepIndex]!
|
|
47
|
+
const journal = await loadJournal(runId)
|
|
48
|
+
const ctx = buildContext(run, attempt, step.name)
|
|
49
|
+
const outcome = await runDurableStep(step, ctx, journal)
|
|
50
|
+
await applyOutcome(runId, stepIndex, outcome)
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/** Phase B — apply a step outcome atomically under a `FOR UPDATE` lock. */
|
|
54
|
+
async function applyOutcome(
|
|
55
|
+
runId: number,
|
|
56
|
+
stepIndex: number,
|
|
57
|
+
outcome: StepOutcome
|
|
58
|
+
): Promise<void> {
|
|
59
|
+
await transaction(async (trx: Tx) => {
|
|
60
|
+
const run = await lockRun(trx, runId)
|
|
61
|
+
// Re-check under the lock: a resume / cancel / concurrent duplicate may
|
|
62
|
+
// have moved the run since Phase A.
|
|
63
|
+
if (!run || run.status !== 'running' || run.currentStep !== stepIndex) return
|
|
64
|
+
|
|
65
|
+
switch (outcome.kind) {
|
|
66
|
+
case 'advance': {
|
|
67
|
+
await writeJournal(trx, runId, outcome.journal)
|
|
68
|
+
const newState = applyPatch(run.state, outcome.resultPatch)
|
|
69
|
+
delete newState['__strav_resume__']
|
|
70
|
+
const next = stepIndex + 1
|
|
71
|
+
await trx`
|
|
72
|
+
UPDATE "_strav_workflow_runs"
|
|
73
|
+
SET "state" = ${JSON.stringify(newState)}, "current_step" = ${next},
|
|
74
|
+
"updated_at" = NOW()
|
|
75
|
+
WHERE "id" = ${runId}
|
|
76
|
+
`
|
|
77
|
+
await enqueueAdvance(trx, runId, next)
|
|
78
|
+
break
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
case 'sleep': {
|
|
82
|
+
await writeJournal(trx, runId, outcome.journal)
|
|
83
|
+
const newState = applyPatch(run.state, outcome.resultPatch)
|
|
84
|
+
const next = stepIndex + 1
|
|
85
|
+
const delay = Math.max(0, outcome.wakeAt.getTime() - Date.now())
|
|
86
|
+
await trx`
|
|
87
|
+
UPDATE "_strav_workflow_runs"
|
|
88
|
+
SET "state" = ${JSON.stringify(newState)}, "current_step" = ${next},
|
|
89
|
+
"wake_at" = ${outcome.wakeAt}, "updated_at" = NOW()
|
|
90
|
+
WHERE "id" = ${runId}
|
|
91
|
+
`
|
|
92
|
+
await enqueueAdvance(trx, runId, next, { delay })
|
|
93
|
+
break
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
case 'suspend-signal': {
|
|
97
|
+
await trx`
|
|
98
|
+
UPDATE "_strav_workflow_runs"
|
|
99
|
+
SET "status" = 'suspended', "awaiting_signal" = ${outcome.signal},
|
|
100
|
+
"updated_at" = NOW()
|
|
101
|
+
WHERE "id" = ${runId}
|
|
102
|
+
`
|
|
103
|
+
break
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
case 'suspend-agent': {
|
|
107
|
+
const newState = applyPatch(run.state, { [outcome.stepName]: outcome.snapshot })
|
|
108
|
+
await trx`
|
|
109
|
+
UPDATE "_strav_workflow_runs"
|
|
110
|
+
SET "status" = 'suspended', "awaiting_signal" = ${outcome.stepName},
|
|
111
|
+
"state" = ${JSON.stringify(newState)}, "updated_at" = NOW()
|
|
112
|
+
WHERE "id" = ${runId}
|
|
113
|
+
`
|
|
114
|
+
break
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
case 'await-child': {
|
|
118
|
+
const childRows = (await trx`
|
|
119
|
+
INSERT INTO "_strav_workflow_runs"
|
|
120
|
+
("workflow_name", "input", "status", "state", "current_step",
|
|
121
|
+
"parent_run_id", "parent_step_id")
|
|
122
|
+
VALUES (
|
|
123
|
+
${outcome.childName}, ${JSON.stringify(outcome.childInput)},
|
|
124
|
+
'running', '{}', 0, ${runId}, ${outcome.childStepId}
|
|
125
|
+
)
|
|
126
|
+
RETURNING "id"
|
|
127
|
+
`) as Record<string, unknown>[]
|
|
128
|
+
const childId = Number(childRows[0]!.id)
|
|
129
|
+
await enqueueAdvance(trx, childId, 0)
|
|
130
|
+
await trx`
|
|
131
|
+
UPDATE "_strav_workflow_runs"
|
|
132
|
+
SET "status" = 'suspended', "updated_at" = NOW()
|
|
133
|
+
WHERE "id" = ${runId}
|
|
134
|
+
`
|
|
135
|
+
break
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
case 'retry': {
|
|
139
|
+
await writeJournal(trx, runId, outcome.journal)
|
|
140
|
+
await enqueueAdvance(trx, runId, stepIndex, {
|
|
141
|
+
attempt: outcome.attempt,
|
|
142
|
+
delay: outcome.backoffMs,
|
|
143
|
+
})
|
|
144
|
+
break
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
case 'compensate': {
|
|
148
|
+
await writeJournal(trx, runId, outcome.journal)
|
|
149
|
+
await beginCompensation(trx, run, stepIndex, outcome.failure)
|
|
150
|
+
break
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
})
|
|
154
|
+
}
|