@effect-app/infra 4.0.0-beta.257 → 4.0.0-beta.258
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +11 -0
- package/dist/WorkflowEngineSqlite.d.ts +24 -0
- package/dist/WorkflowEngineSqlite.d.ts.map +1 -0
- package/dist/WorkflowEngineSqlite.js +550 -0
- package/package.json +7 -206
- package/src/WorkflowEngineCosmos.ts +719 -0
- package/src/WorkflowEngineSqlite.ts +813 -0
- package/test/dist/workflow-engine-sqlite.test.d.ts.map +1 -0
- package/test/workflow-engine-cosmos.test.ts +354 -0
- package/test/workflow-engine-sqlite.test.ts +299 -0
|
@@ -0,0 +1,719 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cosmos DB backed {@link WorkflowEngine} implementation.
|
|
3
|
+
*
|
|
4
|
+
* Persists workflow state in a single container partitioned by `executionId`
|
|
5
|
+
* so per-execution writes share a partition key (eligible for Cosmos
|
|
6
|
+
* TransactionalBatch). Optimistic concurrency is enforced with `_etag` +
|
|
7
|
+
* `IfMatch` on Replace, and create-only batch ops give first-writer-wins
|
|
8
|
+
* semantics for activity results and durable-deferred completions.
|
|
9
|
+
*
|
|
10
|
+
* Durability — everything that crosses the storage boundary is round-tripped
|
|
11
|
+
* through schema codecs (`S.fromJsonString(S.toCodecJson(...))`), exactly like
|
|
12
|
+
* the cluster engine, instead of dumping live runtime objects as JSON:
|
|
13
|
+
*
|
|
14
|
+
* - The workflow payload and the top-level workflow result are encoded with the
|
|
15
|
+
* workflow's own `payloadSchema` / `successSchema` / `errorSchema`, so typed
|
|
16
|
+
* values (dates, branded ids, schema classes) survive a restart.
|
|
17
|
+
* - Activity results flow through the engine already encoded, so they are
|
|
18
|
+
* persisted with an opaque `Workflow.Result({ success: AnyOrVoid, error:
|
|
19
|
+
* AnyOrVoid })` codec — same trick the cluster `ActivityRpc` uses.
|
|
20
|
+
* - Durable-deferred exits and clock completions use an opaque `Exit` codec.
|
|
21
|
+
*
|
|
22
|
+
* Crash recovery: each driver holds a time-bound lease (`worker` +
|
|
23
|
+
* `leaseExpiresAt`) on the exec doc and renews it via a heartbeat fiber. A
|
|
24
|
+
* scope-bound recovery poller queries for exec docs whose lease has lapsed and
|
|
25
|
+
* re-drives them in the local process, decoding the persisted payload and
|
|
26
|
+
* picking up persisted activity results from where the crashed driver left off.
|
|
27
|
+
*
|
|
28
|
+
* Durable clocks: `scheduleClock` writes a clock doc (`fireAt`, `deferredName`)
|
|
29
|
+
* and arms an in-process timer. A cross-partition clock poller fires any clock
|
|
30
|
+
* whose `fireAt` is due, completing the deferred idempotently (create-only) and
|
|
31
|
+
* deleting the doc. Survives restarts.
|
|
32
|
+
*/
|
|
33
|
+
import * as Effect from "effect-app/Effect"
|
|
34
|
+
import * as Layer from "effect-app/Layer"
|
|
35
|
+
import * as Option from "effect-app/Option"
|
|
36
|
+
import * as S from "effect-app/Schema"
|
|
37
|
+
import * as Duration from "effect/Duration"
|
|
38
|
+
import * as Exit from "effect/Exit"
|
|
39
|
+
import * as Fiber from "effect/Fiber"
|
|
40
|
+
import * as FiberMap from "effect/FiberMap"
|
|
41
|
+
import * as Redacted from "effect/Redacted"
|
|
42
|
+
import * as Schedule from "effect/Schedule"
|
|
43
|
+
import type * as Scope from "effect/Scope"
|
|
44
|
+
import * as Workflow from "effect/unstable/workflow/Workflow"
|
|
45
|
+
import { type Encoded, makeUnsafe, WorkflowEngine, WorkflowInstance } from "effect/unstable/workflow/WorkflowEngine"
|
|
46
|
+
import { randomUUID } from "node:crypto"
|
|
47
|
+
import { CosmosClient, CosmosClientLayer } from "./cosmos-client.js"
|
|
48
|
+
import { OptimisticConcurrencyException } from "./errors.js"
|
|
49
|
+
import { annotateCosmosResponse, annotateDb } from "./otel.js"
|
|
50
|
+
|
|
51
|
+
export interface WorkflowEngineCosmosConfig {
|
|
52
|
+
readonly url: Redacted.Redacted<string>
|
|
53
|
+
readonly dbName: string
|
|
54
|
+
readonly prefix?: string
|
|
55
|
+
/** Lease duration before claim considered stale. Default 30s. */
|
|
56
|
+
readonly leaseTtl?: Duration.Duration
|
|
57
|
+
/** Renewal cadence — should be < leaseTtl. Default 10s. */
|
|
58
|
+
readonly heartbeatInterval?: Duration.Duration
|
|
59
|
+
/** Cadence for scanning stale leases. Default 15s. Set to `Duration.zero` to disable. */
|
|
60
|
+
readonly recoveryInterval?: Duration.Duration
|
|
61
|
+
/** Cadence for scanning due clocks. Default 5s. Set to `Duration.zero` to disable. */
|
|
62
|
+
readonly clockPollInterval?: Duration.Duration
|
|
63
|
+
/** Stable worker identity; defaults to a random UUID per process. */
|
|
64
|
+
readonly workerId?: string
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
type ExecStatus = "running" | "complete" | "interrupted"
|
|
68
|
+
|
|
69
|
+
interface ExecDoc {
|
|
70
|
+
readonly id: "exec"
|
|
71
|
+
readonly _partitionKey: string
|
|
72
|
+
readonly type: "exec"
|
|
73
|
+
readonly workflowName: string
|
|
74
|
+
/** Schema-encoded (JSON string) workflow payload. */
|
|
75
|
+
readonly payload: string
|
|
76
|
+
readonly parent: string | undefined
|
|
77
|
+
status: ExecStatus
|
|
78
|
+
suspended: boolean
|
|
79
|
+
interrupted: boolean
|
|
80
|
+
/** Schema-encoded (JSON string) top-level `Workflow.Result`, set on completion. */
|
|
81
|
+
completedResult?: string | undefined
|
|
82
|
+
worker?: string | undefined
|
|
83
|
+
leaseExpiresAt?: string | undefined
|
|
84
|
+
readonly _etag?: string
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
interface ActivityDoc {
|
|
88
|
+
readonly id: string
|
|
89
|
+
readonly _partitionKey: string
|
|
90
|
+
readonly type: "activity"
|
|
91
|
+
/** Schema-encoded (JSON string) `Workflow.Result`. */
|
|
92
|
+
readonly result: string
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
interface DeferredDoc {
|
|
96
|
+
readonly id: string
|
|
97
|
+
readonly _partitionKey: string
|
|
98
|
+
readonly type: "deferred"
|
|
99
|
+
/** Schema-encoded (JSON string) `Exit`. */
|
|
100
|
+
readonly exit: string
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
interface ClockDoc {
|
|
104
|
+
readonly id: string
|
|
105
|
+
readonly _partitionKey: string
|
|
106
|
+
readonly type: "clock"
|
|
107
|
+
readonly workflowName: string
|
|
108
|
+
readonly deferredName: string
|
|
109
|
+
readonly fireAt: string
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
const execId = "exec" as const
|
|
113
|
+
const activityKey = (name: string, attempt: number) => `activity::${name}::${attempt}`
|
|
114
|
+
const deferredKey = (name: string) => `deferred::${name}`
|
|
115
|
+
const clockKey = (name: string) => `clock::${name}`
|
|
116
|
+
|
|
117
|
+
const isOptimisticStatus = (code: number) => code === 409 || code === 412 || code === 404
|
|
118
|
+
|
|
119
|
+
// --- Storage codecs ----------------------------------------------------------
|
|
120
|
+
// Values flowing through the engine's activity / deferred boundary are already
|
|
121
|
+
// schema-encoded, so the structure is round-tripped while the payload stays
|
|
122
|
+
// opaque (mirrors the cluster engine's `AnyOrVoid` usage).
|
|
123
|
+
const AnyOrVoid = S.Union([S.Any, S.Void])
|
|
124
|
+
const ActivityResultCodec = S.fromJsonString(S.toCodecJson(Workflow.Result({ success: AnyOrVoid, error: AnyOrVoid })))
|
|
125
|
+
const DeferredExitCodec = S.fromJsonString(S.toCodecJson(S.Exit(AnyOrVoid, AnyOrVoid, S.Defect)))
|
|
126
|
+
|
|
127
|
+
const encodeActivityResult = (r: Workflow.Result<unknown, unknown>) =>
|
|
128
|
+
Effect.orDie(S.encodeEffect(ActivityResultCodec)(r))
|
|
129
|
+
const decodeActivityResult = (s: string) => Effect.orDie(S.decodeEffect(ActivityResultCodec)(s))
|
|
130
|
+
const encodeDeferredExit = (e: Exit.Exit<unknown, unknown>) => Effect.orDie(S.encodeEffect(DeferredExitCodec)(e))
|
|
131
|
+
const decodeDeferredExit = (s: string) => Effect.orDie(S.decodeEffect(DeferredExitCodec)(s))
|
|
132
|
+
|
|
133
|
+
const makeCosmosWorkflowEngine = Effect.fnUntraced(function*(cfg: WorkflowEngineCosmosConfig) {
|
|
134
|
+
const { db } = yield* CosmosClient
|
|
135
|
+
const containerId = `${cfg.prefix ?? ""}workflow-engine`
|
|
136
|
+
yield* Effect.promise(() =>
|
|
137
|
+
db.containers.createIfNotExists({
|
|
138
|
+
id: containerId,
|
|
139
|
+
partitionKey: { paths: ["/_partitionKey"], version: 2 }
|
|
140
|
+
})
|
|
141
|
+
)
|
|
142
|
+
const container = db.container(containerId)
|
|
143
|
+
const scope = yield* Effect.scope
|
|
144
|
+
|
|
145
|
+
const workerId = cfg.workerId ?? randomUUID()
|
|
146
|
+
const leaseTtl = cfg.leaseTtl ?? Duration.seconds(30)
|
|
147
|
+
const heartbeatInterval = cfg.heartbeatInterval ?? Duration.seconds(10)
|
|
148
|
+
const recoveryInterval = cfg.recoveryInterval ?? Duration.seconds(15)
|
|
149
|
+
const clockPollInterval = cfg.clockPollInterval ?? Duration.seconds(5)
|
|
150
|
+
|
|
151
|
+
const annotate = (operation: string, executionId?: string) =>
|
|
152
|
+
annotateDb({
|
|
153
|
+
operation,
|
|
154
|
+
system: "cosmosdb",
|
|
155
|
+
collection: containerId,
|
|
156
|
+
entity: "workflow",
|
|
157
|
+
extra: executionId !== undefined
|
|
158
|
+
? { "azure.cosmosdb.operation.partition_key": executionId, "app.entity.id": executionId }
|
|
159
|
+
: undefined
|
|
160
|
+
})
|
|
161
|
+
|
|
162
|
+
type Registered = {
|
|
163
|
+
readonly workflow: Workflow.Any
|
|
164
|
+
readonly execute: (
|
|
165
|
+
payload: object,
|
|
166
|
+
executionId: string
|
|
167
|
+
) => Effect.Effect<unknown, unknown, WorkflowInstance | WorkflowEngine>
|
|
168
|
+
readonly scope: Scope.Scope
|
|
169
|
+
}
|
|
170
|
+
const workflows = new Map<string, Registered>()
|
|
171
|
+
|
|
172
|
+
type LocalExec = {
|
|
173
|
+
instance: WorkflowInstance["Service"]
|
|
174
|
+
fiber: Fiber.Fiber<Workflow.Result<unknown, unknown>> | undefined
|
|
175
|
+
parent: string | undefined
|
|
176
|
+
}
|
|
177
|
+
const locals = new Map<string, LocalExec>()
|
|
178
|
+
const clocks = yield* FiberMap.make<string>()
|
|
179
|
+
|
|
180
|
+
// Per-workflow codecs for the typed payload + top-level result. Cached by
|
|
181
|
+
// workflow name; derived from the workflow's own schemas so typed values
|
|
182
|
+
// (dates, branded ids, schema classes) survive the storage round-trip.
|
|
183
|
+
const makePayloadCodec = (workflow: Workflow.Any) => S.fromJsonString(S.toCodecJson(workflow.payloadSchema))
|
|
184
|
+
const payloadCodecCache = new Map<string, ReturnType<typeof makePayloadCodec>>()
|
|
185
|
+
const payloadCodecFor = (workflow: Workflow.Any) => {
|
|
186
|
+
let c = payloadCodecCache.get(workflow.name)
|
|
187
|
+
if (!c) {
|
|
188
|
+
c = makePayloadCodec(workflow)
|
|
189
|
+
payloadCodecCache.set(workflow.name, c)
|
|
190
|
+
}
|
|
191
|
+
return c
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
const makeResultCodec = (workflow: Workflow.Any) =>
|
|
195
|
+
S.fromJsonString(S.toCodecJson(Workflow.Result({ success: workflow.successSchema, error: workflow.errorSchema })))
|
|
196
|
+
const resultCodecCache = new Map<string, ReturnType<typeof makeResultCodec>>()
|
|
197
|
+
const resultCodecFor = (workflow: Workflow.Any) => {
|
|
198
|
+
let c = resultCodecCache.get(workflow.name)
|
|
199
|
+
if (!c) {
|
|
200
|
+
c = makeResultCodec(workflow)
|
|
201
|
+
resultCodecCache.set(workflow.name, c)
|
|
202
|
+
}
|
|
203
|
+
return c
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
const encodePayload = (workflow: Workflow.Any, payload: object) =>
|
|
207
|
+
Effect.orDie(S.encodeEffect(payloadCodecFor(workflow))(payload)) as Effect.Effect<string>
|
|
208
|
+
const decodePayload = (workflow: Workflow.Any, s: string) =>
|
|
209
|
+
Effect.orDie(S.decodeEffect(payloadCodecFor(workflow))(s)) as Effect.Effect<object>
|
|
210
|
+
const encodeResult = (workflow: Workflow.Any, r: Workflow.Result<unknown, unknown>) =>
|
|
211
|
+
Effect.orDie(S.encodeEffect(resultCodecFor(workflow))(r)) as Effect.Effect<string>
|
|
212
|
+
const decodeResult = (workflow: Workflow.Any, s: string) =>
|
|
213
|
+
Effect.orDie(S.decodeEffect(resultCodecFor(workflow))(s)) as Effect.Effect<Workflow.Result<unknown, unknown>>
|
|
214
|
+
|
|
215
|
+
// --- Cosmos primitives -------------------------------------------------
|
|
216
|
+
|
|
217
|
+
const readExec = (executionId: string) =>
|
|
218
|
+
Effect
|
|
219
|
+
.gen(function*() {
|
|
220
|
+
const resp = yield* Effect.promise(() => container.item(execId, executionId).read<ExecDoc>())
|
|
221
|
+
yield* annotateCosmosResponse({ requestCharge: resp.requestCharge, statusCode: resp.statusCode })
|
|
222
|
+
return Option.fromNullishOr(resp.resource).pipe(
|
|
223
|
+
Option.map((r) => ({ ...r, _etag: resp.etag }))
|
|
224
|
+
)
|
|
225
|
+
})
|
|
226
|
+
.pipe(annotate("readExec", executionId))
|
|
227
|
+
|
|
228
|
+
const replaceExec = (doc: ExecDoc) =>
|
|
229
|
+
Effect
|
|
230
|
+
.gen(function*() {
|
|
231
|
+
const resp = yield* Effect.promise(() =>
|
|
232
|
+
container.item(execId, doc._partitionKey).replace<ExecDoc>(doc, {
|
|
233
|
+
accessCondition: { type: "IfMatch", condition: doc._etag ?? "" }
|
|
234
|
+
})
|
|
235
|
+
)
|
|
236
|
+
yield* annotateCosmosResponse({ requestCharge: resp.requestCharge, statusCode: resp.statusCode })
|
|
237
|
+
if (isOptimisticStatus(resp.statusCode)) {
|
|
238
|
+
return yield* new OptimisticConcurrencyException({
|
|
239
|
+
type: "workflow.exec",
|
|
240
|
+
id: doc._partitionKey,
|
|
241
|
+
code: resp.statusCode
|
|
242
|
+
})
|
|
243
|
+
}
|
|
244
|
+
return { ...doc, _etag: resp.etag }
|
|
245
|
+
})
|
|
246
|
+
.pipe(annotate("replaceExec", doc._partitionKey))
|
|
247
|
+
|
|
248
|
+
// Atomic create-or-noop using a single-op batch — returns true if created.
|
|
249
|
+
const createIfMissing = <T extends { readonly id: string; readonly _partitionKey: string }>(
|
|
250
|
+
body: T
|
|
251
|
+
): Effect.Effect<boolean> =>
|
|
252
|
+
Effect.gen(function*() {
|
|
253
|
+
const resp = yield* Effect.promise(() =>
|
|
254
|
+
container.items.batch(
|
|
255
|
+
[{ operationType: "Create" as const, resourceBody: body }],
|
|
256
|
+
body._partitionKey
|
|
257
|
+
)
|
|
258
|
+
)
|
|
259
|
+
const r = resp.result?.[0]
|
|
260
|
+
const code = r?.statusCode ?? 0
|
|
261
|
+
if (code === 201) return true
|
|
262
|
+
if (code === 409) return false
|
|
263
|
+
return yield* Effect.die(
|
|
264
|
+
new Error(`workflow-engine cosmos createIfMissing for ${body.id} failed: ${code}`)
|
|
265
|
+
)
|
|
266
|
+
})
|
|
267
|
+
|
|
268
|
+
// Last-writer-wins upsert — used to overwrite a persisted *suspended* activity
|
|
269
|
+
// result once it finally completes (create-only ops can't transition it).
|
|
270
|
+
const upsert = <T extends { readonly id: string; readonly _partitionKey: string }>(
|
|
271
|
+
body: T
|
|
272
|
+
): Effect.Effect<void> =>
|
|
273
|
+
Effect.gen(function*() {
|
|
274
|
+
const resp = yield* Effect.promise(() => container.items.upsert<T>(body))
|
|
275
|
+
yield* annotateCosmosResponse({ requestCharge: resp.requestCharge, statusCode: resp.statusCode })
|
|
276
|
+
})
|
|
277
|
+
|
|
278
|
+
const readPoint = <T extends { id: string }>(id: string, executionId: string) =>
|
|
279
|
+
Effect.promise(() => container.item(id, executionId).read<T>()).pipe(
|
|
280
|
+
Effect.map((r) => Option.fromNullishOr(r.resource))
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
// --- Workflow result helpers ------------------------------------------
|
|
284
|
+
|
|
285
|
+
const completeResult = (
|
|
286
|
+
workflow: Workflow.Any,
|
|
287
|
+
state: ExecDoc
|
|
288
|
+
): Effect.Effect<Option.Option<Workflow.Result<unknown, unknown>>> =>
|
|
289
|
+
state.status === "complete" && state.completedResult
|
|
290
|
+
? Effect.map(decodeResult(workflow, state.completedResult), Option.some)
|
|
291
|
+
: Effect.succeedNone
|
|
292
|
+
|
|
293
|
+
// --- Lease / claim ----------------------------------------------------
|
|
294
|
+
|
|
295
|
+
const leaseActive = (state: ExecDoc, now: number): boolean =>
|
|
296
|
+
state.worker !== undefined
|
|
297
|
+
&& state.worker !== workerId
|
|
298
|
+
&& state.leaseExpiresAt !== undefined
|
|
299
|
+
&& Date.parse(state.leaseExpiresAt) > now
|
|
300
|
+
|
|
301
|
+
/**
|
|
302
|
+
* Try to claim a lease on `state`. Returns the updated doc on success, `None`
|
|
303
|
+
* if another worker holds an active lease, or on OCC conflict (caller may
|
|
304
|
+
* retry by re-reading).
|
|
305
|
+
*/
|
|
306
|
+
const tryClaim = (state: ExecDoc): Effect.Effect<Option.Option<ExecDoc>> =>
|
|
307
|
+
Effect.gen(function*() {
|
|
308
|
+
const now = Date.now()
|
|
309
|
+
if (leaseActive(state, now)) return Option.none<ExecDoc>()
|
|
310
|
+
const updated: ExecDoc = {
|
|
311
|
+
...state,
|
|
312
|
+
worker: workerId,
|
|
313
|
+
leaseExpiresAt: new Date(now + Duration.toMillis(leaseTtl)).toISOString()
|
|
314
|
+
}
|
|
315
|
+
return yield* replaceExec(updated).pipe(
|
|
316
|
+
Effect.map(Option.some),
|
|
317
|
+
Effect.catchTag("OptimisticConcurrencyException", () => Effect.succeed(Option.none<ExecDoc>()))
|
|
318
|
+
)
|
|
319
|
+
})
|
|
320
|
+
|
|
321
|
+
/**
|
|
322
|
+
* Renew lease until the local fiber stops or another worker takes the claim.
|
|
323
|
+
* Best-effort: failures are swallowed; loop simply retries on next tick.
|
|
324
|
+
*/
|
|
325
|
+
const heartbeat = (executionId: string): Effect.Effect<void> =>
|
|
326
|
+
Effect.gen(function*() {
|
|
327
|
+
while (true) {
|
|
328
|
+
yield* Effect.sleep(heartbeatInterval)
|
|
329
|
+
const local = locals.get(executionId)
|
|
330
|
+
const polled = local?.fiber?.pollUnsafe()
|
|
331
|
+
if (!local?.fiber || polled) return
|
|
332
|
+
const cur = yield* readExec(executionId).pipe(
|
|
333
|
+
Effect.catchCause(() => Effect.succeed(Option.none<ExecDoc>()))
|
|
334
|
+
)
|
|
335
|
+
if (Option.isNone(cur)) continue
|
|
336
|
+
const state = cur.value
|
|
337
|
+
if (state.status === "complete" || state.worker !== workerId) return
|
|
338
|
+
yield* replaceExec({
|
|
339
|
+
...state,
|
|
340
|
+
leaseExpiresAt: new Date(Date.now() + Duration.toMillis(leaseTtl)).toISOString()
|
|
341
|
+
})
|
|
342
|
+
.pipe(
|
|
343
|
+
Effect.catchTag("OptimisticConcurrencyException", () => Effect.void),
|
|
344
|
+
Effect.catchCause(() => Effect.void)
|
|
345
|
+
)
|
|
346
|
+
}
|
|
347
|
+
})
|
|
348
|
+
|
|
349
|
+
// --- Drive logic -------------------------------------------------------
|
|
350
|
+
|
|
351
|
+
const drive = (
|
|
352
|
+
executionId: string,
|
|
353
|
+
payload: object,
|
|
354
|
+
parent: string | undefined,
|
|
355
|
+
entry: Registered
|
|
356
|
+
): Effect.Effect<void> =>
|
|
357
|
+
Effect.gen(function*() {
|
|
358
|
+
let local = locals.get(executionId)
|
|
359
|
+
if (local?.fiber) {
|
|
360
|
+
const polled = local.fiber.pollUnsafe()
|
|
361
|
+
const stillRunning = !polled
|
|
362
|
+
const completedNotResume = polled && polled._tag === "Success" && polled.value._tag === "Complete"
|
|
363
|
+
if (stillRunning || completedNotResume) return
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
const stateOpt = yield* readExec(executionId)
|
|
367
|
+
if (Option.isNone(stateOpt) || stateOpt.value.status === "complete") return
|
|
368
|
+
|
|
369
|
+
// Best-effort claim: takes lease so recovery poller leaves us alone.
|
|
370
|
+
// Failure is tolerated — local fiber still drives; OCC guards persisted
|
|
371
|
+
// state so split-brain stays correct.
|
|
372
|
+
const claimed = yield* tryClaim(stateOpt.value)
|
|
373
|
+
const state = Option.isSome(claimed) ? claimed.value : stateOpt.value
|
|
374
|
+
|
|
375
|
+
const instance = WorkflowInstance.initial(entry.workflow, executionId)
|
|
376
|
+
instance.interrupted = state.interrupted
|
|
377
|
+
if (!local) {
|
|
378
|
+
local = { instance, fiber: undefined, parent }
|
|
379
|
+
locals.set(executionId, local)
|
|
380
|
+
} else {
|
|
381
|
+
local.instance = instance
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
const onComplete = Effect.fnUntraced(function*(result: Workflow.Result<unknown, unknown>) {
|
|
385
|
+
const current = yield* readExec(executionId)
|
|
386
|
+
if (Option.isNone(current) || current.value.status === "complete") return
|
|
387
|
+
const isComplete = result._tag === "Complete"
|
|
388
|
+
const completedResult = isComplete ? yield* encodeResult(entry.workflow, result) : undefined
|
|
389
|
+
yield* replaceExec({
|
|
390
|
+
...current.value,
|
|
391
|
+
status: isComplete ? "complete" : current.value.status,
|
|
392
|
+
suspended: result._tag === "Suspended",
|
|
393
|
+
interrupted: instance.interrupted,
|
|
394
|
+
completedResult,
|
|
395
|
+
// Release lease on completion so the doc isn't seen as orphaned.
|
|
396
|
+
worker: isComplete ? undefined : current.value.worker,
|
|
397
|
+
leaseExpiresAt: isComplete ? undefined : current.value.leaseExpiresAt
|
|
398
|
+
})
|
|
399
|
+
.pipe(Effect.catchTag("OptimisticConcurrencyException", () => Effect.void))
|
|
400
|
+
if (parent && isComplete) {
|
|
401
|
+
yield* Effect.forkIn(driveById(parent), scope)
|
|
402
|
+
}
|
|
403
|
+
})
|
|
404
|
+
|
|
405
|
+
local.fiber = yield* entry.execute(payload, executionId).pipe(
|
|
406
|
+
Effect.onExit(() => {
|
|
407
|
+
if (!instance.interrupted) return Effect.void
|
|
408
|
+
instance.suspended = false
|
|
409
|
+
return Effect.withFiber((fiber) => Effect.interruptible(Fiber.interrupt(fiber)))
|
|
410
|
+
}),
|
|
411
|
+
Workflow.intoResult,
|
|
412
|
+
Effect.provideService(WorkflowInstance, instance),
|
|
413
|
+
Effect.provideService(WorkflowEngine, engine),
|
|
414
|
+
Effect.tap(onComplete),
|
|
415
|
+
Effect.forkIn(entry.scope)
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
if (Option.isSome(claimed)) {
|
|
419
|
+
yield* Effect.forkIn(heartbeat(executionId), scope)
|
|
420
|
+
}
|
|
421
|
+
})
|
|
422
|
+
|
|
423
|
+
const driveById = (executionId: string): Effect.Effect<void> =>
|
|
424
|
+
Effect.gen(function*() {
|
|
425
|
+
const stateOpt = yield* readExec(executionId)
|
|
426
|
+
if (Option.isNone(stateOpt)) return
|
|
427
|
+
const state = stateOpt.value
|
|
428
|
+
const entry = workflows.get(state.workflowName)
|
|
429
|
+
if (!entry) return
|
|
430
|
+
const payload = yield* decodePayload(entry.workflow, state.payload)
|
|
431
|
+
yield* drive(executionId, payload, state.parent, entry)
|
|
432
|
+
})
|
|
433
|
+
|
|
434
|
+
// --- Clock firing -----------------------------------------------------
|
|
435
|
+
// Persist deferred completion (first-writer-wins via createIfMissing),
|
|
436
|
+
// resume the workflow, then clean up the clock doc.
|
|
437
|
+
const fireClock = (doc: ClockDoc): Effect.Effect<void> =>
|
|
438
|
+
Effect.gen(function*() {
|
|
439
|
+
const created = yield* createIfMissing<DeferredDoc>({
|
|
440
|
+
id: deferredKey(doc.deferredName),
|
|
441
|
+
_partitionKey: doc._partitionKey,
|
|
442
|
+
type: "deferred",
|
|
443
|
+
exit: yield* encodeDeferredExit(Exit.void)
|
|
444
|
+
})
|
|
445
|
+
.pipe(annotate("clockFire", doc._partitionKey))
|
|
446
|
+
if (created) yield* driveById(doc._partitionKey)
|
|
447
|
+
yield* Effect.promise(() => container.item(doc.id, doc._partitionKey).delete()).pipe(
|
|
448
|
+
Effect.catchCause(() => Effect.void)
|
|
449
|
+
)
|
|
450
|
+
})
|
|
451
|
+
|
|
452
|
+
// --- Encoded engine ----------------------------------------------------
|
|
453
|
+
|
|
454
|
+
const encoded: Encoded = {
|
|
455
|
+
register: Effect.fnUntraced(function*(workflow, execute) {
|
|
456
|
+
workflows.set(workflow.name, {
|
|
457
|
+
workflow,
|
|
458
|
+
execute,
|
|
459
|
+
scope: yield* Effect.scope
|
|
460
|
+
})
|
|
461
|
+
}),
|
|
462
|
+
execute: Effect.fnUntraced(function*(workflow, options) {
|
|
463
|
+
const entry = workflows.get(workflow.name)
|
|
464
|
+
if (!entry) {
|
|
465
|
+
return yield* Effect.orDie(Effect.fail(`Workflow ${workflow.name} is not registered`))
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
const initial: ExecDoc = {
|
|
469
|
+
id: execId,
|
|
470
|
+
_partitionKey: options.executionId,
|
|
471
|
+
type: "exec",
|
|
472
|
+
workflowName: workflow.name,
|
|
473
|
+
payload: yield* encodePayload(workflow, options.payload),
|
|
474
|
+
parent: options.parent?.executionId,
|
|
475
|
+
status: "running",
|
|
476
|
+
suspended: false,
|
|
477
|
+
interrupted: false
|
|
478
|
+
}
|
|
479
|
+
const created = yield* createIfMissing(initial).pipe(annotate("execute.claim", options.executionId))
|
|
480
|
+
|
|
481
|
+
if (created || !locals.has(options.executionId)) {
|
|
482
|
+
yield* drive(options.executionId, options.payload, options.parent?.executionId, entry)
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
if (options.discard) return undefined as any
|
|
486
|
+
|
|
487
|
+
const local = locals.get(options.executionId)
|
|
488
|
+
if (local?.fiber) {
|
|
489
|
+
return (yield* Fiber.join(local.fiber)) as any
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
// Foreign-owned execution: poll until exec doc reports complete.
|
|
493
|
+
while (true) {
|
|
494
|
+
const cur = yield* readExec(options.executionId)
|
|
495
|
+
if (Option.isSome(cur)) {
|
|
496
|
+
const c = yield* completeResult(workflow, cur.value)
|
|
497
|
+
if (Option.isSome(c)) return c.value as any
|
|
498
|
+
}
|
|
499
|
+
yield* Effect.sleep(Duration.millis(500))
|
|
500
|
+
}
|
|
501
|
+
}),
|
|
502
|
+
poll: (workflow, executionId) =>
|
|
503
|
+
Effect.gen(function*() {
|
|
504
|
+
const local = locals.get(executionId)
|
|
505
|
+
if (local?.fiber) {
|
|
506
|
+
const exit = local.fiber.pollUnsafe()
|
|
507
|
+
if (!exit) return Option.none<Workflow.Result<unknown, unknown>>()
|
|
508
|
+
if (exit._tag !== "Success") return yield* Effect.die(exit.cause)
|
|
509
|
+
return Option.some(exit.value)
|
|
510
|
+
}
|
|
511
|
+
const state = yield* readExec(executionId)
|
|
512
|
+
if (Option.isNone(state)) return Option.none<Workflow.Result<unknown, unknown>>()
|
|
513
|
+
return yield* completeResult(workflow, state.value)
|
|
514
|
+
}),
|
|
515
|
+
interrupt: Effect.fnUntraced(function*(_workflow, executionId) {
|
|
516
|
+
const local = locals.get(executionId)
|
|
517
|
+
if (local) local.instance.interrupted = true
|
|
518
|
+
const current = yield* readExec(executionId)
|
|
519
|
+
if (Option.isNone(current) || current.value.status === "complete") return
|
|
520
|
+
yield* replaceExec({ ...current.value, interrupted: true }).pipe(
|
|
521
|
+
Effect.catchTag("OptimisticConcurrencyException", () => Effect.void)
|
|
522
|
+
)
|
|
523
|
+
yield* driveById(executionId)
|
|
524
|
+
}),
|
|
525
|
+
interruptUnsafe: Effect.fnUntraced(function*(_workflow, executionId) {
|
|
526
|
+
const local = locals.get(executionId)
|
|
527
|
+
if (local) local.instance.interrupted = true
|
|
528
|
+
const current = yield* readExec(executionId)
|
|
529
|
+
if (Option.isSome(current) && current.value.status !== "complete") {
|
|
530
|
+
yield* replaceExec({ ...current.value, interrupted: true }).pipe(
|
|
531
|
+
Effect.catchTag("OptimisticConcurrencyException", () => Effect.void)
|
|
532
|
+
)
|
|
533
|
+
}
|
|
534
|
+
if (local?.fiber) yield* Fiber.interrupt(local.fiber)
|
|
535
|
+
}),
|
|
536
|
+
resume: (_workflow, executionId) => driveById(executionId),
|
|
537
|
+
activityExecute: Effect.fnUntraced(function*(activity, attempt) {
|
|
538
|
+
const instance = yield* WorkflowInstance
|
|
539
|
+
const id = activityKey(activity.name, attempt)
|
|
540
|
+
const existing = yield* readPoint<ActivityDoc>(id, instance.executionId).pipe(
|
|
541
|
+
annotate("activityRead", instance.executionId)
|
|
542
|
+
)
|
|
543
|
+
if (Option.isSome(existing)) {
|
|
544
|
+
const prev = yield* decodeActivityResult(existing.value.result)
|
|
545
|
+
// A completed activity is replayed from its persisted result; a
|
|
546
|
+
// suspended one must re-run (it parked on a clock/deferred).
|
|
547
|
+
if (prev._tag === "Complete") return prev
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
const activityInstance = WorkflowInstance.initial(instance.workflow, instance.executionId)
|
|
551
|
+
activityInstance.interrupted = instance.interrupted
|
|
552
|
+
|
|
553
|
+
const result = yield* activity.executeEncoded.pipe(
|
|
554
|
+
Workflow.intoResult,
|
|
555
|
+
Effect.provideService(WorkflowInstance, activityInstance)
|
|
556
|
+
)
|
|
557
|
+
const doc: ActivityDoc = {
|
|
558
|
+
id,
|
|
559
|
+
_partitionKey: instance.executionId,
|
|
560
|
+
type: "activity",
|
|
561
|
+
result: yield* encodeActivityResult(result)
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
if (Option.isSome(existing)) {
|
|
565
|
+
// Overwrite the previously persisted *suspended* doc with the new result.
|
|
566
|
+
yield* upsert(doc).pipe(annotate("activityPersist", instance.executionId))
|
|
567
|
+
return result
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
// First-writer-wins: if persistence loses the race, use the persisted result.
|
|
571
|
+
const persisted = yield* createIfMissing(doc).pipe(annotate("activityPersist", instance.executionId))
|
|
572
|
+
if (persisted) return result
|
|
573
|
+
const winner = yield* readPoint<ActivityDoc>(id, instance.executionId)
|
|
574
|
+
if (Option.isSome(winner)) {
|
|
575
|
+
const w = yield* decodeActivityResult(winner.value.result)
|
|
576
|
+
if (w._tag === "Complete") return w
|
|
577
|
+
}
|
|
578
|
+
return result
|
|
579
|
+
}),
|
|
580
|
+
deferredResult: Effect.fnUntraced(function*(deferred) {
|
|
581
|
+
const instance = yield* WorkflowInstance
|
|
582
|
+
const got = yield* readPoint<DeferredDoc>(deferredKey(deferred.name), instance.executionId).pipe(
|
|
583
|
+
annotate("deferredRead", instance.executionId)
|
|
584
|
+
)
|
|
585
|
+
if (Option.isNone(got)) return Option.none<Exit.Exit<unknown, unknown>>()
|
|
586
|
+
return Option.some(yield* decodeDeferredExit(got.value.exit))
|
|
587
|
+
}),
|
|
588
|
+
deferredDone: Effect.fnUntraced(function*(options) {
|
|
589
|
+
const created = yield* createIfMissing<DeferredDoc>({
|
|
590
|
+
id: deferredKey(options.deferredName),
|
|
591
|
+
_partitionKey: options.executionId,
|
|
592
|
+
type: "deferred",
|
|
593
|
+
exit: yield* encodeDeferredExit(options.exit)
|
|
594
|
+
})
|
|
595
|
+
.pipe(annotate("deferredPersist", options.executionId))
|
|
596
|
+
if (!created) return
|
|
597
|
+
yield* driveById(options.executionId)
|
|
598
|
+
}),
|
|
599
|
+
scheduleClock: (workflow, options) => {
|
|
600
|
+
const fireAt = new Date(Date.now() + Duration.toMillis(options.clock.duration)).toISOString()
|
|
601
|
+
const clockDoc: ClockDoc = {
|
|
602
|
+
id: clockKey(options.clock.name),
|
|
603
|
+
_partitionKey: options.executionId,
|
|
604
|
+
type: "clock",
|
|
605
|
+
workflowName: workflow.name,
|
|
606
|
+
deferredName: options.clock.deferred.name,
|
|
607
|
+
fireAt
|
|
608
|
+
}
|
|
609
|
+
return Effect.gen(function*() {
|
|
610
|
+
yield* createIfMissing(clockDoc).pipe(annotate("clockPersist", options.executionId))
|
|
611
|
+
// Fast-path in-process timer. If this process dies, the clock poller
|
|
612
|
+
// picks up the persisted doc and fires the deferred.
|
|
613
|
+
yield* fireClock(clockDoc).pipe(
|
|
614
|
+
Effect.delay(options.clock.duration),
|
|
615
|
+
FiberMap.run(clocks, `${options.executionId}/${options.clock.name}`, { onlyIfMissing: true }),
|
|
616
|
+
Effect.asVoid
|
|
617
|
+
)
|
|
618
|
+
})
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
const engine = makeUnsafe(encoded)
|
|
623
|
+
|
|
624
|
+
// --- Recovery poller --------------------------------------------------
|
|
625
|
+
// Scan for executions whose lease has lapsed (or was never set) and
|
|
626
|
+
// re-drive them locally. driveById will go through claim → fork fiber,
|
|
627
|
+
// resuming activities from persisted results.
|
|
628
|
+
if (Duration.toMillis(recoveryInterval) > 0) {
|
|
629
|
+
type StaleRow = { readonly _partitionKey: string; readonly workflowName: string }
|
|
630
|
+
const recoverStep = Effect
|
|
631
|
+
.gen(function*() {
|
|
632
|
+
const nowIso = new Date().toISOString()
|
|
633
|
+
const stale = yield* Effect.promise(() =>
|
|
634
|
+
container
|
|
635
|
+
.items
|
|
636
|
+
.query<StaleRow>({
|
|
637
|
+
query:
|
|
638
|
+
"SELECT c._partitionKey, c.workflowName FROM c WHERE c.type = 'exec' AND c.status = 'running' AND (NOT IS_DEFINED(c.leaseExpiresAt) OR c.leaseExpiresAt <= @now)",
|
|
639
|
+
parameters: [{ name: "@now", value: nowIso }]
|
|
640
|
+
})
|
|
641
|
+
.fetchAll()
|
|
642
|
+
)
|
|
643
|
+
for (const row of stale.resources) {
|
|
644
|
+
if (!workflows.has(row.workflowName)) continue
|
|
645
|
+
const local = locals.get(row._partitionKey)
|
|
646
|
+
if (local?.fiber && !local.fiber.pollUnsafe()) continue
|
|
647
|
+
yield* Effect.forkIn(driveById(row._partitionKey), scope)
|
|
648
|
+
}
|
|
649
|
+
})
|
|
650
|
+
.pipe(annotate("recoveryScan"), Effect.catchCause(() => Effect.void))
|
|
651
|
+
|
|
652
|
+
yield* recoverStep.pipe(
|
|
653
|
+
Effect.repeat(Schedule.spaced(recoveryInterval)),
|
|
654
|
+
Effect.forkIn(scope)
|
|
655
|
+
)
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
// --- Clock poller -----------------------------------------------------
|
|
659
|
+
// Cross-partition scan for clocks whose fireAt is due. Fires the deferred
|
|
660
|
+
// via createIfMissing (idempotent) so multiple pollers across processes
|
|
661
|
+
// converge. Also acts as the restart recovery path for clocks scheduled
|
|
662
|
+
// before a crash.
|
|
663
|
+
if (Duration.toMillis(clockPollInterval) > 0) {
|
|
664
|
+
type DueClock = {
|
|
665
|
+
readonly id: string
|
|
666
|
+
readonly _partitionKey: string
|
|
667
|
+
readonly workflowName: string
|
|
668
|
+
readonly deferredName: string
|
|
669
|
+
}
|
|
670
|
+
const clockStep = Effect
|
|
671
|
+
.gen(function*() {
|
|
672
|
+
const nowIso = new Date().toISOString()
|
|
673
|
+
const due = yield* Effect.promise(() =>
|
|
674
|
+
container
|
|
675
|
+
.items
|
|
676
|
+
.query<DueClock>({
|
|
677
|
+
query:
|
|
678
|
+
"SELECT c.id, c._partitionKey, c.workflowName, c.deferredName FROM c WHERE c.type = 'clock' AND c.fireAt <= @now",
|
|
679
|
+
parameters: [{ name: "@now", value: nowIso }]
|
|
680
|
+
})
|
|
681
|
+
.fetchAll()
|
|
682
|
+
)
|
|
683
|
+
for (const row of due.resources) {
|
|
684
|
+
yield* Effect.forkIn(
|
|
685
|
+
fireClock({
|
|
686
|
+
id: row.id,
|
|
687
|
+
_partitionKey: row._partitionKey,
|
|
688
|
+
type: "clock",
|
|
689
|
+
workflowName: row.workflowName,
|
|
690
|
+
deferredName: row.deferredName,
|
|
691
|
+
fireAt: nowIso
|
|
692
|
+
}),
|
|
693
|
+
scope
|
|
694
|
+
)
|
|
695
|
+
}
|
|
696
|
+
})
|
|
697
|
+
.pipe(annotate("clockScan"), Effect.catchCause(() => Effect.void))
|
|
698
|
+
|
|
699
|
+
yield* clockStep.pipe(
|
|
700
|
+
Effect.repeat(Schedule.spaced(clockPollInterval)),
|
|
701
|
+
Effect.forkIn(scope)
|
|
702
|
+
)
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
return engine
|
|
706
|
+
})
|
|
707
|
+
|
|
708
|
+
/**
|
|
709
|
+
* Cosmos DB backed `WorkflowEngine` layer.
|
|
710
|
+
*
|
|
711
|
+
* Per-execution writes share a partition key (TransactionalBatch-eligible) and
|
|
712
|
+
* use OCC via `_etag`/IfMatch, giving first-writer-wins semantics for activity
|
|
713
|
+
* results, durable-deferred completions, and exec-state transitions. All
|
|
714
|
+
* persisted payloads/results/exits are round-tripped through schema codecs.
|
|
715
|
+
*/
|
|
716
|
+
export const layerCosmos = (cfg: WorkflowEngineCosmosConfig): Layer.Layer<WorkflowEngine> =>
|
|
717
|
+
Layer
|
|
718
|
+
.effect(WorkflowEngine)(makeCosmosWorkflowEngine(cfg))
|
|
719
|
+
.pipe(Layer.provide(CosmosClientLayer(Redacted.value(cfg.url), cfg.dbName)))
|