@redflow/client 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/worker.ts ADDED
@@ -0,0 +1,1042 @@
1
+ import type { RedisClient } from "bun";
2
+ import { RedisClient as BunRedisClient, redis as defaultRedis } from "bun";
3
+ import { Cron } from "croner";
4
+ import {
5
+ computeRetryDelayMs,
6
+ createClient,
7
+ defaultPrefix,
8
+ isRetryableError,
9
+ makeErrorJson,
10
+ validateInputWithSchema,
11
+ type RedflowClient,
12
+ } from "./client";
13
+ import { CanceledError, OutputSerializationError, TimeoutError, UnknownWorkflowError } from "./internal/errors";
14
+ import { keys } from "./internal/keys";
15
+ import { safeJsonParse, safeJsonStringify, safeJsonTryParse } from "./internal/json";
16
+ import { sleep } from "./internal/sleep";
17
+ import { nowMs } from "./internal/time";
18
+ import { getDefaultRegistry, type WorkflowRegistry } from "./registry";
19
+ import type { OnFailureContext, RunStatus, StepApi, StepStatus } from "./types";
20
+
21
+ export type StartWorkerOptions = {
22
+ redis?: RedisClient;
23
+ url?: string;
24
+ prefix?: string;
25
+ registry?: WorkflowRegistry;
26
+
27
+ /** If omitted, derived from workflows' `queue` options. */
28
+ queues?: string[];
29
+ concurrency?: number;
30
+
31
+ /** Runtime tuning. Most users don't need this. */
32
+ runtime?: {
33
+ /** Lease TTL in ms for crash recovery. Default: 5000 */
34
+ leaseMs?: number;
35
+ /** BLMOVE block timeout in seconds. Default: 1 */
36
+ blmoveTimeoutSec?: number;
37
+ /** How often reaper scans processing lists. Default: 500 */
38
+ reaperIntervalMs?: number;
39
+ };
40
+ };
41
+
42
+ export type WorkerHandle = {
43
+ stop(): Promise<void>;
44
+ };
45
+
46
+ const UNLOCK_LUA = `
47
+ if redis.call("get", KEYS[1]) == ARGV[1] then
48
+ return redis.call("del", KEYS[1])
49
+ else
50
+ return 0
51
+ end
52
+ `;
53
+
54
+ const RENEW_LEASE_LUA = `
55
+ if redis.call("get", KEYS[1]) == ARGV[1] then
56
+ return redis.call("set", KEYS[1], ARGV[1], "PX", ARGV[2])
57
+ else
58
+ return nil
59
+ end
60
+ `;
61
+
62
+ const CLAIM_QUEUED_RUN_FOR_INLINE_LUA = `
63
+ if redis.call("hget", KEYS[3], "status") ~= "queued" then
64
+ return 0
65
+ end
66
+
67
+ if redis.call("lrem", KEYS[1], 1, ARGV[1]) <= 0 then
68
+ return 0
69
+ end
70
+
71
+ redis.call("lpush", KEYS[2], ARGV[1])
72
+ return 1
73
+ `;
74
+
75
+ export async function startWorker(options?: StartWorkerOptions): Promise<WorkerHandle> {
76
+ const registry = options?.registry ?? getDefaultRegistry();
77
+ const prefix = options?.prefix ?? defaultPrefix();
78
+ const ownsBaseRedis = !options?.redis && !!options?.url;
79
+ const baseRedis = options?.redis ?? (options?.url ? new BunRedisClient(options.url) : defaultRedis);
80
+ const syncClient = createClient({ redis: baseRedis, prefix });
81
+
82
+ const queues = options?.queues ?? deriveQueuesFromRegistry(registry);
83
+ const concurrency = Math.max(1, options?.concurrency ?? 1);
84
+ const leaseMs = Math.max(100, options?.runtime?.leaseMs ?? 5000);
85
+ const blmoveTimeoutSec = options?.runtime?.blmoveTimeoutSec ?? 1;
86
+ const reaperIntervalMs = options?.runtime?.reaperIntervalMs ?? 500;
87
+
88
+ const abort = new AbortController();
89
+ const tasks: Promise<void>[] = [];
90
+ const ownedRedis: RedisClient[] = [];
91
+
92
+ const closeOwnedResources = async (): Promise<void> => {
93
+ abort.abort();
94
+ await Promise.allSettled(tasks);
95
+ for (const r of ownedRedis) {
96
+ try {
97
+ r.close();
98
+ } catch {
99
+ // Ignore close errors during shutdown.
100
+ }
101
+ }
102
+ if (ownsBaseRedis) {
103
+ try {
104
+ baseRedis.close();
105
+ } catch {
106
+ // Ignore close errors during shutdown.
107
+ }
108
+ }
109
+ };
110
+
111
+ try {
112
+ await syncClient.syncRegistry(registry);
113
+
114
+ // Worker loops (blocking BLMOVE). Use dedicated connections per slot.
115
+ for (let i = 0; i < concurrency; i++) {
116
+ const r = await baseRedis.duplicate();
117
+ const client = createClient({ redis: r, prefix });
118
+ ownedRedis.push(r);
119
+ tasks.push(workerLoop({
120
+ redis: r,
121
+ client,
122
+ registry,
123
+ prefix,
124
+ queues,
125
+ blmoveTimeoutSec,
126
+ leaseMs,
127
+ signal: abort.signal,
128
+ }));
129
+ }
130
+
131
+ // Background loops use dedicated Redis connections to avoid response interleaving.
132
+ const promoterRedis = await baseRedis.duplicate();
133
+ const promoterClient = createClient({ redis: promoterRedis, prefix });
134
+ ownedRedis.push(promoterRedis);
135
+ tasks.push(scheduledPromoterLoop({ redis: promoterRedis, client: promoterClient, prefix, queues, signal: abort.signal }));
136
+
137
+ const reaperRedis = await baseRedis.duplicate();
138
+ ownedRedis.push(reaperRedis);
139
+ tasks.push(
140
+ reaperLoop({
141
+ redis: reaperRedis,
142
+ prefix,
143
+ queues,
144
+ intervalMs: reaperIntervalMs,
145
+ signal: abort.signal,
146
+ }),
147
+ );
148
+
149
+ const cronRedis = await baseRedis.duplicate();
150
+ const cronClient = createClient({ redis: cronRedis, prefix });
151
+ ownedRedis.push(cronRedis);
152
+ tasks.push(cronSchedulerLoop({ redis: cronRedis, client: cronClient, prefix, signal: abort.signal }));
153
+
154
+ return {
155
+ async stop() {
156
+ await closeOwnedResources();
157
+ },
158
+ };
159
+ } catch (err) {
160
+ await closeOwnedResources();
161
+ throw err;
162
+ }
163
+ }
164
+
165
+ function deriveQueuesFromRegistry(registry: WorkflowRegistry): string[] {
166
+ const set = new Set<string>();
167
+ for (const def of registry.list()) set.add(def.options.queue ?? "default");
168
+ return [...set].sort();
169
+ }
170
+
171
+ function parseRedisScore(value: unknown): number | null {
172
+ if (typeof value === "number" && Number.isFinite(value)) return value;
173
+ if (typeof value === "string" && value.trim() !== "") {
174
+ const parsed = Number(value);
175
+ if (Number.isFinite(parsed)) return parsed;
176
+ }
177
+ return null;
178
+ }
179
+
180
+ function parseZPopMinEntry(value: unknown): { member: string; score: number } | null {
181
+ if (!Array.isArray(value) || value.length === 0) return null;
182
+
183
+ if (value.length === 1 && Array.isArray(value[0])) {
184
+ return parseZPopMinEntry(value[0]);
185
+ }
186
+
187
+ const member = value[0];
188
+ const score = parseRedisScore(value[1]);
189
+ if (typeof member !== "string" || score == null) return null;
190
+ return { member, score };
191
+ }
192
+
193
+ /** Parse ZPOPMIN response with count > 1.
194
+ * Bun returns [[member, score], [member, score], ...] for batch zpopmin. */
195
+ function parseZPopMinBatch(value: unknown): Array<{ member: string; score: number }> {
196
+ if (!Array.isArray(value) || value.length === 0) return [];
197
+
198
+ const results: Array<{ member: string; score: number }> = [];
199
+
200
+ for (const item of value) {
201
+ // Each item is [member, score]
202
+ const entry = parseZPopMinEntry(item);
203
+ if (entry) {
204
+ results.push(entry);
205
+ continue;
206
+ }
207
+
208
+ // Fallback: flat format [member1, score1, member2, score2, ...]
209
+ // Try parsing value itself as a single entry
210
+ if (typeof item === "string" && results.length === 0) {
211
+ const flat = parseZPopMinEntry(value);
212
+ if (flat) return [flat];
213
+ break;
214
+ }
215
+ }
216
+ return results;
217
+ }
218
+
219
+ function encodeIdempotencyPart(value: string): string {
220
+ return `${value.length}:${value}`;
221
+ }
222
+
223
+ function defaultStepWorkflowIdempotencyKey(parentRunId: string, stepName: string, childWorkflowName: string): string {
224
+ return `stepwf:${encodeIdempotencyPart(parentRunId)}:${encodeIdempotencyPart(stepName)}:${encodeIdempotencyPart(childWorkflowName)}`;
225
+ }
226
+
227
+ function defaultStepEventIdempotencyKey(parentRunId: string, stepName: string, eventName: string): string {
228
+ return `stepev:${encodeIdempotencyPart(parentRunId)}:${encodeIdempotencyPart(stepName)}:${encodeIdempotencyPart(eventName)}`;
229
+ }
230
+
231
+ async function claimQueuedRunForInlineExecution(args: {
232
+ redis: RedisClient;
233
+ prefix: string;
234
+ queue: string;
235
+ runId: string;
236
+ }): Promise<boolean> {
237
+ const { redis, prefix, queue, runId } = args;
238
+ const result = await redis.send("EVAL", [
239
+ CLAIM_QUEUED_RUN_FOR_INLINE_LUA,
240
+ "3",
241
+ keys.queueReady(prefix, queue),
242
+ keys.queueProcessing(prefix, queue),
243
+ keys.run(prefix, runId),
244
+ runId,
245
+ ]);
246
+
247
+ return result === 1 || result === "1";
248
+ }
249
+
250
+ async function workerLoop(args: {
251
+ redis: RedisClient;
252
+ client: RedflowClient;
253
+ registry: WorkflowRegistry;
254
+ prefix: string;
255
+ queues: string[];
256
+ blmoveTimeoutSec: number;
257
+ leaseMs: number;
258
+ signal: AbortSignal;
259
+ }): Promise<void> {
260
+ const { redis, queues, blmoveTimeoutSec, signal } = args;
261
+ let queueCursor = 0;
262
+
263
+ while (!signal.aborted) {
264
+ try {
265
+ if (queues.length === 0) {
266
+ await sleep(25);
267
+ continue;
268
+ }
269
+
270
+ let claim: { queue: string; processingKey: string; runId: string } | null = null;
271
+
272
+ // First probe all queues without blocking so busy queues don't wait behind empty ones.
273
+ for (let i = 0; i < queues.length; i++) {
274
+ if (signal.aborted) break;
275
+ const idx = (queueCursor + i) % queues.length;
276
+ const queue = queues[idx]!;
277
+ const readyKey = keys.queueReady(args.prefix, queue);
278
+ const processingKey = keys.queueProcessing(args.prefix, queue);
279
+ const runId = await redis.lmove(readyKey, processingKey, "RIGHT", "LEFT");
280
+ if (!runId) continue;
281
+
282
+ claim = { queue, processingKey, runId };
283
+ queueCursor = (idx + 1) % queues.length;
284
+ break;
285
+ }
286
+
287
+ // Nothing ready right now - block on one queue, rotating the queue every time.
288
+ if (!claim && !signal.aborted) {
289
+ const idx = queueCursor % queues.length;
290
+ const queue = queues[idx]!;
291
+ queueCursor = (idx + 1) % queues.length;
292
+
293
+ const readyKey = keys.queueReady(args.prefix, queue);
294
+ const processingKey = keys.queueProcessing(args.prefix, queue);
295
+ const runId = await redis.blmove(readyKey, processingKey, "RIGHT", "LEFT", blmoveTimeoutSec);
296
+ if (runId) {
297
+ claim = { queue, processingKey, runId };
298
+ }
299
+ }
300
+
301
+ if (!claim) {
302
+ await sleep(10);
303
+ continue;
304
+ }
305
+
306
+ try {
307
+ await processRun({ ...args, queue: claim.queue, runId: claim.runId, processingKey: claim.processingKey });
308
+ } catch {
309
+ // Defensive: never let a single job crash the loop.
310
+ }
311
+ } catch {
312
+ if (signal.aborted) break;
313
+ await sleep(50);
314
+ }
315
+ }
316
+ }
317
+
318
+ async function processRun(args: {
319
+ redis: RedisClient;
320
+ client: RedflowClient;
321
+ registry: WorkflowRegistry;
322
+ prefix: string;
323
+ queues: string[];
324
+ queue: string;
325
+ runId: string;
326
+ processingKey: string;
327
+ leaseMs: number;
328
+ signal: AbortSignal;
329
+ }): Promise<void> {
330
+ const { redis, client, registry, prefix, queues, queue, runId, processingKey, leaseMs, signal } = args;
331
+ const runKey = keys.run(prefix, runId);
332
+ const stepsKey = keys.runSteps(prefix, runId);
333
+ const leaseKey = keys.runLease(prefix, runId);
334
+ const leaseToken = `lease_${crypto.randomUUID()}`;
335
+
336
+ const leaseOk = await redis.set(leaseKey, leaseToken, "NX", "PX", String(leaseMs));
337
+ if (leaseOk === null) {
338
+ // Someone else owns this run. Remove the duplicate processing entry.
339
+ await redis.lrem(processingKey, 1, runId);
340
+ return;
341
+ }
342
+
343
+ const runAbort = new AbortController();
344
+ let leaseLost = false;
345
+ const markLeaseLost = () => {
346
+ if (leaseLost) return;
347
+ leaseLost = true;
348
+ runAbort.abort();
349
+ };
350
+
351
+ let leaseInterval: Timer | null = null;
352
+ let renewFailures = 0;
353
+ leaseInterval = setInterval(() => {
354
+ // Best-effort renewal; if we lose ownership, abort this attempt so another
355
+ // worker can safely recover the run.
356
+ redis
357
+ .send("EVAL", [RENEW_LEASE_LUA, "1", leaseKey, leaseToken, String(leaseMs)])
358
+ .then((renewed) => {
359
+ if (renewed == null) {
360
+ markLeaseLost();
361
+ return;
362
+ }
363
+
364
+ renewFailures = 0;
365
+ })
366
+ .catch(() => {
367
+ renewFailures += 1;
368
+ if (renewFailures >= 2) {
369
+ markLeaseLost();
370
+ }
371
+ });
372
+ }, Math.floor(leaseMs / 2));
373
+
374
+ try {
375
+ const run = await redis.hgetall(runKey);
376
+ if (!run || Object.keys(run).length === 0) {
377
+ await redis.lrem(processingKey, 0, runId);
378
+ return;
379
+ }
380
+
381
+ const currentStatus = (run.status as RunStatus) ?? "queued";
382
+ if (currentStatus === "succeeded" || currentStatus === "failed" || currentStatus === "canceled") {
383
+ // Defensive: if a terminal run is accidentally re-queued, do not re-execute.
384
+ await redis.lrem(processingKey, 0, runId);
385
+ return;
386
+ }
387
+
388
+ if (currentStatus === "scheduled") {
389
+ // Defensive: scheduled runs are promoted by the promoter loop only.
390
+ await redis.lrem(processingKey, 0, runId);
391
+ return;
392
+ }
393
+
394
+ if (currentStatus !== "queued" && currentStatus !== "running") {
395
+ // Unknown status, do not execute the run.
396
+ await redis.lrem(processingKey, 0, runId);
397
+ return;
398
+ }
399
+
400
+ const workflowName = run.workflow ?? "";
401
+ const maxAttempts = Number(run.maxAttempts ?? "1");
402
+ const cancelRequestedAt = run.cancelRequestedAt ? Number(run.cancelRequestedAt) : 0;
403
+ if (cancelRequestedAt > 0) {
404
+ await client.finalizeRun(runId, { status: "canceled", finishedAt: nowMs() });
405
+ await redis.lrem(processingKey, 0, runId);
406
+ return;
407
+ }
408
+
409
+ const startedAt = run.startedAt && run.startedAt !== "" ? Number(run.startedAt) : nowMs();
410
+
411
+ if (currentStatus === "queued") {
412
+ const movedToRunning = await client.transitionRunStatusIfCurrent(runId, "queued", "running", startedAt);
413
+ if (!movedToRunning) {
414
+ // Most likely canceled between dequeue and start transition.
415
+ await redis.lrem(processingKey, 0, runId);
416
+ return;
417
+ }
418
+ }
419
+
420
+ if (!run.startedAt || run.startedAt === "") {
421
+ await redis.hset(runKey, { startedAt: String(startedAt) });
422
+ }
423
+
424
+ // Only increment attempt on fresh pick-up (queued → running).
425
+ // On crash recovery the status is already "running" and attempt was
426
+ // already incremented before the previous worker crashed.
427
+ const attempt = currentStatus === "queued"
428
+ ? await redis.hincrby(runKey, "attempt", 1)
429
+ : Math.max(1, Number(run.attempt ?? "0"));
430
+
431
+ if (!workflowName) {
432
+ const errorJson = makeErrorJson(new UnknownWorkflowError(""));
433
+ await client.finalizeRun(runId, { status: "failed", errorJson, finishedAt: nowMs() });
434
+ await redis.lrem(processingKey, 0, runId);
435
+ return;
436
+ }
437
+
438
+ const def = registry.get(workflowName);
439
+ if (!def) {
440
+ const errorJson = makeErrorJson(new UnknownWorkflowError(workflowName));
441
+ await client.finalizeRun(runId, { status: "failed", errorJson, finishedAt: nowMs() });
442
+ await redis.lrem(processingKey, 0, runId);
443
+ return;
444
+ }
445
+
446
+ let input: unknown;
447
+ try {
448
+ const inputRaw = safeJsonParse(run.inputJson ?? "null");
449
+ input = def.options.schema ? validateInputWithSchema(def.options.schema, inputRaw) : inputRaw;
450
+ } catch (err) {
451
+ // Input is not valid for this workflow. This is deterministic, so don't retry.
452
+ await invokeOnFailure(def.options.onFailure, {
453
+ error: err, input: undefined, run: { id: runId, workflow: workflowName, queue, attempt, maxAttempts },
454
+ });
455
+ await client.finalizeRun(runId, { status: "failed", errorJson: makeErrorJson(err), finishedAt: nowMs() });
456
+ await redis.lrem(processingKey, 0, runId);
457
+ return;
458
+ }
459
+
460
+ const onWorkerAbort = () => {
461
+ runAbort.abort();
462
+ };
463
+ // Use { once: true } so the listener auto-removes, and add it
464
+ // unconditionally — if the signal is already aborted, manually abort.
465
+ signal.addEventListener("abort", onWorkerAbort, { once: true });
466
+ if (signal.aborted) {
467
+ runAbort.abort();
468
+ }
469
+
470
+ const cancelPoll = setInterval(() => {
471
+ // Best-effort: if cancellation is requested mid-step, abort the run.
472
+ redis
473
+ .hget(runKey, "cancelRequestedAt")
474
+ .then((cancel) => {
475
+ if (cancel && cancel !== "") runAbort.abort();
476
+ })
477
+ .catch(() => {});
478
+ }, 100);
479
+
480
+ try {
481
+ const ensureNotCanceled = async () => {
482
+ if (runAbort.signal.aborted) throw new CanceledError();
483
+ const cancel = await redis.hget(runKey, "cancelRequestedAt");
484
+ if (cancel && cancel !== "") {
485
+ runAbort.abort();
486
+ throw new CanceledError();
487
+ }
488
+ };
489
+
490
+ const existingStepsRaw = await redis.hgetall(stepsKey);
491
+ let stepSeq = 0;
492
+ for (const raw of Object.values(existingStepsRaw)) {
493
+ try {
494
+ const parsed = safeJsonParse<any>(raw);
495
+ const seq = parsed?.seq;
496
+ if (typeof seq === "number" && Number.isFinite(seq) && seq > stepSeq) {
497
+ stepSeq = seq;
498
+ }
499
+ } catch {
500
+ // Ignore malformed cached step data.
501
+ }
502
+ }
503
+
504
+ // Track step names within a single execution to catch accidental
505
+ // duplicates. On retries cached steps are already in Redis, so we
506
+ // only guard against re-use within the *current* handler invocation.
507
+ const usedStepNames = new Set<string>();
508
+
509
+ const runStep: StepApi["run"] = async <T>(
510
+ options: { name: string; timeoutMs?: number },
511
+ fn: (ctx: { signal: AbortSignal }) => Promise<T>,
512
+ ): Promise<T> => {
513
+ await ensureNotCanceled();
514
+
515
+ if (usedStepNames.has(options.name)) {
516
+ throw new Error(`Duplicate step name '${options.name}' in workflow '${workflowName}'. Step names must be unique within a run.`);
517
+ }
518
+ usedStepNames.add(options.name);
519
+
520
+ const existingRaw = await redis.hget(stepsKey, options.name);
521
+ let seq = 0;
522
+ if (existingRaw) {
523
+ const existing = safeJsonParse<any>(existingRaw);
524
+ const existingSeq = typeof existing.seq === "number" && Number.isFinite(existing.seq) ? existing.seq : undefined;
525
+ if (existingSeq != null && existingSeq > stepSeq) {
526
+ stepSeq = existingSeq;
527
+ }
528
+
529
+ if (existing.status === "succeeded") {
530
+ return safeJsonTryParse(existing.outputJson ?? null) as T;
531
+ }
532
+
533
+ seq = existingSeq ?? (stepSeq += 1);
534
+ } else {
535
+ seq = (stepSeq += 1);
536
+ }
537
+
538
+ const stepStartedAt = nowMs();
539
+ await redis.hset(stepsKey, {
540
+ [options.name]: safeJsonStringify({
541
+ status: "running" satisfies StepStatus,
542
+ seq,
543
+ startedAt: stepStartedAt,
544
+ }),
545
+ });
546
+
547
+ const stepAbort = new AbortController();
548
+ let timeout: Timer | null = null;
549
+ let runAbortListener: (() => void) | null = null;
550
+
551
+ try {
552
+ const fnPromise = fn({ signal: stepAbort.signal });
553
+
554
+ const cancelPromise = new Promise<never>((_, reject) => {
555
+ const onAbort = () => {
556
+ stepAbort.abort();
557
+ reject(new CanceledError());
558
+ };
559
+
560
+ if (runAbort.signal.aborted) return onAbort();
561
+ runAbortListener = onAbort;
562
+ runAbort.signal.addEventListener("abort", onAbort);
563
+ });
564
+
565
+ const timeoutPromise =
566
+ options.timeoutMs && options.timeoutMs > 0
567
+ ? new Promise<never>((_, reject) => {
568
+ timeout = setTimeout(() => {
569
+ stepAbort.abort();
570
+ reject(new TimeoutError(`Step timed out: ${options.name}`));
571
+ }, options.timeoutMs);
572
+ })
573
+ : null;
574
+
575
+ const value = (await Promise.race([fnPromise, cancelPromise, timeoutPromise].filter(Boolean))) as T;
576
+ const stepFinishedAt = nowMs();
577
+ let outputJson: string;
578
+ try {
579
+ outputJson = safeJsonStringify(value);
580
+ } catch (err) {
581
+ throw new OutputSerializationError(`Step output is not JSON-serializable: ${options.name}`, err);
582
+ }
583
+
584
+ await redis.hset(stepsKey, {
585
+ [options.name]: safeJsonStringify({
586
+ status: "succeeded" satisfies StepStatus,
587
+ seq,
588
+ startedAt: stepStartedAt,
589
+ finishedAt: stepFinishedAt,
590
+ outputJson,
591
+ }),
592
+ });
593
+ return value;
594
+ } catch (err) {
595
+ const stepFinishedAt = nowMs();
596
+ const error = err;
597
+ await redis.hset(stepsKey, {
598
+ [options.name]: safeJsonStringify({
599
+ status: "failed" satisfies StepStatus,
600
+ seq,
601
+ startedAt: stepStartedAt,
602
+ finishedAt: stepFinishedAt,
603
+ errorJson: makeErrorJson(error),
604
+ }),
605
+ });
606
+ throw error;
607
+ } finally {
608
+ if (runAbortListener) runAbort.signal.removeEventListener("abort", runAbortListener);
609
+ if (timeout) clearTimeout(timeout);
610
+ }
611
+ };
612
+
613
+ const runWorkflowStep: StepApi["runWorkflow"] = async (options, workflow, workflowInput) => {
614
+ const idempotencyKey =
615
+ options.idempotencyKey ?? defaultStepWorkflowIdempotencyKey(runId, options.name, workflow.name);
616
+
617
+ const waitForRunResultWithInlineAssist = async <TOutput>(
618
+ childRunId: string,
619
+ resultOptions: { timeoutMs?: number; pollMs?: number } | undefined,
620
+ stepSignal: AbortSignal,
621
+ ): Promise<TOutput> => {
622
+ const timeoutMs = resultOptions?.timeoutMs ?? 30_000;
623
+ const pollMs = resultOptions?.pollMs ?? 250;
624
+ const deadline = nowMs() + timeoutMs;
625
+ const missingGraceMs = Math.max(250, Math.min(2000, pollMs * 4));
626
+
627
+ let missingSince: number | null = null;
628
+ let seenState = false;
629
+
630
+ while (true) {
631
+ if (stepSignal.aborted) throw new CanceledError();
632
+
633
+ const state = await client.getRun(childRunId);
634
+ if (!state) {
635
+ const t = nowMs();
636
+ if (missingSince == null) missingSince = t;
637
+
638
+ if (t - missingSince >= missingGraceMs) {
639
+ if (seenState) {
640
+ throw new Error(`Run state unavailable: ${childRunId}`);
641
+ }
642
+ throw new Error(`Run not found: ${childRunId}`);
643
+ }
644
+
645
+ if (t > deadline) throw new TimeoutError(`Timed out waiting for run result (${childRunId})`);
646
+ await sleep(pollMs);
647
+ continue;
648
+ }
649
+
650
+ seenState = true;
651
+ missingSince = null;
652
+
653
+ if (state.status === "succeeded") return state.output as TOutput;
654
+ if (state.status === "failed") throw new Error(`Run failed: ${safeJsonStringify(state.error)}`);
655
+ if (state.status === "canceled") {
656
+ throw new CanceledError(state.cancelReason ? `Run canceled: ${state.cancelReason}` : "Run canceled");
657
+ }
658
+
659
+ if (state.status === "queued" && queues.includes(state.queue)) {
660
+ const claimed = await claimQueuedRunForInlineExecution({
661
+ redis,
662
+ prefix,
663
+ queue: state.queue,
664
+ runId: childRunId,
665
+ });
666
+
667
+ if (claimed) {
668
+ await processRun({
669
+ redis,
670
+ client,
671
+ registry,
672
+ prefix,
673
+ queues,
674
+ queue: state.queue,
675
+ runId: childRunId,
676
+ processingKey: keys.queueProcessing(prefix, state.queue),
677
+ leaseMs,
678
+ signal,
679
+ });
680
+ continue;
681
+ }
682
+ }
683
+
684
+ if (nowMs() > deadline) throw new TimeoutError(`Timed out waiting for run result (${childRunId})`);
685
+ await sleep(pollMs);
686
+ }
687
+ };
688
+
689
+ return await runStep(
690
+ { name: options.name, timeoutMs: options.timeoutMs },
691
+ async ({ signal: stepSignal }) => {
692
+ const handle = await workflow.run(workflowInput, {
693
+ ...(options.run ?? {}),
694
+ idempotencyKey,
695
+ });
696
+
697
+ return await waitForRunResultWithInlineAssist(handle.id, options.result, stepSignal);
698
+ },
699
+ );
700
+ };
701
+
702
+ const emitEventStep: StepApi["emitEvent"] = async (options, payload) => {
703
+ const idempotencyKey =
704
+ options.idempotencyKey ?? defaultStepEventIdempotencyKey(runId, options.name, options.event);
705
+
706
+ return await runStep(
707
+ { name: options.name, timeoutMs: options.timeoutMs },
708
+ async () => {
709
+ return await client.emitEvent(options.event, payload, {
710
+ ...(options.emit ?? {}),
711
+ idempotencyKey,
712
+ });
713
+ },
714
+ );
715
+ };
716
+
717
+ const scheduleEventStep: StepApi["scheduleEvent"] = async (options, payload) => {
718
+ const idempotencyKey =
719
+ options.idempotencyKey ?? defaultStepEventIdempotencyKey(runId, options.name, options.event);
720
+
721
+ return await runStep(
722
+ { name: options.name, timeoutMs: options.timeoutMs },
723
+ async () => {
724
+ return await client.scheduleEvent(options.event, payload, {
725
+ ...(options.schedule ?? {}),
726
+ idempotencyKey,
727
+ });
728
+ },
729
+ );
730
+ };
731
+
732
+ const step: StepApi = {
733
+ run: runStep,
734
+ runWorkflow: runWorkflowStep,
735
+ emitEvent: emitEventStep,
736
+ scheduleEvent: scheduleEventStep,
737
+ };
738
+
739
+ let outputJson: string;
740
+ try {
741
+ await ensureNotCanceled();
742
+ let runAbortListener: (() => void) | null = null;
743
+ const cancelPromise = new Promise<never>((_, reject) => {
744
+ const onAbort = () => reject(new CanceledError());
745
+ if (runAbort.signal.aborted) return onAbort();
746
+ runAbortListener = onAbort;
747
+ runAbort.signal.addEventListener("abort", onAbort);
748
+ });
749
+
750
+ let output: unknown;
751
+ try {
752
+ output = await Promise.race([
753
+ def.handler({
754
+ input,
755
+ run: {
756
+ id: runId,
757
+ workflow: workflowName,
758
+ queue,
759
+ attempt,
760
+ maxAttempts,
761
+ },
762
+ step,
763
+ signal: runAbort.signal,
764
+ }),
765
+ cancelPromise,
766
+ ]);
767
+ } finally {
768
+ if (runAbortListener) {
769
+ runAbort.signal.removeEventListener("abort", runAbortListener);
770
+ }
771
+ }
772
+
773
+ await ensureNotCanceled();
774
+
775
+ try {
776
+ outputJson = safeJsonStringify(output);
777
+ } catch (err) {
778
+ throw new OutputSerializationError("Workflow output is not JSON-serializable", err);
779
+ }
780
+ } catch (err) {
781
+ if (err instanceof CanceledError) {
782
+ if (leaseLost) {
783
+ // Lease ownership moved away from this worker. Do not finalize or
784
+ // mutate queue state here - recovery worker/reaper will continue.
785
+ return;
786
+ }
787
+ await client.finalizeRun(runId, { status: "canceled", finishedAt: nowMs() });
788
+ await redis.lrem(processingKey, 0, runId);
789
+ return;
790
+ }
791
+
792
+ const retryable = isRetryableError(err);
793
+ const hasAttemptsLeft = attempt < maxAttempts;
794
+ const finishedAt = nowMs();
795
+
796
+ if (retryable && hasAttemptsLeft) {
797
+ const delay = computeRetryDelayMs(attempt);
798
+ const nextAt = finishedAt + delay;
799
+ await client.scheduleRetry(runId, {
800
+ queue,
801
+ nextAt,
802
+ errorJson: makeErrorJson(err),
803
+ updatedAt: finishedAt,
804
+ });
805
+ await redis.lrem(processingKey, 0, runId);
806
+ return;
807
+ }
808
+
809
+ await invokeOnFailure(def.options.onFailure, {
810
+ error: err, input, run: { id: runId, workflow: workflowName, queue, attempt, maxAttempts },
811
+ });
812
+ await client.finalizeRun(runId, { status: "failed", errorJson: makeErrorJson(err), finishedAt });
813
+ await redis.lrem(processingKey, 0, runId);
814
+ return;
815
+ }
816
+
817
+ await client.finalizeRun(runId, { status: "succeeded", outputJson, finishedAt: nowMs() });
818
+ await redis.lrem(processingKey, 0, runId);
819
+ } finally {
820
+ clearInterval(cancelPoll);
821
+ signal.removeEventListener("abort", onWorkerAbort);
822
+ }
823
+ } finally {
824
+ if (leaseInterval) clearInterval(leaseInterval);
825
+ await redis.send("EVAL", [UNLOCK_LUA, "1", leaseKey, leaseToken]).catch(() => {});
826
+ }
827
+ }
828
+
829
+ async function scheduledPromoterLoop(args: {
830
+ redis: RedisClient;
831
+ client: RedflowClient;
832
+ prefix: string;
833
+ queues: string[];
834
+ signal: AbortSignal;
835
+ }): Promise<void> {
836
+ const { redis, prefix, queues, signal } = args;
837
+
838
+ while (!signal.aborted) {
839
+ try {
840
+ const now = nowMs();
841
+ let nextWake: number | null = null;
842
+
843
+ for (const queue of queues) {
844
+ if (signal.aborted) break;
845
+ const schedKey = keys.queueScheduled(prefix, queue);
846
+ const readyKey = keys.queueReady(prefix, queue);
847
+
848
+ // Pop up to 100 entries at a time to avoid starvation under burst load.
849
+ const batch = parseZPopMinBatch(await redis.send("ZPOPMIN", [schedKey, "100"]));
850
+ if (batch.length === 0) continue;
851
+
852
+ for (const { member: runId, score } of batch) {
853
+ if (signal.aborted) break;
854
+
855
+ if (score <= now) {
856
+ const queued = await args.client.transitionRunStatusIfCurrent(runId, "scheduled", "queued", now);
857
+ if (queued) {
858
+ await redis.lpush(readyKey, runId);
859
+ }
860
+ } else {
861
+ // Not yet due — put it back and compute next wake time.
862
+ await redis.zadd(schedKey, score, runId);
863
+ nextWake = nextWake == null ? score : Math.min(nextWake, score);
864
+ }
865
+ }
866
+ }
867
+
868
+ if (nextWake != null) {
869
+ await sleep(Math.min(1000, Math.max(10, nextWake - nowMs())));
870
+ } else {
871
+ await sleep(50);
872
+ }
873
+ } catch {
874
+ if (signal.aborted) break;
875
+ await sleep(100);
876
+ }
877
+ }
878
+ }
879
+
880
+ async function reaperLoop(args: {
881
+ redis: RedisClient;
882
+ prefix: string;
883
+ queues: string[];
884
+ intervalMs: number;
885
+ signal: AbortSignal;
886
+ }): Promise<void> {
887
+ const { redis, prefix, queues, intervalMs, signal } = args;
888
+
889
+ while (!signal.aborted) {
890
+ try {
891
+ for (const queue of queues) {
892
+ if (signal.aborted) break;
893
+ const processingKey = keys.queueProcessing(prefix, queue);
894
+ const readyKey = keys.queueReady(prefix, queue);
895
+ const runIds = await redis.lrange(processingKey, 0, -1);
896
+
897
+ for (const runId of runIds) {
898
+ const leaseKey = keys.runLease(prefix, runId);
899
+ const lease = await redis.get(leaseKey);
900
+ if (lease) continue;
901
+ // Not leased => assume worker crashed.
902
+ const removed = await redis.lrem(processingKey, 1, runId);
903
+ if (removed > 0) await redis.lpush(readyKey, runId);
904
+ }
905
+ }
906
+
907
+ await sleep(intervalMs);
908
+ } catch {
909
+ if (signal.aborted) break;
910
+ await sleep(Math.max(100, intervalMs));
911
+ }
912
+ }
913
+ }
914
+
915
+ async function cronSchedulerLoop(args: {
916
+ redis: RedisClient;
917
+ client: RedflowClient;
918
+ prefix: string;
919
+ signal: AbortSignal;
920
+ }): Promise<void> {
921
+ const { redis, client, prefix, signal } = args;
922
+ const lockKey = keys.lockCron(prefix);
923
+ const token = `cronlock_${crypto.randomUUID()}`;
924
+ const lockMs = 2000;
925
+
926
+ while (!signal.aborted) {
927
+ try {
928
+ const acquired = await redis.set(lockKey, token, "NX", "PX", String(lockMs));
929
+ if (acquired === null) {
930
+ await sleep(250);
931
+ continue;
932
+ }
933
+
934
+ // We are the active scheduler until we stop renewing.
935
+ while (!signal.aborted) {
936
+ try {
937
+ // Renew lock (best-effort, safe).
938
+ const renewed = await redis.send("EVAL", [
939
+ `if redis.call("get", KEYS[1]) == ARGV[1] then return redis.call("set", KEYS[1], ARGV[1], "PX", ARGV[2]) else return nil end`,
940
+ "1",
941
+ lockKey,
942
+ token,
943
+ String(lockMs),
944
+ ]);
945
+ if (renewed == null) break;
946
+
947
+ const now = nowMs();
948
+ const next = parseZPopMinEntry(await redis.zpopmin(keys.cronNext(prefix)));
949
+ if (!next) {
950
+ await sleep(250);
951
+ continue;
952
+ }
953
+ const { member: cronId, score } = next;
954
+ if (score > now) {
955
+ await redis.zadd(keys.cronNext(prefix), score, cronId);
956
+ await sleep(Math.min(1000, Math.max(10, score - now)));
957
+ continue;
958
+ }
959
+
960
+ let requeueCurrent = true;
961
+ try {
962
+ const defRaw = await redis.hget(keys.cronDef(prefix), cronId);
963
+ if (!defRaw) {
964
+ requeueCurrent = false;
965
+ continue;
966
+ }
967
+
968
+ let def: any;
969
+ try {
970
+ def = safeJsonParse<any>(defRaw);
971
+ } catch {
972
+ // Corrupted definition, drop it to avoid a tight error loop.
973
+ requeueCurrent = false;
974
+ continue;
975
+ }
976
+
977
+ const workflow = def.workflow as string;
978
+ const queue = def.queue as string;
979
+
980
+ let input: unknown;
981
+ try {
982
+ input = safeJsonParse(def.inputJson);
983
+ } catch {
984
+ // Corrupted payload encoding, drop it.
985
+ requeueCurrent = false;
986
+ continue;
987
+ }
988
+
989
+ await client.runByName(workflow, input, { queueOverride: queue });
990
+
991
+ // Schedule next run.
992
+ let nextAt: number | null = null;
993
+ try {
994
+ const job = new Cron(def.expression, { timezone: def.timezone, catch: true });
995
+ // Reschedule from current time so stale cron scores do not create
996
+ // catch-up bursts after clock drift/temporary pauses.
997
+ const next = job.nextRun(new Date(nowMs()));
998
+ job.stop();
999
+ nextAt = next ? next.getTime() : null;
1000
+ } catch {
1001
+ // Invalid expression in metadata. Drop this trigger.
1002
+ requeueCurrent = false;
1003
+ continue;
1004
+ }
1005
+
1006
+ if (nextAt != null) {
1007
+ await redis.zadd(keys.cronNext(prefix), nextAt, cronId);
1008
+ }
1009
+ requeueCurrent = false;
1010
+ } catch {
1011
+ // Keep the current cron id alive on transient failures.
1012
+ } finally {
1013
+ if (requeueCurrent) {
1014
+ const retryAt = Math.max(score, nowMs() + 250);
1015
+ await redis.zadd(keys.cronNext(prefix), retryAt, cronId).catch(() => {});
1016
+ }
1017
+ }
1018
+ } catch {
1019
+ if (signal.aborted) break;
1020
+ await sleep(250);
1021
+ break;
1022
+ }
1023
+ }
1024
+ } catch {
1025
+ if (signal.aborted) break;
1026
+ await sleep(250);
1027
+ }
1028
+ }
1029
+ }
1030
+
1031
+ /** Best-effort onFailure invocation — errors are swallowed so they never break finalization. */
1032
+ async function invokeOnFailure(
1033
+ handler: ((ctx: OnFailureContext) => void | Promise<void>) | undefined,
1034
+ ctx: OnFailureContext,
1035
+ ): Promise<void> {
1036
+ if (!handler) return;
1037
+ try {
1038
+ await handler(ctx);
1039
+ } catch {
1040
+ // onFailure must not interfere with the run lifecycle.
1041
+ }
1042
+ }