@redflow/client 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/INTERNALS.md +238 -0
- package/README.md +24 -3
- package/package.json +1 -1
- package/src/client.ts +36 -32
- package/src/types.ts +9 -1
- package/src/worker.ts +88 -16
- package/tests/bugfixes.test.ts +11 -11
- package/tests/fixtures/worker-crash.ts +1 -0
- package/tests/fixtures/worker-recover.ts +1 -0
- package/tests/redflow.e2e.test.ts +142 -73
package/INTERNALS.md
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
# redflow internals
|
|
2
|
+
|
|
3
|
+
This document describes how `@redflow/client` works internally in production terms.
|
|
4
|
+
|
|
5
|
+
## Design model
|
|
6
|
+
|
|
7
|
+
- Durable state lives in Redis.
|
|
8
|
+
- Handlers and workflow code live in process memory (per worker process).
|
|
9
|
+
- The runtime is queue-based and crash-recoverable.
|
|
10
|
+
- Delivery semantics are at-least-once at run level.
|
|
11
|
+
- Step API provides deterministic replay/caching to avoid repeating completed work.
|
|
12
|
+
|
|
13
|
+
## Main components
|
|
14
|
+
|
|
15
|
+
- **Workflow registry (in-memory):** built via `defineWorkflow(...)`.
|
|
16
|
+
- **Client (`RedflowClient`):** enqueue runs, inspect state, cancel runs, sync metadata.
|
|
17
|
+
- **Worker runtime:** executes queued runs, retries failures, promotes scheduled runs.
|
|
18
|
+
- **Cron scheduler:** leader-elected loop that creates cron runs.
|
|
19
|
+
|
|
20
|
+
## Registry and metadata sync
|
|
21
|
+
|
|
22
|
+
`startWorker({ app, ... })` always calls `syncRegistry(registry, { app })` before loops start.
|
|
23
|
+
|
|
24
|
+
What `syncRegistry` writes per workflow:
|
|
25
|
+
|
|
26
|
+
- `workflow:<name>` hash:
|
|
27
|
+
- `name`
|
|
28
|
+
- `queue`
|
|
29
|
+
- `maxConcurrency` (default `1`)
|
|
30
|
+
- `app` (required ownership scope for cleanup)
|
|
31
|
+
- `updatedAt`
|
|
32
|
+
- `cronJson`
|
|
33
|
+
- `retriesJson`
|
|
34
|
+
- `cronIdsJson`
|
|
35
|
+
- `workflows` set (all known workflow names)
|
|
36
|
+
- cron definitions in `cron:def` and schedule in `cron:next`
|
|
37
|
+
|
|
38
|
+
### Stale cleanup
|
|
39
|
+
|
|
40
|
+
Before writing new metadata, sync removes stale workflow metadata when all are true:
|
|
41
|
+
|
|
42
|
+
- workflow exists in Redis,
|
|
43
|
+
- workflow is missing in current registry,
|
|
44
|
+
- workflow `app` equals current `app`,
|
|
45
|
+
- workflow is older than grace period (`30s`).
|
|
46
|
+
|
|
47
|
+
Cleanup removes:
|
|
48
|
+
|
|
49
|
+
- `workflow:<name>` metadata hash,
|
|
50
|
+
- `workflows` set membership,
|
|
51
|
+
- associated cron entries (`cron:def`, `cron:next`).
|
|
52
|
+
|
|
53
|
+
It does **not** delete historical runs.
|
|
54
|
+
|
|
55
|
+
## Redis keyspace
|
|
56
|
+
|
|
57
|
+
Key builders are in `src/internal/keys.ts`.
|
|
58
|
+
|
|
59
|
+
- `workflows`
|
|
60
|
+
- `workflow:<name>`
|
|
61
|
+
- `workflow-runs:<name>`
|
|
62
|
+
- `runs:created`
|
|
63
|
+
- `runs:status:<status>`
|
|
64
|
+
- `run:<runId>`
|
|
65
|
+
- `run:<runId>:steps`
|
|
66
|
+
- `run:<runId>:lease`
|
|
67
|
+
- `q:<queue>:ready`
|
|
68
|
+
- `q:<queue>:processing`
|
|
69
|
+
- `q:<queue>:scheduled`
|
|
70
|
+
- `cron:def`
|
|
71
|
+
- `cron:next`
|
|
72
|
+
- `lock:cron`
|
|
73
|
+
- `idempo:<encodedWorkflow>:<encodedKey>`
|
|
74
|
+
|
|
75
|
+
## Run lifecycle
|
|
76
|
+
|
|
77
|
+
Statuses:
|
|
78
|
+
|
|
79
|
+
- `scheduled`
|
|
80
|
+
- `queued`
|
|
81
|
+
- `running`
|
|
82
|
+
- terminal: `succeeded`, `failed`, `canceled`
|
|
83
|
+
|
|
84
|
+
### Enqueue
|
|
85
|
+
|
|
86
|
+
Enqueue uses `ENQUEUE_RUN_LUA` atomically:
|
|
87
|
+
|
|
88
|
+
- creates run hash,
|
|
89
|
+
- writes indexes (`runs:created`, `runs:status:*`, `workflow-runs:*`),
|
|
90
|
+
- pushes to ready queue or scheduled ZSET,
|
|
91
|
+
- applies idempotency mapping if key was provided.
|
|
92
|
+
|
|
93
|
+
Idempotency key TTL defaults to `7 days`.
|
|
94
|
+
|
|
95
|
+
### Processing
|
|
96
|
+
|
|
97
|
+
Worker loop uses `LMOVE`/`BLMOVE` from `ready` -> `processing`.
|
|
98
|
+
|
|
99
|
+
For each claimed run:
|
|
100
|
+
|
|
101
|
+
1. Acquire lease (`run:<id>:lease`) with periodic renewal.
|
|
102
|
+
2. Validate current run status.
|
|
103
|
+
3. If `queued`, enforce `maxConcurrency` for that workflow.
|
|
104
|
+
4. Transition `queued -> running` atomically.
|
|
105
|
+
5. Execute handler with step engine.
|
|
106
|
+
6. Finalize to terminal status atomically.
|
|
107
|
+
7. Remove from `processing`.
|
|
108
|
+
|
|
109
|
+
If lease is lost, current worker aborts and does not finalize.
|
|
110
|
+
|
|
111
|
+
### Reaper
|
|
112
|
+
|
|
113
|
+
Reaper scans `processing` lists. For runs without active lease:
|
|
114
|
+
|
|
115
|
+
- removes from `processing`,
|
|
116
|
+
- pushes back to `ready`.
|
|
117
|
+
|
|
118
|
+
This recovers from worker crashes.
|
|
119
|
+
|
|
120
|
+
### Scheduled promoter
|
|
121
|
+
|
|
122
|
+
Promoter pops due items from `q:<queue>:scheduled` (`ZPOPMIN` batch), then:
|
|
123
|
+
|
|
124
|
+
- transitions `scheduled -> queued`,
|
|
125
|
+
- pushes to `ready`.
|
|
126
|
+
|
|
127
|
+
Future items are put back.
|
|
128
|
+
|
|
129
|
+
## maxConcurrency
|
|
130
|
+
|
|
131
|
+
`maxConcurrency` is per workflow, default `1`.
|
|
132
|
+
|
|
133
|
+
### For regular queued runs
|
|
134
|
+
|
|
135
|
+
When a worker picks a `queued` run:
|
|
136
|
+
|
|
137
|
+
- it counts current `running` runs for the same workflow,
|
|
138
|
+
- if count >= `maxConcurrency`, run is atomically moved from `processing` back to end of `ready`.
|
|
139
|
+
|
|
140
|
+
So non-cron runs are delayed (not dropped).
|
|
141
|
+
|
|
142
|
+
### For cron runs
|
|
143
|
+
|
|
144
|
+
Cron loop also checks running count before enqueue.
|
|
145
|
+
|
|
146
|
+
- if count >= `maxConcurrency`, that cron tick is skipped,
|
|
147
|
+
- next cron tick is still scheduled normally.
|
|
148
|
+
|
|
149
|
+
## Cron scheduler
|
|
150
|
+
|
|
151
|
+
- Leader election via Redis lock `lock:cron`.
|
|
152
|
+
- Only lock holder schedules cron runs.
|
|
153
|
+
- Loop pops earliest `cronId` from `cron:next`.
|
|
154
|
+
- If due:
|
|
155
|
+
- parses `cron:def` payload,
|
|
156
|
+
- enforces `maxConcurrency`,
|
|
157
|
+
- enqueues run via `runByName` (or skips),
|
|
158
|
+
- computes next fire time and stores in `cron:next`.
|
|
159
|
+
|
|
160
|
+
Cron uses "reschedule from now" behavior (no catch-up burst if stale timestamp was in the past).
|
|
161
|
+
|
|
162
|
+
## Step engine semantics
|
|
163
|
+
|
|
164
|
+
Inside handler, `step` API has three primitives.
|
|
165
|
+
|
|
166
|
+
### `step.run(...)`
|
|
167
|
+
|
|
168
|
+
- Step state is persisted in `run:<id>:steps` hash under `step.name`.
|
|
169
|
+
- If step already `succeeded`, cached output is returned.
|
|
170
|
+
- Duplicate step names in one execution are rejected.
|
|
171
|
+
- Step timeout and cancellation are supported.
|
|
172
|
+
|
|
173
|
+
### `step.runWorkflow(...)`
|
|
174
|
+
|
|
175
|
+
- Enqueues child workflow with deterministic idempotency by default:
|
|
176
|
+
- `parentRunId + stepName + childWorkflowName`.
|
|
177
|
+
- Waits for child completion.
|
|
178
|
+
- Waiting is bounded by step `timeoutMs` (if set), otherwise unbounded until cancellation.
|
|
179
|
+
- Inline assist: if child is queued on a queue this worker handles, worker may execute child inline to avoid self-deadlock with low concurrency.
|
|
180
|
+
|
|
181
|
+
### `step.emitWorkflow(...)`
|
|
182
|
+
|
|
183
|
+
- Enqueues child workflow and returns child `runId`.
|
|
184
|
+
- Supports child as workflow object or workflow name string.
|
|
185
|
+
- Uses deterministic idempotency default based on parent run and step name.
|
|
186
|
+
|
|
187
|
+
## Retry model
|
|
188
|
+
|
|
189
|
+
- `maxAttempts` is workflow-level (`retries.maxAttempts`), default `1`.
|
|
190
|
+
- Retry delay uses exponential backoff + jitter.
|
|
191
|
+
- Non-retryable classes:
|
|
192
|
+
- input validation errors,
|
|
193
|
+
- unknown workflow,
|
|
194
|
+
- output serialization errors,
|
|
195
|
+
- cancellation,
|
|
196
|
+
- explicit `NonRetriableError`.
|
|
197
|
+
- Retry scheduling is atomic (`scheduleRetry` Lua): status/index update + queue scheduled ZSET write in one script.
|
|
198
|
+
|
|
199
|
+
## Cancellation
|
|
200
|
+
|
|
201
|
+
`cancelRun(runId)`:
|
|
202
|
+
|
|
203
|
+
- sets `cancelRequestedAt` + optional reason,
|
|
204
|
+
- if run is `queued`/`scheduled`, attempts immediate transition to `canceled` and cleanup,
|
|
205
|
+
- if run is `running`, cancellation is cooperative via `AbortSignal` polling in worker.
|
|
206
|
+
|
|
207
|
+
Terminal finalize script ensures consistent indexes and terminal status.
|
|
208
|
+
|
|
209
|
+
## Idempotency vs step cache
|
|
210
|
+
|
|
211
|
+
- **Idempotency:** deduplicates run creation (`key -> runId`) with TTL.
|
|
212
|
+
- **Step cache:** deduplicates completed step execution within one parent run.
|
|
213
|
+
|
|
214
|
+
They solve different failure windows and are intentionally both used.
|
|
215
|
+
|
|
216
|
+
## Multi-worker behavior
|
|
217
|
+
|
|
218
|
+
- Many workers can process same prefix/queues.
|
|
219
|
+
- Cron scheduling is single-leader.
|
|
220
|
+
- Processing/recovery is shared via Redis lists + leases.
|
|
221
|
+
- `maxConcurrency` is enforced globally against Redis `running` index.
|
|
222
|
+
|
|
223
|
+
## Operational notes
|
|
224
|
+
|
|
225
|
+
Recommended for production:
|
|
226
|
+
|
|
227
|
+
- Use stable `prefix` per environment.
|
|
228
|
+
- Use explicit `app` per service role for safe metadata cleanup.
|
|
229
|
+
- Set `maxConcurrency` intentionally for long workflows.
|
|
230
|
+
- Keep queue ownership clear (avoid workers consuming queues for workflows they do not register).
|
|
231
|
+
- Use idempotency keys for external trigger endpoints.
|
|
232
|
+
|
|
233
|
+
## Current guarantees and limitations
|
|
234
|
+
|
|
235
|
+
- Run execution is at-least-once.
|
|
236
|
+
- Step cache reduces replay but cannot provide global exactly-once side effects.
|
|
237
|
+
- `maxConcurrency` is enforced via runtime checks against Redis state; it is robust in practice but not a strict distributed semaphore proof.
|
|
238
|
+
- `handle.result({ timeoutMs })` timeout affects caller waiting only, not run execution itself.
|
package/README.md
CHANGED
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
Redis-backed workflow runtime for Bun.
|
|
4
4
|
|
|
5
|
+
Deep internal details: `INTERNALS.md`
|
|
6
|
+
|
|
5
7
|
## Warning
|
|
6
8
|
|
|
7
9
|
This project is still in early alpha stage.
|
|
@@ -147,13 +149,14 @@ const output = await handle.result({ timeoutMs: 90_000 });
|
|
|
147
149
|
|
|
148
150
|
## Start a worker
|
|
149
151
|
|
|
150
|
-
Import workflows, then run `startWorker()`.
|
|
152
|
+
Import workflows, then run `startWorker({ app: ... })`.
|
|
151
153
|
|
|
152
154
|
```ts
|
|
153
155
|
import { startWorker } from "@redflow/client";
|
|
154
156
|
import "./workflows";
|
|
155
157
|
|
|
156
158
|
const worker = await startWorker({
|
|
159
|
+
app: "billing-worker",
|
|
157
160
|
url: process.env.REDIS_URL,
|
|
158
161
|
prefix: "redflow:prod",
|
|
159
162
|
concurrency: 4,
|
|
@@ -164,6 +167,7 @@ Explicit queues + runtime tuning:
|
|
|
164
167
|
|
|
165
168
|
```ts
|
|
166
169
|
const worker = await startWorker({
|
|
170
|
+
app: "billing-worker",
|
|
167
171
|
url: process.env.REDIS_URL,
|
|
168
172
|
prefix: "redflow:prod",
|
|
169
173
|
queues: ["critical", "io", "analytics"],
|
|
@@ -178,6 +182,21 @@ const worker = await startWorker({
|
|
|
178
182
|
|
|
179
183
|
## Workflow options examples
|
|
180
184
|
|
|
185
|
+
### maxConcurrency
|
|
186
|
+
|
|
187
|
+
`maxConcurrency` limits concurrent `running` runs per workflow. Default is `1`.
|
|
188
|
+
|
|
189
|
+
```ts
|
|
190
|
+
defineWorkflow(
|
|
191
|
+
"heavy-sync",
|
|
192
|
+
{
|
|
193
|
+
queue: "ops",
|
|
194
|
+
maxConcurrency: 1,
|
|
195
|
+
},
|
|
196
|
+
async () => ({ ok: true }),
|
|
197
|
+
);
|
|
198
|
+
```
|
|
199
|
+
|
|
181
200
|
### Cron
|
|
182
201
|
|
|
183
202
|
```ts
|
|
@@ -194,6 +213,8 @@ defineWorkflow(
|
|
|
194
213
|
);
|
|
195
214
|
```
|
|
196
215
|
|
|
216
|
+
Cron respects `maxConcurrency`: if the limit is reached, that cron tick is skipped.
|
|
217
|
+
|
|
197
218
|
### onFailure
|
|
198
219
|
|
|
199
220
|
```ts
|
|
@@ -281,10 +302,10 @@ const output = await handle.result({ timeoutMs: 30_000 });
|
|
|
281
302
|
console.log(output);
|
|
282
303
|
```
|
|
283
304
|
|
|
284
|
-
### Registry sync
|
|
305
|
+
### Registry sync app id
|
|
285
306
|
|
|
286
307
|
```ts
|
|
287
308
|
import { getDefaultRegistry } from "@redflow/client";
|
|
288
309
|
|
|
289
|
-
await client.syncRegistry(getDefaultRegistry(), {
|
|
310
|
+
await client.syncRegistry(getDefaultRegistry(), { app: "billing-service" });
|
|
290
311
|
```
|
package/package.json
CHANGED
package/src/client.ts
CHANGED
|
@@ -16,16 +16,17 @@ import { keys } from "./internal/keys";
|
|
|
16
16
|
import { safeJsonParse, safeJsonStringify, safeJsonTryParse } from "./internal/json";
|
|
17
17
|
import { nowMs } from "./internal/time";
|
|
18
18
|
import { sleep } from "./internal/sleep";
|
|
19
|
-
import
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
19
|
+
import {
|
|
20
|
+
DEFAULT_MAX_ATTEMPTS,
|
|
21
|
+
type EmitWorkflowOptions,
|
|
22
|
+
type ListedRun,
|
|
23
|
+
type ListRunsParams,
|
|
24
|
+
type RunHandle,
|
|
25
|
+
type RunOptions,
|
|
26
|
+
type RunState,
|
|
27
|
+
type RunStatus,
|
|
28
|
+
type StepState,
|
|
29
|
+
type WorkflowMeta,
|
|
29
30
|
} from "./types";
|
|
30
31
|
import type { WorkflowRegistry } from "./registry";
|
|
31
32
|
|
|
@@ -37,10 +38,10 @@ export type CreateClientOptions = {
|
|
|
37
38
|
|
|
38
39
|
export type SyncRegistryOptions = {
|
|
39
40
|
/**
|
|
40
|
-
*
|
|
41
|
-
*
|
|
41
|
+
* Stable application id used for stale workflow metadata cleanup.
|
|
42
|
+
* Workflows are pruned only when they were last synced by the same app.
|
|
42
43
|
*/
|
|
43
|
-
|
|
44
|
+
app: string;
|
|
44
45
|
};
|
|
45
46
|
|
|
46
47
|
export function defaultPrefix(): string {
|
|
@@ -249,16 +250,6 @@ function encodeCompositePart(value: string): string {
|
|
|
249
250
|
return `${value.length}:${value}`;
|
|
250
251
|
}
|
|
251
252
|
|
|
252
|
-
function defaultRegistryOwner(): string {
|
|
253
|
-
const envOwner = process.env.REDFLOW_SYNC_OWNER?.trim();
|
|
254
|
-
if (envOwner) return envOwner;
|
|
255
|
-
|
|
256
|
-
const argvOwner = process.argv[1]?.trim();
|
|
257
|
-
if (argvOwner) return argvOwner;
|
|
258
|
-
|
|
259
|
-
return "redflow:unknown-owner";
|
|
260
|
-
}
|
|
261
|
-
|
|
262
253
|
function parseEnqueueScriptResult(value: unknown): { kind: "created" | "existing"; runId: string } | null {
|
|
263
254
|
if (Array.isArray(value) && value.length === 1 && Array.isArray(value[0])) {
|
|
264
255
|
return parseEnqueueScriptResult(value[0]);
|
|
@@ -309,6 +300,11 @@ function isValidDate(value: Date): boolean {
|
|
|
309
300
|
return value instanceof Date && Number.isFinite(value.getTime());
|
|
310
301
|
}
|
|
311
302
|
|
|
303
|
+
function normalizeMaxConcurrency(value: unknown): number {
|
|
304
|
+
if (typeof value !== "number" || !Number.isFinite(value) || value <= 0) return 1;
|
|
305
|
+
return Math.floor(value);
|
|
306
|
+
}
|
|
307
|
+
|
|
312
308
|
export class RedflowClient {
|
|
313
309
|
constructor(
|
|
314
310
|
public readonly redis: RedisClient,
|
|
@@ -356,9 +352,11 @@ export class RedflowClient {
|
|
|
356
352
|
const retries = safeJsonTryParse<any>(data.retriesJson ?? null) as any;
|
|
357
353
|
const updatedAt = Number(data.updatedAt ?? "0");
|
|
358
354
|
const queue = data.queue ?? "default";
|
|
355
|
+
const maxConcurrency = normalizeMaxConcurrency(Number(data.maxConcurrency ?? "1"));
|
|
359
356
|
return {
|
|
360
357
|
name,
|
|
361
358
|
queue,
|
|
359
|
+
maxConcurrency,
|
|
362
360
|
cron: Array.isArray(cron) && cron.length > 0 ? cron : undefined,
|
|
363
361
|
retries,
|
|
364
362
|
updatedAt,
|
|
@@ -395,7 +393,7 @@ export class RedflowClient {
|
|
|
395
393
|
? Math.floor(options.__maxAttemptsOverride)
|
|
396
394
|
: null;
|
|
397
395
|
|
|
398
|
-
const maxAttempts = maxAttemptsOverride ?? (await this.getMaxAttemptsForWorkflow(workflowName)) ??
|
|
396
|
+
const maxAttempts = maxAttemptsOverride ?? (await this.getMaxAttemptsForWorkflow(workflowName)) ?? DEFAULT_MAX_ATTEMPTS;
|
|
399
397
|
|
|
400
398
|
return await this.enqueueRun<TOutput>({
|
|
401
399
|
workflowName,
|
|
@@ -471,7 +469,7 @@ export class RedflowClient {
|
|
|
471
469
|
output,
|
|
472
470
|
error,
|
|
473
471
|
attempt: Number(data.attempt ?? "0"),
|
|
474
|
-
maxAttempts: Number(data.maxAttempts ??
|
|
472
|
+
maxAttempts: Number(data.maxAttempts ?? String(DEFAULT_MAX_ATTEMPTS)),
|
|
475
473
|
createdAt: Number(data.createdAt ?? "0"),
|
|
476
474
|
availableAt: data.availableAt ? Number(data.availableAt) : undefined,
|
|
477
475
|
startedAt: data.startedAt ? Number(data.startedAt) : undefined,
|
|
@@ -606,17 +604,21 @@ export class RedflowClient {
|
|
|
606
604
|
}
|
|
607
605
|
}
|
|
608
606
|
|
|
609
|
-
async syncRegistry(registry: WorkflowRegistry, options
|
|
607
|
+
async syncRegistry(registry: WorkflowRegistry, options: SyncRegistryOptions): Promise<void> {
|
|
610
608
|
const defs = registry.list();
|
|
611
609
|
const syncStartedAt = nowMs();
|
|
612
|
-
const
|
|
610
|
+
const app = options.app.trim();
|
|
611
|
+
if (!app) {
|
|
612
|
+
throw new Error("syncRegistry requires a non-empty options.app");
|
|
613
|
+
}
|
|
613
614
|
const registeredNames = new Set(defs.map((def) => def.options.name));
|
|
614
615
|
|
|
615
|
-
await this.cleanupStaleWorkflows(registeredNames, syncStartedAt,
|
|
616
|
+
await this.cleanupStaleWorkflows(registeredNames, syncStartedAt, app);
|
|
616
617
|
|
|
617
618
|
for (const def of defs) {
|
|
618
619
|
const name = def.options.name;
|
|
619
620
|
const queue = def.options.queue ?? "default";
|
|
621
|
+
const maxConcurrency = normalizeMaxConcurrency(def.options.maxConcurrency);
|
|
620
622
|
const cron = def.options.cron ?? [];
|
|
621
623
|
const retries = def.options.retries ?? {};
|
|
622
624
|
const updatedAt = nowMs();
|
|
@@ -653,6 +655,7 @@ export class RedflowClient {
|
|
|
653
655
|
id: cronId,
|
|
654
656
|
workflow: name,
|
|
655
657
|
queue,
|
|
658
|
+
maxConcurrency,
|
|
656
659
|
expression: c.expression,
|
|
657
660
|
timezone: c.timezone,
|
|
658
661
|
inputJson: safeJsonStringify(cronInput),
|
|
@@ -671,7 +674,8 @@ export class RedflowClient {
|
|
|
671
674
|
const meta: Record<string, string> = {
|
|
672
675
|
name,
|
|
673
676
|
queue,
|
|
674
|
-
|
|
677
|
+
maxConcurrency: String(maxConcurrency),
|
|
678
|
+
app,
|
|
675
679
|
updatedAt: String(updatedAt),
|
|
676
680
|
cronJson: safeJsonStringify(cron),
|
|
677
681
|
retriesJson: safeJsonStringify(retries),
|
|
@@ -722,7 +726,7 @@ export class RedflowClient {
|
|
|
722
726
|
private async cleanupStaleWorkflows(
|
|
723
727
|
registeredNames: Set<string>,
|
|
724
728
|
syncStartedAt: number,
|
|
725
|
-
|
|
729
|
+
app: string,
|
|
726
730
|
): Promise<void> {
|
|
727
731
|
const existingNames = await this.redis.smembers(keys.workflows(this.prefix));
|
|
728
732
|
|
|
@@ -730,8 +734,8 @@ export class RedflowClient {
|
|
|
730
734
|
if (registeredNames.has(existingName)) continue;
|
|
731
735
|
|
|
732
736
|
const workflowKey = keys.workflow(this.prefix, existingName);
|
|
733
|
-
const
|
|
734
|
-
if (!
|
|
737
|
+
const workflowApp = (await this.redis.hget(workflowKey, "app")) ?? "";
|
|
738
|
+
if (!workflowApp || workflowApp !== app) {
|
|
735
739
|
continue;
|
|
736
740
|
}
|
|
737
741
|
|
package/src/types.ts
CHANGED
|
@@ -12,8 +12,10 @@ export type CronTrigger = {
|
|
|
12
12
|
id?: string;
|
|
13
13
|
};
|
|
14
14
|
|
|
15
|
+
export const DEFAULT_MAX_ATTEMPTS = 3;
|
|
16
|
+
|
|
15
17
|
export type WorkflowRetries = {
|
|
16
|
-
/** Total attempts including the first one. Default:
|
|
18
|
+
/** Total attempts including the first one. Default: 3. */
|
|
17
19
|
maxAttempts?: number;
|
|
18
20
|
};
|
|
19
21
|
|
|
@@ -32,6 +34,11 @@ export type OnFailureContext = {
|
|
|
32
34
|
export type DefineWorkflowOptions<TSchema extends ZodTypeAny | undefined = ZodTypeAny | undefined> = {
|
|
33
35
|
name: string;
|
|
34
36
|
queue?: string;
|
|
37
|
+
/**
|
|
38
|
+
* Maximum concurrently running runs for this workflow.
|
|
39
|
+
* Default: 1.
|
|
40
|
+
*/
|
|
41
|
+
maxConcurrency?: number;
|
|
35
42
|
schema?: TSchema;
|
|
36
43
|
cron?: CronTrigger[];
|
|
37
44
|
retries?: WorkflowRetries;
|
|
@@ -167,6 +174,7 @@ export type ListedRun = {
|
|
|
167
174
|
export type WorkflowMeta = {
|
|
168
175
|
name: string;
|
|
169
176
|
queue: string;
|
|
177
|
+
maxConcurrency: number;
|
|
170
178
|
cron?: CronTrigger[];
|
|
171
179
|
retries?: WorkflowRetries;
|
|
172
180
|
updatedAt: number;
|
package/src/worker.ts
CHANGED
|
@@ -16,9 +16,11 @@ import { safeJsonParse, safeJsonStringify, safeJsonTryParse } from "./internal/j
|
|
|
16
16
|
import { sleep } from "./internal/sleep";
|
|
17
17
|
import { nowMs } from "./internal/time";
|
|
18
18
|
import { getDefaultRegistry, type WorkflowRegistry } from "./registry";
|
|
19
|
-
import type
|
|
19
|
+
import { DEFAULT_MAX_ATTEMPTS, type OnFailureContext, type RunStatus, type StepApi, type StepStatus } from "./types";
|
|
20
20
|
|
|
21
21
|
export type StartWorkerOptions = {
|
|
22
|
+
/** Stable application id used for registry sync stale-cleanup scoping. */
|
|
23
|
+
app: string;
|
|
22
24
|
redis?: RedisClient;
|
|
23
25
|
url?: string;
|
|
24
26
|
prefix?: string;
|
|
@@ -74,18 +76,32 @@ redis.call("lpush", KEYS[2], ARGV[1])
|
|
|
74
76
|
return 1
|
|
75
77
|
`;
|
|
76
78
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
79
|
+
const REQUEUE_DUE_TO_CONCURRENCY_LUA = `
|
|
80
|
+
if redis.call("lrem", KEYS[1], 1, ARGV[1]) <= 0 then
|
|
81
|
+
return 0
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
redis.call("rpush", KEYS[2], ARGV[1])
|
|
85
|
+
return 1
|
|
86
|
+
`;
|
|
87
|
+
|
|
88
|
+
export async function startWorker(options: StartWorkerOptions): Promise<WorkerHandle> {
|
|
89
|
+
const app = options.app.trim();
|
|
90
|
+
if (!app) {
|
|
91
|
+
throw new Error("startWorker requires a non-empty options.app");
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
const registry = options.registry ?? getDefaultRegistry();
|
|
95
|
+
const prefix = options.prefix ?? defaultPrefix();
|
|
96
|
+
const ownsBaseRedis = !options.redis && !!options.url;
|
|
97
|
+
const baseRedis = options.redis ?? (options.url ? new BunRedisClient(options.url) : defaultRedis);
|
|
82
98
|
const syncClient = createClient({ redis: baseRedis, prefix });
|
|
83
99
|
|
|
84
|
-
const queues = options
|
|
85
|
-
const concurrency = Math.max(1, options
|
|
86
|
-
const leaseMs = Math.max(100, options
|
|
87
|
-
const blmoveTimeoutSec = options
|
|
88
|
-
const reaperIntervalMs = options
|
|
100
|
+
const queues = options.queues ?? deriveQueuesFromRegistry(registry);
|
|
101
|
+
const concurrency = Math.max(1, options.concurrency ?? 1);
|
|
102
|
+
const leaseMs = Math.max(100, options.runtime?.leaseMs ?? 5000);
|
|
103
|
+
const blmoveTimeoutSec = options.runtime?.blmoveTimeoutSec ?? 1;
|
|
104
|
+
const reaperIntervalMs = options.runtime?.reaperIntervalMs ?? 500;
|
|
89
105
|
|
|
90
106
|
const abort = new AbortController();
|
|
91
107
|
const tasks: Promise<void>[] = [];
|
|
@@ -111,7 +127,7 @@ export async function startWorker(options?: StartWorkerOptions): Promise<WorkerH
|
|
|
111
127
|
};
|
|
112
128
|
|
|
113
129
|
try {
|
|
114
|
-
await syncClient.syncRegistry(registry);
|
|
130
|
+
await syncClient.syncRegistry(registry, { app });
|
|
115
131
|
|
|
116
132
|
// Worker loops (blocking BLMOVE). Use dedicated connections per slot.
|
|
117
133
|
for (let i = 0; i < concurrency; i++) {
|
|
@@ -222,6 +238,11 @@ function encodeIdempotencyPart(value: string): string {
|
|
|
222
238
|
return `${value.length}:${value}`;
|
|
223
239
|
}
|
|
224
240
|
|
|
241
|
+
function normalizeMaxConcurrency(value: unknown): number {
|
|
242
|
+
if (typeof value !== "number" || !Number.isFinite(value) || value <= 0) return 1;
|
|
243
|
+
return Math.floor(value);
|
|
244
|
+
}
|
|
245
|
+
|
|
225
246
|
function defaultStepWorkflowIdempotencyKey(parentRunId: string, stepName: string, childWorkflowName: string): string {
|
|
226
247
|
return `stepwf:${encodeIdempotencyPart(parentRunId)}:${encodeIdempotencyPart(stepName)}:${encodeIdempotencyPart(childWorkflowName)}`;
|
|
227
248
|
}
|
|
@@ -396,7 +417,9 @@ async function processRun(args: {
|
|
|
396
417
|
}
|
|
397
418
|
|
|
398
419
|
const workflowName = run.workflow ?? "";
|
|
399
|
-
const
|
|
420
|
+
const def = workflowName ? registry.get(workflowName) : undefined;
|
|
421
|
+
const maxConcurrency = normalizeMaxConcurrency(def?.options.maxConcurrency);
|
|
422
|
+
const maxAttempts = Number(run.maxAttempts ?? String(DEFAULT_MAX_ATTEMPTS));
|
|
400
423
|
const cancelRequestedAt = run.cancelRequestedAt ? Number(run.cancelRequestedAt) : 0;
|
|
401
424
|
if (cancelRequestedAt > 0) {
|
|
402
425
|
await client.finalizeRun(runId, { status: "canceled", finishedAt: nowMs() });
|
|
@@ -406,7 +429,26 @@ async function processRun(args: {
|
|
|
406
429
|
|
|
407
430
|
const startedAt = run.startedAt && run.startedAt !== "" ? Number(run.startedAt) : nowMs();
|
|
408
431
|
|
|
409
|
-
if (currentStatus === "queued") {
|
|
432
|
+
if (currentStatus === "queued" && def) {
|
|
433
|
+
const runningCount = await countRunningRunsForWorkflow({
|
|
434
|
+
redis,
|
|
435
|
+
prefix,
|
|
436
|
+
workflowName,
|
|
437
|
+
stopAt: maxConcurrency,
|
|
438
|
+
});
|
|
439
|
+
|
|
440
|
+
if (runningCount >= maxConcurrency) {
|
|
441
|
+
await redis.send("EVAL", [
|
|
442
|
+
REQUEUE_DUE_TO_CONCURRENCY_LUA,
|
|
443
|
+
"2",
|
|
444
|
+
processingKey,
|
|
445
|
+
keys.queueReady(prefix, queue),
|
|
446
|
+
runId,
|
|
447
|
+
]);
|
|
448
|
+
await sleep(25);
|
|
449
|
+
return;
|
|
450
|
+
}
|
|
451
|
+
|
|
410
452
|
const movedToRunning = await client.transitionRunStatusIfCurrent(runId, "queued", "running", startedAt);
|
|
411
453
|
if (!movedToRunning) {
|
|
412
454
|
// Most likely canceled between dequeue and start transition.
|
|
@@ -433,7 +475,6 @@ async function processRun(args: {
|
|
|
433
475
|
return;
|
|
434
476
|
}
|
|
435
477
|
|
|
436
|
-
const def = registry.get(workflowName);
|
|
437
478
|
if (!def) {
|
|
438
479
|
const errorJson = makeErrorJson(new UnknownWorkflowError(workflowName));
|
|
439
480
|
await client.finalizeRun(runId, { status: "failed", errorJson, finishedAt: nowMs() });
|
|
@@ -902,6 +943,27 @@ async function reaperLoop(args: {
|
|
|
902
943
|
}
|
|
903
944
|
}
|
|
904
945
|
|
|
946
|
+
async function countRunningRunsForWorkflow(args: {
|
|
947
|
+
redis: RedisClient;
|
|
948
|
+
prefix: string;
|
|
949
|
+
workflowName: string;
|
|
950
|
+
stopAt?: number;
|
|
951
|
+
}): Promise<number> {
|
|
952
|
+
const { redis, prefix, workflowName, stopAt } = args;
|
|
953
|
+
const runningRunIds = await redis.zrevrange(keys.runsStatus(prefix, "running"), 0, -1);
|
|
954
|
+
let count = 0;
|
|
955
|
+
|
|
956
|
+
for (const runId of runningRunIds) {
|
|
957
|
+
const runWorkflow = await redis.hget(keys.run(prefix, runId), "workflow");
|
|
958
|
+
if (runWorkflow !== workflowName) continue;
|
|
959
|
+
|
|
960
|
+
count += 1;
|
|
961
|
+
if (typeof stopAt === "number" && count >= stopAt) return count;
|
|
962
|
+
}
|
|
963
|
+
|
|
964
|
+
return count;
|
|
965
|
+
}
|
|
966
|
+
|
|
905
967
|
async function cronSchedulerLoop(args: {
|
|
906
968
|
redis: RedisClient;
|
|
907
969
|
client: RedflowClient;
|
|
@@ -976,7 +1038,17 @@ async function cronSchedulerLoop(args: {
|
|
|
976
1038
|
continue;
|
|
977
1039
|
}
|
|
978
1040
|
|
|
979
|
-
|
|
1041
|
+
const cronMaxConcurrency = normalizeMaxConcurrency(def.maxConcurrency);
|
|
1042
|
+
const runningCount = await countRunningRunsForWorkflow({
|
|
1043
|
+
redis,
|
|
1044
|
+
prefix,
|
|
1045
|
+
workflowName: workflow,
|
|
1046
|
+
stopAt: cronMaxConcurrency,
|
|
1047
|
+
});
|
|
1048
|
+
|
|
1049
|
+
if (runningCount < cronMaxConcurrency) {
|
|
1050
|
+
await client.runByName(workflow, input, { queueOverride: queue });
|
|
1051
|
+
}
|
|
980
1052
|
|
|
981
1053
|
// Schedule next run.
|
|
982
1054
|
let nextAt: number | null = null;
|