queasy 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/.github/workflows/check.yml +50 -0
  2. package/.github/workflows/publish.yml +44 -0
  3. package/AGENTS.md +2 -1
  4. package/CLAUDE.md +1 -1
  5. package/Readme.md +32 -22
  6. package/docker-compose.yml +0 -2
  7. package/fuzztest/Readme.md +185 -0
  8. package/fuzztest/fuzz.js +354 -0
  9. package/fuzztest/handlers/cascade-a.js +94 -0
  10. package/fuzztest/handlers/cascade-b.js +72 -0
  11. package/fuzztest/handlers/fail-handler.js +52 -0
  12. package/fuzztest/handlers/periodic.js +93 -0
  13. package/fuzztest/process.js +100 -0
  14. package/fuzztest/shared/chaos.js +28 -0
  15. package/fuzztest/shared/stream.js +40 -0
  16. package/package.json +7 -4
  17. package/plans/redis-options.md +279 -0
  18. package/src/client.js +100 -30
  19. package/src/constants.js +3 -0
  20. package/src/pool.js +4 -7
  21. package/src/queasy.lua +33 -40
  22. package/src/queue.js +4 -11
  23. package/src/types.ts +15 -0
  24. package/src/utils.js +26 -0
  25. package/test/client.test.js +39 -41
  26. package/test/errors.test.js +12 -12
  27. package/test/fixtures/always-fail-handler.js +3 -3
  28. package/test/fixtures/data-logger-handler.js +5 -0
  29. package/test/fixtures/failure-handler.js +2 -2
  30. package/test/fixtures/permanent-error-handler.js +3 -3
  31. package/test/fixtures/slow-handler.js +2 -2
  32. package/test/fixtures/success-handler.js +3 -3
  33. package/test/fixtures/with-failure-handler.js +3 -3
  34. package/test/guards.test.js +131 -0
  35. package/test/manager.test.js +217 -70
  36. package/test/pool.test.js +153 -57
  37. package/test/queue.test.js +6 -5
  38. package/test/redis-functions.test.js +18 -12
  39. package/test/utils.test.js +52 -0
  40. package/.claude/settings.local.json +0 -27
  41. package/.zed/settings.json +0 -39
  42. package/doc/Implementation.md +0 -70
  43. package/test/index.test.js +0 -55
@@ -0,0 +1,50 @@
1
+ name: Check
2
+
3
+ on:
4
+ pull_request:
5
+ branches: [master]
6
+
7
+ jobs:
8
+ check:
9
+ runs-on: ubuntu-latest
10
+
11
+ services:
12
+ redis:
13
+ image: redis:7
14
+ ports:
15
+ - 6379:6379
16
+ options: >-
17
+ --health-cmd "redis-cli ping"
18
+ --health-interval 10s
19
+ --health-timeout 5s
20
+ --health-retries 5
21
+
22
+ steps:
23
+ - uses: actions/checkout@v4
24
+ with:
25
+ fetch-depth: 0
26
+
27
+ - uses: actions/setup-node@v4
28
+ with:
29
+ node-version: 22
30
+
31
+ - run: npm ci
32
+
33
+ - name: Lint
34
+ run: npm run lint
35
+
36
+ - name: Typecheck
37
+ run: npm run typecheck
38
+
39
+ - name: Test with coverage
40
+ run: npm run test:coverage
41
+
42
+ - name: Check version is not already tagged
43
+ run: |
44
+ git fetch --tags
45
+ VERSION=$(node -e "console.log(require('./package.json').version)")
46
+ if [ -n "$(git tag -l "v$VERSION")" ]; then
47
+ echo "::error::Tag v$VERSION already exists. Bump the version in package.json."
48
+ exit 1
49
+ fi
50
+ echo "Version v$VERSION is not yet tagged"
@@ -0,0 +1,44 @@
1
+ name: Publish
2
+
3
+ on:
4
+ push:
5
+ branches: [master]
6
+
7
+ permissions:
8
+ contents: write
9
+ id-token: write
10
+
11
+ jobs:
12
+ publish:
13
+ runs-on: ubuntu-latest
14
+
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - uses: actions/setup-node@v4
19
+ with:
20
+ node-version: 22
21
+ registry-url: https://registry.npmjs.org
22
+
23
+ - run: npm install -g npm@latest
24
+
25
+ - run: npm ci
26
+
27
+ - name: Extract version
28
+ id: version
29
+ run: echo "version=$(node -e "console.log(require('./package.json').version)")" >> "$GITHUB_OUTPUT"
30
+
31
+ - name: Publish to npm
32
+ run: npm publish --provenance --access public
33
+ env:
34
+ NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
35
+
36
+ - name: Create and push tag
37
+ run: |
38
+ git tag "v${{ steps.version.outputs.version }}"
39
+ git push origin "v${{ steps.version.outputs.version }}"
40
+
41
+ - name: Create GitHub release
42
+ run: gh release create "v${{ steps.version.outputs.version }}" --generate-notes
43
+ env:
44
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
package/AGENTS.md CHANGED
@@ -24,7 +24,8 @@ Queasy is a Redis-backed job queue for Node.js with **at-least-once** delivery s
24
24
  The JS side is split across several modules:
25
25
 
26
26
  - **`src/client.js`** (`Client` class): Top-level entry point. Wraps a `node-redis` connection, loads the Lua script into Redis via `FUNCTION LOAD REPLACE` on construction, and manages named `Queue` instances. Generates a unique `clientId` for heartbeats. All Redis `fCall` invocations live here (`dispatch`, `cancel`, `dequeue`, `finish`, `fail`, `retry`, `bump`). Exported from `src/index.js`.
27
- - **`src/queue.js`** (`Queue` class): Represents a single named queue. Holds dequeue options and handler path. `listen()` attaches a handler and starts a `setInterval` polling loop that calls `dequeue()`. `dequeue()` checks pool capacity, fetches jobs from Redis, and processes each via the pool. Handles retry/fail logic (backoff calculation, stall-count checks) on the JS side.
27
+ - **`src/queue.js`** (`Queue` class): Represents a single named queue. `listen()` attaches a handler path and options, optionally sets up a fail queue (`{key}-fail`), then registers itself with the `Manager` via `addQueue()`. `dequeue(count)` fetches jobs from Redis, processes each via the pool, and handles outcomes: finishes on success, retries with exponential backoff on retriable errors, and dispatches to the fail queue on permanent errors or when `maxRetries`/`maxStalls` limits are exceeded. Returns `{ count, promise }` so the manager can track whether the queue is saturated.
28
+ - **`src/manager.js`** (`Manager` class): Centralized dequeue scheduler shared across all queues on a client. When a queue calls `listen()`, it registers itself via `addQueue()`. The manager runs a single `next()` loop that round-robins through queues, calling `queue.dequeue(batchSize)` on each. Batch size is computed from pool capacity, the number of busy queues, and the handler's `size` option. After each dequeue, queues are re-sorted by a priority function (`compareQueueEntries`): busy queues first, then by `priority` (higher first), then by `lastDequeuedAt` (oldest first), then by `size` (larger first). The loop schedules the next tick immediately if the top queue is busy, otherwise waits `DEQUEUE_INTERVAL` ms from the last dequeue time.
28
29
  - **`src/pool.js`** (`Pool` class): Manages a set of `Worker` threads. Each worker has a `capacity` (default 100 units). `process()` picks the worker with the most spare capacity, posts the job, and returns a promise. Handles job timeouts: a timed-out job marks the worker as unhealthy, replaces it with a fresh one, and terminates the old worker once only stalled jobs remain.
29
30
  - **`src/worker.js`**: Runs inside a `Worker` thread. Receives `exec` messages, dynamically imports the handler module, calls `handle(data, job)`, and posts back `done` messages (with optional error info).
30
31
  - **`src/constants.js`**: Default retry options, heartbeat/timeout intervals, worker capacity, dequeue polling interval.
package/CLAUDE.md CHANGED
@@ -21,7 +21,7 @@ Tests require a running Redis instance. Use `docker:up` first if needed.
21
21
 
22
22
  Queasy is a Redis-backed job queue with **at-least-once** delivery semantics. The core logic lives in two layers:
23
23
 
24
- - **JS layer** (`src/queue.js`): The `queue()` factory returns `{ dispatch, cancel, listen }`. On first use, it uploads the Lua script to Redis via `FUNCTION LOAD REPLACE`. A `WeakSet` (`initializedClients`) tracks which Redis clients have already had the functions loaded. `listen()` is currently a TODO stub.
24
+ - **JS layer** (`src/client.js`): The `Client` class accepts a `RedisOptions` object and constructs its own node-redis connection via `createClient` (plain object) or `createCluster` (object with `rootNodes`). On construction it connects, then uploads the Lua script to Redis via `FUNCTION LOAD REPLACE`. The connection is torn down in `close()` via `destroy()`.
25
25
  - **Lua layer** (`src/queasy.lua`): All queue state mutations are atomic Redis functions registered under the `queasy` library. No queue logic should be duplicated in JS — the Lua functions are the single source of truth for state transitions.
26
26
 
27
27
  ### Redis data structures
package/Readme.md CHANGED
@@ -2,18 +2,18 @@
2
2
 
3
3
  A Redis-backed job queue for Node.js, featuring (in comparison with design inspiration BullMQ):
4
4
 
5
- - **Singleton jobs**: Guarantees that no more than one job with a given ID is be processed at a time, without trampolines or dropping jobs (“unsafe deduplication”).
6
- - **Fail handlers**: Guaranteed at-least-once handlers for failed or stalled jobs, which permits reliable periodic jobs without a external scheduling or “reviver” systems.
5
+ - **Singleton jobs**: Guarantees that no more than one job with a given ID is being processed at any time, without trampolines or dropping jobs (“unsafe deduplication”).
6
+ - **Fail handlers**: Guaranteed at-least-once handlers for failed or stalled jobs, enabling reliable periodic jobs without a external scheduling or “reviver” systems.
7
7
  - **Instant config changes**: Most configuration changes take effect immediately no matter the queue length, as they apply at dequeue time.
8
8
  - **Worker threads**: Jobs are processed in worker threads, preventing main process stalling and failing health checks due to CPU-bound jobs
9
9
  - **Capacity model**: Worker capacity flexibly shared between heterogenous queues based on priority and demand, rather than queue-specific “concurrency”.
10
- - **Job timeout**: Enforced by draining and terminating worker threads with timed out jobs
11
- - **Zombie protection**: Clients that have lost locks detect this and exit at next heartbeat
10
+ - **Job timeout**: Timed out jobs are killed by draining and terminating the worker threads it runs on
11
+ - **Zombie protection**: Clients that have lost locks while stalled before recovering detect this and terminate themselves immediately
12
12
  - **Fine-grained updates**: Control over individual attributes when one job updates another with the same ID
13
13
 
14
14
  ### Terminology
15
15
 
16
- A _client_ is an instance of Quesy that connects to a Redis database. A _job_ is the basic unit of work that is _dispatched_ into a _queue_.
16
+ A _client_ is an instance of Queasy. It manages its own Redis connection. A _job_ is the basic unit of work that is _dispatched_ into a _queue_.
17
17
 
18
18
  A _handler_ is JavaScript code that performs work. There are two kinds of handlers: _task handlers_, which process jobs, and _fail handlers_, which are invoked when a job fails permanently. Handlers run on _workers_, which are Node.js worker threads. By default, a Queasy client automatically creates one worker per CPU.
19
19
 
@@ -22,8 +22,9 @@ A _handler_ is JavaScript code that performs work. There are two kinds of handle
22
22
  - `id`: string; generated if unspecified. See _update semantics_ below for more information.
23
23
  - `data`: a JSON-serializable value passed to handlers
24
24
  - `runAt`: number; a unix timestamp, to delay job execution until at least that time
25
- - `stallCount`: number; how many times has this job caused the client or worker to stall?
26
- - `retryCount`: number; how many times has this job caused the handler to throw an error?
25
+ - `retryCount`: number; how many times has this job been retried for any reason?
26
+ - `stallCount`: number; how many times did the client processing this job stop sending heartbeats?
27
+ - `timeoutCount`: number; how many times did this job fail to complete in the allocated time?
27
28
 
28
29
  ### Job lifecycle
29
30
 
@@ -42,23 +43,29 @@ Queues are dequeued based on their priority and the ratio of available capacity
42
43
 
43
44
  When a worker start processing a job, a timer is started; if the job completes or throws, the timer is cleared. If the timeout occurs, the job is marked stalled and the worker is removed from the pool so it no longer receives new jobs. A new worker is also created and added to the pool to replace it.
44
45
 
45
- The unhealthy worker (with stalled jobs) continues to run until it has *only* stalled jobs remaining. When this happens, the worker is terminated, and all its stalled jobs are retried.
46
+ The unhealthy worker (with at least one stalled job) continues to run until it has *only* stalled jobs remaining. When this happens, the worker is terminated, and all its stalled jobs are retried.
46
47
 
47
48
  ### Stall handling
48
49
 
49
50
  The client (in the main thread) sends periodic heartbeats to Redis for each queue it’s processing. If heartbeats from a client stop, a Lua script in Redis removes this client and returns all its active jobs into the waiting state with their stall count property incremented.
50
51
 
51
- When a job is dequeued, if its stall count exceeds the configured maximum, it is immediately considered permanently failed and its handler is not invoked.
52
+ When a job is dequeued, if its stall count exceeds the configured maximum, it is immediately considered permanently failed; its task handler is not invoked.
52
53
 
53
54
  The response of the heartbeat Lua function indicates whether the client had been removed due to an earlier stall; if it receives this response, the client terminates all its worker threads immediately and re-initializes the pool and queues.
54
55
 
55
56
  ## API
56
57
 
57
- ### `client(redisConnection, workerCount)`
58
- Returns a Queasy client.
59
- - `redisConnection`: a node-redis connection object.
58
+ ### `new Client(options, workerCount)`
59
+ Returns a Queasy client. Queasy creates and manages its own Redis connection internally.
60
+ - `options`: connection options. Two forms are accepted:
61
+ - **Single node** (plain object): passed to node-redis `createClient`. Accepts `url`, `socket`, `username`, `password`, and `database`. Defaults to `{}` (connects to `localhost:6379`).
62
+ - **Cluster** (object with `rootNodes`): passed to node-redis `createCluster`. Accepts:
63
+ - `rootNodes`: array of per-node connection options (same fields as single-node form); at least three nodes are recommended.
64
+ - `defaults`: options shared across all nodes (e.g. auth and TLS).
65
+ - `nodeAddressMap`: address translation map for NAT environments.
60
66
  - `workerCount`: number; Size of the worker pool. If 0, or if called in a queasy worker thread, no pool is created. Defaults to the number of CPUs.
61
67
 
68
+ The client object returned is an EventEmitter, which emits a 'disconnect' event when it fails permanently for any reason, such as library version mismatch between different workers connected to the same Redis insance, or a lost locks situation. When this happens, in general the application should exit the worker process and allow the supervisor to restart it.
62
69
 
63
70
  ### `client.queue(name)`
64
71
 
@@ -74,8 +81,7 @@ Adds a job to the queue. `data` may be any JSON value, which will be passed unch
74
81
  The following options take effect if an `id` is provided, and it matches that of a job already in the queue.
75
82
  - `updateData`: boolean; whether to replace the data of any waiting job with the same ID; default: true
76
83
  - `updateRunAt`: boolean | 'ifLater' | 'ifEarlier'; default: true
77
- - `updateRetryStrategy`: boolean; whether to replace `maxRetries`, `maxStalls`, `minBackoff` and `maxBackoff`
78
- - `resetCounts`: boolean; Whether to reset the internal failure and stall counts to 0; default: same as updateData
84
+ - `resetCounts`: boolean; Whether to reset the retry, timeout and stall counts to 0; default: same as updateData
79
85
 
80
86
  Returns a promise that resolves to the job ID when the job has been added to Redis.
81
87
 
@@ -92,10 +98,12 @@ Attaches handlers to a queue to process jobs that are added to it.
92
98
  The following options control retry behavior:
93
99
  - `maxRetries`: number; default: 10
94
100
  - `maxStalls`: number; default: 3
101
+ - `maxTimeouts`: number, default: 3
95
102
  - `minBackoff`: number; in milliseconds; default: 2,000
96
103
  - `maxBackoff`: number; default: 300,000
97
104
  - `size`: number; default: 10
98
105
  - `timeout`: number; in milliseconds; default: 60,000
106
+ - `priority`: number; higher values are given preference; default: 100
99
107
 
100
108
  Additional options affect failure handling:
101
109
  - `failHandler`: The path to a JavaScript module that exports the handler for failure jobs
@@ -107,13 +115,13 @@ Every handler module must have a named export `handle`, a function that is calle
107
115
 
108
116
  ### Task handlers
109
117
 
110
- It receives two arguments:
118
+ They receive two arguments:
111
119
  - `data`, the JSON value passed to dispatch
112
- - `job`, a Job object contains the job attributes except data
120
+ - `job`, a Job object containing other job attributes (excluding data)
113
121
 
114
- This function may throw (or return a Promise that rejects) to indicate job failure. If the thrown error is an
115
- instance of `PermanentError`, or if `maxRetries` has been reached, the job is not retried. Otherwise, the job
116
- is queued to be retried with `maxRetries` incremented.
122
+ This function may throw (or return a Promise that rejects) to indicate job failure. If the thrown error contains
123
+ a property `kind` with the value `permanent`, or if `maxRetries` has been reached, the job is not retried.
124
+ Otherwise, the job is queued to be retried with `retryCount` incremented.
117
125
 
118
126
  If the thrown error has a property `retryAt`, the job’s `runAt` is set to this value; otherwise, it’s set using
119
127
  the exponential backoff algorithm.
@@ -123,8 +131,10 @@ If it returns any value apart from a Promise that rejects, the job is considered
123
131
  ### Failure handlers
124
132
 
125
133
  This function receives three arguments:
126
- - `data`, the JSON value passed to dispatch
127
- - `job`
128
- - `error`, a JSON object with a copy of the enumerable properties of the error thrown by the final call to handle, or an instance of `StallError` if the final call to handle didn’t return or throw.
134
+ - `data`, a tuple (array) containing three items:
135
+ - `originalData`
136
+ - `originalJob`
137
+ - `error`, a JSON object with the name, message and kind properties of the error thrown by the final call to handle. Kind might be `permanent`, `retriable` or `stall`. In case of stall, the name property is either `StallError` or `TimeoutError`.
138
+ - `job`, details of the failure handling job
129
139
 
130
140
  If this function throws an error (or returns a Promise that rejects), it is retried using exponential backoff.
@@ -7,8 +7,6 @@ services:
7
7
  ports:
8
8
  - '6379:6379'
9
9
  command: redis-server --save 60 1 --loglevel warning
10
- volumes:
11
- - redis-data:/data
12
10
  healthcheck:
13
11
  test: ['CMD', 'redis-cli', 'ping']
14
12
  interval: 5s
@@ -0,0 +1,185 @@
1
+ # Queasy Fuzz Test Plan
2
+
3
+ A long-running end-to-end fuzz test that simulates random failures and continuously verifies core system invariants.
4
+
5
+ ## Invariants Verified
6
+
7
+ 1. **Mutual exclusion**: Two jobs with the same Job ID are never processed by different clients or worker threads simultaneously.
8
+ 2. **No re-processing of successful jobs**: A job that has succeeded is never processed again.
9
+ 3. **Scheduling**: No job is processed before its `run_at` time.
10
+ 4. **Priority ordering within a queue**: No job starts processing while another job in the same queue with a lower `run_at` is still waiting (i.e., eligible jobs are dequeued in order).
11
+ 5. **Fail handler completeness**: If a fail handler is registered, every job that does not eventually succeed MUST result in the fail handler being invoked.
12
+ 6. **Queue progress (priority starvation prevention)**: Non-empty queues at the highest priority level always make progress. When they drain, queues at the next priority level begin making progress.
13
+
14
+ ## Structure Overview
15
+
16
+ ```
17
+ fuzztest/
18
+ Readme.md # This file
19
+ fuzz.js # Orchestrator: spawns child processes, monitors shared state
20
+ process.js # Child process: sets up clients and listens on all queues
21
+ handlers/
22
+ periodic.js # Re-queues itself; dispatches cascade jobs; occasionally stalls/crashes
23
+ cascade-a.js # Dispatched by periodic; dispatches into cascade-b
24
+ cascade-b.js # Dispatched by cascade-a; final handler
25
+ fail-handler.js # Shared fail handler for all queues; records invocations
26
+ shared/
27
+ state.js # In-process shared state helpers (for the orchestrator)
28
+ log.js # Structured logger (writes to fuzz-output.log, never throws)
29
+ ```
30
+
31
+ ## Process Architecture
32
+
33
+ The orchestrator (`fuzz.js`) spawns **N child processes** (default: 4). Each child process creates one Redis client and calls `listen()` on every queue. The orchestrator itself does not process jobs — it only monitors invariants and manages the lifecycle.
34
+
35
+ Handlers write events (job start, finish, fail, stall) directly to a Redis stream (`fuzz:events`). The orchestrator reads from this stream and maintains a shared in-memory log of events, checking invariants after each one. Child processes do not need to forward events to the orchestrator themselves — the stream is the shared channel.
36
+
37
+ Child processes are deliberately killed and restarted periodically to simulate crashes. A killed process' checked-out jobs will be swept and retried/failed by the remaining processes.
38
+
39
+ ## Queue Configuration
40
+
41
+ Three queues at different priority levels, all listened on by every child process. Parameters are kept small to produce many events quickly:
42
+
43
+ | Parameter | `{fuzz}:periodic` | `{fuzz}:cascade-a` | `{fuzz}:cascade-b` |
44
+ |---|---|---|---|
45
+ | Handler | `periodic.js` | `cascade-a.js` | `cascade-b.js` |
46
+ | Priority | 300 | 200 | 100 |
47
+ | `maxRetries` | 3 | 3 | 3 |
48
+ | `maxStalls` | 2 | 2 | 2 |
49
+ | `minBackoff` | 200 ms | 200 ms | 200 ms |
50
+ | `maxBackoff` | 2 000 ms | 2 000 ms | 2 000 ms |
51
+ | `timeout` | 3 000 ms | 3 000 ms | 3 000 ms |
52
+ | `size` | 10 | 10 | 10 |
53
+ | `failHandler` | `fail-handler.js` | `fail-handler.js` | `fail-handler.js` |
54
+ | `failRetryOptions.maxRetries` | 5 | 5 | 5 |
55
+ | `failRetryOptions.minBackoff` | 200 ms | 200 ms | 200 ms |
56
+
57
+ The short `timeout` (3 s) means stalling jobs are detected and swept quickly. The short `minBackoff` / `maxBackoff` window (200 ms – 2 s) means retries cycle fast. With `maxRetries: 3` and `maxStalls: 2`, most failed jobs reach the fail handler within seconds.
58
+
59
+ ## Periodic Jobs (Seed)
60
+
61
+ A fixed set of periodic job IDs (e.g., `periodic-0` through `periodic-4`) are dispatched by the orchestrator at startup. Each periodic handler:
62
+
63
+ 1. Records the current processing event by writing `{ type: 'start', queue, id, threadId, clientId, startedAt }` to the `fuzz:events` Redis stream.
64
+ 2. Optionally sleeps for a random short delay.
65
+ 3. Dispatches a cascade-a job with a unique ID and a `runAt` randomly up to 2 seconds in the future.
66
+ 4. Re-dispatches itself (same job ID, `updateRunAt: true`) with a delay of 1–5 seconds, so the job continues to fire periodically.
67
+ 5. On success, writes `{ type: 'finish', queue, id, threadId, clientId, finishedAt }` to the `fuzz:events` stream.
68
+
69
+ The fail handler for periodic jobs also re-dispatches the same periodic job ID (with a delay), ensuring periodic jobs survive permanent failures. This lets the orchestrator assert that periodic jobs keep running indefinitely.
70
+
71
+ ## Cascade Jobs
72
+
73
+ `cascade-a.js`:
74
+ - Records start/finish events.
75
+ - Dispatches one or two `cascade-b` jobs with unique IDs.
76
+ - Subject to all chaos behaviors (see below).
77
+
78
+ `cascade-b.js`:
79
+ - Records start/finish events.
80
+ - Terminal handler; does not dispatch further jobs.
81
+ - Subject to all chaos behaviors (see below).
82
+
83
+ ## Chaos Behaviors
84
+
85
+ All handlers are subject to all chaos behaviors. The probabilities below are per-invocation and apply uniformly across `periodic.js`, `cascade-a.js`, and `cascade-b.js`:
86
+
87
+ | Behavior | Probability | Notes |
88
+ |---|---|---|
89
+ | Normal completion | ~65% | Dispatches downstream jobs (if any), then returns |
90
+ | Retriable error (throws `Error`) | ~15% | No downstream dispatch |
91
+ | Permanent error (throws `PermanentError`) | ~5% | No downstream dispatch |
92
+ | Stall (returns a never-resolving promise) | ~10% | Detected after `timeout` (3 s); counts as a stall |
93
+ | CPU spin (blocks the worker thread) | ~3% | Tight loop until the process detects the hang and kills the thread (via `timeout`) |
94
+ | Crash (causes the child process to exit) | ~2% | Handler writes a "crash-me" flag to Redis; main thread polls and exits |
95
+
96
+ With `timeout: 3000`, stalling and spinning jobs are swept within ~3–13 seconds (timeout + heartbeat sweep interval). With `maxStalls: 2`, two stalls exhaust the stall budget and the job is sent to the fail handler, cycling fast.
97
+
98
+ When a child process crashes, the orchestrator detects the exit event and restarts a new child process after a short delay.
99
+
100
+ ## Event Logging and Invariant Checking
101
+
102
+ The orchestrator maintains an append-only in-memory event log. Each entry contains:
103
+ ```js
104
+ { type, queue, id, threadId, clientId, timestamp }
105
+ ```
106
+ where `type` is one of: `start`, `finish`, `fail`, `stall`, `cancel`.
107
+
108
+ After each event is appended, the orchestrator runs incremental invariant checks:
109
+
110
+ ### Invariant 1: Mutual Exclusion
111
+ Maintain a `Map<jobId, { clientId, threadId, startedAt }>` of currently-active jobs. On `start`, check that the job ID is not already in the map. On `finish`/`fail`/`stall`, remove it.
112
+
113
+ If a `start` event arrives for a job ID already in the map → **VIOLATION**.
114
+
115
+ ### Invariant 2: No Re-processing of Succeeded Jobs
116
+ Maintain a `Set<jobId>` of successfully finished job IDs. On `start`, check that the ID is not in this set.
117
+
118
+ If a `start` event arrives for a job ID in the succeeded set → **VIOLATION**.
119
+
120
+ Note: Re-processing after a stall or retry is expected and must not be flagged.
121
+
122
+ ### Invariant 3: Scheduling (No Early Processing)
123
+ Each `start` event includes `startedAt` (wall clock). Each job dispatch records an intended `runAt`. On `start`, verify `startedAt >= runAt - CLOCK_TOLERANCE_MS`.
124
+
125
+ If `startedAt < runAt - CLOCK_TOLERANCE_MS` → **VIOLATION**.
126
+
127
+ `CLOCK_TOLERANCE_MS` accounts for clock skew between the orchestrator, child processes, and Redis (default: 100ms).
128
+
129
+ ### Invariant 4: Priority Ordering
130
+ Track the earliest-known `runAt` for jobs dispatched into each queue but not yet started. When a `start` event arrives for a job in that queue, verify no other eligible job (with `runAt <= now`) in the same queue has a lower `runAt` that has been waiting longer.
131
+
132
+ This invariant is best-effort and checked with a configurable lag (e.g., 200ms) to account for the inherent race between dequeue polling and dispatch. A violation is only flagged when the ordering difference exceeds this lag.
133
+
134
+ ### Invariant 5: Fail Handler Completeness
135
+ Track every job that has been dispatched (by ID). When a job exceeds its `maxRetries` or receives a permanent error, a fail event should be observed. Maintain a map `{ jobId → { exhausted: bool, failSeen: bool } }`. After a configurable drain period (e.g., 30 seconds after a queue goes quiet), check that every exhausted job has a corresponding `fail` event.
136
+
137
+ ### Invariant 6: Queue Progress
138
+ The orchestrator monitors the time since the last `start` event per queue. If a queue is known to be non-empty (based on dispatched vs finished counts) and no `start` event has been seen for more than a configurable `STALL_THRESHOLD_MS` (e.g., 30 seconds), flag a progress violation.
139
+
140
+ Priority starvation is checked by verifying that the low-priority queue does not process jobs while the high-priority queue has outstanding jobs older than the dequeue poll interval.
141
+
142
+ ## Output and Reporting
143
+
144
+ Violations are logged to stdout and to `fuzz-output.log` with full context. The process does **not** exit on a violation — it logs and continues, accumulating a count of violations. A summary is printed periodically (every 60 seconds) and on `SIGINT`.
145
+
146
+ Log format (newline-delimited JSON):
147
+ ```json
148
+ { "time": "...", "level": "info|warn|error", "msg": "...", "data": { ... } }
149
+ ```
150
+
151
+ Violation entries use level `"error"` and include the invariant name, the offending event, and relevant recent history.
152
+
153
+ ## Configuration
154
+
155
+ All tunable parameters live at the top of `fuzz.js` as named constants:
156
+
157
+ ```js
158
+ const NUM_PROCESSES = 4; // Child processes
159
+ const NUM_PERIODIC_JOBS = 5; // Fixed periodic job IDs
160
+ const PERIODIC_MIN_DELAY = 1000; // ms before re-queuing self
161
+ const PERIODIC_MAX_DELAY = 5000;
162
+ const CRASH_INTERVAL_MS = 30000; // Orchestrator kills a random child process this often
163
+ const CLOCK_TOLERANCE_MS = 100;
164
+ const STALL_THRESHOLD_MS = 30000;
165
+ const PRIORITY_LAG_MS = 200;
166
+ const LOG_FILE = 'fuzz-output.log';
167
+ ```
168
+
169
+ ## Running
170
+
171
+ The fuzz test is separate from the default test suite and is never run by `npm test`. It is started manually:
172
+
173
+ ```sh
174
+ node fuzztest/fuzz.js
175
+ ```
176
+
177
+ It runs indefinitely. Stop with `Ctrl+C`. A summary of violations and events processed will be printed on exit.
178
+
179
+ ## Notes on Implementation
180
+
181
+ - Child processes use the `queasy` library's public API (`queue()`, `dispatch()`, `listen()`). They do not talk directly to Redis.
182
+ - The orchestrator does not import from `src/`; it only spawns child processes and learns about child process lifecycle only from the `spawn` and `exit` events.
183
+ - All handler modules in `fuzztest/handlers/` must be self-contained ESM modules that can be passed as `handlerPath` to `queue.listen()`.
184
+ - Handlers write events to the `fuzz:events` Redis stream using a dedicated Redis client created at handler module load time. The orchestrator reads from this stream via `XREAD BLOCK`. This is the only communication channel between handlers and the orchestrator — no IPC is used.
185
+ - The chaos crash behavior must be triggered from the child process's main thread, not from inside a handler's worker thread. To simulate a crash, the handler uses `postMessage` to send a `{ type: 'crash' }` message to the main thread, which listens for it and calls `process.exit()`.