groupmq-plus 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/LICENSE +59 -0
  2. package/README.md +722 -0
  3. package/dist/index.cjs +2567 -0
  4. package/dist/index.cjs.map +1 -0
  5. package/dist/index.d.cts +1300 -0
  6. package/dist/index.d.ts +1300 -0
  7. package/dist/index.js +2557 -0
  8. package/dist/index.js.map +1 -0
  9. package/dist/lua/change-delay.lua +62 -0
  10. package/dist/lua/check-stalled.lua +86 -0
  11. package/dist/lua/clean-status.lua +64 -0
  12. package/dist/lua/cleanup-poisoned-group.lua +46 -0
  13. package/dist/lua/cleanup.lua +46 -0
  14. package/dist/lua/complete-and-reserve-next-with-metadata.lua +221 -0
  15. package/dist/lua/complete-with-metadata.lua +190 -0
  16. package/dist/lua/complete.lua +51 -0
  17. package/dist/lua/dead-letter.lua +86 -0
  18. package/dist/lua/enqueue-batch.lua +149 -0
  19. package/dist/lua/enqueue-flow.lua +107 -0
  20. package/dist/lua/enqueue.lua +154 -0
  21. package/dist/lua/get-active-count.lua +6 -0
  22. package/dist/lua/get-active-jobs.lua +6 -0
  23. package/dist/lua/get-delayed-count.lua +5 -0
  24. package/dist/lua/get-delayed-jobs.lua +5 -0
  25. package/dist/lua/get-unique-groups-count.lua +13 -0
  26. package/dist/lua/get-unique-groups.lua +15 -0
  27. package/dist/lua/get-waiting-count.lua +11 -0
  28. package/dist/lua/get-waiting-jobs.lua +15 -0
  29. package/dist/lua/heartbeat.lua +22 -0
  30. package/dist/lua/is-empty.lua +35 -0
  31. package/dist/lua/promote-delayed-jobs.lua +40 -0
  32. package/dist/lua/promote-delayed-one.lua +44 -0
  33. package/dist/lua/promote-staged.lua +70 -0
  34. package/dist/lua/record-job-result.lua +143 -0
  35. package/dist/lua/remove.lua +55 -0
  36. package/dist/lua/reserve-atomic.lua +114 -0
  37. package/dist/lua/reserve-batch.lua +141 -0
  38. package/dist/lua/reserve.lua +161 -0
  39. package/dist/lua/retry.lua +53 -0
  40. package/package.json +92 -0
@@ -0,0 +1,86 @@
1
+ -- Check for stalled jobs and move them back to waiting or fail them
2
+ -- KEYS: namespace, currentTime, gracePeriod, maxStalledCount
3
+ -- Returns: array of [jobId, groupId, action] for each stalled job found
4
+ -- action: "recovered" or "failed"
5
+
6
+ local ns = KEYS[1]
7
+ local now = tonumber(ARGV[1])
8
+ local gracePeriod = tonumber(ARGV[2]) or 0
9
+ local maxStalledCount = tonumber(ARGV[3]) or 1
10
+
11
+ -- Circuit breaker for high concurrency: limit stalled job recovery
12
+ local circuitBreakerKey = ns .. ":stalled:circuit"
13
+ local lastCheck = redis.call("GET", circuitBreakerKey)
14
+ if lastCheck then
15
+ local lastCheckTime = tonumber(lastCheck)
16
+ local circuitBreakerInterval = 2000
17
+ if lastCheckTime and (now - lastCheckTime) < circuitBreakerInterval then
18
+ return {}
19
+ end
20
+ end
21
+ redis.call("SET", circuitBreakerKey, now, "PX", 3000)
22
+
23
+ local processingKey = ns .. ":processing"
24
+ local groupsKey = ns .. ":groups"
25
+
26
+ -- Candidates: jobs whose deadlines are past
27
+ local candidates = redis.call("ZRANGEBYSCORE", processingKey, 0, now - gracePeriod, "LIMIT", 0, 100)
28
+ if not candidates or #candidates == 0 then
29
+ return {}
30
+ end
31
+
32
+ local results = {}
33
+
34
+ for _, jobId in ipairs(candidates) do
35
+ local jobKey = ns .. ":job:" .. jobId
36
+ local h = redis.call("HMGET", jobKey, "groupId","stalledCount","maxAttempts","attempts","status","finishedOn","score")
37
+ local groupId = h[1]
38
+ if groupId then
39
+ local stalledCount = tonumber(h[2]) or 0
40
+ local maxAttempts = tonumber(h[3]) or 3
41
+ local status = h[5]
42
+ local finishedOn = tonumber(h[6] or "0")
43
+ -- CRITICAL: Don't recover jobs that are completing (prevents race with completion)
44
+ -- "completing" is a temporary state set by complete-with-metadata.lua to prevent races
45
+ if status == "processing" then
46
+ stalledCount = stalledCount + 1
47
+ redis.call("HSET", jobKey, "stalledCount", stalledCount)
48
+ -- BullMQ-style: Remove from per-group active list
49
+ local groupActiveKey = ns .. ":g:" .. groupId .. ":active"
50
+ redis.call("LREM", groupActiveKey, 1, jobId)
51
+
52
+ if stalledCount >= maxStalledCount and maxStalledCount > 0 then
53
+ redis.call("ZREM", processingKey, jobId)
54
+ local groupKey = ns .. ":g:" .. groupId
55
+ redis.call("ZREM", groupKey, jobId)
56
+ redis.call("DEL", ns .. ":processing:" .. jobId)
57
+ redis.call("HSET", jobKey, "status","failed","finishedOn", now,
58
+ "failedReason", "Job stalled " .. stalledCount .. " times (max: " .. maxStalledCount .. ")")
59
+ redis.call("ZADD", ns .. ":failed", now, jobId)
60
+ table.insert(results, jobId); table.insert(results, groupId); table.insert(results, "failed")
61
+ else
62
+ local stillInProcessing = redis.call("ZSCORE", processingKey, jobId)
63
+ if stillInProcessing then
64
+ redis.call("ZREM", processingKey, jobId)
65
+ redis.call("DEL", ns .. ":processing:" .. jobId)
66
+ local score = tonumber(h[7])
67
+ if score then
68
+ local groupKey2 = ns .. ":g:" .. groupId
69
+ redis.call("ZADD", groupKey2, score, jobId)
70
+ local head = redis.call("ZRANGE", groupKey2, 0, 0, "WITHSCORES")
71
+ if head and #head >= 2 then
72
+ local headScore = tonumber(head[2])
73
+ redis.call("ZADD", ns .. ":ready", headScore, groupId)
74
+ end
75
+ redis.call("SADD", groupsKey, groupId)
76
+ end
77
+ redis.call("HSET", jobKey, "status", "waiting")
78
+ table.insert(results, jobId); table.insert(results, groupId); table.insert(results, "recovered")
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
84
+
85
+ return results
86
+
@@ -0,0 +1,64 @@
1
+ -- argv: ns, status, graceAtMs, limit
2
+ local ns = KEYS[1]
3
+ local status = ARGV[1]
4
+ local graceAt = tonumber(ARGV[2]) or 0
5
+ local limit = tonumber(ARGV[3]) or 1000
6
+
7
+ local setKey = nil
8
+ if status == 'completed' then
9
+ setKey = ns .. ':completed'
10
+ elseif status == 'failed' then
11
+ setKey = ns .. ':failed'
12
+ elseif status == 'delayed' then
13
+ setKey = ns .. ':delayed'
14
+ else
15
+ -- unsupported status for clean
16
+ return 0
17
+ end
18
+
19
+ -- Fetch up to 'limit' job ids with score <= graceAt
20
+ local ids = redis.call('ZRANGEBYSCORE', setKey, '-inf', graceAt, 'LIMIT', 0, limit)
21
+
22
+ local removed = 0
23
+ for i = 1, #ids do
24
+ local id = ids[i]
25
+ local jobKey = ns .. ':job:' .. id
26
+
27
+ -- Remove from the primary set first to avoid reprocessing
28
+ redis.call('ZREM', setKey, id)
29
+
30
+ -- Remove from group and update ready queue for ALL statuses
31
+ -- This prevents poisoned groups when completed/failed jobs are cleaned
32
+ local groupId = redis.call('HGET', jobKey, 'groupId')
33
+ if groupId then
34
+ local gZ = ns .. ':g:' .. groupId
35
+ local readyKey = ns .. ':ready'
36
+ redis.call('ZREM', gZ, id)
37
+ local jobCount = redis.call('ZCARD', gZ)
38
+ if jobCount == 0 then
39
+ redis.call('ZREM', readyKey, groupId)
40
+ -- Clean up empty group
41
+ redis.call('DEL', gZ)
42
+ redis.call('SREM', ns .. ':groups', groupId)
43
+ elseif status == 'delayed' then
44
+ -- Only update ready queue score for delayed jobs
45
+ -- (completed/failed jobs shouldn't affect ready queue)
46
+ local head = redis.call('ZRANGE', gZ, 0, 0, 'WITHSCORES')
47
+ if head and #head >= 2 then
48
+ local headScore = tonumber(head[2])
49
+ redis.call('ZADD', readyKey, headScore, groupId)
50
+ end
51
+ end
52
+ end
53
+
54
+ -- Delete job hash, idempotence key and flow results
55
+ redis.call('DEL', jobKey)
56
+ redis.call('DEL', ns .. ':unique:' .. id)
57
+ redis.call('DEL', ns .. ':flow:results:' .. id)
58
+
59
+ removed = removed + 1
60
+ end
61
+
62
+ return removed
63
+
64
+
@@ -0,0 +1,46 @@
1
+ -- argv: ns, groupId, now
2
+ local ns = KEYS[1]
3
+ local groupId = ARGV[1]
4
+ local now = tonumber(ARGV[2])
5
+
6
+ local readyKey = ns .. ":ready"
7
+ local gZ = ns .. ":g:" .. groupId
8
+ local lockKey = ns .. ":lock:" .. groupId
9
+
10
+ -- Check if group has any jobs at all
11
+ local jobCount = redis.call("ZCARD", gZ)
12
+ if jobCount == 0 then
13
+ redis.call("ZREM", readyKey, groupId)
14
+ return "empty"
15
+ end
16
+
17
+ -- Check if group is currently locked by another worker
18
+ local lockValue = redis.call("GET", lockKey)
19
+ if lockValue then
20
+ local lockTtl = redis.call("PTTL", lockKey)
21
+ if lockTtl > 0 then
22
+ return "locked"
23
+ end
24
+ end
25
+
26
+ -- Check if all jobs in the group have exceeded max attempts
27
+ local jobs = redis.call("ZRANGE", gZ, 0, -1)
28
+ local reservableJobs = 0
29
+ for i = 1, #jobs do
30
+ local jobId = jobs[i]
31
+ local jobKey = ns .. ":job:" .. jobId
32
+ local attempts = tonumber(redis.call("HGET", jobKey, "attempts"))
33
+ local maxAttempts = tonumber(redis.call("HGET", jobKey, "maxAttempts"))
34
+ if attempts and maxAttempts and attempts < maxAttempts then
35
+ reservableJobs = reservableJobs + 1
36
+ end
37
+ end
38
+
39
+ if reservableJobs == 0 then
40
+ redis.call("ZREM", readyKey, groupId)
41
+ return "poisoned"
42
+ end
43
+
44
+ return "ok"
45
+
46
+
@@ -0,0 +1,46 @@
1
+ -- argv: ns, nowEpochMs
2
+ local ns = KEYS[1]
3
+ local now = tonumber(ARGV[1])
4
+
5
+ local readyKey = ns .. ":ready"
6
+ local processingKey = ns .. ":processing"
7
+ local cleaned = 0
8
+
9
+ local expiredJobs = redis.call("ZRANGEBYSCORE", processingKey, 0, now)
10
+ for _, jobId in ipairs(expiredJobs) do
11
+ -- CRITICAL: Verify job is STILL in processing to avoid race conditions
12
+ -- If job was completed between our snapshot and now, don't re-add it
13
+ local stillInProcessing = redis.call("ZSCORE", processingKey, jobId)
14
+
15
+ if stillInProcessing then
16
+ local procKey = ns .. ":processing:" .. jobId
17
+ local procData = redis.call("HMGET", procKey, "groupId", "deadlineAt")
18
+ local gid = procData[1]
19
+ local deadlineAt = tonumber(procData[2])
20
+ if gid and deadlineAt and now > deadlineAt then
21
+ local jobKey = ns .. ":job:" .. jobId
22
+ local jobScore = redis.call("HGET", jobKey, "score")
23
+ if jobScore then
24
+ local gZ = ns .. ":g:" .. gid
25
+ redis.call("ZADD", gZ, tonumber(jobScore), jobId)
26
+ local head = redis.call("ZRANGE", gZ, 0, 0, "WITHSCORES")
27
+ if head and #head >= 2 then
28
+ local headScore = tonumber(head[2])
29
+ redis.call("ZADD", readyKey, headScore, gid)
30
+ end
31
+ redis.call("DEL", ns .. ":lock:" .. gid)
32
+ redis.call("DEL", procKey)
33
+ redis.call("ZREM", processingKey, jobId)
34
+
35
+ -- No counter operations - use ZCARD for counts
36
+
37
+ cleaned = cleaned + 1
38
+ end
39
+ end
40
+ end
41
+ -- If not still in processing, it was completed - don't re-add it!
42
+ end
43
+
44
+ return cleaned
45
+
46
+
@@ -0,0 +1,221 @@
1
+ -- Complete a job with metadata and atomically reserve the next job from the same group
2
+ -- argv: ns, completedJobId, groupId, status, timestamp, resultOrError, keepCompleted, keepFailed,
3
+ -- processedOn, finishedOn, attempts, maxAttempts, now, vt
4
+ local ns = KEYS[1]
5
+ local completedJobId = ARGV[1]
6
+ local gid = ARGV[2]
7
+ local status = ARGV[3]
8
+ local timestamp = tonumber(ARGV[4])
9
+ local resultOrError = ARGV[5]
10
+ local keepCompleted = tonumber(ARGV[6])
11
+ local keepFailed = tonumber(ARGV[7])
12
+ local processedOn = ARGV[8]
13
+ local finishedOn = ARGV[9]
14
+ local attempts = ARGV[10]
15
+ local maxAttempts = ARGV[11]
16
+ local now = tonumber(ARGV[12])
17
+ local vt = tonumber(ARGV[13])
18
+
19
+ local jobKey = ns .. ":job:" .. completedJobId
20
+
21
+ -- [PHASE 3 MODIFICATION START: Get parentId before potentially deleting the job]
22
+ local parentId = redis.call("HGET", jobKey, "parentId")
23
+ -- [PHASE 3 MODIFICATION END]
24
+
25
+ -- Part 1: Atomically verify and mark completion (prevent duplicate processing)
26
+ local processingKey = ns .. ":processing"
27
+
28
+ -- CRITICAL: Check both status AND processing set membership atomically
29
+ -- This prevents race with stalled job recovery
30
+ local jobStatus = redis.call("HGET", jobKey, "status")
31
+ local stillInProcessing = redis.call("ZSCORE", processingKey, completedJobId)
32
+
33
+ -- If job is not in "processing" state OR not in processing set, this is late/duplicate
34
+ if jobStatus ~= "processing" or not stillInProcessing then
35
+ return nil
36
+ end
37
+
38
+ -- Atomically mark as completed and remove from processing
39
+ -- This prevents stalled checker from racing with us
40
+ redis.call("HSET", jobKey, "status", "completing") -- Temporary status to block stalled checker
41
+ redis.call("DEL", ns .. ":processing:" .. completedJobId)
42
+ redis.call("ZREM", processingKey, completedJobId)
43
+
44
+ -- Part 3: Record job metadata (completed or failed)
45
+
46
+ if status == "completed" then
47
+ local completedKey = ns .. ":completed"
48
+
49
+ -- CRITICAL: Always set final status first, even if job will be deleted
50
+ -- This ensures any concurrent reads see "completed", not "completing"
51
+ redis.call("HSET", jobKey, "status", "completed")
52
+
53
+ -- [PHASE 3 MODIFICATION START: Update parent flow if exists]
54
+ if parentId then
55
+ local parentKey = ns .. ":job:" .. parentId
56
+ -- 1. Store child result in flow:results hash (CRITICAL: was missing!)
57
+ local flowResultsKey = ns .. ":flow:results:" .. parentId
58
+ redis.call("HSET", flowResultsKey, completedJobId, resultOrError)
59
+
60
+ -- 2. Decrement remaining counter
61
+ local remaining = redis.call("HINCRBY", parentKey, "flowRemaining", -1)
62
+
63
+ -- 3. If all children done, move parent to waiting
64
+ if remaining <= 0 then
65
+ local parentStatus = redis.call("HGET", parentKey, "status")
66
+ if parentStatus == "waiting-children" then
67
+ redis.call("HSET", parentKey, "status", "waiting")
68
+ local parentGroupId = redis.call("HGET", parentKey, "groupId")
69
+ local parentScore = tonumber(redis.call("HGET", parentKey, "score"))
70
+ if not parentScore then
71
+ parentScore = tonumber(now)
72
+ end
73
+
74
+ local pGZ = ns .. ":g:" .. parentGroupId
75
+ redis.call("ZADD", pGZ, parentScore, parentId)
76
+ redis.call("SADD", ns .. ":groups", parentGroupId)
77
+
78
+ -- Check if should add to ready queue (if head)
79
+ local pHead = redis.call("ZRANGE", pGZ, 0, 0, "WITHSCORES")
80
+ if pHead and #pHead >= 2 then
81
+ local pHeadScore = tonumber(pHead[2])
82
+ redis.call("ZADD", ns .. ":ready", pHeadScore, parentGroupId)
83
+ end
84
+ end
85
+ end
86
+ end
87
+ -- [PHASE 3 MODIFICATION END]
88
+
89
+ if keepCompleted > 0 then
90
+ -- Store full job metadata and add to completed set
91
+ redis.call("HSET", jobKey,
92
+ "processedOn", processedOn,
93
+ "finishedOn", finishedOn,
94
+ "attempts", attempts,
95
+ "maxAttempts", maxAttempts,
96
+ "returnvalue", resultOrError
97
+ )
98
+ redis.call("ZADD", completedKey, timestamp, completedJobId)
99
+
100
+ -- Trim old entries atomically
101
+ local zcount = redis.call("ZCARD", completedKey)
102
+ local toRemove = zcount - keepCompleted
103
+ if toRemove > 0 then
104
+ local oldIds = redis.call("ZRANGE", completedKey, 0, toRemove - 1)
105
+ if #oldIds > 0 then
106
+ redis.call("ZREMRANGEBYRANK", completedKey, 0, toRemove - 1)
107
+ for i = 1, #oldIds do
108
+ local oldId = oldIds[i]
109
+ redis.call("DEL", ns .. ":job:" .. oldId)
110
+ redis.call("DEL", ns .. ":unique:" .. oldId)
111
+ redis.call("DEL", ns .. ":flow:results:" .. oldId)
112
+ end
113
+ end
114
+ end
115
+ else
116
+ -- keepCompleted == 0: Delete immediately (status already set above)
117
+ redis.call("DEL", jobKey)
118
+ redis.call("DEL", ns .. ":unique:" .. completedJobId)
119
+ redis.call("DEL", ns .. ":flow:results:" .. completedJobId)
120
+ end
121
+
122
+ elseif status == "failed" then
123
+ local failedKey = ns .. ":failed"
124
+ local errorInfo = cjson.decode(resultOrError)
125
+
126
+ -- CRITICAL: Always set final status first, even if job will be deleted
127
+ redis.call("HSET", jobKey, "status", "failed")
128
+
129
+ if keepFailed > 0 then
130
+ redis.call("HSET", jobKey,
131
+ "failedReason", errorInfo.message or "Error",
132
+ "failedName", errorInfo.name or "Error",
133
+ "stacktrace", errorInfo.stack or "",
134
+ "processedOn", processedOn,
135
+ "finishedOn", finishedOn,
136
+ "attempts", attempts,
137
+ "maxAttempts", maxAttempts
138
+ )
139
+ redis.call("ZADD", failedKey, timestamp, completedJobId)
140
+ else
141
+ -- Delete job (status already set above)
142
+ redis.call("DEL", jobKey)
143
+ redis.call("DEL", ns .. ":unique:" .. completedJobId)
144
+ redis.call("DEL", ns .. ":flow:results:" .. completedJobId)
145
+ end
146
+ end
147
+
148
+ -- Part 3: Handle group active list and get next job (BullMQ-style)
149
+ local groupActiveKey = ns .. ":g:" .. gid .. ":active"
150
+ local activeJobId = redis.call("LINDEX", groupActiveKey, 0)
151
+
152
+ -- Always clean up this job from active list, even if not at head
153
+ -- This prevents stale active lists from race conditions
154
+ if activeJobId == completedJobId then
155
+ -- Normal case: this job is at the head of active list
156
+ redis.call("LPOP", groupActiveKey)
157
+ else
158
+ -- Race condition: job is not at head (maybe already removed, or wrong job)
159
+ -- Clean it up anyway to prevent stale entries
160
+ redis.call("LREM", groupActiveKey, 1, completedJobId)
161
+
162
+ -- If active list had a different job or was empty, don't try to reserve next
163
+ -- Return nil to indicate no chaining
164
+ return nil
165
+ end
166
+
167
+ local gZ = ns .. ":g:" .. gid
168
+ local zpop = redis.call("ZPOPMIN", gZ, 1)
169
+ if not zpop or #zpop == 0 then
170
+ -- Clean up empty group
171
+ local jobCount = redis.call("ZCARD", gZ)
172
+ if jobCount == 0 then
173
+ redis.call("DEL", gZ)
174
+ redis.call("SREM", ns .. ":groups", gid)
175
+ redis.call("ZREM", ns .. ":ready", gid)
176
+ end
177
+ -- No next job
178
+ return nil
179
+ end
180
+
181
+ local nextJobId = zpop[1]
182
+ local nextJobKey = ns .. ":job:" .. nextJobId
183
+ local job = redis.call("HMGET", nextJobKey, "id","groupId","data","attempts","maxAttempts","seq","timestamp","orderMs","score")
184
+ local id, groupId, payload, attempts, maxAttempts, seq, enq, orderMs, score = job[1], job[2], job[3], job[4], job[5], job[6], job[7], job[8], job[9]
185
+
186
+ -- Validate job data exists (handle corrupted/missing job hash)
187
+ if not id or id == false then
188
+ -- Job hash is missing/corrupted, clean up and return completion only
189
+ -- Re-add next job to ready queue if exists
190
+ local nextHead = redis.call("ZRANGE", gZ, 0, 0, "WITHSCORES")
191
+ if nextHead and #nextHead >= 2 then
192
+ local nextScore = tonumber(nextHead[2])
193
+ local readyKey = ns .. ":ready"
194
+ redis.call("ZADD", readyKey, nextScore, groupId)
195
+ end
196
+
197
+ -- Return nil to indicate no next job was reserved
198
+ return nil
199
+ end
200
+
201
+ -- Push next job to active list (chaining)
202
+ redis.call("LPUSH", groupActiveKey, id)
203
+
204
+ local procKey = ns .. ":processing:" .. id
205
+ local deadline = now + vt
206
+ redis.call("HSET", procKey, "groupId", groupId, "deadlineAt", tostring(deadline))
207
+
208
+ local processingKey = ns .. ":processing"
209
+ redis.call("ZADD", processingKey, deadline, id)
210
+
211
+ -- Mark next job as processing for accurate stalled detection
212
+ redis.call("HSET", nextJobKey, "status", "processing")
213
+
214
+ local nextHead = redis.call("ZRANGE", gZ, 0, 0, "WITHSCORES")
215
+ if nextHead and #nextHead >= 2 then
216
+ local nextScore = tonumber(nextHead[2])
217
+ local readyKey = ns .. ":ready"
218
+ redis.call("ZADD", readyKey, nextScore, groupId)
219
+ end
220
+
221
+ return id .. "|||" .. groupId .. "|||" .. payload .. "|||" .. attempts .. "|||" .. maxAttempts .. "|||" .. seq .. "|||" .. enq .. "|||" .. orderMs .. "|||" .. score .. "|||" .. deadline
@@ -0,0 +1,190 @@
1
+ -- Complete a job: unlock group AND record metadata atomically in one call
2
+ -- argv: ns, jobId, groupId, status, timestamp, resultOrError, keepCompleted, keepFailed,
3
+ -- processedOn, finishedOn, attempts, maxAttempts
4
+ local ns = KEYS[1]
5
+ local jobId = ARGV[1]
6
+ local gid = ARGV[2]
7
+ local status = ARGV[3]
8
+ local timestamp = tonumber(ARGV[4])
9
+ local resultOrError = ARGV[5]
10
+ local keepCompleted = tonumber(ARGV[6])
11
+ local keepFailed = tonumber(ARGV[7])
12
+ local processedOn = ARGV[8]
13
+ local finishedOn = ARGV[9]
14
+ local attempts = ARGV[10]
15
+ local maxAttempts = ARGV[11]
16
+
17
+ local jobKey = ns .. ":job:" .. jobId
18
+
19
+ -- [PHASE 3 MODIFICATION START: Get parentId before potentially deleting the job]
20
+ local parentId = redis.call("HGET", jobKey, "parentId")
21
+ -- [PHASE 3 MODIFICATION END]
22
+
23
+ -- Part 1: Atomically verify and mark completion (prevent duplicate processing)
24
+ local processingKey = ns .. ":processing"
25
+
26
+ -- CRITICAL: Check both status AND processing set membership atomically
27
+ -- This prevents race with stalled job recovery
28
+ local jobStatus = redis.call("HGET", jobKey, "status")
29
+ local stillInProcessing = redis.call("ZSCORE", processingKey, jobId)
30
+
31
+ -- If job is not in "processing" state OR not in processing set, this is late/duplicate
32
+ if jobStatus ~= "processing" or not stillInProcessing then
33
+ -- Job was already handled (recovered, failed, or completed by another worker)
34
+ -- Return 0 to indicate this completion was ignored
35
+ return 0
36
+ end
37
+
38
+ -- Atomically mark as completed and remove from processing
39
+ -- This prevents stalled checker from racing with us
40
+ redis.call("HSET", jobKey, "status", "completing") -- Temporary status to block stalled checker
41
+ redis.call("DEL", ns .. ":processing:" .. jobId)
42
+ redis.call("ZREM", processingKey, jobId)
43
+
44
+ -- Always remove this job from active list to prevent stale entries
45
+ local groupActiveKey = ns .. ":g:" .. gid .. ":active"
46
+ local activeJobId = redis.call("LINDEX", groupActiveKey, 0)
47
+ local wasActive = (activeJobId == jobId)
48
+
49
+ if wasActive then
50
+ -- Normal case: remove from head of active list
51
+ redis.call("LPOP", groupActiveKey)
52
+ else
53
+ -- Race condition: not at head, but still remove to prevent stale entries
54
+ redis.call("LREM", groupActiveKey, 1, jobId)
55
+ end
56
+
57
+ -- Check if there are more jobs in this group
58
+ local gZ = ns .. ":g:" .. gid
59
+ local jobCount = redis.call("ZCARD", gZ)
60
+ if jobCount == 0 then
61
+ -- Remove empty group
62
+ redis.call("DEL", gZ)
63
+ redis.call("DEL", groupActiveKey)
64
+ redis.call("SREM", ns .. ":groups", gid)
65
+ redis.call("ZREM", ns .. ":ready", gid)
66
+ redis.call("DEL", ns .. ":buffer:" .. gid)
67
+ redis.call("ZREM", ns .. ":buffering", gid)
68
+ else
69
+ -- Group has more jobs, re-add to ready if not buffering
70
+ local groupBufferKey = ns .. ":buffer:" .. gid
71
+ local isBuffering = redis.call("EXISTS", groupBufferKey)
72
+
73
+ if isBuffering == 0 then
74
+ local nextHead = redis.call("ZRANGE", gZ, 0, 0, "WITHSCORES")
75
+ if nextHead and #nextHead >= 2 then
76
+ local nextScore = tonumber(nextHead[2])
77
+ local readyKey = ns .. ":ready"
78
+ redis.call("ZADD", readyKey, nextScore, gid)
79
+ end
80
+ end
81
+ end
82
+
83
+ -- [PHASE 3 MODIFICATION START: Update Flow Parent]
84
+ if parentId then
85
+ local parentKey = ns .. ":job:" .. parentId
86
+ -- 1. Store child result in a separate hash to define parent's "childrenValues"
87
+ -- Key: flow:results:{parentId}, Field: {childId}
88
+ local flowResultsKey = ns .. ":flow:results:" .. parentId
89
+ redis.call("HSET", flowResultsKey, jobId, resultOrError)
90
+
91
+ -- 2. Decrement remaining counter
92
+ local remaining = redis.call("HINCRBY", parentKey, "flowRemaining", -1)
93
+
94
+ -- 3. If all children done, move parent to waiting
95
+ if remaining <= 0 then
96
+ local parentStatus = redis.call("HGET", parentKey, "status")
97
+ if parentStatus == "waiting-children" then
98
+ redis.call("HSET", parentKey, "status", "waiting")
99
+
100
+ -- Add parent to its group and ready queue
101
+ local parentGroupId = redis.call("HGET", parentKey, "groupId")
102
+ local parentScore = tonumber(redis.call("HGET", parentKey, "score"))
103
+ if not parentScore then
104
+ parentScore = tonumber(redis.call("TIME")[1]) * 1000
105
+ end
106
+
107
+ local pGZ = ns .. ":g:" .. parentGroupId
108
+ redis.call("ZADD", pGZ, parentScore, parentId)
109
+ redis.call("SADD", ns .. ":groups", parentGroupId)
110
+
111
+ -- Check if should add to ready queue (if head)
112
+ local pHead = redis.call("ZRANGE", pGZ, 0, 0, "WITHSCORES")
113
+ if pHead and #pHead >= 2 then
114
+ local pHeadScore = tonumber(pHead[2])
115
+ redis.call("ZADD", ns .. ":ready", pHeadScore, parentGroupId)
116
+ end
117
+ end
118
+ end
119
+ end
120
+ -- [PHASE 3 MODIFICATION END]
121
+
122
+ -- Part 2: Record job metadata (completed or failed)
123
+ if status == "completed" then
124
+ local completedKey = ns .. ":completed"
125
+
126
+ -- CRITICAL: Always set final status first, even if job will be deleted
127
+ -- This ensures any concurrent reads see "completed", not "completing"
128
+ redis.call("HSET", jobKey, "status", "completed")
129
+
130
+ if keepCompleted > 0 then
131
+ -- Store full job metadata and add to completed set
132
+ redis.call("HSET", jobKey,
133
+ "processedOn", processedOn,
134
+ "finishedOn", finishedOn,
135
+ "attempts", attempts,
136
+ "maxAttempts", maxAttempts,
137
+ "returnvalue", resultOrError
138
+ )
139
+ redis.call("ZADD", completedKey, timestamp, jobId)
140
+
141
+ -- Trim old entries atomically
142
+ local zcount = redis.call("ZCARD", completedKey)
143
+ local toRemove = zcount - keepCompleted
144
+ if toRemove > 0 then
145
+ local oldIds = redis.call("ZRANGE", completedKey, 0, toRemove - 1)
146
+ if #oldIds > 0 then
147
+ redis.call("ZREMRANGEBYRANK", completedKey, 0, toRemove - 1)
148
+ for i = 1, #oldIds do
149
+ local oldId = oldIds[i]
150
+ redis.call("DEL", ns .. ":job:" .. oldId)
151
+ redis.call("DEL", ns .. ":unique:" .. oldId)
152
+ redis.call("DEL", ns .. ":flow:results:" .. oldId)
153
+ end
154
+ end
155
+ end
156
+ else
157
+ -- keepCompleted == 0: Delete immediately (status already set above)
158
+ redis.call("DEL", jobKey)
159
+ redis.call("DEL", ns .. ":unique:" .. jobId)
160
+ redis.call("DEL", ns .. ":flow:results:" .. jobId)
161
+ end
162
+
163
+ elseif status == "failed" then
164
+ local failedKey = ns .. ":failed"
165
+ local errorInfo = cjson.decode(resultOrError)
166
+
167
+ -- CRITICAL: Always set final status first, even if job will be deleted
168
+ redis.call("HSET", jobKey, "status", "failed")
169
+
170
+ if keepFailed > 0 then
171
+ redis.call("HSET", jobKey,
172
+ "failedReason", errorInfo.message or "Error",
173
+ "failedName", errorInfo.name or "Error",
174
+ "stacktrace", errorInfo.stack or "",
175
+ "processedOn", processedOn,
176
+ "finishedOn", finishedOn,
177
+ "attempts", attempts,
178
+ "maxAttempts", maxAttempts
179
+ )
180
+ redis.call("ZADD", failedKey, timestamp, jobId)
181
+ else
182
+ -- Delete job (status already set above)
183
+ redis.call("DEL", jobKey)
184
+ redis.call("DEL", ns .. ":unique:" .. jobId)
185
+ redis.call("DEL", ns .. ":flow:results:" .. jobId)
186
+ end
187
+ end
188
+
189
+ return 1
190
+